Merge pull request #3378 from cudawarped:replace_texture_ref_with_texture_obj

Fix CUDA texture bugs and replace all instances of CUDA texture references with texture objects
2 years ago · 8db3e627fb
parent b5f4e24515 8a6ea82ed0
commit 8db3e627fb
33 changed files with 1135 additions and 2329 deletions
--- a/modules/cudaarithm/src/cuda/lut.cu
+++ b/modules/cudaarithm/src/cuda/lut.cu
@ -53,6 +53,7 @@
 #include "opencv2/cudaarithm.hpp"
 #include "opencv2/cudev.hpp"
 #include "opencv2/core/private.cuda.hpp"
+#include <opencv2/cudev/ptr2d/texture.hpp>

 using namespace cv;
 using namespace cv::cuda;
@ -60,8 +61,6 @@ using namespace cv::cudev;

 namespace cv { namespace cuda {

-    texture<uchar, cudaTextureType1D, cudaReadModeElementType> texLutTable;
-
    LookUpTableImpl::LookUpTableImpl(InputArray _lut)
    {
        if (_lut.kind() == _InputArray::CUDA_GPU_MAT)
@ -73,83 +72,28 @@ namespace cv { namespace cuda {
            Mat h_lut = _lut.getMat();
            d_lut.upload(Mat(1, 256, h_lut.type(), h_lut.data));
        }
-
        CV_Assert( d_lut.depth() == CV_8U );
        CV_Assert( d_lut.rows == 1 && d_lut.cols == 256 );
-
-        cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);
-
-        if (cc30)
-        {
-            // Use the texture object
-            cudaResourceDesc texRes;
-            std::memset(&texRes, 0, sizeof(texRes));
-            texRes.resType = cudaResourceTypeLinear;
-            texRes.res.linear.devPtr = d_lut.data;
-            texRes.res.linear.desc = cudaCreateChannelDesc<uchar>();
-            texRes.res.linear.sizeInBytes = 256 * d_lut.channels() * sizeof(uchar);
-
-            cudaTextureDesc texDescr;
-            std::memset(&texDescr, 0, sizeof(texDescr));
-
-            CV_CUDEV_SAFE_CALL( cudaCreateTextureObject(&texLutTableObj, &texRes, &texDescr, 0) );
-        }
-        else
-        {
-            // Use the texture reference
-            cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar>();
-            CV_CUDEV_SAFE_CALL( cudaBindTexture(0, &texLutTable, d_lut.data, &desc) );
-        }
-    }
-
-    LookUpTableImpl::~LookUpTableImpl()
-    {
-        if (cc30)
-        {
-            // Use the texture object
-            cudaDestroyTextureObject(texLutTableObj);
-        }
-        else
-        {
-            // Use the texture reference
-            cudaUnbindTexture(texLutTable);
-        }
+        szInBytes = 256 * d_lut.channels() * sizeof(uchar);
    }

    struct LutTablePtrC1
    {
        typedef uchar value_type;
        typedef uchar index_type;
-
-        cudaTextureObject_t texLutTableObj;
-
-        __device__ __forceinline__ uchar operator ()(uchar, uchar x) const
-        {
-        #if CV_CUDEV_ARCH < 300
-            // Use the texture reference
-            return tex1Dfetch(texLutTable, x);
-        #else
-            // Use the texture object
-            return tex1Dfetch<uchar>(texLutTableObj, x);
-        #endif
+        cv::cudev::TexturePtr<uchar> tex;
+        __device__ __forceinline__ uchar operator ()(uchar, uchar x) const {
+            return tex(x);
        }
    };
+
    struct LutTablePtrC3
    {
        typedef uchar3 value_type;
        typedef uchar3 index_type;
-
-        cudaTextureObject_t texLutTableObj;
-
-        __device__ __forceinline__ uchar3 operator ()(const uchar3&, const uchar3& x) const
-        {
-        #if CV_CUDEV_ARCH < 300
-            // Use the texture reference
-            return make_uchar3(tex1Dfetch(texLutTable, x.x * 3), tex1Dfetch(texLutTable, x.y * 3 + 1), tex1Dfetch(texLutTable, x.z * 3 + 2));
-        #else
-            // Use the texture object
-            return make_uchar3(tex1Dfetch<uchar>(texLutTableObj, x.x * 3), tex1Dfetch<uchar>(texLutTableObj, x.y * 3 + 1), tex1Dfetch<uchar>(texLutTableObj, x.z * 3 + 2));
-        #endif
+        cv::cudev::TexturePtr<uchar> tex;
+        __device__ __forceinline__ uchar3 operator ()(const uchar3&, const uchar3& x) const {
+            return make_uchar3(tex(x.x * 3), tex(x.y * 3 + 1), tex(x.z * 3 + 2));
        }
    };

@ -169,20 +113,18 @@ namespace cv { namespace cuda {
        {
            GpuMat_<uchar> src1(src.reshape(1));
            GpuMat_<uchar> dst1(dst.reshape(1));
-
+            cv::cudev::Texture<uchar> tex(szInBytes, reinterpret_cast<uchar*>(d_lut.data));
            LutTablePtrC1 tbl;
-            tbl.texLutTableObj = texLutTableObj;
-
+            tbl.tex = TexturePtr<uchar>(tex);
            dst1.assign(lut_(src1, tbl), stream);
        }
        else if (lut_cn == 3)
        {
            GpuMat_<uchar3>& src3 = (GpuMat_<uchar3>&) src;
            GpuMat_<uchar3>& dst3 = (GpuMat_<uchar3>&) dst;
-
+            cv::cudev::Texture<uchar> tex(szInBytes, reinterpret_cast<uchar*>(d_lut.data));
            LutTablePtrC3 tbl;
-            tbl.texLutTableObj = texLutTableObj;
-
+            tbl.tex = TexturePtr<uchar>(tex);
            dst3.assign(lut_(src3, tbl), stream);
        }

--- a/modules/cudaarithm/src/lut.hpp
+++ b/modules/cudaarithm/src/lut.hpp
@ -15,14 +15,10 @@ class LookUpTableImpl : public LookUpTable
 {
 public:
    LookUpTableImpl(InputArray lut);
-    ~LookUpTableImpl();
-
    void transform(InputArray src, OutputArray dst, Stream& stream = Stream::Null()) CV_OVERRIDE;
-
 private:
    GpuMat d_lut;
-    cudaTextureObject_t texLutTableObj;
-    bool cc30;
+    size_t szInBytes = 0;
 };

 } }
--- a/modules/cudaimgproc/src/cuda/canny.cu
+++ b/modules/cudaimgproc/src/cuda/canny.cu
@ -48,6 +48,7 @@
 #include "opencv2/core/cuda/functional.hpp"
 #include "opencv2/core/cuda/utility.hpp"
 #include "opencv2/core/cuda.hpp"
+#include <opencv2/cudev/ptr2d/texture.hpp>

 using namespace cv::cuda;
 using namespace cv::cuda::device;
@ -90,47 +91,8 @@ namespace cv { namespace cuda { namespace device

 namespace canny
 {
-    struct SrcTex
-    {
-        virtual ~SrcTex() {}
-
-        __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
-
-        __device__ __forceinline__ virtual int operator ()(int y, int x) const = 0;
-
-        int xoff;
-        int yoff;
-    };
-
-    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
-    struct SrcTexRef : SrcTex
-    {
-        __host__ SrcTexRef(int _xoff, int _yoff) : SrcTex(_xoff, _yoff) {}
-
-        __device__ __forceinline__ int operator ()(int y, int x) const override
-        {
-            return tex2D(tex_src, x + xoff, y + yoff);
-        }
-    };
-
-    struct SrcTexObj : SrcTex
-    {
-        __host__ SrcTexObj(int _xoff, int _yoff, cudaTextureObject_t _tex_src_object) : SrcTex(_xoff, _yoff), tex_src_object(_tex_src_object) { }
-
-        __device__ __forceinline__ int operator ()(int y, int x) const override
-        {
-            return tex2D<uchar>(tex_src_object, x + xoff, y + yoff);
-        }
-
-        cudaTextureObject_t tex_src_object;
-    };
-
-    template <
-        class T,
-        class Norm,
-        typename = typename std::enable_if<std::is_base_of<SrcTex, T>::value>::type
-    >
-    __global__ void calcMagnitudeKernel(const T src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
+    template <class Norm>
+    __global__ void calcMagnitudeKernel(cv::cudev::TextureOffPtr<uchar> texSrc, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
    {
        const int x = blockIdx.x * blockDim.x + threadIdx.x;
        const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -138,8 +100,8 @@ namespace canny
        if (y >= mag.rows || x >= mag.cols)
            return;

-        int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
-        int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
+        int dxVal = (texSrc(y - 1, x + 1) + 2 * texSrc(y, x + 1) + texSrc(y + 1, x + 1)) - (texSrc(y - 1, x - 1) + 2 * texSrc(y, x - 1) + texSrc(y + 1, x - 1));
+        int dyVal = (texSrc(y + 1, x - 1) + 2 * texSrc(y + 1, x) + texSrc(y + 1, x + 1)) - (texSrc(y - 1, x - 1) + 2 * texSrc(y - 1, x) + texSrc(y - 1, x + 1));

        dx(y, x) = dxVal;
        dy(y, x) = dyVal;
@ -151,63 +113,20 @@ namespace canny
    {
        const dim3 block(16, 16);
        const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
-
-        bool cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);
-
-        if (cc30)
+        cv::cudev::TextureOff<uchar> texSrc(srcWhole, yoff, xoff);
+        if (L2Grad)
        {
-            cudaTextureDesc texDesc;
-            memset(&texDesc, 0, sizeof(texDesc));
-            texDesc.addressMode[0] = cudaAddressModeClamp;
-            texDesc.addressMode[1] = cudaAddressModeClamp;
-            texDesc.addressMode[2] = cudaAddressModeClamp;
-
-            cudaTextureObject_t tex = 0;
-            createTextureObjectPitch2D(&tex, srcWhole, texDesc);
-
-            SrcTexObj src(xoff, yoff, tex);
-
-            if (L2Grad)
-            {
-                L2 norm;
-                calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
-            }
-            else
-            {
-                L1 norm;
-                calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
-            }
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == NULL)
-                cudaSafeCall( cudaDeviceSynchronize() );
-            else
-                cudaSafeCall( cudaStreamSynchronize(stream) );
-
-            cudaSafeCall( cudaDestroyTextureObject(tex) );
+            L2 norm;
+            calcMagnitudeKernel<<<grid, block, 0, stream>>>(texSrc, dx, dy, mag, norm);
        }
        else
        {
-            bindTexture(&tex_src, srcWhole);
-            SrcTexRef src(xoff, yoff);
-
-            if (L2Grad)
-            {
-                L2 norm;
-                calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
-            }
-            else
-            {
-                L1 norm;
-                calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
-            }
-
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == NULL)
-                cudaSafeCall( cudaDeviceSynchronize() );
+            L1 norm;
+            calcMagnitudeKernel<<<grid, block, 0, stream>>>(texSrc, dx, dy, mag, norm);
        }
+
+        if (stream == NULL)
+            cudaSafeCall(cudaDeviceSynchronize());
    }

    void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream)
@ -229,8 +148,7 @@ namespace canny

 namespace canny
 {
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);
-    __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
+    __global__ void calcMapKernel(cv::cudev::TexturePtr<float> texMag, const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
    {
        const int CANNY_SHIFT = 15;
        const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
@ -245,7 +163,7 @@ namespace canny
        int dyVal = dy(y, x);

        const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
-        const float m = tex2D(tex_mag, x, y);
+        const float m = texMag(y, x);

        dxVal = ::abs(dxVal);
        dyVal = ::abs(dyVal);
@ -264,69 +182,17 @@ namespace canny

            if (dyVal < tg22x)
            {
-                if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y))
+                if (m > texMag(y, x - 1) && m >= texMag(y, x + 1))
                    edge_type = 1 + (int)(m > high_thresh);
            }
            else if(dyVal > tg67x)
            {
-                if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1))
+                if (m > texMag(y - 1, x) && m >= texMag(y + 1, x))
                    edge_type = 1 + (int)(m > high_thresh);
            }
            else
            {
-                if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1))
-                    edge_type = 1 + (int)(m > high_thresh);
-            }
-        }
-
-        map(y, x) = edge_type;
-    }
-
-    __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh, cudaTextureObject_t tex_mag)
-    {
-        const int CANNY_SHIFT = 15;
-        const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
-
-        const int x = blockIdx.x * blockDim.x + threadIdx.x;
-        const int y = blockIdx.y * blockDim.y + threadIdx.y;
-
-        if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
-            return;
-
-        int dxVal = dx(y, x);
-        int dyVal = dy(y, x);
-
-        const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
-        const float m = tex2D<float>(tex_mag, x, y);
-
-        dxVal = ::abs(dxVal);
-        dyVal = ::abs(dyVal);
-
-        // 0 - the pixel can not belong to an edge
-        // 1 - the pixel might belong to an edge
-        // 2 - the pixel does belong to an edge
-        int edge_type = 0;
-
-        if (m > low_thresh)
-        {
-            const int tg22x = dxVal * TG22;
-            const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
-
-            dyVal <<= CANNY_SHIFT;
-
-            if (dyVal < tg22x)
-            {
-                if (m > tex2D<float>(tex_mag, x - 1, y) && m >= tex2D<float>(tex_mag, x + 1, y))
-                    edge_type = 1 + (int)(m > high_thresh);
-            }
-            else if(dyVal > tg67x)
-            {
-                if (m > tex2D<float>(tex_mag, x, y - 1) && m >= tex2D<float>(tex_mag, x, y + 1))
-                    edge_type = 1 + (int)(m > high_thresh);
-            }
-            else
-            {
-                if (m > tex2D<float>(tex_mag, x - s, y - 1) && m >= tex2D<float>(tex_mag, x + s, y + 1))
+                if (m > texMag(y - 1, x - s) && m >= texMag(y + 1, x + s))
                    edge_type = 1 + (int)(m > high_thresh);
            }
        }
@ -338,47 +204,10 @@ namespace canny
    {
        const dim3 block(16, 16);
        const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
-
-        if (deviceSupports(FEATURE_SET_COMPUTE_30))
-        {
-            // Use the texture object
-            cudaResourceDesc resDesc;
-            memset(&resDesc, 0, sizeof(resDesc));
-            resDesc.resType = cudaResourceTypePitch2D;
-            resDesc.res.pitch2D.devPtr = mag.ptr();
-            resDesc.res.pitch2D.height = mag.rows;
-            resDesc.res.pitch2D.width = mag.cols;
-            resDesc.res.pitch2D.pitchInBytes = mag.step;
-            resDesc.res.pitch2D.desc = cudaCreateChannelDesc<float>();
-
-            cudaTextureDesc texDesc;
-            memset(&texDesc, 0, sizeof(texDesc));
-            texDesc.addressMode[0] = cudaAddressModeClamp;
-            texDesc.addressMode[1] = cudaAddressModeClamp;
-            texDesc.addressMode[2] = cudaAddressModeClamp;
-
-            cudaTextureObject_t tex=0;
-            cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
-            calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh, tex);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == NULL)
-                cudaSafeCall( cudaDeviceSynchronize() );
-            else
-                cudaSafeCall( cudaStreamSynchronize(stream) );
-
-            cudaSafeCall( cudaDestroyTextureObject(tex) );
-        }
-        else
-        {
-            // Use the texture reference
-            bindTexture(&tex_mag, mag);
-            calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh);
-            cudaSafeCall( cudaGetLastError() );
-
-            if (stream == NULL)
-                cudaSafeCall( cudaDeviceSynchronize() );
-        }
+        cv::cudev::Texture<float> texMag(mag);
+        calcMapKernel<<<grid, block, 0, stream>>>(texMag, dx, dy, map, low_thresh, high_thresh);
+        if (stream == NULL)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
 }

--- a/modules/cudaimgproc/src/cuda/corners.cu
+++ b/modules/cudaimgproc/src/cuda/corners.cu
@ -47,6 +47,7 @@
 #include "opencv2/core/cuda/vec_math.hpp"
 #include "opencv2/core/cuda/saturate_cast.hpp"
 #include "opencv2/core/cuda/border_interpolate.hpp"
+#include <opencv2/cudev/ptr2d/texture.hpp>

 #include "opencv2/opencv_modules.hpp"

@ -58,10 +59,7 @@ namespace cv { namespace cuda { namespace device
    {
        /////////////////////////////////////////// Corner Harris /////////////////////////////////////////////////

-        texture<float, cudaTextureType2D, cudaReadModeElementType> harrisDxTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-        texture<float, cudaTextureType2D, cudaReadModeElementType> harrisDyTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-
-        __global__ void cornerHarris_kernel(const int block_size, const float k, PtrStepSzf dst)
+        __global__ void cornerHarris_kernel(cv::cudev::TexturePtr<float> texDx, cv::cudev::TexturePtr<float> texDy, const int block_size, const float k, PtrStepSzf dst)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -81,8 +79,8 @@ namespace cv { namespace cuda { namespace device
                {
                    for (int j = jbegin; j < jend; ++j)
                    {
-                        float dx = tex2D(harrisDxTex, j, i);
-                        float dy = tex2D(harrisDyTex, j, i);
+                        float dx = texDx(i, j);
+                        float dy = texDy(i, j);

                        a += dx * dx;
                        b += dx * dy;
@ -95,7 +93,7 @@ namespace cv { namespace cuda { namespace device
        }

        template <typename BR, typename BC>
-        __global__ void cornerHarris_kernel(const int block_size, const float k, PtrStepSzf dst, const BR border_row, const BC border_col)
+        __global__ void cornerHarris_kernel(cv::cudev::TexturePtr<float> texDx, cv::cudev::TexturePtr<float> texDy, const int block_size, const float k, PtrStepSzf dst, const BR border_row, const BC border_col)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -119,8 +117,8 @@ namespace cv { namespace cuda { namespace device
                    {
                        const int x = border_row.idx_col(j);

-                        float dx = tex2D(harrisDxTex, x, y);
-                        float dy = tex2D(harrisDyTex, x, y);
+                        float dx = texDx(y, x);
+                        float dy = texDy(y, x);

                        a += dx * dx;
                        b += dx * dy;
@ -136,22 +134,20 @@ namespace cv { namespace cuda { namespace device
        {
            dim3 block(32, 8);
            dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y));
-
-            bindTexture(&harrisDxTex, Dx);
-            bindTexture(&harrisDyTex, Dy);
-
+            cv::cudev::Texture<float> texDx(Dx);
+            cv::cudev::Texture<float> texDy(Dy);
            switch (border_type)
            {
            case BORDER_REFLECT101:
-                cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
+                cornerHarris_kernel<<<grid, block, 0, stream>>>(texDx, texDy, block_size, k, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
                break;

            case BORDER_REFLECT:
-                cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
+                cornerHarris_kernel<<<grid, block, 0, stream>>>(texDx, texDy, block_size, k, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
                break;

            case BORDER_REPLICATE:
-                cornerHarris_kernel<<<grid, block, 0, stream>>>(block_size, k, dst);
+                cornerHarris_kernel<<<grid, block, 0, stream>>>(texDx, texDy, block_size, k, dst);
                break;
            }

@ -163,10 +159,7 @@ namespace cv { namespace cuda { namespace device

        /////////////////////////////////////////// Corner Min Eigen Val /////////////////////////////////////////////////

-        texture<float, cudaTextureType2D, cudaReadModeElementType> minEigenValDxTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-        texture<float, cudaTextureType2D, cudaReadModeElementType> minEigenValDyTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-
-        __global__ void cornerMinEigenVal_kernel(const int block_size, PtrStepSzf dst)
+        __global__ void cornerMinEigenVal_kernel(cv::cudev::TexturePtr<float> texMinEigenValDx, cv::cudev::TexturePtr<float> texMinEigenValDy, const int block_size, PtrStepSzf dst)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -186,8 +179,8 @@ namespace cv { namespace cuda { namespace device
                {
                    for (int j = jbegin; j < jend; ++j)
                    {
-                        float dx = tex2D(minEigenValDxTex, j, i);
-                        float dy = tex2D(minEigenValDyTex, j, i);
+                        float dx = texMinEigenValDx(i, j);
+                        float dy = texMinEigenValDy(i, j);

                        a += dx * dx;
                        b += dx * dy;
@ -204,7 +197,7 @@ namespace cv { namespace cuda { namespace device


        template <typename BR, typename BC>
-        __global__ void cornerMinEigenVal_kernel(const int block_size, PtrStepSzf dst, const BR border_row, const BC border_col)
+        __global__ void cornerMinEigenVal_kernel(cv::cudev::TexturePtr<float> texMinEigenValDx, cv::cudev::TexturePtr<float> texMinEigenValDy, const int block_size, PtrStepSzf dst, const BR border_row, const BC border_col)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@ -228,8 +221,8 @@ namespace cv { namespace cuda { namespace device
                    {
                        int x = border_row.idx_col(j);

-                        float dx = tex2D(minEigenValDxTex, x, y);
-                        float dy = tex2D(minEigenValDyTex, x, y);
+                        float dx = texMinEigenValDx(y, x);
+                        float dy = texMinEigenValDy(y, x);

                        a += dx * dx;
                        b += dx * dy;
@ -248,22 +241,20 @@ namespace cv { namespace cuda { namespace device
        {
            dim3 block(32, 8);
            dim3 grid(divUp(Dx.cols, block.x), divUp(Dx.rows, block.y));
-
-            bindTexture(&minEigenValDxTex, Dx);
-            bindTexture(&minEigenValDyTex, Dy);
-
+            cv::cudev::Texture<float> texMinEigenValDx(Dx);
+            cv::cudev::Texture<float> texMinEigenValDy(Dy);
            switch (border_type)
            {
            case BORDER_REFLECT101:
-                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
+                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(texMinEigenValDx, texMinEigenValDy, block_size, dst, BrdRowReflect101<void>(Dx.cols), BrdColReflect101<void>(Dx.rows));
                break;

            case BORDER_REFLECT:
-                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
+                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(texMinEigenValDx, texMinEigenValDy, block_size, dst, BrdRowReflect<void>(Dx.cols), BrdColReflect<void>(Dx.rows));
                break;

            case BORDER_REPLICATE:
-                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(block_size, dst);
+                cornerMinEigenVal_kernel<<<grid, block, 0, stream>>>(texMinEigenValDx, texMinEigenValDy, block_size, dst);
                break;
            }

--- a/modules/cudaimgproc/src/cuda/debayer.cu
+++ b/modules/cudaimgproc/src/cuda/debayer.cu
@ -48,6 +48,7 @@
 #include "opencv2/core/cuda/limits.hpp"
 #include "opencv2/core/cuda/color.hpp"
 #include "opencv2/core/cuda/saturate_cast.hpp"
+#include "opencv2/cudev/ptr2d/texture.hpp"

 namespace cv { namespace cuda { namespace device
 {
@ -389,10 +390,8 @@ namespace cv { namespace cuda { namespace device
    //
    // ported to CUDA

-    texture<uchar, cudaTextureType2D, cudaReadModeElementType> sourceTex(false, cudaFilterModePoint, cudaAddressModeClamp);
-
-    template <typename DstType>
-    __global__ void MHCdemosaic(PtrStepSz<DstType> dst, const int2 sourceOffset, const int2 firstRed)
+    template <typename DstType, class Ptr2D>
+    __global__ void MHCdemosaic(PtrStepSz<DstType> dst, Ptr2D src, const int2 firstRed)
    {
        const float   kAx = -1.0f / 8.0f,     kAy = -1.5f / 8.0f,     kAz =  0.5f / 8.0f    /*kAw = -1.0f / 8.0f*/;
        const float   kBx =  2.0f / 8.0f,   /*kBy =  0.0f / 8.0f,*/ /*kBz =  0.0f / 8.0f,*/   kBw =  4.0f / 8.0f  ;
@ -408,8 +407,8 @@ namespace cv { namespace cuda { namespace device
            return;

        int2 center;
-        center.x = x + sourceOffset.x;
-        center.y = y + sourceOffset.y;
+        center.x = x;
+        center.y = y;

        int4 xCoord;
        xCoord.x = center.x - 2;
@ -423,25 +422,26 @@ namespace cv { namespace cuda { namespace device
        yCoord.z = center.y + 1;
        yCoord.w = center.y + 2;

-        float C = tex2D(sourceTex, center.x, center.y); // ( 0, 0)
+        float C = src(center.y, center.x); // ( 0, 0)

        float4 Dvec;
-        Dvec.x = tex2D(sourceTex, xCoord.y, yCoord.y); // (-1,-1)
-        Dvec.y = tex2D(sourceTex, xCoord.y, yCoord.z); // (-1, 1)
-        Dvec.z = tex2D(sourceTex, xCoord.z, yCoord.y); // ( 1,-1)
-        Dvec.w = tex2D(sourceTex, xCoord.z, yCoord.z); // ( 1, 1)
+        Dvec.x = src(yCoord.y, xCoord.y); // (-1,-1)
+        Dvec.y = src(yCoord.z, xCoord.y); // (-1, 1)
+        Dvec.z = src(yCoord.y, xCoord.z); // ( 1,-1)
+        Dvec.w = src(yCoord.z, xCoord.z); // ( 1, 1)
+

        float4 value;
-        value.x = tex2D(sourceTex, center.x, yCoord.x); // ( 0,-2) A0
-        value.y = tex2D(sourceTex, center.x, yCoord.y); // ( 0,-1) B0
-        value.z = tex2D(sourceTex, xCoord.x, center.y); // (-2, 0) E0
-        value.w = tex2D(sourceTex, xCoord.y, center.y); // (-1, 0) F0
+        value.x = src(yCoord.x, center.x); // ( 0,-2) A0
+        value.y = src(yCoord.y, center.x); // ( 0,-1) B0
+        value.z = src(center.y, xCoord.x); // (-2, 0) E0
+        value.w = src(center.y, xCoord.y); // (-1, 0) F0

        // (A0 + A1), (B0 + B1), (E0 + E1), (F0 + F1)
-        value.x += tex2D(sourceTex, center.x, yCoord.w); // ( 0, 2) A1
-        value.y += tex2D(sourceTex, center.x, yCoord.z); // ( 0, 1) B1
-        value.z += tex2D(sourceTex, xCoord.w, center.y); // ( 2, 0) E1
-        value.w += tex2D(sourceTex, xCoord.z, center.y); // ( 1, 0) F1
+        value.x += src(yCoord.w, center.x); // ( 0, 2) A1
+        value.y += src(yCoord.z, center.x); // ( 0, 1) B1
+        value.z += src(center.y, xCoord.w); // ( 2, 0) E1
+        value.w += src(center.y, xCoord.z); // ( 1, 0) F1

        float4 PATTERN;
        PATTERN.x = kCx * C;
@ -527,9 +527,15 @@ namespace cv { namespace cuda { namespace device
        const dim3 block(32, 8);
        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));

-        bindTexture(&sourceTex, src);
+        if (sourceOffset.x || sourceOffset.y) {
+            cv::cudev::TextureOff<uchar> texSrc(src, sourceOffset.y, sourceOffset.x);
+            MHCdemosaic<dst_t, cv::cudev::TextureOffPtr<uchar>><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, texSrc, firstRed);
+        }
+        else {
+            cv::cudev::Texture<uchar> texSrc(src);
+            MHCdemosaic<dst_t, cv::cudev::TexturePtr<uchar>><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, texSrc, firstRed);
+        }

-        MHCdemosaic<dst_t><<<grid, block, 0, stream>>>((PtrStepSz<dst_t>)dst, sourceOffset, firstRed);
        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
--- a/modules/cudaimgproc/src/cuda/gftt.cu
+++ b/modules/cudaimgproc/src/cuda/gftt.cu
@ -45,36 +45,36 @@
 #include <thrust/device_ptr.h>
 #include <thrust/sort.h>

-#include "opencv2/core/cuda/common.hpp"
 #include "opencv2/core/cuda/utility.hpp"
+#include <opencv2/cudev/ptr2d/texture.hpp>
 #include <thrust/execution_policy.h>
 namespace cv { namespace cuda { namespace device
 {
    namespace gfft
    {
-        template <class Mask> __global__ void findCorners(float threshold, const Mask mask, float2* corners, int max_count, int rows, int cols, cudaTextureObject_t eigTex, int *g_counter)
+        template <class Mask> __global__ void findCorners(cv::cudev::TexturePtr<float> tex, float threshold, const Mask mask, float2* corners, int max_count, int rows, int cols, int *g_counter)
        {
            const int j = blockIdx.x * blockDim.x + threadIdx.x;
            const int i = blockIdx.y * blockDim.y + threadIdx.y;

            if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1 && mask(i, j))
            {
-                float val = tex2D<float>(eigTex, j, i);
+                float val = tex(i, j);

                if (val > threshold)
                {
                    float maxVal = val;

-                    maxVal = ::fmax(tex2D<float>(eigTex, j - 1, i - 1), maxVal);
-                    maxVal = ::fmax(tex2D<float>(eigTex, j    , i - 1), maxVal);
-                    maxVal = ::fmax(tex2D<float>(eigTex, j + 1, i - 1), maxVal);
+                    maxVal = ::fmax(tex(i - 1, j - 1), maxVal);
+                    maxVal = ::fmax(tex(i - 1, j), maxVal);
+                    maxVal = ::fmax(tex(i - 1, j + 1), maxVal);

-                    maxVal = ::fmax(tex2D<float>(eigTex, j - 1, i), maxVal);
-                    maxVal = ::fmax(tex2D<float>(eigTex, j + 1, i), maxVal);
+                    maxVal = ::fmax(tex(i, j - 1), maxVal);
+                    maxVal = ::fmax(tex(i, j + 1), maxVal);

-                    maxVal = ::fmax(tex2D<float>(eigTex, j - 1, i + 1), maxVal);
-                    maxVal = ::fmax(tex2D<float>(eigTex, j    , i + 1), maxVal);
-                    maxVal = ::fmax(tex2D<float>(eigTex, j + 1, i + 1), maxVal);
+                    maxVal = ::fmax(tex(i + 1, j - 1), maxVal);
+                    maxVal = ::fmax(tex(i + 1, j), maxVal);
+                    maxVal = ::fmax(tex(i + 1, j + 1), maxVal);

                    if (val == maxVal)
                    {
@ -87,17 +87,18 @@ namespace cv { namespace cuda { namespace device
            }
        }

-        int findCorners_gpu(const cudaTextureObject_t &eigTex, const int &rows, const int &cols, float threshold, PtrStepSzb mask, float2* corners, int max_count, int* counterPtr, cudaStream_t stream)
+        int findCorners_gpu(const PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count, int* counterPtr, cudaStream_t stream)
        {
            cudaSafeCall( cudaMemsetAsync(counterPtr, 0, sizeof(int), stream) );
+            cv::cudev::Texture<float> tex(eig);

            dim3 block(16, 16);
-            dim3 grid(divUp(cols, block.x), divUp(rows, block.y));
+            dim3 grid(divUp(eig.cols, block.x), divUp(eig.rows, block.y));

            if (mask.data)
-                findCorners<<<grid, block, 0, stream>>>(threshold, SingleMask(mask), corners, max_count, rows, cols, eigTex, counterPtr);
+                findCorners<<<grid, block, 0, stream>>>(tex, threshold, SingleMask(mask), corners, max_count, eig.rows, eig.cols, counterPtr);
            else
-                findCorners<<<grid, block, 0, stream>>>(threshold, WithOutMask(), corners, max_count, rows, cols, eigTex, counterPtr);
+                findCorners<<<grid, block, 0, stream>>>(tex, threshold, WithOutMask(), corners, max_count, eig.rows, eig.cols, counterPtr);

            cudaSafeCall( cudaGetLastError() );

@ -113,27 +114,24 @@ namespace cv { namespace cuda { namespace device
        class EigGreater
        {
        public:
-            EigGreater(const cudaTextureObject_t &eigTex_) : eigTex(eigTex_)
-            {
-            }
-            __device__ __forceinline__ bool operator()(float2 a, float2 b) const
-            {
-                return tex2D<float>(eigTex, a.x, a.y) > tex2D<float>(eigTex, b.x, b.y);
+            EigGreater(cv::cudev::TexturePtr<float> tex_) : tex(tex_) {}
+            __device__ __forceinline__ bool operator()(float2 a, float2 b) const{
+                return tex(a.y, a.x) > tex(b.y, b.x);
            }
-
-            cudaTextureObject_t eigTex;
+            cv::cudev::TexturePtr<float> tex;
        };

-        void sortCorners_gpu(const cudaTextureObject_t &eigTex, float2* corners, int count, cudaStream_t stream)
+        void sortCorners_gpu(const PtrStepSzf eig, float2* corners, int count, cudaStream_t stream)
        {
+            cv::cudev::Texture<float> tex(eig);
            thrust::device_ptr<float2> ptr(corners);
 #if THRUST_VERSION >= 100802
            if (stream)
-                thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()).on(stream), ptr, ptr + count, EigGreater(eigTex));
+                thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()).on(stream), ptr, ptr + count, EigGreater(tex));
            else
-                thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()), ptr, ptr + count, EigGreater(eigTex));
+                thrust::sort(thrust::cuda::par(ThrustAllocator::getAllocator()), ptr, ptr + count, EigGreater(tex));
 #else
-            thrust::sort(ptr, ptr + count, EigGreater(eigTex));
+            thrust::sort(ptr, ptr + count, EigGreater(tex));
 #endif
        }
    } // namespace optical_flow
--- a/modules/cudaimgproc/src/cuda/hough_segments.cu
+++ b/modules/cudaimgproc/src/cuda/hough_segments.cu
@ -50,7 +50,8 @@ namespace cv { namespace cuda { namespace device
 {
    namespace hough_segments
    {
-        __global__ void houghLinesProbabilistic(cv::cudev::Texture<uchar> src, const PtrStepSzi accum,
+        template<class Ptr2D>
+        __global__ void houghLinesProbabilistic(Ptr2D src, const PtrStepSzi accum,
                                                int4* out, const int maxSize,
                                                const float rho, const float theta,
                                                const int lineGap, const int lineLength,
@ -219,15 +220,18 @@ namespace cv { namespace cuda { namespace device
            const dim3 block(32, 8);
            const dim3 grid(divUp(accum.cols - 2, block.x), divUp(accum.rows - 2, block.y));
            
-            cv::cudev::GpuMat_<uchar> src_(mask);
-            cv::cudev::Texture<uchar> tex(src_, false, cudaFilterModePoint, cudaAddressModeClamp);
-
-            houghLinesProbabilistic<<<grid, block, 0, stream>>>(tex, accum,
-                                                     out, maxSize,
-                                                     rho, theta,
-                                                     lineGap, lineLength,
-                                                     mask.rows, mask.cols,
-                                                     counterPtr);
+            Size wholeSize;
+            Point ofs;
+            mask.locateROI(wholeSize, ofs);
+            if (ofs.x || ofs.y) {
+                cv::cudev::TextureOff<uchar> texMask(wholeSize.height, wholeSize.width, mask.datastart, mask.step, ofs.y, ofs.x);
+                houghLinesProbabilistic<cv::cudev::TextureOffPtr<uchar>><<<grid, block, 0, stream>>>(texMask, accum, out, maxSize, rho, theta, lineGap, lineLength, mask.rows, mask.cols, counterPtr);
+            }
+            else {
+                cv::cudev::Texture<uchar> texMask(mask);
+                houghLinesProbabilistic<cv::cudev::TexturePtr<uchar>><<<grid, block, 0, stream>>>(texMask, accum, out, maxSize, rho, theta, lineGap, lineLength, mask.rows, mask.cols, counterPtr);
+            }
+
            cudaSafeCall( cudaGetLastError() );

            int totalCount;
@ -236,7 +240,6 @@ namespace cv { namespace cuda { namespace device
            cudaSafeCall( cudaStreamSynchronize(stream) );

            totalCount = ::min(totalCount, maxSize);
-
            return totalCount;
        }
    }
--- a/modules/cudaimgproc/src/cuda/mean_shift.cu
+++ b/modules/cudaimgproc/src/cuda/mean_shift.cu
@ -47,19 +47,16 @@
 #include "opencv2/core/cuda/vec_math.hpp"
 #include "opencv2/core/cuda/saturate_cast.hpp"
 #include "opencv2/core/cuda/border_interpolate.hpp"
+#include <opencv2/cudev/ptr2d/texture.hpp>

 namespace cv { namespace cuda { namespace device
 {
    namespace imgproc
    {
-        texture<uchar4, 2> tex_meanshift;
-
-        __device__ short2 do_mean_shift(int x0, int y0, unsigned char* out,
-                                        size_t out_step, int cols, int rows,
-                                        int sp, int sr, int maxIter, float eps)
+        __device__ short2 do_mean_shift(cv::cudev::TexturePtr<uchar4> tex, int x0, int y0, unsigned char* out,size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps)
        {
            int isr2 = sr*sr;
-            uchar4 c = tex2D(tex_meanshift, x0, y0 );
+            uchar4 c = tex(y0, x0);

            // iterate meanshift procedure
            for( int iter = 0; iter < maxIter; iter++ )
@ -79,7 +76,7 @@ namespace cv { namespace cuda { namespace device
                    int rowCount = 0;
                    for( int x = minx; x <= maxx; x++ )
                    {
-                        uchar4 t = tex2D( tex_meanshift, x, y );
+                        uchar4 t = tex(y, x);

                        int norm2 = (t.x - c.x) * (t.x - c.x) + (t.y - c.y) * (t.y - c.y) + (t.z - c.z) * (t.z - c.z);
                        if( norm2 <= isr2 )
@ -119,13 +116,13 @@ namespace cv { namespace cuda { namespace device
            return make_short2((short)x0, (short)y0);
        }

-        __global__ void meanshift_kernel(unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps )
+        __global__ void meanshift_kernel(cv::cudev::TexturePtr<uchar4> tex, unsigned char* out, size_t out_step, int cols, int rows, int sp, int sr, int maxIter, float eps )
        {
            int x0 = blockIdx.x * blockDim.x + threadIdx.x;
            int y0 = blockIdx.y * blockDim.y + threadIdx.y;

            if( x0 < cols && y0 < rows )
-                do_mean_shift(x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);
+                do_mean_shift(tex, x0, y0, out, out_step, cols, rows, sp, sr, maxIter, eps);
        }

        void meanShiftFiltering_gpu(const PtrStepSzb& src, PtrStepSzb dst, int sp, int sr, int maxIter, float eps, cudaStream_t stream)
@ -134,21 +131,15 @@ namespace cv { namespace cuda { namespace device
            dim3 threads(32, 8, 1);
            grid.x = divUp(src.cols, threads.x);
            grid.y = divUp(src.rows, threads.y);
-
-            cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
-            cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
-
-            meanshift_kernel<<< grid, threads, 0, stream >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
+            cv::cudev::Texture<uchar4> tex(src.rows, src.cols, (uchar4*)src.data, src.step);
+            meanshift_kernel<<< grid, threads, 0, stream >>>( tex, dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
            cudaSafeCall( cudaGetLastError() );
-
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }

-        __global__ void meanshiftproc_kernel(unsigned char* outr, size_t outrstep,
-                                             unsigned char* outsp, size_t outspstep,
-                                             int cols, int rows,
-                                             int sp, int sr, int maxIter, float eps)
+        __global__ void meanshiftproc_kernel(cv::cudev::TexturePtr<uchar4> tex, unsigned char* outr, size_t outrstep, unsigned char* outsp, size_t outspstep,
+            int cols, int rows,int sp, int sr, int maxIter, float eps)
        {
            int x0 = blockIdx.x * blockDim.x + threadIdx.x;
            int y0 = blockIdx.y * blockDim.y + threadIdx.y;
@ -156,7 +147,7 @@ namespace cv { namespace cuda { namespace device
            if( x0 < cols && y0 < rows )
            {
                int basesp = (blockIdx.y * blockDim.y + threadIdx.y) * outspstep + (blockIdx.x * blockDim.x + threadIdx.x) * 2 * sizeof(short);
-                *(short2*)(outsp + basesp) = do_mean_shift(x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);
+                *(short2*)(outsp + basesp) = do_mean_shift(tex, x0, y0, outr, outrstep, cols, rows, sp, sr, maxIter, eps);
            }
        }

@ -166,13 +157,9 @@ namespace cv { namespace cuda { namespace device
            dim3 threads(32, 8, 1);
            grid.x = divUp(src.cols, threads.x);
            grid.y = divUp(src.rows, threads.y);
-
-            cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
-            cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
-
-            meanshiftproc_kernel<<< grid, threads, 0, stream >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
+            cv::cudev::Texture<uchar4> tex(src.rows, src.cols, (uchar4*)src.data, src.step);
+            meanshiftproc_kernel<<< grid, threads, 0, stream >>>( tex, dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
            cudaSafeCall( cudaGetLastError() );
-
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }
--- a/modules/cudaimgproc/src/gftt.cpp
+++ b/modules/cudaimgproc/src/gftt.cpp
@ -55,8 +55,8 @@ namespace cv { namespace cuda { namespace device
 {
    namespace gfft
    {
-        int findCorners_gpu(const cudaTextureObject_t &eigTex_, const int &rows, const int &cols, float threshold, PtrStepSzb mask, float2* corners, int max_count, int* counterPtr, cudaStream_t stream);
-        void sortCorners_gpu(const cudaTextureObject_t &eigTex_, float2* corners, int count, cudaStream_t stream);
+        int findCorners_gpu(const PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count, int* counterPtr, cudaStream_t stream);
+        void sortCorners_gpu(const PtrStepSzf eig, float2* corners, int count, cudaStream_t stream);
    }
 }}}

@ -120,31 +120,15 @@ namespace
        cudaStream_t stream_ = StreamAccessor::getStream(stream);
        ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);

-        //create texture object for findCorners_gpu and sortCorners_gpu
-        cudaTextureDesc texDesc;
-        memset(&texDesc, 0, sizeof(texDesc));
-        texDesc.readMode = cudaReadModeElementType;
-        texDesc.filterMode = cudaFilterModePoint;
-        texDesc.addressMode[0] = cudaAddressModeClamp;
-        texDesc.addressMode[1] = cudaAddressModeClamp;
-        texDesc.addressMode[2] = cudaAddressModeClamp;
-
-        cudaTextureObject_t eigTex_;
-        PtrStepSzf eig = eig_;
-        cv::cuda::device::createTextureObjectPitch2D<float>(&eigTex_, eig, texDesc);
-
-        int total = findCorners_gpu(eigTex_, eig_.rows, eig_.cols, static_cast<float>(maxVal * qualityLevel_), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols, counterPtr_, stream_);
+        int total = findCorners_gpu(eig_, static_cast<float>(maxVal * qualityLevel_), mask, tmpCorners_.ptr<float2>(), tmpCorners_.cols, counterPtr_, stream_);

        if (total == 0)
        {
            _corners.release();
-            cudaSafeCall( cudaDestroyTextureObject(eigTex_) );
            return;
        }

-        sortCorners_gpu(eigTex_, tmpCorners_.ptr<float2>(), total, stream_);
-
-        cudaSafeCall( cudaDestroyTextureObject(eigTex_) );
+        sortCorners_gpu(eig_, tmpCorners_.ptr<float2>(), total, stream_);

        if (minDistance_ < 1)
        {
--- a/modules/cudaimgproc/test/test_color.cpp
+++ b/modules/cudaimgproc/test/test_color.cpp
@ -2294,14 +2294,15 @@ INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CvtColor, testing::Combine(
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // Demosaicing

-struct Demosaicing : testing::TestWithParam<cv::cuda::DeviceInfo>
+struct Demosaicing : testing::TestWithParam<testing::tuple<cv::cuda::DeviceInfo, bool>>
 {
    cv::cuda::DeviceInfo devInfo;
+    bool useRoi;

    virtual void SetUp()
    {
-        devInfo = GetParam();
-
+        devInfo = GET_PARAM(0);
+        useRoi = GET_PARAM(1);
        cv::cuda::setDevice(devInfo.deviceID());
    }

@ -2419,7 +2420,7 @@ CUDA_TEST_P(Demosaicing, BayerBG2BGR_MHT)
    mosaic(img, src, cv::Point(1, 1));

    cv::cuda::GpuMat dst;
-    cv::cuda::demosaicing(loadMat(src), dst, cv::cuda::COLOR_BayerBG2BGR_MHT);
+    cv::cuda::demosaicing(loadMat(src, useRoi), dst, cv::cuda::COLOR_BayerBG2BGR_MHT);

    EXPECT_MAT_SIMILAR(img, dst, 5e-3);
 }
@ -2433,7 +2434,7 @@ CUDA_TEST_P(Demosaicing, BayerGB2BGR_MHT)
    mosaic(img, src, cv::Point(0, 1));

    cv::cuda::GpuMat dst;
-    cv::cuda::demosaicing(loadMat(src), dst, cv::cuda::COLOR_BayerGB2BGR_MHT);
+    cv::cuda::demosaicing(loadMat(src, useRoi), dst, cv::cuda::COLOR_BayerGB2BGR_MHT);

    EXPECT_MAT_SIMILAR(img, dst, 5e-3);
 }
@ -2447,7 +2448,7 @@ CUDA_TEST_P(Demosaicing, BayerRG2BGR_MHT)
    mosaic(img, src, cv::Point(0, 0));

    cv::cuda::GpuMat dst;
-    cv::cuda::demosaicing(loadMat(src), dst, cv::cuda::COLOR_BayerRG2BGR_MHT);
+    cv::cuda::demosaicing(loadMat(src, useRoi), dst, cv::cuda::COLOR_BayerRG2BGR_MHT);

    EXPECT_MAT_SIMILAR(img, dst, 5e-3);
 }
@ -2461,12 +2462,11 @@ CUDA_TEST_P(Demosaicing, BayerGR2BGR_MHT)
    mosaic(img, src, cv::Point(1, 0));

    cv::cuda::GpuMat dst;
-    cv::cuda::demosaicing(loadMat(src), dst, cv::cuda::COLOR_BayerGR2BGR_MHT);
-
+    cv::cuda::demosaicing(loadMat(src, useRoi), dst, cv::cuda::COLOR_BayerGR2BGR_MHT);
    EXPECT_MAT_SIMILAR(img, dst, 5e-3);
 }

-INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, Demosaicing, ALL_DEVICES);
+INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, Demosaicing, testing::Combine(ALL_DEVICES, WHOLE_SUBMAT));

 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // swapChannels
--- a/modules/cudaimgproc/test/test_hough.cpp
+++ b/modules/cudaimgproc/test/test_hough.cpp
@ -115,8 +115,20 @@ INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, HoughLines, testing::Combine(

 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // HoughLines Probabilistic
-PARAM_TEST_CASE(HoughLinesProbabilistic, cv::cuda::DeviceInfo, cv::Size, UseRoi)
+PARAM_TEST_CASE(HoughLinesProbabilistic, DeviceInfo, Size, UseRoi)
 {
+    cv::cuda::DeviceInfo devInfo;
+    bool useRoi;
+    Size size;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        useRoi = GET_PARAM(2);
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+
    static void generateLines(cv::Mat& img)
    {
        img.setTo(cv::Scalar::all(0));
@ -140,11 +152,6 @@ PARAM_TEST_CASE(HoughLinesProbabilistic, cv::cuda::DeviceInfo, cv::Size, UseRoi)

 CUDA_TEST_P(HoughLinesProbabilistic, Accuracy)
 {
-    const cv::cuda::DeviceInfo devInfo = GET_PARAM(0);
-    cv::cuda::setDevice(devInfo.deviceID());
-    const cv::Size size = GET_PARAM(1);
-    const bool useRoi = GET_PARAM(2);
-
    const float rho = 1.0f;
    const float theta = (float) (1.0 * CV_PI / 180.0);
    const int minLineLength = 15;
@ -169,12 +176,55 @@ CUDA_TEST_P(HoughLinesProbabilistic, Accuracy)

 }

+void HoughLinesProbabilisticThread(const Ptr<HoughSegmentDetector> detector, const GpuMat& imgIn, const std::vector<GpuMat>& linesOut, Stream& stream) {
+    for (auto& lines : linesOut)
+        detector->detect(imgIn, lines, stream);
+    stream.waitForCompletion();
+}
+
+CUDA_TEST_P(HoughLinesProbabilistic, Async)
+{
+    constexpr int nThreads = 5;
+    constexpr int nIters = 5;
+    vector<Stream> streams(nThreads); // async test only
+    vector<GpuMat> imgsIn;
+    vector<Ptr<HoughSegmentDetector>> detectors;
+    vector<vector<GpuMat>> linesOut(nThreads);
+    const float rho = 1.0f;
+    const float theta = (float)(1.0 * CV_PI / 180.0);
+    const int minLineLength = 15;
+    const int maxLineGap = 8;
+
+    cv::Mat src(size, CV_8UC1);
+    generateLines(src);
+
+    for (int i = 0; i < nThreads; i++) {
+        imgsIn.push_back(loadMat(src, useRoi));
+        detectors.push_back(createHoughSegmentDetector(rho, theta, minLineLength, maxLineGap));
+        linesOut.push_back(vector<GpuMat>(nIters));
+    }
+
+    vector<std::thread> thread(nThreads);
+    for (int i = 0; i < nThreads; i++) thread.at(i) = std::thread(HoughLinesProbabilisticThread, detectors.at(i), std::ref(imgsIn.at(i)), std::ref(linesOut.at(i)), std::ref(streams.at(i)));
+    for (int i = 0; i < nThreads; i++) thread.at(i).join();
+
+    for (int i = 0; i < nThreads; i++) {
+        std::vector<cv::Vec4i> linesSegment;
+        std::vector<cv::Vec2f> lines;
+        for (const auto& line : linesOut.at(i)) {
+            line.download(linesSegment);
+            cv::Mat dst(size, CV_8UC1);
+            drawLines(dst, linesSegment);
+            ASSERT_MAT_NEAR(src, dst, 0.0);
+        }
+    }
+}
+
 INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, HoughLinesProbabilistic, testing::Combine(
    ALL_DEVICES,
    DIFFERENT_SIZES,
    WHOLE_SUBMAT));

-
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // HoughCircles

--- a/modules/cudaimgproc/test/test_precomp.hpp
+++ b/modules/cudaimgproc/test/test_precomp.hpp
@ -49,4 +49,6 @@

 #include "cvconfig.h"

+#include <thread>
+
 #endif
--- a/modules/cudalegacy/include/opencv2/cudalegacy/NCV.hpp
+++ b/modules/cudalegacy/include/opencv2/cudalegacy/NCV.hpp
@ -119,9 +119,9 @@ typedef               bool NcvBool;
 typedef          long long Ncv64s;

 #if defined(__APPLE__) && !defined(__CUDACC__)
-    typedef uint64_t Ncv64u;
+    typedef uint64 Ncv64u;
 #else
-    typedef unsigned long long Ncv64u;
+    typedef uint64 Ncv64u;
 #endif

 typedef                int Ncv32s;
--- a/modules/cudalegacy/include/opencv2/cudalegacy/NPP_staging.hpp
+++ b/modules/cudalegacy/include/opencv2/cudalegacy/NPP_staging.hpp
@ -174,7 +174,7 @@ NCVStatus nppiStInterpolateFrames(const NppStInterpolationState *pState);
 * \return NCV status code
 */
 CV_EXPORTS
-NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
+NCVStatus nppiStFilterRowBorder_32f_C1R(Ncv32f *pSrc,
                                        NcvSize32u srcSize,
                                        Ncv32u nSrcStep,
                                        Ncv32f *pDst,
@ -182,7 +182,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
                                        Ncv32u nDstStep,
                                        NcvRect32u oROI,
                                        NppStBorderType borderType,
-                                        const Ncv32f *pKernel,
+                                        Ncv32f *pKernel,
                                        Ncv32s nKernelSize,
                                        Ncv32s nAnchor,
                                        Ncv32f multiplier);
@ -208,7 +208,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
 * \return NCV status code
 */
 CV_EXPORTS
-NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
+NCVStatus nppiStFilterColumnBorder_32f_C1R(Ncv32f *pSrc,
                                           NcvSize32u srcSize,
                                           Ncv32u nSrcStep,
                                           Ncv32f *pDst,
@ -216,7 +216,7 @@ NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
                                           Ncv32u nDstStep,
                                           NcvRect32u oROI,
                                           NppStBorderType borderType,
-                                           const Ncv32f *pKernel,
+                                           Ncv32f *pKernel,
                                           Ncv32s nKernelSize,
                                           Ncv32s nAnchor,
                                           Ncv32f multiplier);
@ -319,7 +319,7 @@ NCVStatus nppiStVectorWarp_PSF2x2_32f_C1(const Ncv32f *pSrc,
 * \return NCV status code
 */
 CV_EXPORTS
-NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc,
+NCVStatus nppiStResize_32f_C1R(Ncv32f *pSrc,
                               NcvSize32u srcSize,
                               Ncv32u nSrcStep,
                               NcvRect32u srcROI,
--- a/modules/cudalegacy/src/cuda/NCVBroxOpticalFlow.cu
+++ b/modules/cudalegacy/src/cuda/NCVBroxOpticalFlow.cu
@ -65,9 +65,12 @@

 #include "opencv2/cudalegacy/NPP_staging.hpp"
 #include "opencv2/cudalegacy/NCVBroxOpticalFlow.hpp"
+#include <opencv2/cudev/ptr2d/texture.hpp>


 typedef NCVVectorAlloc<Ncv32f> FloatVector;
+typedef cv::cudev::TexturePtr<float> Ptr2D;
+typedef cv::cudev::Texture<float> Texture;

 /////////////////////////////////////////////////////////////////////////////////////////
 // Implementation specific constants
@ -84,39 +87,6 @@ inline int iDivUp(int a, int b)
    return (a + b - 1)/b;
 }

-/////////////////////////////////////////////////////////////////////////////////////////
-// Texture references
-/////////////////////////////////////////////////////////////////////////////////////////
-
-texture<float, 2, cudaReadModeElementType> tex_coarse;
-texture<float, 2, cudaReadModeElementType> tex_fine;
-
-texture<float, 2, cudaReadModeElementType> tex_I1;
-texture<float, 2, cudaReadModeElementType> tex_I0;
-
-texture<float, 2, cudaReadModeElementType> tex_Ix;
-texture<float, 2, cudaReadModeElementType> tex_Ixx;
-texture<float, 2, cudaReadModeElementType> tex_Ix0;
-
-texture<float, 2, cudaReadModeElementType> tex_Iy;
-texture<float, 2, cudaReadModeElementType> tex_Iyy;
-texture<float, 2, cudaReadModeElementType> tex_Iy0;
-
-texture<float, 2, cudaReadModeElementType> tex_Ixy;
-
-texture<float, 1, cudaReadModeElementType> tex_u;
-texture<float, 1, cudaReadModeElementType> tex_v;
-texture<float, 1, cudaReadModeElementType> tex_du;
-texture<float, 1, cudaReadModeElementType> tex_dv;
-texture<float, 1, cudaReadModeElementType> tex_numerator_dudv;
-texture<float, 1, cudaReadModeElementType> tex_numerator_u;
-texture<float, 1, cudaReadModeElementType> tex_numerator_v;
-texture<float, 1, cudaReadModeElementType> tex_inv_denominator_u;
-texture<float, 1, cudaReadModeElementType> tex_inv_denominator_v;
-texture<float, 1, cudaReadModeElementType> tex_diffusivity_x;
-texture<float, 1, cudaReadModeElementType> tex_diffusivity_y;
-
-
 /////////////////////////////////////////////////////////////////////////////////////////
 // SUPPLEMENTARY FUNCTIONS
 /////////////////////////////////////////////////////////////////////////////////////////
@ -265,8 +235,7 @@ __forceinline__ __device__ void diffusivity_along_y(float *s, int pos, const flo
 ///\param h number of rows in global memory array
 ///\param p global memory array pitch in floats
 ///////////////////////////////////////////////////////////////////////////////
-template<int tex_id>
-__forceinline__ __device__ void load_array_element(float *smem, int is, int js, int i, int j, int w, int h, int p)
+__forceinline__ __device__ void load_array_element(Ptr2D texSrc, float *smem, int is, int js, int i, int j, int w, int h, int p)
 {
    //position within shared memory array
    const int ijs = js * PSOR_PITCH + is;
@ -276,20 +245,7 @@ __forceinline__ __device__ void load_array_element(float *smem, int is, int js,
    j = max(j, -j-1);
    j = min(j, h-j+h-1);
    const int pos = j * p + i;
-    switch(tex_id){
-        case 0:
-            smem[ijs] = tex1Dfetch(tex_u, pos);
-            break;
-        case 1:
-            smem[ijs] = tex1Dfetch(tex_v, pos);
-            break;
-        case 2:
-            smem[ijs] = tex1Dfetch(tex_du, pos);
-            break;
-        case 3:
-            smem[ijs] = tex1Dfetch(tex_dv, pos);
-            break;
-    }
+    smem[ijs] = texSrc(pos);
 }

 ///////////////////////////////////////////////////////////////////////////////
@ -301,49 +257,48 @@ __forceinline__ __device__ void load_array_element(float *smem, int is, int js,
 ///\param h number of rows in global memory array
 ///\param p global memory array pitch in floats
 ///////////////////////////////////////////////////////////////////////////////
-template<int tex>
-__forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, int h, int p)
+__forceinline__ __device__ void load_array(Ptr2D texSrc, float *smem, int ig, int jg, int w, int h, int p)
 {
    const int i = threadIdx.x + 2;
    const int j = threadIdx.y + 2;
-    load_array_element<tex>(smem, i, j, ig, jg, w, h, p);//load current pixel
+    load_array_element(texSrc, smem, i, j, ig, jg, w, h, p);//load current pixel
    __syncthreads();
    if(threadIdx.y < 2)
    {
        //load bottom shadow elements
-        load_array_element<tex>(smem, i, j-2, ig, jg-2, w, h, p);
+        load_array_element(texSrc, smem, i, j-2, ig, jg-2, w, h, p);
        if(threadIdx.x < 2)
        {
            //load bottom right shadow elements
-            load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j-2, ig+PSOR_TILE_WIDTH, jg-2, w, h, p);
+            load_array_element(texSrc, smem, i+PSOR_TILE_WIDTH, j-2, ig+PSOR_TILE_WIDTH, jg-2, w, h, p);
            //load middle right shadow elements
-            load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
+            load_array_element(texSrc, smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
        }
        else if(threadIdx.x >= PSOR_TILE_WIDTH-2)
        {
            //load bottom left shadow elements
-            load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j-2, ig-PSOR_TILE_WIDTH, jg-2, w, h, p);
+            load_array_element(texSrc, smem, i-PSOR_TILE_WIDTH, j-2, ig-PSOR_TILE_WIDTH, jg-2, w, h, p);
            //load middle left shadow elements
-            load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
+            load_array_element(texSrc, smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
        }
    }
    else if(threadIdx.y >= PSOR_TILE_HEIGHT-2)
    {
        //load upper shadow elements
-        load_array_element<tex>(smem, i, j+2, ig, jg+2, w, h, p);
+        load_array_element(texSrc, smem, i, j+2, ig, jg+2, w, h, p);
        if(threadIdx.x < 2)
        {
            //load upper right shadow elements
-            load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j+2, ig+PSOR_TILE_WIDTH, jg+2, w, h, p);
+            load_array_element(texSrc, smem, i+PSOR_TILE_WIDTH, j+2, ig+PSOR_TILE_WIDTH, jg+2, w, h, p);
            //load middle right shadow elements
-            load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
+            load_array_element(texSrc, smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
        }
        else if(threadIdx.x >= PSOR_TILE_WIDTH-2)
        {
            //load upper left shadow elements
-            load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j+2, ig-PSOR_TILE_WIDTH, jg+2, w, h, p);
+            load_array_element(texSrc, smem, i-PSOR_TILE_WIDTH, j+2, ig-PSOR_TILE_WIDTH, jg+2, w, h, p);
            //load middle left shadow elements
-            load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
+            load_array_element(texSrc, smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
        }
    }
    else
@ -352,12 +307,12 @@ __forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, i
        if(threadIdx.x < 2)
        {
            //load middle right shadow elements
-            load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
+            load_array_element(texSrc, smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
        }
        else if(threadIdx.x >= PSOR_TILE_WIDTH-2)
        {
            //load middle left shadow elements
-            load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
+            load_array_element(texSrc, smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
        }
    }
    __syncthreads();
@ -382,13 +337,9 @@ __forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, i
 /// \param alpha (in) alpha in Brox model (flow smoothness)
 /// \param gamma (in) gamma in Brox model (edge importance)
 ///////////////////////////////////////////////////////////////////////////////
-
-__global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity_y,
-                                                        float *denominator_u, float *denominator_v,
-                                                        float *numerator_dudv,
-                                                        float *numerator_u, float *numerator_v,
-                                                        int w, int h, int s,
-                                                        float alpha, float gamma)
+__global__ void prepare_sor_stage_1_tex(Ptr2D texU, Ptr2D texV, Ptr2D texDu, Ptr2D texDv, Ptr2D texI0, Ptr2D texI1, Ptr2D texIx, Ptr2D texIxx, Ptr2D texIx0, Ptr2D texIy, Ptr2D texIyy,
+    Ptr2D texIy0, Ptr2D texIxy, float *diffusivity_x, float *diffusivity_y, float *denominator_u, float *denominator_v, float *numerator_dudv, float *numerator_u, float *numerator_v,
+    int w, int h, int s, float alpha, float gamma)
 {
    __shared__ float u[PSOR_PITCH * PSOR_HEIGHT];
    __shared__ float v[PSOR_PITCH * PSOR_HEIGHT];
@ -408,24 +359,24 @@ __global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity
    float x = (float)ig + 0.5f;
    float y = (float)jg + 0.5f;
    //load u  and v to smem
-    load_array<0>(u, ig, jg, w, h, s);
-    load_array<1>(v, ig, jg, w, h, s);
-    load_array<2>(du, ig, jg, w, h, s);
-    load_array<3>(dv, ig, jg, w, h, s);
+    load_array(texU, u, ig, jg, w, h, s);
+    load_array(texV, v, ig, jg, w, h, s);
+    load_array(texDu, du, ig, jg, w, h, s);
+    load_array(texDv, dv, ig, jg, w, h, s);
    //warped position
    float wx = (x + u[ijs])/(float)w;
    float wy = (y + v[ijs])/(float)h;
    x /= (float)w;
    y /= (float)h;
    //compute image derivatives
-    const float Iz  = tex2D(tex_I1, wx, wy) - tex2D(tex_I0, x, y);
-    const float Ix  = tex2D(tex_Ix, wx, wy);
-    const float Ixz = Ix - tex2D(tex_Ix0, x, y);
-    const float Ixy = tex2D(tex_Ixy, wx, wy);
-    const float Ixx = tex2D(tex_Ixx, wx, wy);
-    const float Iy  = tex2D(tex_Iy, wx, wy);
-    const float Iyz = Iy - tex2D(tex_Iy0, x, y);
-    const float Iyy = tex2D(tex_Iyy, wx, wy);
+    const float Iz = texI1(wy, wx) - texI0(y,x);
+    const float Ix  = texIx(wy, wx);
+    const float Ixz = Ix - texIx0(y, x);
+    const float Ixy = texIxy(wy, wx);
+    const float Ixx = texIxx(wy, wx);
+    const float Iy = texIy(wy, wx);
+    const float Iyz = Iy - texIy0(y, x);
+    const float Iyy = texIyy(wy, wx);
    //compute data term
    float q0, q1, q2;
    q0 = Iz  + Ix  * du[ijs] + Iy  * dv[ijs];
@ -462,8 +413,7 @@ __global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity
 ///\param h
 ///\param s
 ///////////////////////////////////////////////////////////////////////////////
-__global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denominator_v,
-                                    int w, int h, int s)
+__global__ void prepare_sor_stage_2(Ptr2D texDiffX, Ptr2D texDiffY, float *inv_denominator_u, float *inv_denominator_v, int w, int h, int s)
 {
    __shared__ float sx[(PSOR_TILE_WIDTH+1) * (PSOR_TILE_HEIGHT+1)];
    __shared__ float sy[(PSOR_TILE_WIDTH+1) * (PSOR_TILE_HEIGHT+1)];
@ -486,8 +436,8 @@ __global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denomin
    }
    if(inside)
    {
-        sx[ijs] = tex1Dfetch(tex_diffusivity_x, ijg);
-        sy[ijs] = tex1Dfetch(tex_diffusivity_y, ijg);
+        sx[ijs] = texDiffX(ijg);
+        sy[ijs] = texDiffY(ijg);
    }
    else
    {
@ -498,25 +448,17 @@ __global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denomin
    if(j == PSOR_TILE_HEIGHT-1)
    {
        if(jg < h-1 && inside)
-        {
-            sy[up] = tex1Dfetch(tex_diffusivity_y, ijg + s);
-        }
+            sy[up] = texDiffY(ijg + s);
        else
-        {
            sy[up] = 0.0f;
-        }
    }
    int right = ijs + 1;
    if(threadIdx.x == PSOR_TILE_WIDTH-1)
    {
        if(ig < w-1 && inside)
-        {
-            sx[right] = tex1Dfetch(tex_diffusivity_x, ijg + 1);
-        }
+            sx[right] = texDiffX(ijg + 1);
        else
-        {
            sx[right] = 0.0f;
-        }
    }
    __syncthreads();
    float diffusivity_sum;
@ -534,17 +476,8 @@ __global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denomin
 // Red-Black SOR
 /////////////////////////////////////////////////////////////////////////////////////////

-template<int isBlack> __global__ void sor_pass(float *new_du,
-                                               float *new_dv,
-                                               const float *g_inv_denominator_u,
-                                               const float *g_inv_denominator_v,
-                                               const float *g_numerator_u,
-                                               const float *g_numerator_v,
-                                               const float *g_numerator_dudv,
-                                               float omega,
-                                               int width,
-                                               int height,
-                                               int stride)
+template<int isBlack> __global__ void sor_pass(Ptr2D texU, Ptr2D texV, Ptr2D texDu, Ptr2D texDv, Ptr2D texDiffX, Ptr2D texDiffY, float *new_du, float *new_dv, const float *g_inv_denominator_u,
+    const float *g_inv_denominator_v, const float *g_numerator_u, const float *g_numerator_v, const float *g_numerator_dudv, float omega, int width, int height, int stride)
 {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j = blockIdx.y * blockDim.y + threadIdx.y;
@ -560,14 +493,14 @@ template<int isBlack> __global__ void sor_pass(float *new_du,

    //load smooth term
    float s_up, s_left, s_right, s_down;
-    s_left = tex1Dfetch(tex_diffusivity_x, pos);
-    s_down = tex1Dfetch(tex_diffusivity_y, pos);
+    s_left = texDiffX(pos);
+    s_down = texDiffY(pos);
    if(i < width-1)
-        s_right = tex1Dfetch(tex_diffusivity_x, pos_r);
+        s_right = texDiffX(pos_r);
    else
        s_right = 0.0f; //Neumann BC
    if(j < height-1)
-        s_up = tex1Dfetch(tex_diffusivity_y, pos_u);
+        s_up = texDiffY(pos_u);
    else
        s_up = 0.0f; //Neumann BC

@ -577,30 +510,29 @@ template<int isBlack> __global__ void sor_pass(float *new_du,
    float du_up, du_left, du_right, du_down, du;
    float dv_up, dv_left, dv_right, dv_down, dv;

-    u_left  = tex1Dfetch(tex_u, pos_l);
-    u_right = tex1Dfetch(tex_u, pos_r);
-    u_down  = tex1Dfetch(tex_u, pos_d);
-    u_up    = tex1Dfetch(tex_u, pos_u);
-    u       = tex1Dfetch(tex_u, pos);
-
-    v_left  = tex1Dfetch(tex_v, pos_l);
-    v_right = tex1Dfetch(tex_v, pos_r);
-    v_down  = tex1Dfetch(tex_v, pos_d);
-    v       = tex1Dfetch(tex_v, pos);
-    v_up    = tex1Dfetch(tex_v, pos_u);
-
-    du       = tex1Dfetch(tex_du, pos);
-    du_left  = tex1Dfetch(tex_du, pos_l);
-    du_right = tex1Dfetch(tex_du, pos_r);
-    du_down  = tex1Dfetch(tex_du, pos_d);
-    du_up    = tex1Dfetch(tex_du, pos_u);
-
-    dv       = tex1Dfetch(tex_dv, pos);
-    dv_left  = tex1Dfetch(tex_dv, pos_l);
-    dv_right = tex1Dfetch(tex_dv, pos_r);
-    dv_down  = tex1Dfetch(tex_dv, pos_d);
-    dv_up    = tex1Dfetch(tex_dv, pos_u);
-
+    u_left = texU(pos_l);
+    u_right = texU(pos_r);
+    u_down = texU(pos_d);
+    u_up = texU(pos_u);
+    u = texU(pos);
+
+    v_left = texV(pos_l);
+    v_right = texV(pos_r);
+    v_down = texV(pos_d);
+    v = texV(pos);
+    v_up = texV(pos_u);
+
+    du =  texDu(pos);
+    du_left = texDu(pos_l);
+    du_right = texDu(pos_r);
+    du_down = texDu(pos_d);
+    du_up = texDu(pos_u);
+
+    dv = texDv(pos);
+    dv_left = texDv(pos_l);
+    dv_right = texDv(pos_r);
+    dv_down = texDv(pos_d);
+    dv_up = texDv(pos_u);
    float numerator_dudv    = g_numerator_dudv[pos];

    if((i+j)%2 == isBlack)
@ -624,52 +556,6 @@ template<int isBlack> __global__ void sor_pass(float *new_du,
 ///////////////////////////////////////////////////////////////////////////////
 // utility functions
 ///////////////////////////////////////////////////////////////////////////////
-
-void initTexture1D(texture<float, 1, cudaReadModeElementType> &tex)
-{
-    tex.addressMode[0] = cudaAddressModeClamp;
-    tex.filterMode = cudaFilterModePoint;
-    tex.normalized = false;
-}
-
-void initTexture2D(texture<float, 2, cudaReadModeElementType> &tex)
-{
-    tex.addressMode[0] = cudaAddressModeMirror;
-    tex.addressMode[1] = cudaAddressModeMirror;
-    tex.filterMode = cudaFilterModeLinear;
-    tex.normalized = true;
-}
-
-void InitTextures()
-{
-    initTexture2D(tex_I0);
-    initTexture2D(tex_I1);
-    initTexture2D(tex_fine);      // for downsampling
-    initTexture2D(tex_coarse);    // for prolongation
-
-    initTexture2D(tex_Ix);
-    initTexture2D(tex_Ixx);
-    initTexture2D(tex_Ix0);
-
-    initTexture2D(tex_Iy);
-    initTexture2D(tex_Iyy);
-    initTexture2D(tex_Iy0);
-
-    initTexture2D(tex_Ixy);
-
-    initTexture1D(tex_u);
-    initTexture1D(tex_v);
-    initTexture1D(tex_du);
-    initTexture1D(tex_dv);
-    initTexture1D(tex_diffusivity_x);
-    initTexture1D(tex_diffusivity_y);
-    initTexture1D(tex_inv_denominator_u);
-    initTexture1D(tex_inv_denominator_v);
-    initTexture1D(tex_numerator_dudv);
-    initTexture1D(tex_numerator_u);
-    initTexture1D(tex_numerator_v);
-}
-
 namespace
 {
    struct ImagePyramid
@ -804,8 +690,6 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,

        ncvAssertCUDAReturn(cudaMemcpy(derivativeFilter.ptr(), derivativeFilterHost, sizeof(float) * kDFilterSize,
            cudaMemcpyHostToDevice), NCV_CUDA_ERROR);
-
-        InitTextures();
    }

    //prepare image pyramid
@ -909,9 +793,6 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
        ncvAssertCUDAReturn(cudaMemsetAsync(v.ptr(), 0, kSizeInPixelsAligned * sizeof(float), stream), NCV_CUDA_ERROR);

        //select images with lowest resolution
-        size_t pitch = alignUp(pyr.w.back(), kStrideAlignmentFloat) * sizeof(float);
-        ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, pyr.img0.back()->ptr(), channel_desc, pyr.w.back(), pyr.h.back(), pitch), NCV_CUDA_ERROR);
-        ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, pyr.img1.back()->ptr(), channel_desc, pyr.w.back(), pyr.h.back(), pitch), NCV_CUDA_ERROR);
        ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);

        FloatVector* ptrU = &u;
@ -941,17 +822,14 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
            ncvAssertCUDAReturn(cudaMemsetAsync(du.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR);
            ncvAssertCUDAReturn(cudaMemsetAsync(dv.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR);

-            //texture format descriptor
-            cudaChannelFormatDesc ch_desc = cudaCreateChannelDesc<float>();
-
            I0 = *img0Iter;
            I1 = *img1Iter;

            ++img0Iter;
            ++img1Iter;

-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, I0->ptr(), ch_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR);
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, I1->ptr(), ch_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR);
+            Texture texI0(kLevelHeight, kLevelWidth, I0->ptr(), kLevelStride * sizeof(float), true, cudaFilterModeLinear, cudaAddressModeMirror);
+            Texture texI1(kLevelHeight, kLevelWidth, I1->ptr(), kLevelStride * sizeof(float), true, cudaFilterModeLinear, cudaAddressModeMirror);

            //compute derivatives
            dim3 dBlocks(iDivUp(kLevelWidth, 32), iDivUp(kLevelHeight, 6));
@ -991,20 +869,24 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
            ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Ixy.ptr(), srcSize, nSrcStep, oROI,
                nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );

-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix,  Ix.ptr(),  ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixx, Ixx.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix0, Ix0.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iy,  Iy.ptr(),  ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iyy, Iyy.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iy0, Iy0.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixy, Ixy.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
+            Texture texIx(kLevelHeight, kLevelWidth, Ix.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
+            Texture texIxx(kLevelHeight, kLevelWidth, Ixx.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
+            Texture texIx0(kLevelHeight, kLevelWidth, Ix0.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
+            Texture texIy(kLevelHeight, kLevelWidth, Iy.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
+            Texture texIyy(kLevelHeight, kLevelWidth, Iyy.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
+            Texture texIy0(kLevelHeight, kLevelWidth, Iy0.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
+            Texture texIxy(kLevelHeight, kLevelWidth, Ixy.ptr(), kPitchTex, true, cudaFilterModeLinear, cudaAddressModeMirror);
+            Texture texDiffX(1, kLevelSizeInBytes / sizeof(float), diffusivity_x.ptr(), kLevelSizeInBytes);
+            Texture texDiffY(1, kLevelSizeInBytes / sizeof(float), diffusivity_y.ptr(), kLevelSizeInBytes);

            //    flow
-            ncvAssertCUDAReturn(cudaBindTexture(0, tex_u, ptrU->ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-            ncvAssertCUDAReturn(cudaBindTexture(0, tex_v, ptrV->ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+            Texture texU(1, kLevelSizeInBytes / sizeof(float), ptrU->ptr(), kLevelSizeInBytes);
+            Texture texV(1, kLevelSizeInBytes / sizeof(float), ptrV->ptr(), kLevelSizeInBytes);
            //    flow increments
-            ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-            ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+            Texture texDu(1, kLevelSizeInBytes / sizeof(float), du.ptr(), kLevelSizeInBytes);
+            Texture texDv(1, kLevelSizeInBytes / sizeof(float), dv.ptr(), kLevelSizeInBytes);
+            Texture texDuNew(1, kLevelSizeInBytes / sizeof(float), du_new.ptr(), kLevelSizeInBytes);
+            Texture texDvNew(1, kLevelSizeInBytes / sizeof(float), dv_new.ptr(), kLevelSizeInBytes);

            dim3 psor_blocks(iDivUp(kLevelWidth, PSOR_TILE_WIDTH), iDivUp(kLevelHeight, PSOR_TILE_HEIGHT));
            dim3 psor_threads(PSOR_TILE_WIDTH, PSOR_TILE_HEIGHT);
@ -1018,89 +900,30 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
            for (Ncv32u current_inner_iteration = 0; current_inner_iteration < desc.number_of_inner_iterations; ++current_inner_iteration)
            {
                //compute coefficients
-                prepare_sor_stage_1_tex<<<psor_blocks, psor_threads, 0, stream>>>
-                    (diffusivity_x.ptr(),
-                     diffusivity_y.ptr(),
-                     denom_u.ptr(),
-                     denom_v.ptr(),
-                     num_dudv.ptr(),
-                     num_u.ptr(),
-                     num_v.ptr(),
-                     kLevelWidth,
-                     kLevelHeight,
-                     kLevelStride,
-                     alpha,
-                     gamma);
+                prepare_sor_stage_1_tex<<<psor_blocks, psor_threads, 0, stream>>> (texU, texV, texDu, texDv, texI0, texI1, texIx, texIxx, texIx0, texIy, texIyy, texIy0, texIxy,
+                    diffusivity_x.ptr(), diffusivity_y.ptr(), denom_u.ptr(), denom_v.ptr(), num_dudv.ptr(), num_u.ptr(), num_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride, alpha, gamma);

                ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);

-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_dudv, num_dudv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_u, num_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_v, num_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-
-                prepare_sor_stage_2<<<psor_blocks, psor_threads, 0, stream>>>(denom_u.ptr(), denom_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride);
+                prepare_sor_stage_2<<<psor_blocks, psor_threads, 0, stream>>>(texDiffX, texDiffY, denom_u.ptr(), denom_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride);

                ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);

-                //    linear system coefficients
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_dudv, num_dudv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_u, num_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_v, num_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_u, denom_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_v, denom_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);

                //solve linear system
                for (Ncv32u solver_iteration = 0; solver_iteration < desc.number_of_solver_iterations; ++solver_iteration)
                {
                    float omega = 1.99f;
-
-                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-
-                    sor_pass<0><<<sor_blocks, sor_threads, 0, stream>>>
-                        (du_new.ptr(),
-                        dv_new.ptr(),
-                        denom_u.ptr(),
-                        denom_v.ptr(),
-                        num_u.ptr(),
-                        num_v.ptr(),
-                        num_dudv.ptr(),
-                        omega,
-                        kLevelWidth,
-                        kLevelHeight,
-                        kLevelStride);
+                    sor_pass<0><<<sor_blocks, sor_threads, 0, stream>>>(texU, texV, texDu, texDv, texDiffX, texDiffY, du_new.ptr(), dv_new.ptr(), denom_u.ptr(), denom_v.ptr(),
+                        num_u.ptr(), num_v.ptr(), num_dudv.ptr(), omega, kLevelWidth, kLevelHeight, kLevelStride);

                    ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);

-                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du_new.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv_new.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-
-                    sor_pass<1><<<sor_blocks, sor_threads, 0, stream>>>
-                        (du.ptr(),
-                        dv.ptr(),
-                        denom_u.ptr(),
-                        denom_v.ptr(),
-                        num_u.ptr(),
-                        num_v.ptr(),
-                        num_dudv.ptr(),
-                        omega,
-                        kLevelWidth,
-                        kLevelHeight,
-                        kLevelStride);

-                    ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
+                    sor_pass<1><<<sor_blocks, sor_threads, 0, stream>>>(texU, texV, texDuNew, texDvNew, texDiffX, texDiffY, du.ptr(), dv.ptr(), denom_u.ptr(), denom_v.ptr(), num_u.ptr(),
+                        num_v.ptr(),num_dudv.ptr(), omega, kLevelWidth, kLevelHeight, kLevelStride);

-                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                    ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
                }//end of solver loop
            }// end of inner loop

--- a/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu
+++ b/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu
@ -72,6 +72,7 @@
 #include "opencv2/cudalegacy/NCV.hpp"
 #include "opencv2/cudalegacy/NPP_staging.hpp"
 #include "opencv2/cudalegacy/NCVHaarObjectDetection.hpp"
+#include <opencv2/cudev/ptr2d/texture.hpp>

 #include "NCVRuntimeTemplates.hpp"
 #include "NCVAlg.hpp"
@ -94,24 +95,6 @@ const Ncv32u NUM_THREADS_ANCHORSPARALLEL = 64;
 #define NUM_THREADS_CLASSIFIERPARALLEL          (1 << NUM_THREADS_CLASSIFIERPARALLEL_LOG2)


-/** \internal
-* Haar features solid array.
-*/
-texture<uint2, 1, cudaReadModeElementType> texHaarFeatures;
-
-
-/** \internal
-* Haar classifiers flattened trees container.
-* Two parts: first contains root nodes, second - nodes that are referred by root nodes.
-* Drawback: breaks tree locality (might cause more cache misses
-* Advantage: No need to introduce additional 32-bit field to index root nodes offsets
-*/
-texture<uint4, 1, cudaReadModeElementType> texHaarClassifierNodes;
-
-
-texture<Ncv32u, 1, cudaReadModeElementType> texIImage;
-
-
 __device__ HaarStage64 getStage(Ncv32u iStage, HaarStage64 *d_Stages)
 {
    return d_Stages[iStage];
@ -119,51 +102,37 @@ __device__ HaarStage64 getStage(Ncv32u iStage, HaarStage64 *d_Stages)


 template <NcvBool tbCacheTextureCascade>
-__device__ HaarClassifierNode128 getClassifierNode(Ncv32u iNode, HaarClassifierNode128 *d_ClassifierNodes)
+__device__ HaarClassifierNode128 getClassifierNode(cv::cudev::TexturePtr<uint4> texHaarClassifierNodes, Ncv32u iNode, HaarClassifierNode128 *d_ClassifierNodes)
 {
    HaarClassifierNode128 tmpNode;
    if (tbCacheTextureCascade)
-    {
-        tmpNode._ui4 = tex1Dfetch(texHaarClassifierNodes, iNode);
-    }
+        tmpNode._ui4 = texHaarClassifierNodes(iNode);
    else
-    {
        tmpNode = d_ClassifierNodes[iNode];
-    }
    return tmpNode;
 }


 template <NcvBool tbCacheTextureCascade>
-__device__ void getFeature(Ncv32u iFeature, HaarFeature64 *d_Features,
-                           Ncv32f *weight,
-                           Ncv32u *rectX, Ncv32u *rectY, Ncv32u *rectWidth, Ncv32u *rectHeight)
+__device__ void getFeature(cv::cudev::TexturePtr<uint2> texHaarFeatures, Ncv32u iFeature, HaarFeature64* d_Features, Ncv32f* weight, Ncv32u* rectX, Ncv32u* rectY, Ncv32u* rectWidth, Ncv32u* rectHeight)
 {
    HaarFeature64 feature;
    if (tbCacheTextureCascade)
-    {
-        feature._ui2 = tex1Dfetch(texHaarFeatures, iFeature);
-    }
+        feature._ui2 = texHaarFeatures(iFeature);
    else
-    {
        feature = d_Features[iFeature];
-    }
    feature.getRect(rectX, rectY, rectWidth, rectHeight);
    *weight = feature.getWeight();
 }


 template <NcvBool tbCacheTextureIImg>
-__device__ Ncv32u getElemIImg(Ncv32u x, Ncv32u *d_IImg)
+__device__ Ncv32u getElemIImg(cv::cudev::TexturePtr<Ncv32u> texImg, Ncv32u x, Ncv32u *d_IImg)
 {
    if (tbCacheTextureIImg)
-    {
-        return tex1Dfetch(texIImage, x);
-    }
+        return texImg(x);
    else
-    {
        return d_IImg[x];
-    }
 }


@ -203,17 +172,10 @@ __device__ void compactBlockWriteOutAnchorParallel(Ncv32u threadPassFlag, Ncv32u
 }


-template <NcvBool tbInitMaskPositively,
-          NcvBool tbCacheTextureIImg,
-          NcvBool tbCacheTextureCascade,
-          NcvBool tbReadPixelIndexFromVector,
-          NcvBool tbDoAtomicCompaction>
-__global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStride,
-                                                  Ncv32f *d_weights, Ncv32u weightsStride,
-                                                  HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
-                                                  Ncv32u *d_inMask, Ncv32u *d_outMask,
-                                                  Ncv32u mask1Dlen, Ncv32u mask2Dstride,
-                                                  NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
+template <NcvBool tbInitMaskPositively, NcvBool tbCacheTextureIImg, NcvBool tbCacheTextureCascade, NcvBool tbReadPixelIndexFromVector, NcvBool tbDoAtomicCompaction>
+__global__ void applyHaarClassifierAnchorParallel(cv::cudev::TexturePtr<Ncv32u> texImg, cv::cudev::TexturePtr<uint2> texHaarFeatures, cv::cudev::TexturePtr<uint4> texHaarClassifierNodes,
+    Ncv32u *d_IImg, Ncv32u IImgStride, Ncv32f *d_weights, Ncv32u weightsStride, HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, Ncv32u *d_inMask,
+    Ncv32u *d_outMask, Ncv32u mask1Dlen, Ncv32u mask2Dstride,  NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
 {
    Ncv32u y_offs;
    Ncv32u x_offs;
@ -299,7 +261,7 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
                {
                    while (bMoreNodesToTraverse)
                    {
-                        HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);
+                        HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(texHaarClassifierNodes, iNode, d_ClassifierNodes);
                        HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();
                        Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();
                        Ncv32u iFeature = featuresDesc.getFeaturesOffset();
@ -310,19 +272,17 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
                        {
                            Ncv32f rectWeight;
                            Ncv32u rectX, rectY, rectWidth, rectHeight;
-                            getFeature<tbCacheTextureCascade>
-                                (iFeature + iRect, d_Features,
-                                &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
+                            getFeature<tbCacheTextureCascade> (texHaarFeatures, iFeature + iRect, d_Features, &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);

                            Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);
                            Ncv32u iioffsTR = iioffsTL + rectWidth;
                            Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;
                            Ncv32u iioffsBR = iioffsBL + rectWidth;

-                            Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -
-                                             getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +
-                                             getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -
-                                             getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);
+                            Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(texImg, iioffsBR, d_IImg) -
+                                             getElemIImg<tbCacheTextureIImg>(texImg, iioffsBL, d_IImg) +
+                                             getElemIImg<tbCacheTextureIImg>(texImg, iioffsTL, d_IImg) -
+                                             getElemIImg<tbCacheTextureIImg>(texImg, iioffsTR, d_IImg);

    #if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY
                        curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);
@ -393,15 +353,10 @@ __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStr
 }


-template <NcvBool tbCacheTextureIImg,
-          NcvBool tbCacheTextureCascade,
-          NcvBool tbDoAtomicCompaction>
-__global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IImgStride,
-                                                      Ncv32f *d_weights, Ncv32u weightsStride,
-                                                      HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
-                                                      Ncv32u *d_inMask, Ncv32u *d_outMask,
-                                                      Ncv32u mask1Dlen, Ncv32u mask2Dstride,
-                                                      NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
+template <NcvBool tbCacheTextureIImg, NcvBool tbCacheTextureCascade, NcvBool tbDoAtomicCompaction>
+__global__ void applyHaarClassifierClassifierParallel(cv::cudev::TexturePtr<Ncv32u> texImg, cv::cudev::TexturePtr<uint2> texHaarFeatures, cv::cudev::TexturePtr<uint4> texHaarClassifierNodes, Ncv32u *d_IImg,
+    Ncv32u IImgStride, Ncv32f *d_weights, Ncv32u weightsStride, HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, Ncv32u *d_inMask, Ncv32u *d_outMask,
+    Ncv32u mask1Dlen, Ncv32u mask2Dstride, NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
 {
    Ncv32u maskOffset = MAX_GRID_DIM * blockIdx.y + blockIdx.x;

@ -439,7 +394,7 @@ __global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IIm

                while (bMoreNodesToTraverse)
                {
-                    HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);
+                    HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(texHaarClassifierNodes, iNode, d_ClassifierNodes);
                    HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();
                    Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();
                    Ncv32u iFeature = featuresDesc.getFeaturesOffset();
@ -450,19 +405,17 @@ __global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IIm
                    {
                        Ncv32f rectWeight;
                        Ncv32u rectX, rectY, rectWidth, rectHeight;
-                        getFeature<tbCacheTextureCascade>
-                            (iFeature + iRect, d_Features,
-                            &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
+                        getFeature<tbCacheTextureCascade> (texHaarFeatures, iFeature + iRect, d_Features, &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);

                        Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);
                        Ncv32u iioffsTR = iioffsTL + rectWidth;
                        Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;
                        Ncv32u iioffsBR = iioffsBL + rectWidth;

-                        Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -
-                                         getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +
-                                         getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -
-                                         getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);
+                        Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(texImg, iioffsBR, d_IImg) -
+                                         getElemIImg<tbCacheTextureIImg>(texImg, iioffsBL, d_IImg) +
+                                         getElemIImg<tbCacheTextureIImg>(texImg, iioffsTL, d_IImg) -
+                                         getElemIImg<tbCacheTextureIImg>(texImg, iioffsTR, d_IImg);

 #if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY
                        curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);
@ -578,8 +531,9 @@ struct applyHaarClassifierAnchorParallelFunctor
 {
    dim3 gridConf, blockConf;
    cudaStream_t cuStream;
-
-    //Kernel arguments are stored as members;
+    cv::cudev::TexturePtr<Ncv32u> texImg;
+    cv::cudev::TexturePtr<uint2> texHaarFeatures;
+    cv::cudev::TexturePtr<uint4> texHaarClassifierNodes;
    Ncv32u *d_IImg;
    Ncv32u IImgStride;
    Ncv32f *d_weights;
@ -597,32 +551,12 @@ struct applyHaarClassifierAnchorParallelFunctor
    Ncv32f scaleArea;

    //Arguments are passed through the constructor
-    applyHaarClassifierAnchorParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
-                                             Ncv32u *_d_IImg, Ncv32u _IImgStride,
-                                             Ncv32f *_d_weights, Ncv32u _weightsStride,
-                                             HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages,
-                                             Ncv32u *_d_inMask, Ncv32u *_d_outMask,
-                                             Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
-                                             NcvSize32u _anchorsRoi, Ncv32u _startStageInc,
-                                             Ncv32u _endStageExc, Ncv32f _scaleArea) :
-    gridConf(_gridConf),
-    blockConf(_blockConf),
-    cuStream(_cuStream),
-    d_IImg(_d_IImg),
-    IImgStride(_IImgStride),
-    d_weights(_d_weights),
-    weightsStride(_weightsStride),
-    d_Features(_d_Features),
-    d_ClassifierNodes(_d_ClassifierNodes),
-    d_Stages(_d_Stages),
-    d_inMask(_d_inMask),
-    d_outMask(_d_outMask),
-    mask1Dlen(_mask1Dlen),
-    mask2Dstride(_mask2Dstride),
-    anchorsRoi(_anchorsRoi),
-    startStageInc(_startStageInc),
-    endStageExc(_endStageExc),
-    scaleArea(_scaleArea)
+    applyHaarClassifierAnchorParallelFunctor(cv::cudev::TexturePtr<Ncv32u> texImg_, cv::cudev::TexturePtr<uint2> texHaarFeatures_, cv::cudev::TexturePtr<uint4> texHaarClassifierNodes_, dim3 _gridConf,
+        dim3 _blockConf, cudaStream_t _cuStream, Ncv32u *_d_IImg, Ncv32u _IImgStride, Ncv32f *_d_weights, Ncv32u _weightsStride, HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes,
+        HaarStage64 *_d_Stages, Ncv32u *_d_inMask, Ncv32u *_d_outMask, Ncv32u _mask1Dlen, Ncv32u _mask2Dstride, NcvSize32u _anchorsRoi, Ncv32u _startStageInc, Ncv32u _endStageExc, Ncv32f _scaleArea) :
+        gridConf(_gridConf), blockConf(_blockConf), cuStream(_cuStream), texImg(texImg_), texHaarFeatures(texHaarFeatures_), texHaarClassifierNodes(texHaarClassifierNodes_), d_IImg(_d_IImg), IImgStride(_IImgStride),
+        d_weights(_d_weights), weightsStride(_weightsStride), d_Features(_d_Features), d_ClassifierNodes(_d_ClassifierNodes), d_Stages(_d_Stages), d_inMask(_d_inMask), d_outMask(_d_outMask), mask1Dlen(_mask1Dlen),
+        mask2Dstride(_mask2Dstride), anchorsRoi(_anchorsRoi), startStageInc(_startStageInc), endStageExc(_endStageExc), scaleArea(_scaleArea)
    {}

    template<class TList>
@ -635,43 +569,19 @@ struct applyHaarClassifierAnchorParallelFunctor
            Loki::TL::TypeAt<TList, 2>::Result::value,
            Loki::TL::TypeAt<TList, 3>::Result::value,
            Loki::TL::TypeAt<TList, 4>::Result::value >
-            <<<gridConf, blockConf, 0, cuStream>>>
-            (d_IImg, IImgStride,
-            d_weights, weightsStride,
-            d_Features, d_ClassifierNodes, d_Stages,
-            d_inMask, d_outMask,
-            mask1Dlen, mask2Dstride,
-            anchorsRoi, startStageInc,
-            endStageExc, scaleArea);
+            <<<gridConf, blockConf, 0, cuStream>>> (texImg, texHaarFeatures, texHaarClassifierNodes, d_IImg, IImgStride, d_weights, weightsStride, d_Features, d_ClassifierNodes, d_Stages, d_inMask,
+                d_outMask, mask1Dlen, mask2Dstride, anchorsRoi, startStageInc, endStageExc, scaleArea);
    }
 };


-void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively,
-                                                  NcvBool tbCacheTextureIImg,
-                                                  NcvBool tbCacheTextureCascade,
-                                                  NcvBool tbReadPixelIndexFromVector,
-                                                  NcvBool tbDoAtomicCompaction,
-
-                                                  dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
-
-                                                  Ncv32u *d_IImg, Ncv32u IImgStride,
-                                                  Ncv32f *d_weights, Ncv32u weightsStride,
-                                                  HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
-                                                  Ncv32u *d_inMask, Ncv32u *d_outMask,
-                                                  Ncv32u mask1Dlen, Ncv32u mask2Dstride,
-                                                  NcvSize32u anchorsRoi, Ncv32u startStageInc,
-                                                  Ncv32u endStageExc, Ncv32f scaleArea)
+void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively, NcvBool tbCacheTextureIImg, NcvBool tbCacheTextureCascade, NcvBool tbReadPixelIndexFromVector, NcvBool tbDoAtomicCompaction,
+    dim3 gridConf, dim3 blockConf, cudaStream_t cuStream, cv::cudev::TexturePtr<Ncv32u> texImg, cv::cudev::TexturePtr<uint2> texHaarFeatures, cv::cudev::TexturePtr<uint4> texHaarClassifierNodes, Ncv32u *d_IImg,
+    Ncv32u IImgStride, Ncv32f *d_weights, Ncv32u weightsStride, HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, Ncv32u *d_inMask, Ncv32u *d_outMask,
+    Ncv32u mask1Dlen, Ncv32u mask2Dstride, NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
 {
-
-    applyHaarClassifierAnchorParallelFunctor functor(gridConf, blockConf, cuStream,
-                                                     d_IImg, IImgStride,
-                                                     d_weights, weightsStride,
-                                                     d_Features, d_ClassifierNodes, d_Stages,
-                                                     d_inMask, d_outMask,
-                                                     mask1Dlen, mask2Dstride,
-                                                     anchorsRoi, startStageInc,
-                                                     endStageExc, scaleArea);
+    applyHaarClassifierAnchorParallelFunctor functor(texImg, texHaarFeatures, texHaarClassifierNodes, gridConf, blockConf, cuStream, d_IImg, IImgStride, d_weights, weightsStride, d_Features, d_ClassifierNodes, d_Stages,
+                                                     d_inMask, d_outMask, mask1Dlen, mask2Dstride, anchorsRoi, startStageInc, endStageExc, scaleArea);

    //Second parameter is the number of "dynamic" template parameters
    NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 5, applyHaarClassifierAnchorParallelFunctor>
@ -688,8 +598,9 @@ struct applyHaarClassifierClassifierParallelFunctor
 {
    dim3 gridConf, blockConf;
    cudaStream_t cuStream;
-
-    //Kernel arguments are stored as members;
+    cv::cudev::TexturePtr<Ncv32u> texImg;
+    cv::cudev::TexturePtr<uint2> texHaarFeatures;
+    cv::cudev::TexturePtr<uint4> texHaarClassifierNodes;
    Ncv32u *d_IImg;
    Ncv32u IImgStride;
    Ncv32f *d_weights;
@ -707,32 +618,13 @@ struct applyHaarClassifierClassifierParallelFunctor
    Ncv32f scaleArea;

    //Arguments are passed through the constructor
-    applyHaarClassifierClassifierParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
-                                                 Ncv32u *_d_IImg, Ncv32u _IImgStride,
-                                                 Ncv32f *_d_weights, Ncv32u _weightsStride,
-                                                 HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages,
-                                                 Ncv32u *_d_inMask, Ncv32u *_d_outMask,
-                                                 Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
-                                                 NcvSize32u _anchorsRoi, Ncv32u _startStageInc,
-                                                 Ncv32u _endStageExc, Ncv32f _scaleArea) :
-    gridConf(_gridConf),
-    blockConf(_blockConf),
-    cuStream(_cuStream),
-    d_IImg(_d_IImg),
-    IImgStride(_IImgStride),
-    d_weights(_d_weights),
-    weightsStride(_weightsStride),
-    d_Features(_d_Features),
-    d_ClassifierNodes(_d_ClassifierNodes),
-    d_Stages(_d_Stages),
-    d_inMask(_d_inMask),
-    d_outMask(_d_outMask),
-    mask1Dlen(_mask1Dlen),
-    mask2Dstride(_mask2Dstride),
-    anchorsRoi(_anchorsRoi),
-    startStageInc(_startStageInc),
-    endStageExc(_endStageExc),
-    scaleArea(_scaleArea)
+    applyHaarClassifierClassifierParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream, cv::cudev::TexturePtr<Ncv32u> texImg_, cv::cudev::TexturePtr<uint2> texHaarFeatures_,
+        cv::cudev::TexturePtr<uint4> texHaarClassifierNodes_, Ncv32u *_d_IImg, Ncv32u _IImgStride, Ncv32f *_d_weights, Ncv32u _weightsStride, HaarFeature64 *_d_Features,
+        HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages, Ncv32u *_d_inMask, Ncv32u *_d_outMask, Ncv32u _mask1Dlen, Ncv32u _mask2Dstride, NcvSize32u _anchorsRoi,
+        Ncv32u _startStageInc, Ncv32u _endStageExc, Ncv32f _scaleArea) : gridConf(_gridConf), blockConf(_blockConf), cuStream(_cuStream), texImg(texImg_), texHaarFeatures(texHaarFeatures_),
+        texHaarClassifierNodes(texHaarClassifierNodes_), d_IImg(_d_IImg), IImgStride(_IImgStride), d_weights(_d_weights), weightsStride(_weightsStride), d_Features(_d_Features),
+        d_ClassifierNodes(_d_ClassifierNodes), d_Stages(_d_Stages), d_inMask(_d_inMask), d_outMask(_d_outMask), mask1Dlen(_mask1Dlen), mask2Dstride(_mask2Dstride), anchorsRoi(_anchorsRoi),
+        startStageInc(_startStageInc), endStageExc(_endStageExc), scaleArea(_scaleArea)
    {}

    template<class TList>
@ -743,40 +635,19 @@ struct applyHaarClassifierClassifierParallelFunctor
            Loki::TL::TypeAt<TList, 0>::Result::value,
            Loki::TL::TypeAt<TList, 1>::Result::value,
            Loki::TL::TypeAt<TList, 2>::Result::value >
-            <<<gridConf, blockConf, 0, cuStream>>>
-            (d_IImg, IImgStride,
-            d_weights, weightsStride,
-            d_Features, d_ClassifierNodes, d_Stages,
-            d_inMask, d_outMask,
-            mask1Dlen, mask2Dstride,
-            anchorsRoi, startStageInc,
-            endStageExc, scaleArea);
+            <<<gridConf, blockConf, 0, cuStream>>> (texImg, texHaarFeatures, texHaarClassifierNodes, d_IImg, IImgStride, d_weights, weightsStride, d_Features, d_ClassifierNodes, d_Stages, d_inMask,
+                d_outMask, mask1Dlen, mask2Dstride, anchorsRoi, startStageInc, endStageExc, scaleArea);
    }
 };


-void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg,
-                                                      NcvBool tbCacheTextureCascade,
-                                                      NcvBool tbDoAtomicCompaction,
-
-                                                      dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
-
-                                                      Ncv32u *d_IImg, Ncv32u IImgStride,
-                                                      Ncv32f *d_weights, Ncv32u weightsStride,
-                                                      HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
-                                                      Ncv32u *d_inMask, Ncv32u *d_outMask,
-                                                      Ncv32u mask1Dlen, Ncv32u mask2Dstride,
-                                                      NcvSize32u anchorsRoi, Ncv32u startStageInc,
-                                                      Ncv32u endStageExc, Ncv32f scaleArea)
+void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg, NcvBool tbCacheTextureCascade, NcvBool tbDoAtomicCompaction, dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
+    cv::cudev::TexturePtr<Ncv32u> texImg, cv::cudev::TexturePtr<uint2> texHaarFeatures, cv::cudev::TexturePtr<uint4> texHaarClassifierNodes, Ncv32u *d_IImg, Ncv32u IImgStride, Ncv32f *d_weights,
+    Ncv32u weightsStride, HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, Ncv32u *d_inMask, Ncv32u *d_outMask, Ncv32u mask1Dlen, Ncv32u mask2Dstride,
+    NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
 {
-    applyHaarClassifierClassifierParallelFunctor functor(gridConf, blockConf, cuStream,
-                                                         d_IImg, IImgStride,
-                                                         d_weights, weightsStride,
-                                                         d_Features, d_ClassifierNodes, d_Stages,
-                                                         d_inMask, d_outMask,
-                                                         mask1Dlen, mask2Dstride,
-                                                         anchorsRoi, startStageInc,
-                                                         endStageExc, scaleArea);
+    applyHaarClassifierClassifierParallelFunctor functor(gridConf, blockConf, cuStream, texImg, texHaarFeatures, texHaarClassifierNodes, d_IImg, IImgStride, d_weights, weightsStride, d_Features,
+        d_ClassifierNodes, d_Stages, d_inMask, d_outMask, mask1Dlen, mask2Dstride, anchorsRoi, startStageInc, endStageExc, scaleArea);

    //Second parameter is the number of "dynamic" template parameters
    NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 3, applyHaarClassifierClassifierParallelFunctor>
@ -1015,31 +886,15 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &integral,

    NCV_SKIP_COND_BEGIN

+    cv::cudev::Texture<Ncv32u> texImg;
    if (bTexCacheIImg)
-    {
-        cudaChannelFormatDesc cfdTexIImage;
-        cfdTexIImage = cudaCreateChannelDesc<Ncv32u>();
+        texImg = cv::cudev::Texture<Ncv32u>((anchorsRoi.height + haar.ClassifierSize.height) * integral.pitch(), integral.ptr());

-        size_t alignmentOffset;
-        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texIImage, integral.ptr(), cfdTexIImage,
-            (anchorsRoi.height + haar.ClassifierSize.height) * integral.pitch()), NCV_CUDA_ERROR);
-        ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
-    }
-
-    if (bTexCacheCascade)
-    {
-        cudaChannelFormatDesc cfdTexHaarFeatures;
-        cudaChannelFormatDesc cfdTexHaarClassifierNodes;
-        cfdTexHaarFeatures = cudaCreateChannelDesc<uint2>();
-        cfdTexHaarClassifierNodes = cudaCreateChannelDesc<uint4>();
-
-        size_t alignmentOffset;
-        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarFeatures,
-            d_HaarFeatures.ptr(), cfdTexHaarFeatures,sizeof(HaarFeature64) * haar.NumFeatures), NCV_CUDA_ERROR);
-        ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
-        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarClassifierNodes,
-            d_HaarNodes.ptr(), cfdTexHaarClassifierNodes, sizeof(HaarClassifierNode128) * haar.NumClassifierTotalNodes), NCV_CUDA_ERROR);
-        ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
+    cv::cudev::Texture<uint2> texHaarFeatures;
+    cv::cudev::Texture<uint4> texHaarClassifierNodes;
+    if (bTexCacheCascade) {
+        texHaarFeatures = cv::cudev::Texture<uint2>(sizeof(HaarFeature64) * haar.NumFeatures, reinterpret_cast<uint2*>(d_HaarFeatures.ptr()));
+        texHaarClassifierNodes = cv::cudev::Texture<uint4>(sizeof(HaarClassifierNode128) * haar.NumClassifierTotalNodes, reinterpret_cast<uint4*>(d_HaarNodes.ptr()));
    }

    Ncv32u stageStartAnchorParallel = 0;
@ -1130,26 +985,10 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &integral,
        dim3 grid1(((d_pixelMask.stride() + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL),
                   anchorsRoi.height);
        dim3 block1(NUM_THREADS_ANCHORSPARALLEL);
-        applyHaarClassifierAnchorParallelDynTemplate(
-            true,                         //tbInitMaskPositively
-            bTexCacheIImg,                //tbCacheTextureIImg
-            bTexCacheCascade,             //tbCacheTextureCascade
-            pixParallelStageStops[pixParallelStageStopsIndex] != 0,//tbReadPixelIndexFromVector
-            bDoAtomicCompaction,          //tbDoAtomicCompaction
-            grid1,
-            block1,
-            cuStream,
-            integral.ptr(), integral.stride(),
-            d_weights.ptr(), d_weights.stride(),
-            d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
-            d_ptrNowData->ptr(),
-            bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
-            0,
-            d_pixelMask.stride(),
-            anchorsRoi,
-            pixParallelStageStops[pixParallelStageStopsIndex],
-            pixParallelStageStops[pixParallelStageStopsIndex+1],
-            scaleAreaPixels);
+        applyHaarClassifierAnchorParallelDynTemplate( true, bTexCacheIImg, bTexCacheCascade,  pixParallelStageStops[pixParallelStageStopsIndex] != 0, bDoAtomicCompaction, grid1, block1, cuStream,
+            texImg, texHaarFeatures, texHaarClassifierNodes, integral.ptr(), integral.stride(), d_weights.ptr(), d_weights.stride(), d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
+            d_ptrNowData->ptr(), bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(), 0, d_pixelMask.stride(), anchorsRoi, pixParallelStageStops[pixParallelStageStopsIndex],
+            pixParallelStageStops[pixParallelStageStopsIndex+1], scaleAreaPixels);
        ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);

        if (bDoAtomicCompaction)
@ -1200,26 +1039,10 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &integral,
        }
        dim3 block2(NUM_THREADS_ANCHORSPARALLEL);

-        applyHaarClassifierAnchorParallelDynTemplate(
-            false,                        //tbInitMaskPositively
-            bTexCacheIImg,                //tbCacheTextureIImg
-            bTexCacheCascade,             //tbCacheTextureCascade
-            pixParallelStageStops[pixParallelStageStopsIndex] != 0 || pixelStep != 1 || bMaskElements,//tbReadPixelIndexFromVector
-            bDoAtomicCompaction,          //tbDoAtomicCompaction
-            grid2,
-            block2,
-            cuStream,
-            integral.ptr(), integral.stride(),
-            d_weights.ptr(), d_weights.stride(),
-            d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
-            d_ptrNowData->ptr(),
-            bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
-            numDetections,
-            d_pixelMask.stride(),
-            anchorsRoi,
-            pixParallelStageStops[pixParallelStageStopsIndex],
-            pixParallelStageStops[pixParallelStageStopsIndex+1],
-            scaleAreaPixels);
+        applyHaarClassifierAnchorParallelDynTemplate( false, bTexCacheIImg, bTexCacheCascade, pixParallelStageStops[pixParallelStageStopsIndex] != 0 || pixelStep != 1 || bMaskElements, bDoAtomicCompaction,
+            grid2, block2, cuStream, texImg, texHaarFeatures, texHaarClassifierNodes, integral.ptr(), integral.stride(), d_weights.ptr(), d_weights.stride(), d_HaarFeatures.ptr(), d_HaarNodes.ptr(),
+            d_HaarStages.ptr(), d_ptrNowData->ptr(), bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(), numDetections, d_pixelMask.stride(), anchorsRoi,
+            pixParallelStageStops[pixParallelStageStopsIndex], pixParallelStageStops[pixParallelStageStopsIndex+1], scaleAreaPixels);
        ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);

        if (bDoAtomicCompaction)
@ -1263,24 +1086,9 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &integral,
        }
        dim3 block3(NUM_THREADS_CLASSIFIERPARALLEL);

-        applyHaarClassifierClassifierParallelDynTemplate(
-            bTexCacheIImg,                //tbCacheTextureIImg
-            bTexCacheCascade,             //tbCacheTextureCascade
-            bDoAtomicCompaction,          //tbDoAtomicCompaction
-            grid3,
-            block3,
-            cuStream,
-            integral.ptr(), integral.stride(),
-            d_weights.ptr(), d_weights.stride(),
-            d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
-            d_ptrNowData->ptr(),
-            bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
-            numDetections,
-            d_pixelMask.stride(),
-            anchorsRoi,
-            stageMiddleSwitch,
-            stageEndClassifierParallel,
-            scaleAreaPixels);
+        applyHaarClassifierClassifierParallelDynTemplate(bTexCacheIImg, bTexCacheCascade, bDoAtomicCompaction, grid3, block3, cuStream, texImg, texHaarFeatures, texHaarClassifierNodes, integral.ptr(), integral.stride(),
+            d_weights.ptr(), d_weights.stride(), d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(), d_ptrNowData->ptr(), bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(), numDetections,
+            d_pixelMask.stride(), anchorsRoi, stageMiddleSwitch, stageEndClassifierParallel, scaleAreaPixels);
        ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);

        if (bDoAtomicCompaction)
--- a/modules/cudalegacy/src/cuda/NPP_staging.cu
+++ b/modules/cudalegacy/src/cuda/NPP_staging.cu
@ -48,12 +48,7 @@
 #include "opencv2/cudev.hpp"

 #include "opencv2/cudalegacy/NPP_staging.hpp"
-
-
-texture<Ncv8u,  1, cudaReadModeElementType> tex8u;
-texture<Ncv32u, 1, cudaReadModeElementType> tex32u;
-texture<uint2,  1, cudaReadModeElementType> tex64u;
-
+#include <opencv2/cudev/ptr2d/texture.hpp>

 //==============================================================================
 //
@ -71,7 +66,6 @@ cudaStream_t nppStGetActiveCUDAstream(void)
 }


-
 cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream)
 {
    cudaStream_t tmp = nppStream;
@ -117,25 +111,25 @@ private:


 template<class T>
-inline __device__ T readElem(T *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs);
+inline __device__ T readElem(cv::cudev::TexturePtr<Ncv8u> tex8u, T *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs);


 template<>
-inline __device__ Ncv8u readElem<Ncv8u>(Ncv8u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
+inline __device__ Ncv8u readElem<Ncv8u>(cv::cudev::TexturePtr<Ncv8u> tex8u, Ncv8u* d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
 {
-    return tex1Dfetch(tex8u, texOffs + srcStride * blockIdx.x + curElemOffs);
+    return tex8u(texOffs + srcStride * blockIdx.x + curElemOffs);
 }


 template<>
-inline __device__ Ncv32u readElem<Ncv32u>(Ncv32u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
+inline __device__ Ncv32u readElem<Ncv32u>(cv::cudev::TexturePtr<Ncv8u> tex8u, Ncv32u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
 {
    return d_src[curElemOffs];
 }


 template<>
-inline __device__ Ncv32f readElem<Ncv32f>(Ncv32f *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
+inline __device__ Ncv32f readElem<Ncv32f>(cv::cudev::TexturePtr<Ncv8u> tex8u, Ncv32f *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
 {
    return d_src[curElemOffs];
 }
@ -160,8 +154,7 @@ inline __device__ Ncv32f readElem<Ncv32f>(Ncv32f *d_src, Ncv32u texOffs, Ncv32u
 * \return None
 */
 template <class T_in, class T_out, bool tbDoSqr>
-__global__ void scanRows(T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u srcStride,
-                         T_out *d_II, Ncv32u IIstride)
+__global__ void scanRows(cv::cudev::TexturePtr<Ncv8u> tex8u, T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u srcStride, T_out *d_II, Ncv32u IIstride)
 {
    //advance pointers to the current line
    if (sizeof(T_in) != 1)
@ -190,7 +183,7 @@ __global__ void scanRows(T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u sr
        if (curElemOffs < srcWidth)
        {
            //load elements
-            curElem = readElem<T_in>(d_src, texOffs, srcStride, curElemOffs);
+            curElem = readElem<T_in>(tex8u, d_src, texOffs, srcStride, curElemOffs);
        }
        curElemMod = _scanElemOp<T_in, T_out>::scanElemOp<tbDoSqr>(curElem);

@ -224,25 +217,9 @@ template <bool tbDoSqr, class T_in, class T_out>
 NCVStatus scanRowsWrapperDevice(T_in *d_src, Ncv32u srcStride,
                                T_out *d_dst, Ncv32u dstStride, NcvSize32u roi)
 {
-    cudaChannelFormatDesc cfdTex;
-    size_t alignmentOffset = 0;
-    if (sizeof(T_in) == 1)
-    {
-        cfdTex = cudaCreateChannelDesc<Ncv8u>();
-        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR);
-        if (alignmentOffset > 0)
-        {
-            ncvAssertCUDAReturn(cudaUnbindTexture(tex8u), NCV_CUDA_ERROR);
-            ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, alignmentOffset + roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR);
-        }
-    }
-    scanRows
-        <T_in, T_out, tbDoSqr>
-        <<<roi.height, NUM_SCAN_THREADS, 0, nppStGetActiveCUDAstream()>>>
-        (d_src, (Ncv32u)alignmentOffset, roi.width, srcStride, d_dst, dstStride);
-
+    cv::cudev::Texture<Ncv8u> tex8u(static_cast<size_t>(roi.height * srcStride), (Ncv8u*)d_src);
+    scanRows <T_in, T_out, tbDoSqr> <<<roi.height, NUM_SCAN_THREADS, 0, nppStGetActiveCUDAstream()>>> (tex8u, d_src, 0, roi.width, srcStride, d_dst, dstStride);
    ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
-
    return NPPST_SUCCESS;
 }

@ -585,59 +562,25 @@ NCVStatus nppiStSqrIntegral_8u64u_C1R_host(Ncv8u *h_src, Ncv32u srcStep,
 const Ncv32u NUM_DOWNSAMPLE_NEAREST_THREADS_X = 32;
 const Ncv32u NUM_DOWNSAMPLE_NEAREST_THREADS_Y = 8;

-
-template<class T, NcvBool tbCacheTexture>
-__device__ T getElem_Decimate(Ncv32u x, T *d_src);
-
-
-template<>
-__device__ Ncv32u getElem_Decimate<Ncv32u, true>(Ncv32u x, Ncv32u *d_src)
-{
-    return tex1Dfetch(tex32u, x);
-}
-
-
-template<>
-__device__ Ncv32u getElem_Decimate<Ncv32u, false>(Ncv32u x, Ncv32u *d_src)
-{
-    return d_src[x];
-}
-
-
-template<>
-__device__ Ncv64u getElem_Decimate<Ncv64u, true>(Ncv32u x, Ncv64u *d_src)
-{
-    uint2 tmp = tex1Dfetch(tex64u, x);
-    Ncv64u res = (Ncv64u)tmp.y;
-    res <<= 32;
-    res |= tmp.x;
-    return res;
-}
-
-
-template<>
-__device__ Ncv64u getElem_Decimate<Ncv64u, false>(Ncv32u x, Ncv64u *d_src)
+template <class T>
+__global__ void decimate_C1R(T* d_src, Ncv32u srcStep, T* d_dst, Ncv32u dstStep, NcvSize32u dstRoi, Ncv32u scale)
 {
-    return d_src[x];
+    int curX = blockIdx.x * blockDim.x + threadIdx.x;
+    int curY = blockIdx.y * blockDim.y + threadIdx.y;
+    if (curX >= dstRoi.width || curY >= dstRoi.height) return;
+    d_dst[curY * dstStep + curX] = d_src[(curY * srcStep + curX) * scale];
 }

-
-template <class T, NcvBool tbCacheTexture>
-__global__ void decimate_C1R(T *d_src, Ncv32u srcStep, T *d_dst, Ncv32u dstStep,
-                                      NcvSize32u dstRoi, Ncv32u scale)
+template <class T>
+__global__ void decimate_C1R(cv::cudev::TexturePtr<T> texSrc, Ncv32u srcStep, T* d_dst, Ncv32u dstStep,
+    NcvSize32u dstRoi, Ncv32u scale)
 {
    int curX = blockIdx.x * blockDim.x + threadIdx.x;
    int curY = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (curX >= dstRoi.width || curY >= dstRoi.height)
-    {
-        return;
-    }
-
-    d_dst[curY * dstStep + curX] = getElem_Decimate<T, tbCacheTexture>((curY * srcStep + curX) * scale, d_src);
+    if (curX >= dstRoi.width || curY >= dstRoi.height) return;
+    d_dst[curY * dstStep + curX] = texSrc((curY * srcStep + curX) * scale);
 }

-
 template <class T>
 static NCVStatus decimateWrapperDevice(T *d_src, Ncv32u srcStep,
                                                T *d_dst, Ncv32u dstStep,
@ -659,39 +602,12 @@ static NCVStatus decimateWrapperDevice(T *d_src, Ncv32u srcStep,
    dim3 grid((dstRoi.width + NUM_DOWNSAMPLE_NEAREST_THREADS_X - 1) / NUM_DOWNSAMPLE_NEAREST_THREADS_X,
              (dstRoi.height + NUM_DOWNSAMPLE_NEAREST_THREADS_Y - 1) / NUM_DOWNSAMPLE_NEAREST_THREADS_Y);
    dim3 block(NUM_DOWNSAMPLE_NEAREST_THREADS_X, NUM_DOWNSAMPLE_NEAREST_THREADS_Y);
-
-    if (!readThruTexture)
-    {
-        decimate_C1R
-            <T, false>
-            <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
-            (d_src, srcStep, d_dst, dstStep, dstRoi, scale);
+    if (!readThruTexture) {
+        decimate_C1R<T><<<grid, block, 0, nppStGetActiveCUDAstream()>>>(d_src, srcStep, d_dst, dstStep, dstRoi, scale);
    }
-    else
-    {
-        cudaChannelFormatDesc cfdTexSrc;
-
-        if (sizeof(T) == sizeof(Ncv32u))
-        {
-            cfdTexSrc = cudaCreateChannelDesc<Ncv32u>();
-
-            size_t alignmentOffset;
-            ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex32u, d_src, cfdTexSrc, srcRoi.height * srcStep * sizeof(T)), NPPST_TEXTURE_BIND_ERROR);
-            ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
-        }
-        else
-        {
-            cfdTexSrc = cudaCreateChannelDesc<uint2>();
-
-            size_t alignmentOffset;
-            ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex64u, d_src, cfdTexSrc, srcRoi.height * srcStep * sizeof(T)), NPPST_TEXTURE_BIND_ERROR);
-            ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
-        }
-
-        decimate_C1R
-            <T, true>
-            <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
-            (d_src, srcStep, d_dst, dstStep, dstRoi, scale);
+    else {
+        cv::cudev::Texture<T> texSrc(srcRoi.height * srcStep * sizeof(T), d_src);
+        decimate_C1R<T><<<grid, block, 0, nppStGetActiveCUDAstream()>>>(texSrc, srcStep, d_dst, dstStep, dstRoi, scale);
    }

    ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
@ -753,11 +669,7 @@ static NCVStatus decimateWrapperHost(T *h_src, Ncv32u srcStep,


 implementNppDecimate(32, u)
-implementNppDecimate(32, s)
-implementNppDecimate(32, f)
 implementNppDecimate(64, u)
-implementNppDecimate(64, s)
-implementNppDecimate(64, f)
 implementNppDecimateHost(32, u)
 implementNppDecimateHost(32, s)
 implementNppDecimateHost(32, f)
@ -776,43 +688,29 @@ implementNppDecimateHost(64, f)
 const Ncv32u NUM_RECTSTDDEV_THREADS = 128;


-template <NcvBool tbCacheTexture>
-__device__ Ncv32u getElemSum(Ncv32u x, Ncv32u *d_sum)
+template <NcvBool tbCacheTexture, class Ptr2D>
+__device__ Ncv32u getElemSum(Ptr2D tex, Ncv32u x, Ncv32u *d_sum)
 {
    if (tbCacheTexture)
-    {
-        return tex1Dfetch(tex32u, x);
-    }
+        return tex(x);
    else
-    {
        return d_sum[x];
-    }
 }


-template <NcvBool tbCacheTexture>
-__device__ Ncv64u getElemSqSum(Ncv32u x, Ncv64u *d_sqsum)
+template <NcvBool tbCacheTexture, class Ptr2D>
+__device__ Ncv64u getElemSqSum(Ptr2D tex, Ncv32u x, Ncv64u *d_sqsum)
 {
    if (tbCacheTexture)
-    {
-        uint2 tmp = tex1Dfetch(tex64u, x);
-        Ncv64u res = (Ncv64u)tmp.y;
-        res <<= 32;
-        res |= tmp.x;
-        return res;
-    }
+        return tex(x);
    else
-    {
        return d_sqsum[x];
-    }
 }


 template <NcvBool tbCacheTexture>
-__global__ void rectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
-                                   Ncv64u *d_sqsum, Ncv32u sqsumStep,
-                                   Ncv32f *d_norm, Ncv32u normStep,
-                                   NcvSize32u roi, NcvRect32u rect, Ncv32f invRectArea)
+__global__ void rectStdDev_32f_C1R(cv::cudev::TexturePtr<Ncv32u> texSum, cv::cudev::TexturePtr<Ncv64u> texSumSq, Ncv32u *d_sum, Ncv32u sumStep, Ncv64u *d_sqsum, Ncv32u sqsumStep,
+    Ncv32f *d_norm, Ncv32u normStep, NcvSize32u roi, NcvRect32u rect, Ncv32f invRectArea)
 {
    Ncv32u x_offs = blockIdx.x * NUM_RECTSTDDEV_THREADS + threadIdx.x;
    if (x_offs >= roi.width)
@ -824,17 +722,17 @@ __global__ void rectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
    Ncv32u sqsum_offset = blockIdx.y * sqsumStep + x_offs;

    //OPT: try swapping order (could change cache hit/miss ratio)
-    Ncv32u sum_tl = getElemSum<tbCacheTexture>(sum_offset + rect.y * sumStep + rect.x, d_sum);
-    Ncv32u sum_bl = getElemSum<tbCacheTexture>(sum_offset + (rect.y + rect.height) * sumStep + rect.x, d_sum);
-    Ncv32u sum_tr = getElemSum<tbCacheTexture>(sum_offset + rect.y * sumStep + rect.x + rect.width, d_sum);
-    Ncv32u sum_br = getElemSum<tbCacheTexture>(sum_offset + (rect.y + rect.height) * sumStep + rect.x + rect.width, d_sum);
+    Ncv32u sum_tl = getElemSum<tbCacheTexture>(texSum, sum_offset + rect.y * sumStep + rect.x, d_sum);
+    Ncv32u sum_bl = getElemSum<tbCacheTexture>(texSum, sum_offset + (rect.y + rect.height) * sumStep + rect.x, d_sum);
+    Ncv32u sum_tr = getElemSum<tbCacheTexture>(texSum, sum_offset + rect.y * sumStep + rect.x + rect.width, d_sum);
+    Ncv32u sum_br = getElemSum<tbCacheTexture>(texSum, sum_offset + (rect.y + rect.height) * sumStep + rect.x + rect.width, d_sum);
    Ncv32u sum_val = sum_br + sum_tl - sum_tr - sum_bl;

    Ncv64u sqsum_tl, sqsum_bl, sqsum_tr, sqsum_br;
-    sqsum_tl = getElemSqSum<tbCacheTexture>(sqsum_offset + rect.y * sqsumStep + rect.x, d_sqsum);
-    sqsum_bl = getElemSqSum<tbCacheTexture>(sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x, d_sqsum);
-    sqsum_tr = getElemSqSum<tbCacheTexture>(sqsum_offset + rect.y * sqsumStep + rect.x + rect.width, d_sqsum);
-    sqsum_br = getElemSqSum<tbCacheTexture>(sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x + rect.width, d_sqsum);
+    sqsum_tl = getElemSqSum<tbCacheTexture>(texSumSq, sqsum_offset + rect.y * sqsumStep + rect.x, d_sqsum);
+    sqsum_bl = getElemSqSum<tbCacheTexture>(texSumSq, sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x, d_sqsum);
+    sqsum_tr = getElemSqSum<tbCacheTexture>(texSumSq, sqsum_offset + rect.y * sqsumStep + rect.x + rect.width, d_sqsum);
+    sqsum_br = getElemSqSum<tbCacheTexture>(texSumSq, sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x + rect.width, d_sqsum);
    Ncv64u sqsum_val = sqsum_br + sqsum_tl - sqsum_tr - sqsum_bl;

    Ncv32f mean = sum_val * invRectArea;
@ -897,31 +795,12 @@ NCVStatus nppiStRectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
    dim3 grid(((roi.width + NUM_RECTSTDDEV_THREADS - 1) / NUM_RECTSTDDEV_THREADS), roi.height);
    dim3 block(NUM_RECTSTDDEV_THREADS);

+    cv::cudev::Texture<Ncv32u> texSum((roi.height + rect.y + rect.height) * sumStep * sizeof(Ncv32u), d_sum);
+    cv::cudev::Texture<Ncv64u> texSumSq((roi.height + rect.y + rect.height) * sqsumStep * sizeof(Ncv64u), d_sqsum);
    if (!readThruTexture)
-    {
-        rectStdDev_32f_C1R
-            <false>
-            <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
-            (d_sum, sumStep, d_sqsum, sqsumStep, d_norm, normStep, roi, rect, invRectArea);
-    }
+        rectStdDev_32f_C1R<false><<<grid, block, 0, nppStGetActiveCUDAstream()>>>(texSum, texSumSq, d_sum, sumStep, d_sqsum, sqsumStep, d_norm, normStep, roi, rect, invRectArea);
    else
-    {
-        cudaChannelFormatDesc cfdTexSrc;
-        cudaChannelFormatDesc cfdTexSqr;
-        cfdTexSrc = cudaCreateChannelDesc<Ncv32u>();
-        cfdTexSqr = cudaCreateChannelDesc<uint2>();
-
-        size_t alignmentOffset;
-        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex32u, d_sum, cfdTexSrc, (roi.height + rect.y + rect.height) * sumStep * sizeof(Ncv32u)), NPPST_TEXTURE_BIND_ERROR);
-        ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
-        ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex64u, d_sqsum, cfdTexSqr, (roi.height + rect.y + rect.height) * sqsumStep * sizeof(Ncv64u)), NPPST_TEXTURE_BIND_ERROR);
-        ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
-
-        rectStdDev_32f_C1R
-            <true>
-            <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
-            (NULL, sumStep, NULL, sqsumStep, d_norm, normStep, roi, rect, invRectArea);
-    }
+        rectStdDev_32f_C1R<true><<<grid, block, 0, nppStGetActiveCUDAstream()>>>(texSum, texSumSq, NULL, sumStep, NULL, sqsumStep, d_norm, normStep, roi, rect, invRectArea);

    ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);

@ -1553,40 +1432,24 @@ NCVStatus nppsStCompact_32f_host(Ncv32f *h_src, Ncv32u srcLen,
 //
 //==============================================================================

-
-texture <float, 1, cudaReadModeElementType> texSrc;
-texture <float, 1, cudaReadModeElementType> texKernel;
-
-
-__forceinline__ __device__ float getValueMirrorRow(const int rowOffset,
-                                                   int i,
-                                                   int w)
+__forceinline__ __device__ float getValueMirrorRow(cv::cudev::TexturePtr< Ncv32f> tex, const int rowOffset, int i, int w)
 {
    if (i < 0) i = 1 - i;
    if (i >= w) i = w + w - i - 1;
-    return tex1Dfetch (texSrc, rowOffset + i);
+    return tex(rowOffset + i);
 }


-__forceinline__ __device__ float getValueMirrorColumn(const int offset,
-                                                      const int rowStep,
-                                                      int j,
-                                                      int h)
+__forceinline__ __device__ float getValueMirrorColumn(cv::cudev::TexturePtr< Ncv32f> tex, const int offset, const int rowStep, int j, int h)
 {
    if (j < 0) j = 1 - j;
    if (j >= h) j = h + h - j - 1;
-    return tex1Dfetch (texSrc, offset + j * rowStep);
+    return tex(offset + j * rowStep);
 }


-__global__ void FilterRowBorderMirror_32f_C1R(Ncv32u srcStep,
-                                              Ncv32f *pDst,
-                                              NcvSize32u dstSize,
-                                              Ncv32u dstStep,
-                                              NcvRect32u roi,
-                                              Ncv32s nKernelSize,
-                                              Ncv32s nAnchor,
-                                              Ncv32f multiplier)
+__global__ void FilterRowBorderMirror_32f_C1R(cv::cudev::TexturePtr<Ncv32f> texSrc, cv::cudev::TexturePtr<Ncv32f> texKernel1, Ncv32u srcStep, Ncv32f *pDst, NcvSize32u dstSize, Ncv32u dstStep,
+    NcvRect32u roi, Ncv32s nKernelSize, Ncv32s nAnchor, Ncv32f multiplier)
 {
    // position within ROI
    const int ix = blockDim.x * blockIdx.x + threadIdx.x;
@ -1606,22 +1469,16 @@ __global__ void FilterRowBorderMirror_32f_C1R(Ncv32u srcStep,
    float sum = 0.0f;
    for (int m = 0; m < nKernelSize; ++m)
    {
-        sum += getValueMirrorRow (rowOffset, ix + m - p, roi.width)
-            * tex1Dfetch (texKernel, m);
+        sum += getValueMirrorRow(texSrc, rowOffset, ix + m - p, roi.width)
+            * texKernel1(m);
    }

    pDst[iy * dstStep + ix] = sum * multiplier;
 }


-__global__ void FilterColumnBorderMirror_32f_C1R(Ncv32u srcStep,
-                                                 Ncv32f *pDst,
-                                                 NcvSize32u dstSize,
-                                                 Ncv32u dstStep,
-                                                 NcvRect32u roi,
-                                                 Ncv32s nKernelSize,
-                                                 Ncv32s nAnchor,
-                                                 Ncv32f multiplier)
+__global__ void FilterColumnBorderMirror_32f_C1R(cv::cudev::TexturePtr<Ncv32f> texSrc, cv::cudev::TexturePtr<Ncv32f> texKernel, Ncv32u srcStep, Ncv32f *pDst, NcvSize32u dstSize, Ncv32u dstStep,
+    NcvRect32u roi, Ncv32s nKernelSize, Ncv32s nAnchor, Ncv32f multiplier)
 {
    const int ix = blockDim.x * blockIdx.x + threadIdx.x;
    const int iy = blockDim.y * blockIdx.y + threadIdx.y;
@ -1638,15 +1495,15 @@ __global__ void FilterColumnBorderMirror_32f_C1R(Ncv32u srcStep,
    float sum = 0.0f;
    for (int m = 0; m < nKernelSize; ++m)
    {
-        sum += getValueMirrorColumn (offset, srcStep, iy + m - p, roi.height)
-            * tex1Dfetch (texKernel, m);
+        sum += getValueMirrorColumn(texSrc, offset, srcStep, iy + m - p, roi.height)
+            * texKernel(m);
    }

    pDst[ix + iy * dstStep] = sum * multiplier;
 }


-NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
+NCVStatus nppiStFilterRowBorder_32f_C1R(Ncv32f *pSrc,
                                        NcvSize32u srcSize,
                                        Ncv32u nSrcStep,
                                        Ncv32f *pDst,
@ -1654,7 +1511,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
                                        Ncv32u nDstStep,
                                        NcvRect32u oROI,
                                        NppStBorderType borderType,
-                                        const Ncv32f *pKernel,
+                                        Ncv32f *pKernel,
                                        Ncv32s nKernelSize,
                                        Ncv32s nAnchor,
                                        Ncv32f multiplier)
@ -1686,12 +1543,8 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
        oROI.height = srcSize.height - oROI.y;
    }

-    cudaChannelFormatDesc floatChannel = cudaCreateChannelDesc <float> ();
-    texSrc.normalized    = false;
-    texKernel.normalized = false;
-
-    cudaBindTexture (0, texSrc, pSrc, floatChannel, srcSize.height * nSrcStep);
-    cudaBindTexture (0, texKernel, pKernel, floatChannel, nKernelSize * sizeof (Ncv32f));
+    cv::cudev::Texture<Ncv32f> texSrc(srcSize.height * nSrcStep, pSrc);
+    cv::cudev::Texture<Ncv32f> texKernel(nKernelSize * sizeof(Ncv32f), pKernel);

    dim3 ctaSize (32, 6);
    dim3 gridSize ((oROI.width + ctaSize.x - 1) / ctaSize.x,
@ -1706,8 +1559,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
    case nppStBorderWrap:
        return NPPST_ERROR;
    case nppStBorderMirror:
-        FilterRowBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
-            (srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
+        FilterRowBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>(texSrc, texKernel, srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
        ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
        break;
    default:
@ -1718,7 +1570,7 @@ NCVStatus nppiStFilterRowBorder_32f_C1R(const Ncv32f *pSrc,
 }


-NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
+NCVStatus nppiStFilterColumnBorder_32f_C1R(Ncv32f *pSrc,
                                           NcvSize32u srcSize,
                                           Ncv32u nSrcStep,
                                           Ncv32f *pDst,
@ -1726,7 +1578,7 @@ NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
                                           Ncv32u nDstStep,
                                           NcvRect32u oROI,
                                           NppStBorderType borderType,
-                                           const Ncv32f *pKernel,
+                                           Ncv32f *pKernel,
                                           Ncv32s nKernelSize,
                                           Ncv32s nAnchor,
                                           Ncv32f multiplier)
@ -1758,12 +1610,8 @@ NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
        oROI.height = srcSize.height - oROI.y;
    }

-    cudaChannelFormatDesc floatChannel = cudaCreateChannelDesc <float> ();
-    texSrc.normalized    = false;
-    texKernel.normalized = false;
-
-    cudaBindTexture (0, texSrc, pSrc, floatChannel, srcSize.height * nSrcStep);
-    cudaBindTexture (0, texKernel, pKernel, floatChannel, nKernelSize * sizeof (Ncv32f));
+    cv::cudev::Texture<Ncv32f> texSrc(srcSize.height * nSrcStep, pSrc);
+    cv::cudev::Texture<Ncv32f> texKernel(nKernelSize * sizeof(Ncv32f), pKernel);

    dim3 ctaSize (32, 6);
    dim3 gridSize ((oROI.width + ctaSize.x - 1) / ctaSize.x,
@ -1776,8 +1624,7 @@ NCVStatus nppiStFilterColumnBorder_32f_C1R(const Ncv32f *pSrc,
    case nppStBorderWrap:
        return NPPST_ERROR;
    case nppStBorderMirror:
-        FilterColumnBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
-            (srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
+        FilterColumnBorderMirror_32f_C1R <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>(texSrc, texKernel, srcStep, pDst, dstSize, dstStep, oROI, nKernelSize, nAnchor, multiplier);
        ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
        break;
    default:
@ -1800,16 +1647,11 @@ inline Ncv32u iDivUp(Ncv32u num, Ncv32u denom)
    return (num + denom - 1)/denom;
 }

-
-texture<float, 2, cudaReadModeElementType> tex_src1;
-texture<float, 2, cudaReadModeElementType> tex_src0;
-
-
-__global__ void BlendFramesKernel(const float *u, const float *v,   // forward flow
-                                  const float *ur, const float *vr, // backward flow
-                                  const float *o0, const float *o1, // coverage masks
-                                  int w, int h, int s,
-                                  float theta, float *out)
+__global__ void BlendFramesKernel(cv::cudev::TexturePtr<Ncv32f> texSrc0, cv::cudev::TexturePtr<Ncv32f> texSrc1,
+    const float *u, const float *v,   // forward flow
+    const float *ur, const float *vr, // backward flow
+    const float *o0, const float *o1, // coverage masks
+    int w, int h, int s, float theta, float *out)
 {
    const int ix = threadIdx.x + blockDim.x * blockIdx.x;
    const int iy = threadIdx.y + blockDim.y * blockIdx.y;
@ -1829,27 +1671,17 @@ __global__ void BlendFramesKernel(const float *u, const float *v,   // forward f
    bool b0 = o0[pos] > 1e-4f;
    bool b1 = o1[pos] > 1e-4f;

-    if (b0 && b1)
-    {
-        // pixel is visible on both frames
-        out[pos] = tex2D(tex_src0, x - _u * theta, y - _v * theta) * (1.0f - theta) +
-            tex2D(tex_src1, x + _u * (1.0f - theta), y + _v * (1.0f - theta)) * theta;
-    }
-    else if (b0)
-    {
-        // visible on the first frame only
-        out[pos] = tex2D(tex_src0, x - _u * theta, y - _v * theta);
-    }
-    else
-    {
-        // visible on the second frame only
-        out[pos] = tex2D(tex_src1, x - _ur * (1.0f - theta), y - _vr * (1.0f - theta));
-    }
+    if (b0 && b1) // pixel is visible on both frames
+        out[pos] = texSrc0(y - _v * theta, x - _u * theta)* (1.0f - theta) + texSrc0(y + _v * (1.0f - theta), x + _u * (1.0f - theta)) * theta;
+    else if (b0) // visible on the first frame only
+        out[pos] = texSrc0(y - _v * theta, x - _u * theta);
+    else // visible on the second frame only
+        out[pos] = texSrc1(y - _vr * (1.0f - theta), x - _ur * (1.0f - theta));
 }


-NCVStatus BlendFrames(const Ncv32f *src0,
-                      const Ncv32f *src1,
+NCVStatus BlendFrames(Ncv32f *src0,
+                      Ncv32f *src1,
                      const Ncv32f *ufi,
                      const Ncv32f *vfi,
                      const Ncv32f *ubi,
@ -1862,29 +1694,13 @@ NCVStatus BlendFrames(const Ncv32f *src0,
                      Ncv32f theta,
                      Ncv32f *out)
 {
-    tex_src1.addressMode[0] = cudaAddressModeClamp;
-    tex_src1.addressMode[1] = cudaAddressModeClamp;
-    tex_src1.filterMode = cudaFilterModeLinear;
-    tex_src1.normalized = false;
-
-    tex_src0.addressMode[0] = cudaAddressModeClamp;
-    tex_src0.addressMode[1] = cudaAddressModeClamp;
-    tex_src0.filterMode = cudaFilterModeLinear;
-    tex_src0.normalized = false;
-
-    cudaChannelFormatDesc desc = cudaCreateChannelDesc <float> ();
    const Ncv32u pitch = stride * sizeof (float);
-    ncvAssertCUDAReturn (cudaBindTexture2D (0, tex_src1, src1, desc, width, height, pitch), NPPST_TEXTURE_BIND_ERROR);
-    ncvAssertCUDAReturn (cudaBindTexture2D (0, tex_src0, src0, desc, width, height, pitch), NPPST_TEXTURE_BIND_ERROR);
-
+    cv::cudev::Texture<Ncv32f> texSrc0(height, width, src0, pitch, false, cudaFilterModeLinear);
+    cv::cudev::Texture<Ncv32f> texSrc1(height, width, src1, pitch, false, cudaFilterModeLinear);
    dim3 threads (32, 4);
    dim3 blocks (iDivUp (width, threads.x), iDivUp (height, threads.y));
-
-    BlendFramesKernel<<<blocks, threads, 0, nppStGetActiveCUDAstream ()>>>
-        (ufi, vfi, ubi, vbi, o1, o2, width, height, stride, theta, out);
-
+    BlendFramesKernel<<<blocks, threads, 0, nppStGetActiveCUDAstream ()>>>(texSrc0, texSrc1, ufi, vfi, ubi, vbi, o1, o2, width, height, stride, theta, out);
    ncvAssertCUDALastErrorReturn(NPPST_CUDA_KERNEL_EXECUTION_ERROR);
-
    return NPPST_SUCCESS;
 }

@ -2255,44 +2071,27 @@ NCVStatus nppiStVectorWarp_PSF2x2_32f_C1(const Ncv32f *pSrc,
 //
 //==============================================================================

-
-texture <float, 2, cudaReadModeElementType> texSrc2D;
-
-
 __forceinline__
-__device__ float processLine(int spos,
-                             float xmin,
-                             float xmax,
-                             int ixmin,
-                             int ixmax,
-                             float fxmin,
-                             float cxmax)
+__device__ float processLine(cv::cudev::TexturePtr<Ncv32f> tex, int spos, float xmin, float xmax, int ixmin, int ixmax, float fxmin, float cxmax)
 {
    // first element
    float wsum = 1.0f - xmin + fxmin;
-    float sum = tex1Dfetch(texSrc, spos) * (1.0f - xmin + fxmin);
+    float sum = tex( spos) * (1.0f - xmin + fxmin);
    spos++;
    for (int ix = ixmin + 1; ix < ixmax; ++ix)
    {
-        sum += tex1Dfetch(texSrc, spos);
+        sum += tex(spos);
        spos++;
        wsum += 1.0f;
    }
-    sum += tex1Dfetch(texSrc, spos) * (cxmax - xmax);
+    sum += tex(spos) * (cxmax - xmax);
    wsum += cxmax - xmax;
    return sum / wsum;
 }


-__global__ void resizeSuperSample_32f(NcvSize32u srcSize,
-                                      Ncv32u srcStep,
-                                      NcvRect32u srcROI,
-                                      Ncv32f *dst,
-                                      NcvSize32u dstSize,
-                                      Ncv32u dstStep,
-                                      NcvRect32u dstROI,
-                                      Ncv32f scaleX,
-                                      Ncv32f scaleY)
+__global__ void resizeSuperSample_32f(cv::cudev::TexturePtr<Ncv32f> texSrc, NcvSize32u srcSize, Ncv32u srcStep, NcvRect32u srcROI, Ncv32f *dst, NcvSize32u dstSize, Ncv32u dstStep,
+    NcvRect32u dstROI, Ncv32f scaleX, Ncv32f scaleY)
 {
    // position within dst ROI
    const int ix = blockIdx.x * blockDim.x + threadIdx.x;
@ -2332,18 +2131,18 @@ __global__ void resizeSuperSample_32f(NcvSize32u srcSize,

    float wsum = 1.0f - yBegin + floorYBegin;

-    float sum = processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
+    float sum = processLine (texSrc, pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
        ceilXEnd) * (1.0f - yBegin + floorYBegin);
    pos += srcStep;
    for (int iy = iYBegin + 1; iy < iYEnd; ++iy)
    {
-        sum += processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
+        sum += processLine (texSrc, pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
            ceilXEnd);
        pos += srcStep;
        wsum += 1.0f;
    }

-    sum += processLine (pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
+    sum += processLine (texSrc, pos, xBegin, xEnd, iXBegin, iXEnd, floorXBegin,
        ceilXEnd) * (ceilYEnd - yEnd);
    wsum += ceilYEnd - yEnd;
    sum /= wsum;
@ -2372,14 +2171,7 @@ __device__ float bicubicCoeff(float x_)
 }


-__global__ void resizeBicubic(NcvSize32u srcSize,
-                              NcvRect32u srcROI,
-                              NcvSize32u dstSize,
-                              Ncv32u dstStep,
-                              Ncv32f *dst,
-                              NcvRect32u dstROI,
-                              Ncv32f scaleX,
-                              Ncv32f scaleY)
+__global__ void resizeBicubic(cv::cudev::TexturePtr<Ncv32f> texSrc, NcvSize32u srcSize, NcvRect32u srcROI, NcvSize32u dstSize, Ncv32u dstStep, Ncv32f *dst, NcvRect32u dstROI, Ncv32f scaleX, Ncv32f scaleY)
 {
    const int ix = blockIdx.x * blockDim.x + threadIdx.x;
    const int iy = blockIdx.y * blockDim.y + threadIdx.y;
@ -2433,7 +2225,7 @@ __global__ void resizeBicubic(NcvSize32u srcSize,
            float wx = bicubicCoeff (xDist);
            float wy = bicubicCoeff (yDist);
            wx *= wy;
-            sum += wx * tex2D (texSrc2D, cx * dx, cy * dy);
+            sum += wx * texSrc(cy * dy, cx * dx);
            wsum += wx;
        }
    }
@ -2441,7 +2233,7 @@ __global__ void resizeBicubic(NcvSize32u srcSize,
 }


-NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc,
+NCVStatus nppiStResize_32f_C1R(Ncv32f *pSrc,
                               NcvSize32u srcSize,
                               Ncv32u nSrcStep,
                               NcvRect32u srcROI,
@ -2469,33 +2261,17 @@ NCVStatus nppiStResize_32f_C1R(const Ncv32f *pSrc,

    if (interpolation == nppStSupersample)
    {
-        // bind texture
-        cudaBindTexture (0, texSrc, pSrc, srcSize.height * nSrcStep);
-        // invoke kernel
+        cv::cudev::Texture<Ncv32f> texSrc(srcSize.height * nSrcStep, pSrc);
        dim3 ctaSize (32, 6);
-        dim3 gridSize ((dstROI.width  + ctaSize.x - 1) / ctaSize.x,
-            (dstROI.height + ctaSize.y - 1) / ctaSize.y);
-
-        resizeSuperSample_32f <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
-            (srcSize, srcStep, srcROI, pDst, dstSize, dstStep, dstROI, 1.0f / xFactor, 1.0f / yFactor);
+        dim3 gridSize ((dstROI.width  + ctaSize.x - 1) / ctaSize.x,(dstROI.height + ctaSize.y - 1) / ctaSize.y);
+        resizeSuperSample_32f <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>> (texSrc, srcSize, srcStep, srcROI, pDst, dstSize, dstStep, dstROI, 1.0f / xFactor, 1.0f / yFactor);
    }
    else if (interpolation == nppStBicubic)
    {
-        texSrc2D.addressMode[0] = cudaAddressModeMirror;
-        texSrc2D.addressMode[1] = cudaAddressModeMirror;
-        texSrc2D.normalized = true;
-
-        cudaChannelFormatDesc desc = cudaCreateChannelDesc <float> ();
-
-        cudaBindTexture2D (0, texSrc2D, pSrc, desc, srcSize.width, srcSize.height,
-            nSrcStep);
-
+        cv::cudev::Texture<float> texSrc(srcSize.height, srcSize.width, pSrc, nSrcStep, true, cudaFilterModePoint, cudaAddressModeMirror);
        dim3 ctaSize (32, 6);
-        dim3 gridSize ((dstSize.width  + ctaSize.x - 1) / ctaSize.x,
-            (dstSize.height + ctaSize.y - 1) / ctaSize.y);
-
-        resizeBicubic <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>>
-            (srcSize, srcROI, dstSize, dstStep, pDst, dstROI, 1.0f / xFactor, 1.0f / yFactor);
+        dim3 gridSize ((dstSize.width  + ctaSize.x - 1) / ctaSize.x, (dstSize.height + ctaSize.y - 1) / ctaSize.y);
+        resizeBicubic <<<gridSize, ctaSize, 0, nppStGetActiveCUDAstream ()>>> (texSrc, srcSize, srcROI, dstSize, dstStep, pDst, dstROI, 1.0f / xFactor, 1.0f / yFactor);
    }
    else
    {
--- a/modules/cudalegacy/src/cuda/bm.cu
+++ b/modules/cudalegacy/src/cuda/bm.cu
@ -46,29 +46,27 @@
 #include "opencv2/core/cuda/limits.hpp"
 #include "opencv2/core/cuda/functional.hpp"
 #include "opencv2/core/cuda/reduce.hpp"
+#include <opencv2/cudev/ptr2d/texture.hpp>

 using namespace cv::cuda;
 using namespace cv::cuda::device;

 namespace optflowbm
 {
-    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_prev(false, cudaFilterModePoint, cudaAddressModeClamp);
-    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_curr(false, cudaFilterModePoint, cudaAddressModeClamp);
-
-    __device__ int cmpBlocks(int X1, int Y1, int X2, int Y2, int2 blockSize)
+    __device__ int cmpBlocks(cv::cudev::TexturePtr<uchar> texCurr, cv::cudev::TexturePtr<uchar> texPrev, int X1, int Y1, int X2, int Y2, int2 blockSize)
    {
        int s = 0;

        for (int y = 0; y < blockSize.y; ++y)
        {
            for (int x = 0; x < blockSize.x; ++x)
-                s += ::abs(tex2D(tex_prev, X1 + x, Y1 + y) - tex2D(tex_curr, X2 + x, Y2 + y));
+                s += ::abs(texPrev(Y1 + y, X1 + x) -texCurr(Y2 + y, X2 + x));
        }

        return s;
    }

-    __global__ void calcOptFlowBM(PtrStepSzf velx, PtrStepf vely, const int2 blockSize, const int2 shiftSize, const bool usePrevious,
+    __global__ void calcOptFlowBM(cv::cudev::TexturePtr<uchar> texPrev, cv::cudev::TexturePtr<uchar> texCurr, PtrStepSzf velx, PtrStepf vely, const int2 blockSize, const int2 shiftSize, const bool usePrevious,
                                  const int maxX, const int maxY, const int acceptLevel, const int escapeLevel,
                                  const short2* ss, const int ssCount)
    {
@ -90,7 +88,7 @@ namespace optflowbm
        int dist = numeric_limits<int>::max();

        if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY)
-            dist = cmpBlocks(X1, Y1, X2, Y2, blockSize);
+            dist = cmpBlocks(texPrev, texCurr, X1, Y1, X2, Y2, blockSize);

        int countMin = 1;
        int sumx = offX;
@ -111,7 +109,7 @@ namespace optflowbm

                if (0 <= X2 && X2 <= maxX && 0 <= Y2 && Y2 <= maxY)
                {
-                    const int tmpDist = cmpBlocks(X1, Y1, X2, Y2, blockSize);
+                    const int tmpDist = cmpBlocks(texPrev, texCurr, X1, Y1, X2, Y2, blockSize);
                    if (tmpDist < acceptLevel)
                    {
                        sumx = dx;
@ -151,16 +149,12 @@ namespace optflowbm
    void calc(PtrStepSzb prev, PtrStepSzb curr, PtrStepSzf velx, PtrStepSzf vely, int2 blockSize, int2 shiftSize, bool usePrevious,
              int maxX, int maxY, int acceptLevel, int escapeLevel, const short2* ss, int ssCount, cudaStream_t stream)
    {
-        bindTexture(&tex_prev, prev);
-        bindTexture(&tex_curr, curr);
-
+        cv::cudev::Texture<uchar> texPrev(prev);
+        cv::cudev::Texture<uchar> texCurr(curr);
        const dim3 block(32, 8);
        const dim3 grid(divUp(velx.cols, block.x), divUp(vely.rows, block.y));
-
-        calcOptFlowBM<<<grid, block, 0, stream>>>(velx, vely, blockSize, shiftSize, usePrevious,
-                                                  maxX, maxY, acceptLevel,  escapeLevel, ss, ssCount);
+        calcOptFlowBM<<<grid, block, 0, stream>>>(texPrev, texCurr, velx, vely, blockSize, shiftSize, usePrevious, maxX, maxY, acceptLevel,  escapeLevel, ss, ssCount);
        cudaSafeCall( cudaGetLastError() );
-
        if (stream == 0)
            cudaSafeCall( cudaDeviceSynchronize() );
    }
--- a/modules/cudalegacy/test/TestHypothesesGrow.cpp
+++ b/modules/cudalegacy/test/TestHypothesesGrow.cpp
@ -100,7 +100,8 @@ bool TestHypothesesGrow::process()

    NCV_SKIP_COND_BEGIN
    ncvAssertReturn(this->src.fill(h_vecSrc), false);
-    memset(h_vecDst.ptr(), 0, h_vecDst.length() * sizeof(NcvRect32u));
+
+    *h_vecDst.ptr() = {};
    NCVVectorReuse<Ncv32u> h_vecDst_as32u(h_vecDst.getSegment(), lenDst * sizeof(NcvRect32u) / sizeof(Ncv32u));
    ncvAssertReturn(h_vecDst_as32u.isMemReused(), false);
    ncvAssertReturn(this->src.fill(h_vecDst_as32u), false);
--- a/modules/cudaobjdetect/src/cuda/hog.cu
+++ b/modules/cudaobjdetect/src/cuda/hog.cu
@ -46,6 +46,7 @@
 #include "opencv2/core/cuda/reduce.hpp"
 #include "opencv2/core/cuda/functional.hpp"
 #include "opencv2/core/cuda/warp_shuffle.hpp"
+#include  <opencv2/cudev/ptr2d/texture.hpp>

 namespace cv { namespace cuda { namespace device
 {
@ -825,64 +826,57 @@ namespace cv { namespace cuda { namespace device
        //-------------------------------------------------------------------
        // Resize

-        texture<uchar4, 2, cudaReadModeNormalizedFloat> resize8UC4_tex;
-        texture<uchar,  2, cudaReadModeNormalizedFloat> resize8UC1_tex;
-
-        __global__ void resize_for_hog_kernel(float sx, float sy, PtrStepSz<uchar> dst, int colOfs)
+        __global__ void resize_for_hog_kernel(cv::cudev::TexturePtr<uchar, float> src, float sx, float sy, PtrStepSz<uchar> dst)
        {
            unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
            unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

            if (x < dst.cols && y < dst.rows)
-                dst.ptr(y)[x] = tex2D(resize8UC1_tex, x * sx + colOfs, y * sy) * 255;
+                dst.ptr(y)[x] = src(y * sy, x * sx) * 255;
        }

-        __global__ void resize_for_hog_kernel(float sx, float sy, PtrStepSz<uchar4> dst, int colOfs)
+        __global__ void resize_for_hog_kernel(cv::cudev::TexturePtr<uchar4, float4> src, float sx, float sy, PtrStepSz<uchar4> dst)
        {
            unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
            unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

            if (x < dst.cols && y < dst.rows)
            {
-                float4 val = tex2D(resize8UC4_tex, x * sx + colOfs, y * sy);
+                float4 val = src(y * sy, x * sx);
                dst.ptr(y)[x] = make_uchar4(val.x * 255, val.y * 255, val.z * 255, val.w * 255);
            }
        }

-        template<class T, class TEX>
-        static void resize_for_hog(const PtrStepSzb& src, PtrStepSzb dst, TEX& tex)
+        static void resize_for_hog_8UC1(const PtrStepSzb& src, PtrStepSzb dst)
        {
-            tex.filterMode = cudaFilterModeLinear;
-
-            size_t texOfs = 0;
-            int colOfs = 0;
-
-            cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
-            cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
-
-            if (texOfs != 0)
-            {
-                colOfs = static_cast<int>( texOfs/sizeof(T) );
-                cudaSafeCall( cudaUnbindTexture(tex) );
-                cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
-            }
-
+            cv::cudev::Texture<uchar,float> tex(src.rows, src.cols, src.data, src.step, false, cudaFilterModeLinear, cudaAddressModeClamp, cudaReadModeNormalizedFloat);
            dim3 threads(32, 8);
            dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));

            float sx = static_cast<float>(src.cols) / dst.cols;
            float sy = static_cast<float>(src.rows) / dst.rows;

-            resize_for_hog_kernel<<<grid, threads>>>(sx, sy, (PtrStepSz<T>)dst, colOfs);
+            resize_for_hog_kernel<<<grid, threads>>>(tex, sx, sy, (PtrStepSz<uchar>)dst);
            cudaSafeCall( cudaGetLastError() );
-
            cudaSafeCall( cudaDeviceSynchronize() );
+        }
+
+        static void resize_for_hog_8UC4(const PtrStepSzb& src, PtrStepSzb dst)
+        {
+            cv::cudev::Texture<uchar4, float4> tex(src.rows, src.cols, reinterpret_cast<uchar4*>(src.data), src.step, false, cudaFilterModeLinear, cudaAddressModeClamp, cudaReadModeNormalizedFloat);
+            dim3 threads(32, 8);
+            dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
+
+            float sx = static_cast<float>(src.cols) / dst.cols;
+            float sy = static_cast<float>(src.rows) / dst.rows;

-            cudaSafeCall( cudaUnbindTexture(tex) );
+            resize_for_hog_kernel<<<grid, threads>>>(tex, sx, sy, (PtrStepSz<uchar4>)dst);
+            cudaSafeCall(cudaGetLastError());
+            cudaSafeCall(cudaDeviceSynchronize());
        }

-        void resize_8UC1(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); }
-        void resize_8UC4(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); }
+        void resize_8UC1(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog_8UC1(src, dst); }
+        void resize_8UC4(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog_8UC4(src, dst); }
    } // namespace hog
 }}} // namespace cv { namespace cuda { namespace cudev

--- a/modules/cudaobjdetect/test/test_objdetect.cpp
+++ b/modules/cudaobjdetect/test/test_objdetect.cpp
@ -222,7 +222,7 @@ INSTANTIATE_TEST_CASE_P(CUDA_ObjDetect, HOG, ALL_DEVICES);
 */
 //============== caltech hog tests =====================//

-struct CalTech : public ::testing::TestWithParam<tuple<cv::cuda::DeviceInfo, std::string> >
+struct CalTech : public ::testing::TestWithParam<tuple<cv::cuda::DeviceInfo, std::string, bool>>
 {
    cv::cuda::DeviceInfo devInfo;
    cv::Mat img;
@ -232,7 +232,13 @@ struct CalTech : public ::testing::TestWithParam<tuple<cv::cuda::DeviceInfo, std
        devInfo = GET_PARAM(0);
        cv::cuda::setDevice(devInfo.deviceID());

-        img = readImage(GET_PARAM(1), cv::IMREAD_GRAYSCALE);
+        const bool grayScale = GET_PARAM(2);
+        if(grayScale)
+            img = readImage(GET_PARAM(1), IMREAD_GRAYSCALE);
+        else {
+            Mat imgBgr = readImage(GET_PARAM(1));
+            cv::cvtColor(imgBgr, img, COLOR_BGR2BGRA);
+        }
        ASSERT_FALSE(img.empty());
    }
 };
@ -263,10 +269,11 @@ CUDA_TEST_P(CalTech, HOG)
 #endif
 }

+#define GREYSCALE true, false
 INSTANTIATE_TEST_CASE_P(detect, CalTech, testing::Combine(ALL_DEVICES,
    ::testing::Values<std::string>("caltech/image_00000009_0.png", "caltech/image_00000032_0.png",
        "caltech/image_00000165_0.png", "caltech/image_00000261_0.png", "caltech/image_00000469_0.png",
-        "caltech/image_00000527_0.png", "caltech/image_00000574_0.png")));
+        "caltech/image_00000527_0.png", "caltech/image_00000574_0.png"), testing::Values(GREYSCALE)));


 //------------------------variable GPU HOG Tests------------------------//
--- a/modules/cudaoptflow/src/cuda/pyrlk.cu
+++ b/modules/cudaoptflow/src/cuda/pyrlk.cu
@ -50,8 +50,7 @@
 #include "opencv2/core/cuda/reduce.hpp"
 #include "opencv2/core/cuda/filters.hpp"
 #include "opencv2/core/cuda/border_interpolate.hpp"
-
-#include <iostream>
+#include  <opencv2/cudev/ptr2d/texture.hpp>

 using namespace cv::cuda;
 using namespace cv::cuda::device;
@ -64,224 +63,6 @@ namespace pyrlk
    __constant__ int c_halfWin_y;
    __constant__ int c_iters;

-    texture<uchar, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_I8U(false, cudaFilterModeLinear, cudaAddressModeClamp);
-    texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_I8UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
-
-    texture<ushort4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_I16UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
-
-
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_If(false, cudaFilterModeLinear, cudaAddressModeClamp);
-    texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_If4(false, cudaFilterModeLinear, cudaAddressModeClamp);
-
-    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_Ib(false, cudaFilterModePoint, cudaAddressModeClamp);
-
-    texture<uchar, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_J8U(false, cudaFilterModeLinear, cudaAddressModeClamp);
-    texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_J8UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
-
-    texture<ushort4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_J16UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
-
-
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_Jf(false, cudaFilterModeLinear, cudaAddressModeClamp);
-    texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_Jf4(false, cudaFilterModeLinear, cudaAddressModeClamp);
-
-
-    template <int cn, typename T> struct Tex_I
-    {
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<typename TypeVec<T, cn>::vec_type> I)
-        {
-            CV_UNUSED(I);
-        }
-    };
-
-    template <> struct Tex_I<1, uchar>
-    {
-        static __device__ __forceinline__ float read(float x, float y)
-        {
-            return tex2D(tex_I8U, x, y);
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar>& I)
-        {
-            bindTexture(&tex_I8U, I);
-        }
-    };
-    template <> struct Tex_I<1, ushort>
-    {
-        static __device__ __forceinline__ float read(float x, float y)
-        {
-            return 0.0;
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort>& I)
-        {
-            CV_UNUSED(I);
-        }
-    };
-    template <> struct Tex_I<1, int>
-    {
-        static __device__ __forceinline__ float read(float x, float y)
-        {
-            return 0.0;
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<int>& I)
-        {
-            CV_UNUSED(I);
-        }
-    };
-    template <> struct Tex_I<1, float>
-    {
-        static __device__ __forceinline__ float read(float x, float y)
-        {
-            return tex2D(tex_If, x, y);
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<float>& I)
-        {
-            bindTexture(&tex_If, I);
-        }
-    };
-    // ****************** 3 channel specializations ************************
-    template <> struct Tex_I<3, uchar>
-    {
-        static __device__ __forceinline__ float3 read(float x, float y)
-        {
-            return make_float3(0,0,0);
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar3> I)
-        {
-            CV_UNUSED(I);
-        }
-    };
-    template <> struct Tex_I<3, ushort>
-    {
-        static __device__ __forceinline__ float3 read(float x, float y)
-        {
-            return make_float3(0, 0, 0);
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort3> I)
-        {
-            CV_UNUSED(I);
-        }
-    };
-    template <> struct Tex_I<3, int>
-    {
-        static __device__ __forceinline__ float3 read(float x, float y)
-        {
-            return make_float3(0, 0, 0);
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<int3> I)
-        {
-            CV_UNUSED(I);
-        }
-    };
-    template <> struct Tex_I<3, float>
-    {
-        static __device__ __forceinline__ float3 read(float x, float y)
-        {
-            return make_float3(0, 0, 0);
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<float3> I)
-        {
-            CV_UNUSED(I);
-        }
-    };
-    // ****************** 4 channel specializations ************************
-
-    template <> struct Tex_I<4, uchar>
-    {
-        static __device__ __forceinline__ float4 read(float x, float y)
-        {
-            return tex2D(tex_I8UC4, x, y);
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar4>& I)
-        {
-            bindTexture(&tex_I8UC4, I);
-        }
-    };
-    template <> struct Tex_I<4, ushort>
-    {
-        static __device__ __forceinline__ float4 read(float x, float y)
-        {
-            return tex2D(tex_I16UC4, x, y);
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort4>& I)
-        {
-            bindTexture(&tex_I16UC4, I);
-        }
-    };
-    template <> struct Tex_I<4, float>
-    {
-        static __device__ __forceinline__ float4 read(float x, float y)
-        {
-            return tex2D(tex_If4, x, y);
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<float4>& I)
-        {
-            bindTexture(&tex_If4, I);
-        }
-    };
-    // ************* J  ***************
-    template <int cn, typename T> struct Tex_J
-    {
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<typename TypeVec<T,cn>::vec_type>& J)
-        {
-            CV_UNUSED(J);
-        }
-    };
-    template <> struct Tex_J<1, uchar>
-    {
-        static __device__ __forceinline__ float read(float x, float y)
-        {
-            return tex2D(tex_J8U, x, y);
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar>& J)
-        {
-            bindTexture(&tex_J8U, J);
-        }
-    };
-    template <> struct Tex_J<1, float>
-    {
-        static __device__ __forceinline__ float read(float x, float y)
-        {
-            return tex2D(tex_Jf, x, y);
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<float>& J)
-        {
-            bindTexture(&tex_Jf, J);
-        }
-    };
-    // ************* 4 channel specializations ***************
-    template <> struct Tex_J<4, uchar>
-    {
-        static __device__ __forceinline__ float4 read(float x, float y)
-        {
-            return tex2D(tex_J8UC4, x, y);
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar4>& J)
-        {
-            bindTexture(&tex_J8UC4, J);
-        }
-    };
-    template <> struct Tex_J<4, ushort>
-    {
-        static __device__ __forceinline__ float4 read(float x, float y)
-        {
-            return tex2D(tex_J16UC4, x, y);
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort4>& J)
-        {
-            bindTexture(&tex_J16UC4, J);
-        }
-    };
-    template <> struct Tex_J<4, float>
-    {
-        static __device__ __forceinline__ float4 read(float x, float y)
-        {
-            return tex2D(tex_Jf4, x, y);
-        }
-        static __host__ __forceinline__ void bindTexture_(PtrStepSz<float4>& J)
-        {
-            bindTexture(&tex_Jf4, J);
-        }
-    };
-
    __device__ __forceinline__ void accum(float& dst, const float& val)
    {
        dst += val;
@ -364,8 +145,8 @@ namespace pyrlk
        }
    };

-    template <int cn, int PATCH_X, int PATCH_Y, bool calcErr, typename T>
-    __global__ void sparseKernel(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
+    template <int cn, int PATCH_X, int PATCH_Y, bool calcErr, typename T, class Ptr2D>
+    __global__ void sparseKernel(const Ptr2D texI, const Ptr2D texJ, const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
    {
    #if __CUDA_ARCH__ <= 110
        const int BLOCK_SIZE = 128;
@ -413,15 +194,14 @@ namespace pyrlk
                float x = prevPt.x + xBase + 0.5f;
                float y = prevPt.y + yBase + 0.5f;

-                I_patch[i][j] = Tex_I<cn, T>::read(x, y);
+                I_patch[i][j] = texI(y, x);

                // Scharr Deriv
+                work_type dIdx = 3.0f * texI(y - 1, x + 1) + 10.0f * texI(y, x + 1) + 3.0f * texI(y + 1, x + 1) -
+                    (3.0f * texI(y - 1, x - 1) + 10.0f * texI(y, x - 1) + 3.0f * texI(y + 1, x - 1));

-                work_type dIdx = 3.0f * Tex_I<cn,T>::read(x+1, y-1) + 10.0f * Tex_I<cn, T>::read(x+1, y) + 3.0f * Tex_I<cn,T>::read(x+1, y+1) -
-                                 (3.0f * Tex_I<cn,T>::read(x-1, y-1) + 10.0f * Tex_I<cn, T>::read(x-1, y) + 3.0f * Tex_I<cn,T>::read(x-1, y+1));
-
-                work_type dIdy = 3.0f * Tex_I<cn,T>::read(x-1, y+1) + 10.0f * Tex_I<cn, T>::read(x, y+1) + 3.0f * Tex_I<cn,T>::read(x+1, y+1) -
-                                (3.0f * Tex_I<cn,T>::read(x-1, y-1) + 10.0f * Tex_I<cn, T>::read(x, y-1) + 3.0f * Tex_I<cn,T>::read(x+1, y-1));
+                work_type dIdy = 3.0f * texI(y + 1, x - 1) + 10.0f * texI(y + 1, x) + 3.0f * texI(y + 1, x + 1) -
+                    (3.0f * texI(y - 1, x - 1) + 10.0f * texI(y - 1, x) + 3.0f * texI(y - 1, x + 1));

                dIdx_patch[i][j] = dIdx;
                dIdy_patch[i][j] = dIdy;
@ -490,7 +270,8 @@ namespace pyrlk
                for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
                {
                    work_type I_val = I_patch[i][j];
-                    work_type J_val = Tex_J<cn, T>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
+
+                    work_type J_val = texJ(nextPt.y + y + 0.5f, nextPt.x + x + 0.5f);

                    work_type diff = (J_val - I_val) * 32.0f;

@ -533,7 +314,8 @@ namespace pyrlk
                for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
                {
                    work_type I_val = I_patch[i][j];
-                    work_type J_val = Tex_J<cn, T>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
+
+                    work_type J_val = texJ(nextPt.y + y + 0.5f, nextPt.x + x + 0.5f);

                    work_type diff = J_val - I_val;

@ -749,6 +531,27 @@ namespace pyrlk
        }
    } // __global__ void sparseKernel_

+    // Specialization for non float data, cudaFilterModeLinear only compatible with cudaReadModeNormalizedFloat.
+    template<int cn, class T> class TextureLinear : public cv::cudev::Texture<typename TypeVec<T, cn>::vec_type, typename TypeVec<float, cn>::vec_type> {
+    public:
+        typedef typename TypeVec<T, cn>::vec_type elem_type;
+        typedef typename TypeVec<float, cn>::vec_type ret_type;
+        __host__ TextureLinear(PtrStepSz<elem_type> src, const bool normalizedCoords = false, const cudaTextureAddressMode addressMode = cudaAddressModeClamp) :
+            cv::cudev::Texture<elem_type, ret_type>(src, normalizedCoords, cudaFilterModeLinear, addressMode, cudaReadModeNormalizedFloat)
+        {
+        }
+    };
+
+    // Specialization for float data, cudaReadModeNormalizedFloat only compatible with cudaReadModeElementType.
+    template<int cn> class TextureLinear<cn, float> : public cv::cudev::Texture<typename TypeVec<float, cn>::vec_type, typename TypeVec<float, cn>::vec_type>
+    {
+    public:
+        typedef typename TypeVec<float, cn>::vec_type float_type;
+        __host__ TextureLinear(PtrStepSz<float_type> src, const bool normalizedCoords = false, const cudaTextureAddressMode addressMode = cudaAddressModeClamp) :
+            cv::cudev::Texture <float_type, float_type>(src, normalizedCoords, cudaFilterModeLinear, addressMode, cudaReadModeElementType)
+        {
+        }
+    };

    template <int cn, int PATCH_X, int PATCH_Y, typename T> class sparse_caller
    {
@ -756,16 +559,16 @@ namespace pyrlk
        static void call(PtrStepSz<typename TypeVec<T, cn>::vec_type> I, PtrStepSz<typename TypeVec<T, cn>::vec_type> J, int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
            int level, dim3 block, cudaStream_t stream)
        {
+            typedef typename TypeVec<T, cn>::vec_type dType;
+            typedef typename TypeVec<float, cn>::vec_type rType;
+            TextureLinear<cn,T> texI(I);
+            TextureLinear<cn,T> texJ(J);
            dim3 grid(ptcount);
-            CV_UNUSED(I);
-            CV_UNUSED(J);
            if (level == 0 && err)
-                sparseKernel<cn, PATCH_X, PATCH_Y, true, T> <<<grid, block, 0, stream >>>(prevPts, nextPts, status, err, level, rows, cols);
+                sparseKernel<cn, PATCH_X, PATCH_Y, true, T, cv::cudev::TexturePtr<dType,rType>><<<grid, block, 0, stream>>>(texI, texJ, prevPts, nextPts, status, err, level, rows, cols);
            else
-                sparseKernel<cn, PATCH_X, PATCH_Y, false, T> <<<grid, block, 0, stream >>>(prevPts, nextPts, status, err, level, rows, cols);
-
+                sparseKernel<cn, PATCH_X, PATCH_Y, false, T, cv::cudev::TexturePtr<dType, rType>><<<grid, block, 0, stream>>>(texI, texJ, prevPts, nextPts, status, err, level, rows, cols);
            cudaSafeCall(cudaGetLastError());
-
            if (stream == 0)
                cudaSafeCall(cudaDeviceSynchronize());
        }
@ -903,8 +706,8 @@ namespace pyrlk
    };


-    template <bool calcErr>
-    __global__ void denseKernel(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)
+    template <bool calcErr, class Ptr2D>
+    __global__ void denseKernel(const Ptr2D texI, const Ptr2D texJ, PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)
    {
        extern __shared__ int smem[];

@ -925,15 +728,15 @@ namespace pyrlk
                float x = xBase - c_halfWin_x + j + 0.5f;
                float y = yBase - c_halfWin_y + i + 0.5f;

-                I_patch[i * patchWidth + j] = tex2D(tex_If, x, y);
+                I_patch[i * patchWidth + j] = texI(y, x);

                // Scharr Deriv

-                dIdx_patch[i * patchWidth + j] = 3 * tex2D(tex_If, x+1, y-1) + 10 * tex2D(tex_If, x+1, y) + 3 * tex2D(tex_If, x+1, y+1) -
-                                                (3 * tex2D(tex_If, x-1, y-1) + 10 * tex2D(tex_If, x-1, y) + 3 * tex2D(tex_If, x-1, y+1));
+                dIdx_patch[i * patchWidth + j] = 3 * texI(y - 1, x + 1) + 10 * texI(y, x + 1) + 3 * texI(y + 1, x + 1) -
+                    (3 * texI(y - 1, x - 1) + 10 * texI(y, x - 1) + 3 * texI(y + 1, x - 1));

-                dIdy_patch[i * patchWidth + j] = 3 * tex2D(tex_If, x-1, y+1) + 10 * tex2D(tex_If, x, y+1) + 3 * tex2D(tex_If, x+1, y+1) -
-                                                (3 * tex2D(tex_If, x-1, y-1) + 10 * tex2D(tex_If, x, y-1) + 3 * tex2D(tex_If, x+1, y-1));
+                dIdy_patch[i * patchWidth + j] = 3 * texI(y + 1, x - 1) + 10 * texI(y + 1,x) + 3 * texI(y+ 1, x + 1) -
+                    (3 * texI(y - 1, x - 1) + 10 * texI(y - 1,x) + 3 * texI(y - 1, x + 1));
            }
        }

@ -1004,7 +807,7 @@ namespace pyrlk
                for (int j = 0; j < c_winSize_x; ++j)
                {
                    int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
-                    int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
+                    int J = texJ(nextPt.y - c_halfWin_y + i + 0.5f, nextPt.x - c_halfWin_x + j + 0.5f);

                    int diff = (J - I) * 32;

@ -1040,7 +843,8 @@ namespace pyrlk
                for (int j = 0; j < c_winSize_x; ++j)
                {
                    int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
-                    int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
+
+                    int J = texJ(nextPt.y - c_halfWin_y + i + 0.5f, nextPt.x - c_halfWin_x + j + 0.5f);

                    errval += ::abs(J - I);
                }
@ -1109,9 +913,6 @@ namespace pyrlk
                { sparse_caller<cn, 1, 5,T>::call, sparse_caller<cn, 2, 5,T>::call, sparse_caller<cn, 3, 5,T>::call, sparse_caller<cn, 4, 5,T>::call, sparse_caller<cn, 5, 5,T>::call }
            };

-            Tex_I<cn, T>::bindTexture_(I);
-            Tex_J<cn, T>::bindTexture_(J);
-
            funcs[patch.y - 1][patch.x - 1](I, J, I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
                level, block, stream);
        }
@ -1119,9 +920,8 @@ namespace pyrlk
        {
            dim3 block(16, 16);
            dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y));
-            Tex_I<1, T>::bindTexture_(I);
-            Tex_J<1, T>::bindTexture_(J);
-
+            TextureLinear<1, T> texI(I);
+            TextureLinear<1, T> texJ(J);
            int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
            const int patchWidth = block.x + 2 * halfWin.x;
            const int patchHeight = block.y + 2 * halfWin.y;
@ -1129,12 +929,12 @@ namespace pyrlk

            if (err.data)
            {
-                denseKernel<true> << <grid, block, smem_size, stream >> >(u, v, prevU, prevV, err, I.rows, I.cols);
+                denseKernel<true, cv::cudev::TexturePtr<T,float>><<<grid, block, smem_size, stream>>>(texI, texJ, u, v, prevU, prevV, err, I.rows, I.cols);
                cudaSafeCall(cudaGetLastError());
            }
            else
            {
-                denseKernel<false> << <grid, block, smem_size, stream >> >(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);
+                denseKernel<false, cv::cudev::TexturePtr<T, float>><<<grid, block, smem_size, stream>>>(texI, texJ, u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);
                cudaSafeCall(cudaGetLastError());
            }

--- a/modules/cudaoptflow/src/cuda/tvl1flow.cu
+++ b/modules/cudaoptflow/src/cuda/tvl1flow.cu
@ -46,6 +46,7 @@
 #include "opencv2/core/cuda/border_interpolate.hpp"
 #include "opencv2/core/cuda/limits.hpp"
 #include "opencv2/core/cuda.hpp"
+#include <opencv2/cudev/ptr2d/texture.hpp>

 using namespace cv::cuda;
 using namespace cv::cuda::device;
@ -102,63 +103,8 @@ namespace tvl1flow
        }
    }

-    struct SrcTex
-    {
-        virtual ~SrcTex() {}
-
-        __device__ __forceinline__ virtual float I1(float x, float y) const = 0;
-        __device__ __forceinline__ virtual float I1x(float x, float y) const = 0;
-        __device__ __forceinline__ virtual float I1y(float x, float y) const = 0;
-    };
-
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1 (false, cudaFilterModePoint, cudaAddressModeClamp);
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1x(false, cudaFilterModePoint, cudaAddressModeClamp);
-    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_I1y(false, cudaFilterModePoint, cudaAddressModeClamp);
-    struct SrcTexRef : SrcTex
-    {
-        __device__ __forceinline__ float I1(float x, float y) const CV_OVERRIDE
-        {
-            return tex2D(tex_I1, x, y);
-        }
-        __device__ __forceinline__ float I1x(float x, float y) const CV_OVERRIDE
-        {
-            return tex2D(tex_I1x, x, y);
-        }
-        __device__ __forceinline__ float I1y(float x, float y) const CV_OVERRIDE
-        {
-            return tex2D(tex_I1y, x, y);
-        }
-    };
-
-    struct SrcTexObj : SrcTex
-    {
-        __host__ SrcTexObj(cudaTextureObject_t tex_obj_I1_, cudaTextureObject_t tex_obj_I1x_, cudaTextureObject_t tex_obj_I1y_)
-            : tex_obj_I1(tex_obj_I1_), tex_obj_I1x(tex_obj_I1x_), tex_obj_I1y(tex_obj_I1y_) {}
-
-        __device__ __forceinline__ float I1(float x, float y) const CV_OVERRIDE
-        {
-            return tex2D<float>(tex_obj_I1, x, y);
-        }
-        __device__ __forceinline__ float I1x(float x, float y) const CV_OVERRIDE
-        {
-            return tex2D<float>(tex_obj_I1x, x, y);
-        }
-        __device__ __forceinline__ float I1y(float x, float y) const CV_OVERRIDE
-        {
-            return tex2D<float>(tex_obj_I1y, x, y);
-        }
-
-        cudaTextureObject_t tex_obj_I1;
-        cudaTextureObject_t tex_obj_I1x;
-        cudaTextureObject_t tex_obj_I1y;
-    };
-
-    template <
-        typename T,
-        typename = typename std::enable_if<std::is_base_of<SrcTex, T>::value>::type
-    >
    __global__ void warpBackwardKernel(
-        const PtrStepSzf I0, const T src, const PtrStepf u1, const PtrStepf u2,
+        const PtrStepSzf I0, const cv::cudev::TexturePtr<float> I1, const cv::cudev::TexturePtr<float> I1x, const cv::cudev::TexturePtr<float> I1y, const PtrStepf u1, const PtrStepf u2,
        PtrStepf I1w, PtrStepf I1wx, PtrStepf I1wy, PtrStepf grad, PtrStepf rho)
    {
        const int x = blockIdx.x * blockDim.x + threadIdx.x;
@ -189,11 +135,9 @@ namespace tvl1flow
            for (int cx = xmin; cx <= xmax; ++cx)
            {
                const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);
-
-                sum  += w * src.I1(cx, cy);
-                sumx += w * src.I1x(cx, cy);
-                sumy += w * src.I1y(cx, cy);
-
+                sum  += w * I1(cy, cx);
+                sumx += w * I1x(cy, cx);
+                sumy += w * I1y(cy, cx);
                wsum += w;
            }
        }
@ -224,49 +168,14 @@ namespace tvl1flow
                      PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho,
                      cudaStream_t stream)
    {
+        cv::cudev::Texture<float> texI1(I1);
+        cv::cudev::Texture<float> texI1x(I1x);
+        cv::cudev::Texture<float> texI1y(I1y);
        const dim3 block(32, 8);
        const dim3 grid(divUp(I0.cols, block.x), divUp(I0.rows, block.y));
-
-        bool cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);
-
-        if (cc30)
-        {
-            cudaTextureDesc texDesc;
-            memset(&texDesc, 0, sizeof(texDesc));
-            texDesc.addressMode[0] = cudaAddressModeClamp;
-            texDesc.addressMode[1] = cudaAddressModeClamp;
-            texDesc.addressMode[2] = cudaAddressModeClamp;
-
-            cudaTextureObject_t texObj_I1 = 0, texObj_I1x = 0, texObj_I1y = 0;
-
-            createTextureObjectPitch2D(&texObj_I1, I1, texDesc);
-            createTextureObjectPitch2D(&texObj_I1x, I1x, texDesc);
-            createTextureObjectPitch2D(&texObj_I1y, I1y, texDesc);
-
-            warpBackwardKernel << <grid, block, 0, stream >> > (I0, SrcTexObj(texObj_I1, texObj_I1x, texObj_I1y), u1, u2, I1w, I1wx, I1wy, grad, rho);
-            cudaSafeCall(cudaGetLastError());
-
-            if (!stream)
-                cudaSafeCall(cudaDeviceSynchronize());
-            else
-                cudaSafeCall(cudaStreamSynchronize(stream));
-
-            cudaSafeCall(cudaDestroyTextureObject(texObj_I1));
-            cudaSafeCall(cudaDestroyTextureObject(texObj_I1x));
-            cudaSafeCall(cudaDestroyTextureObject(texObj_I1y));
-        }
-        else
-        {
-            bindTexture(&tex_I1, I1);
-            bindTexture(&tex_I1x, I1x);
-            bindTexture(&tex_I1y, I1y);
-
-            warpBackwardKernel << <grid, block, 0, stream >> > (I0, SrcTexRef(), u1, u2, I1w, I1wx, I1wy, grad, rho);
-            cudaSafeCall(cudaGetLastError());
-
-            if (!stream)
-                cudaSafeCall(cudaDeviceSynchronize());
-        }
+        warpBackwardKernel<<<grid, block, 0, stream>>>(I0, texI1, texI1x, texI1y , u1, u2, I1w, I1wx, I1wy, grad, rho);
+        if (!stream)
+            cudaSafeCall(cudaDeviceSynchronize());
    }
 }

--- a/modules/cudastereo/src/cuda/stereobm.cu
+++ b/modules/cudastereo/src/cuda/stereobm.cu
@ -43,8 +43,10 @@
 #if !defined CUDA_DISABLER

 #include "opencv2/core/cuda/common.hpp"
+#include <opencv2/cudev/ptr2d/texture.hpp>
 #include <limits.h>

+
 namespace cv { namespace cuda { namespace device
 {
    namespace stereobm
@ -601,13 +603,12 @@ namespace cv { namespace cuda { namespace device
        /////////////////////////////////// Textureness filtering ////////////////////////////////////////
        //////////////////////////////////////////////////////////////////////////////////////////////////

-        texture<unsigned char, 2, cudaReadModeNormalizedFloat> texForTF;
-
-        __device__ __forceinline__ float sobel(int x, int y)
+        __device__ __forceinline__ float sobel(cv::cudev::TexturePtr<uchar, float> texSrc, int x, int y)
        {
-            float conv = tex2D(texForTF, x - 1, y - 1) * (-1) + tex2D(texForTF, x + 1, y - 1) * (1) +
-                         tex2D(texForTF, x - 1, y    ) * (-2) + tex2D(texForTF, x + 1, y    ) * (2) +
-                         tex2D(texForTF, x - 1, y + 1) * (-1) + tex2D(texForTF, x + 1, y + 1) * (1);
+            float conv = texSrc(y - 1, x - 1) * (-1) + texSrc(y - 1, x + 1) * (1) +
+                texSrc(y, x - 1) * (-2) + texSrc(y, x + 1) * (2) +
+                texSrc(y + 1, x - 1) * (-1) + texSrc(y + 1, x + 1) * (1);
+
            return fabs(conv);
        }

@ -635,7 +636,7 @@ namespace cv { namespace cuda { namespace device

        #define RpT (2 * ROWSperTHREAD)  // got experimentally

-        __global__ void textureness_kernel(PtrStepSzb disp, int winsz, float threshold)
+        __global__ void textureness_kernel(cv::cudev::TexturePtr<uchar,float> texSrc, PtrStepSzb disp, int winsz, float threshold)
        {
            int winsz2 = winsz/2;
            int n_dirty_pixels = (winsz2) * 2;
@ -657,9 +658,9 @@ namespace cv { namespace cuda { namespace device

                for(int i = y - winsz2; i <= y + winsz2; ++i)
                {
-                    sum += sobel(x - winsz2, i);
+                    sum += sobel(texSrc, x - winsz2, i);
                    if (cols_extra)
-                        sum_extra += sobel(x + blockDim.x - winsz2, i);
+                        sum_extra += sobel(texSrc, x + blockDim.x - winsz2, i);
                }
                *cols = sum;
                if (cols_extra)
@ -675,12 +676,12 @@ namespace cv { namespace cuda { namespace device

                for(int y = beg_row + 1; y < end_row; ++y)
                {
-                    sum = sum - sobel(x - winsz2, y - winsz2 - 1) + sobel(x - winsz2, y + winsz2);
+                    sum = sum - sobel(texSrc, x - winsz2, y - winsz2 - 1) + sobel(texSrc, x - winsz2, y + winsz2);
                    *cols = sum;

                    if (cols_extra)
                    {
-                        sum_extra = sum_extra - sobel(x + blockDim.x - winsz2, y - winsz2 - 1) + sobel(x + blockDim.x - winsz2, y + winsz2);
+                        sum_extra = sum_extra - sobel(texSrc, x + blockDim.x - winsz2, y - winsz2 - 1) + sobel(texSrc, x + blockDim.x - winsz2, y + winsz2);
                        *cols_extra = sum_extra;
                    }

@ -697,28 +698,16 @@ namespace cv { namespace cuda { namespace device
        void postfilter_textureness(const PtrStepSzb& input, int winsz, float avgTexturenessThreshold, const PtrStepSzb& disp, cudaStream_t & stream)
        {
            avgTexturenessThreshold *= winsz * winsz;
-
-            texForTF.filterMode     = cudaFilterModeLinear;
-            texForTF.addressMode[0] = cudaAddressModeWrap;
-            texForTF.addressMode[1] = cudaAddressModeWrap;
-
-            cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
-            cudaSafeCall( cudaBindTexture2D( 0, texForTF, input.data, desc, input.cols, input.rows, input.step ) );
-
+            cv::cudev::Texture<unsigned char, float> tex(input, false, cudaFilterModeLinear, cudaAddressModeWrap, cudaReadModeNormalizedFloat);
            dim3 threads(128, 1, 1);
            dim3 grid(1, 1, 1);
-
            grid.x = divUp(input.cols, threads.x);
            grid.y = divUp(input.rows, RpT);
-
            size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);
-            textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold);
+            textureness_kernel<<<grid, threads, smem_size, stream>>>(tex, disp, winsz, avgTexturenessThreshold);
            cudaSafeCall( cudaGetLastError() );
-
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
-
-            cudaSafeCall( cudaUnbindTexture (texForTF) );
        }
    } // namespace stereobm
 }}} // namespace cv { namespace cuda { namespace cudev
--- a/modules/cudawarping/src/cuda/remap.cu
+++ b/modules/cudawarping/src/cuda/remap.cu
@ -48,6 +48,7 @@
 #include "opencv2/core/cuda/vec_math.hpp"
 #include "opencv2/core/cuda/saturate_cast.hpp"
 #include "opencv2/core/cuda/filters.hpp"
+#include <opencv2/cudev/ptr2d/texture.hpp>

 namespace cv { namespace cuda { namespace device
 {
@ -77,8 +78,8 @@ namespace cv { namespace cuda { namespace device
                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));

                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+                BorderReader<PtrStep<T>, B<work_type>> brdSrc(src, brd);
+                Filter<BorderReader<PtrStep<T>, B<work_type>>> filter_src(brdSrc);

                remap<<<grid, block, 0, stream>>>(filter_src, mapx, mapy, dst);
                cudaSafeCall( cudaGetLastError() );
@ -98,8 +99,8 @@ namespace cv { namespace cuda { namespace device
                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));

                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+                BorderReader<PtrStep<T>, B<work_type>> brdSrc(src, brd);
+                Filter<BorderReader<PtrStep<T>, B<work_type>>> filter_src(brdSrc);

                remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
                cudaSafeCall( cudaGetLastError() );
@ -108,88 +109,96 @@ namespace cv { namespace cuda { namespace device
            }
        };

-        #define OPENCV_CUDA_IMPLEMENT_REMAP_TEX(type) \
-            texture< type , cudaTextureType2D> tex_remap_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-            struct tex_remap_ ## type ## _reader \
-            { \
-                typedef type elem_type; \
-                typedef int index_type; \
-                int xoff, yoff; \
-                tex_remap_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
-                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-                { \
-                    return tex2D(tex_remap_ ## type , x + xoff, y + yoff); \
-                } \
-            }; \
-            template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, type> \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
-                    PtrStepSz< type > dst, const float* borderValue, bool cc20) \
-                { \
-                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
-                    dim3 block(32, cc20 ? 8 : 4); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_remap_ ## type , srcWhole); \
-                    tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
-                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
-                    BorderReader< tex_remap_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
-                    Filter< BorderReader< tex_remap_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
-                    remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            }; \
-            template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, type> \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy, \
-                    PtrStepSz< type > dst, const float*, bool) \
-                { \
-                    dim3 block(32, 8); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_remap_ ## type , srcWhole); \
-                    tex_remap_ ## type ##_reader texSrc(xoff, yoff); \
-                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
-                    { \
-                        Filter< tex_remap_ ## type ##_reader > filter_src(texSrc); \
-                        remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
-                    } \
-                    else \
-                    { \
-                        BrdReplicate<type> brd(src.rows, src.cols); \
-                        BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
-                        Filter< BorderReader< tex_remap_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
-                        remap<<<grid, block>>>(filter_src, mapx, mapy, dst); \
-                    } \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            };
-
-        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(uchar)
-        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(uchar2)
-        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(uchar4)
-
-        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(schar)
-        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(char2)
-        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(char4)
+        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcherNonStreamTex
+        {
+            static void call(PtrStepSz< T > src, PtrStepSz< T > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy,
+                PtrStepSz< T > dst, const float* borderValue, bool cc20)
+            {
+                typedef typename TypeVec<float, VecTraits< T >::cn>::vec_type work_type;
+                dim3 block(32, cc20 ? 8 : 4);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+                if (srcWhole.cols == src.cols && srcWhole.rows == src.rows)
+                {
+                    cudev::Texture<T> texSrcWhole(srcWhole);
+                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                    BorderReader<cudev::TexturePtr<T>, B<work_type>> brdSrc(texSrcWhole, brd);
+                    Filter<BorderReader<cudev::TexturePtr<T>, B<work_type>>> filter_src(brdSrc);
+                    remap<<<grid, block>>>(filter_src, mapx, mapy, dst);

-        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(ushort)
-        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(ushort2)
-        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(ushort4)
+                }
+                else {
+                    cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
+                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                    BorderReader<cudev::TextureOffPtr<T>, B<work_type>> brdSrc(texSrcWhole, brd);
+                    Filter<BorderReader<cudev::TextureOffPtr<T>, B<work_type>>> filter_src(brdSrc);
+                    remap<<<grid, block >>>(filter_src, mapx, mapy, dst);
+                }

-        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(short)
-        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(short2)
-        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(short4)
+                cudaSafeCall( cudaGetLastError() );
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };

-        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(int)
-        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(int2)
-        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(int4)
+        template <template <typename> class Filter, typename T> struct RemapDispatcherNonStreamTex<Filter, BrdReplicate, T>
+        {
+            static void call(PtrStepSz< T > src, PtrStepSz< T > srcWhole, int xoff, int yoff, PtrStepSzf mapx, PtrStepSzf mapy,
+                PtrStepSz< T > dst, const float*, bool)
+            {
+                dim3 block(32, 8);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+                if (srcWhole.cols == src.cols && srcWhole.rows == src.rows)
+                {
+                    cudev::Texture<T> texSrcWhole(srcWhole);
+                    Filter<cudev::TexturePtr<T>> filter_src(texSrcWhole);
+                    remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
+                }
+                else
+                {
+                    cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
+                    BrdReplicate<T> brd(src.rows, src.cols);
+                    BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T>> brdSrc(texSrcWhole, brd);
+                    Filter<BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T>>> filter_src(brdSrc);
+                    remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
+                }
+                cudaSafeCall( cudaGetLastError() );
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };

-        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(float)
-        //OPENCV_CUDA_IMPLEMENT_REMAP_TEX(float2)
-        OPENCV_CUDA_IMPLEMENT_REMAP_TEX(float4)

-        #undef OPENCV_CUDA_IMPLEMENT_REMAP_TEX
+        template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, uchar> :
+            RemapDispatcherNonStreamTex<Filter, B, uchar> {};
+        template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, uchar4> :
+            RemapDispatcherNonStreamTex<Filter, B, uchar4> {};
+        template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, ushort> :
+            RemapDispatcherNonStreamTex<Filter, B, ushort> {};
+        template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, ushort4> :
+            RemapDispatcherNonStreamTex<Filter, B, ushort4> {};
+        template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, short> :
+            RemapDispatcherNonStreamTex<Filter, B, short> {};
+        template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, short4> :
+            RemapDispatcherNonStreamTex<Filter, B, short4> {};
+        template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, float> :
+            RemapDispatcherNonStreamTex<Filter, B, float> {};
+        template <template <typename> class Filter, template <typename> class B> struct RemapDispatcherNonStream<Filter, B, float4> :
+            RemapDispatcherNonStreamTex<Filter, B, float4> {};
+
+        template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, uchar> :
+            RemapDispatcherNonStreamTex<Filter, BrdReplicate, uchar> {};
+        template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, uchar4> :
+            RemapDispatcherNonStreamTex<Filter, BrdReplicate, uchar4> {};
+        template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, ushort> :
+            RemapDispatcherNonStreamTex<Filter, BrdReplicate, ushort> {};
+        template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, ushort4> :
+            RemapDispatcherNonStreamTex<Filter, BrdReplicate, ushort4> {};
+        template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, short> :
+            RemapDispatcherNonStreamTex<Filter, BrdReplicate, short> {};
+        template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, short4> :
+            RemapDispatcherNonStreamTex<Filter, BrdReplicate, short4> {};
+        template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, float> :
+            RemapDispatcherNonStreamTex<Filter, BrdReplicate, float> {};
+        template <template <typename> class Filter> struct RemapDispatcherNonStream<Filter, BrdReplicate, float4> :
+            RemapDispatcherNonStreamTex<Filter, BrdReplicate, float4> {};

        template <template <typename> class Filter, template <typename> class B, typename T> struct RemapDispatcher
        {
@ -234,37 +243,23 @@ namespace cv { namespace cuda { namespace device
                }
            };

-            callers[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff, xmap, ymap,
-                static_cast< PtrStepSz<T> >(dst), borderValue, stream, cc20);
+            callers[interpolation][borderMode](static_cast<PtrStepSz<T>>(src), static_cast<PtrStepSz<T>>(srcWhole), xoff, yoff, xmap, ymap,
+                static_cast<PtrStepSz<T>>(dst), borderValue, stream, cc20);
        }

        template void remap_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void remap_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void remap_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        //template void remap_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
        template void remap_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void remap_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void remap_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

        template void remap_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void remap_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void remap_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        //template void remap_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
        template void remap_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void remap_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void remap_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void remap_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzf xmap, PtrStepSzf ymap, PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
    } // namespace imgproc
--- a/modules/cudawarping/src/cuda/resize.cu
+++ b/modules/cudawarping/src/cuda/resize.cu
@ -49,6 +49,7 @@
 #include "opencv2/core/cuda/vec_math.hpp"
 #include "opencv2/core/cuda/saturate_cast.hpp"
 #include "opencv2/core/cuda/filters.hpp"
+#include <opencv2/cudev/ptr2d/texture.hpp>

 namespace cv { namespace cuda { namespace device
 {
@ -105,7 +106,7 @@ namespace cv { namespace cuda { namespace device
        }
    }

-    template <class Ptr2D, typename T> __global__ void resize(const Ptr2D src, PtrStepSz<T> dst, const float fy, const float fx)
+    template <class Ptr2D, typename T> __global__ void resize(Ptr2D src, PtrStepSz<T> dst, const float fy, const float fx)
    {
        const int dst_x = blockDim.x * blockIdx.x + threadIdx.x;
        const int dst_y = blockDim.y * blockIdx.y + threadIdx.y;
@ -130,54 +131,6 @@ namespace cv { namespace cuda { namespace device
        }
    }

-    // textures
-
-    template <typename T> struct TextureAccessor;
-
-    #define OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(type) \
-        texture<type, cudaTextureType2D, cudaReadModeElementType> tex_resize_##type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-        template <> struct TextureAccessor<type> \
-        { \
-            typedef type elem_type; \
-            typedef int index_type; \
-            int xoff; \
-            int yoff; \
-            __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-            { \
-                return tex2D(tex_resize_##type, x + xoff, y + yoff); \
-            } \
-            __host__ static void bind(const PtrStepSz<type>& mat) \
-            { \
-                bindTexture(&tex_resize_##type, mat); \
-            } \
-        };
-
-    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(uchar)
-    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(uchar4)
-
-    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(ushort)
-    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(ushort4)
-
-    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(short)
-    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(short4)
-
-    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(float)
-    OPENCV_CUDA_IMPLEMENT_RESIZE_TEX(float4)
-
-    #undef OPENCV_CUDA_IMPLEMENT_RESIZE_TEX
-
-    template <typename T>
-    TextureAccessor<T> texAccessor(const PtrStepSz<T>& mat, int yoff, int xoff)
-    {
-        TextureAccessor<T>::bind(mat);
-
-        TextureAccessor<T> t;
-        t.xoff = xoff;
-        t.yoff = yoff;
-
-        return t;
-    }
-
    // callers for nearest interpolation

    template <typename T>
@ -194,14 +147,19 @@ namespace cv { namespace cuda { namespace device
    }

    template <typename T>
-    void call_resize_nearest_tex(const PtrStepSz<T>& /*src*/, const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx)
+    void call_resize_nearest_tex(const PtrStepSz<T>& srcWhole, int yoff, int xoff, const PtrStepSz<T>& dst, float fy, float fx)
    {
        const dim3 block(32, 8);
        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
-        resize<<<grid, block>>>(texAccessor(srcWhole, yoff, xoff), dst, fy, fx);
+        if (xoff || yoff) {
+            cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
+            resize<cudev::TextureOffPtr<T>><<<grid, block>>>(texSrcWhole, dst, fy, fx);
+        }
+        else {
+            cudev::Texture<T> texSrcWhole(srcWhole);
+            resize<cudev::TexturePtr<T>><<<grid, block>>>(texSrcWhole, dst, fy, fx);
+        }
        cudaSafeCall( cudaGetLastError() );
-
        cudaSafeCall( cudaDeviceSynchronize() );
    }

@ -225,27 +183,21 @@ namespace cv { namespace cuda { namespace device
    {
        const dim3 block(32, 8);
        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
        if (srcWhole.data == src.data)
        {
-            TextureAccessor<T> texSrc = texAccessor(src, 0, 0);
-            LinearFilter< TextureAccessor<T> > filteredSrc(texSrc);
-
+            cudev::Texture<T> texSrc(src);
+            LinearFilter<cudev::TexturePtr<T>> filteredSrc(texSrc);
            resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
        }
        else
        {
-            TextureAccessor<T> texSrc = texAccessor(srcWhole, yoff, xoff);
-
+            cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
            BrdReplicate<T> brd(src.rows, src.cols);
-            BorderReader<TextureAccessor<T>, BrdReplicate<T> > brdSrc(texSrc, brd);
-            LinearFilter< BorderReader<TextureAccessor<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
-
+            BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T>> brdSrc(texSrcWhole, brd);
+            LinearFilter<BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T>>> filteredSrc(brdSrc);
            resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
        }
-
        cudaSafeCall( cudaGetLastError() );
-
        cudaSafeCall( cudaDeviceSynchronize() );
    }

@ -258,8 +210,8 @@ namespace cv { namespace cuda { namespace device
        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));

        BrdReplicate<T> brd(src.rows, src.cols);
-        BorderReader< PtrStep<T>, BrdReplicate<T> > brdSrc(src, brd);
-        CubicFilter< BorderReader< PtrStep<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
+        BorderReader<PtrStep<T>, BrdReplicate<T>> brdSrc(src, brd);
+        CubicFilter<BorderReader< PtrStep<T>, BrdReplicate<T>>> filteredSrc(brdSrc);

        resize<<<grid, block, 0, stream>>>(filteredSrc, dst, fy, fx);
        cudaSafeCall( cudaGetLastError() );
@ -273,27 +225,21 @@ namespace cv { namespace cuda { namespace device
    {
        const dim3 block(32, 8);
        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-
        if (srcWhole.data == src.data)
        {
-            TextureAccessor<T> texSrc = texAccessor(src, 0, 0);
-            CubicFilter< TextureAccessor<T> > filteredSrc(texSrc);
-
+            cudev::Texture<T> texSrc(src);
+            CubicFilter<cudev::TexturePtr<T>> filteredSrc(texSrc);
            resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
        }
        else
        {
-            TextureAccessor<T> texSrc = texAccessor(srcWhole, yoff, xoff);
-
+            cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
            BrdReplicate<T> brd(src.rows, src.cols);
-            BorderReader<TextureAccessor<T>, BrdReplicate<T> > brdSrc(texSrc, brd);
-            CubicFilter< BorderReader<TextureAccessor<T>, BrdReplicate<T> > > filteredSrc(brdSrc);
-
+            BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T>> brdSrc(texSrcWhole, brd);
+            CubicFilter<BorderReader<cudev::TextureOffPtr<T>, BrdReplicate<T>>> filteredSrc(brdSrc);
            resize<<<grid, block>>>(filteredSrc, dst, fy, fx);
        }
-
        cudaSafeCall( cudaGetLastError() );
-
        cudaSafeCall( cudaDeviceSynchronize() );
    }

@ -318,7 +264,7 @@ namespace cv { namespace cuda { namespace device
                if (fx > 1 || fy > 1)
                    call_resize_nearest_glob(src, dst, fy, fx, 0);
                else
-                    call_resize_nearest_tex(src, srcWhole, yoff, xoff, dst, fy, fx);
+                   call_resize_nearest_tex(srcWhole, yoff, xoff, dst, fy, fx);
            }
        }
    };
@ -389,7 +335,7 @@ namespace cv { namespace cuda { namespace device
        {
            if (stream)
                call_resize_cubic_glob(src, dst, fy, fx, stream);
-            else
+           else
                call_resize_cubic_tex(src, srcWhole, yoff, xoff, dst, fy, fx);
        }
    };
@ -421,16 +367,16 @@ namespace cv { namespace cuda { namespace device
            if (std::abs(fx - iscale_x) < FLT_MIN && std::abs(fy - iscale_y) < FLT_MIN)
            {
                BrdConstant<T> brd(src.rows, src.cols);
-                BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
-                IntegerAreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
+                BorderReader<PtrStep<T>, BrdConstant<T>> brdSrc(src, brd);
+                IntegerAreaFilter<BorderReader< PtrStep<T>, BrdConstant<T>>> filteredSrc(brdSrc, fx, fy);

                resize_area<<<grid, block, 0, stream>>>(filteredSrc, dst);
            }
            else
            {
                BrdConstant<T> brd(src.rows, src.cols);
-                BorderReader< PtrStep<T>, BrdConstant<T> > brdSrc(src, brd);
-                AreaFilter< BorderReader< PtrStep<T>, BrdConstant<T> > > filteredSrc(brdSrc, fx, fy);
+                BorderReader<PtrStep<T>, BrdConstant<T>> brdSrc(src, brd);
+                AreaFilter<BorderReader< PtrStep<T>, BrdConstant<T>>> filteredSrc(brdSrc, fx, fy);

                resize_area<<<grid, block, 0, stream>>>(filteredSrc, dst);
            }
--- a/modules/cudawarping/src/cuda/warp.cu
+++ b/modules/cudawarping/src/cuda/warp.cu
@ -48,6 +48,7 @@
 #include "opencv2/core/cuda/vec_math.hpp"
 #include "opencv2/core/cuda/saturate_cast.hpp"
 #include "opencv2/core/cuda/filters.hpp"
+#include <opencv2/cudev/ptr2d/texture.hpp>

 namespace cv { namespace cuda { namespace device
 {
@ -164,8 +165,8 @@ namespace cv { namespace cuda { namespace device
                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));

                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+                BorderReader<PtrStep<T>, B<work_type>> brdSrc(src, brd);
+                Filter<BorderReader<PtrStep<T>, B<work_type>>> filter_src(brdSrc);

                warp<Transform><<<grid, block, 0, stream>>>(filter_src, dst, warpMat);
                cudaSafeCall( cudaGetLastError() );
@ -186,8 +187,8 @@ namespace cv { namespace cuda { namespace device
                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));

                B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
-                BorderReader< PtrStep<T>, B<work_type> > brdSrc(src, brd);
-                Filter< BorderReader< PtrStep<T>, B<work_type> > > filter_src(brdSrc);
+                BorderReader<PtrStep<T>, B<work_type>> brdSrc(src, brd);
+                Filter<BorderReader<PtrStep<T>, B<work_type>>> filter_src(brdSrc);

                warp<Transform><<<grid, block>>>(filter_src, dst, warpMat);
                cudaSafeCall( cudaGetLastError() );
@ -196,86 +197,48 @@ namespace cv { namespace cuda { namespace device
            }
        };

-        #define OPENCV_CUDA_IMPLEMENT_WARP_TEX(type) \
-            texture< type , cudaTextureType2D > tex_warp_ ## type (0, cudaFilterModePoint, cudaAddressModeClamp); \
-            struct tex_warp_ ## type ## _reader \
-            { \
-                typedef type elem_type; \
-                typedef int index_type; \
-                int xoff, yoff; \
-                tex_warp_ ## type ## _reader (int xoff_, int yoff_) : xoff(xoff_), yoff(yoff_) {} \
-                __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const \
-                { \
-                    return tex2D(tex_warp_ ## type , x + xoff, y + yoff); \
-                } \
-            }; \
-            template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, type> \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float* borderValue, const float warpMat[Transform::rows*3], bool cc20) \
-                { \
-                    typedef typename TypeVec<float, VecTraits< type >::cn>::vec_type work_type; \
-                    dim3 block(32, cc20 ? 8 : 4); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_warp_ ## type , srcWhole); \
-                    tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
-                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue)); \
-                    BorderReader< tex_warp_ ## type ##_reader, B<work_type> > brdSrc(texSrc, brd); \
-                    Filter< BorderReader< tex_warp_ ## type ##_reader, B<work_type> > > filter_src(brdSrc); \
-                    warp<Transform><<<grid, block>>>(filter_src, dst, warpMat); \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            }; \
-            template <class Transform, template <typename> class Filter> struct WarpDispatcherNonStream<Transform, Filter, BrdReplicate, type> \
-            { \
-                static void call(PtrStepSz< type > src, PtrStepSz< type > srcWhole, int xoff, int yoff, PtrStepSz< type > dst, const float*, const float warpMat[Transform::rows*3], bool) \
-                { \
-                    dim3 block(32, 8); \
-                    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
-                    bindTexture(&tex_warp_ ## type , srcWhole); \
-                    tex_warp_ ## type ##_reader texSrc(xoff, yoff); \
-                    if (srcWhole.cols == src.cols && srcWhole.rows == src.rows) \
-                    { \
-                        Filter< tex_warp_ ## type ##_reader > filter_src(texSrc); \
-                        warp<Transform><<<grid, block>>>(filter_src, dst, warpMat); \
-                    } \
-                    else \
-                    { \
-                        BrdReplicate<type> brd(src.rows, src.cols); \
-                        BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > brdSrc(texSrc, brd); \
-                        Filter< BorderReader< tex_warp_ ## type ##_reader, BrdReplicate<type> > > filter_src(brdSrc); \
-                        warp<Transform><<<grid, block>>>(filter_src, dst, warpMat); \
-                    } \
-                    cudaSafeCall( cudaGetLastError() ); \
-                    cudaSafeCall( cudaDeviceSynchronize() ); \
-                } \
-            };
-
-        OPENCV_CUDA_IMPLEMENT_WARP_TEX(uchar)
-        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(uchar2)
-        OPENCV_CUDA_IMPLEMENT_WARP_TEX(uchar4)
-
-        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(schar)
-        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(char2)
-        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(char4)
-
-        OPENCV_CUDA_IMPLEMENT_WARP_TEX(ushort)
-        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(ushort2)
-        OPENCV_CUDA_IMPLEMENT_WARP_TEX(ushort4)
-
-        OPENCV_CUDA_IMPLEMENT_WARP_TEX(short)
-        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(short2)
-        OPENCV_CUDA_IMPLEMENT_WARP_TEX(short4)
-
-        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(int)
-        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(int2)
-        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(int4)
-
-        OPENCV_CUDA_IMPLEMENT_WARP_TEX(float)
-        //OPENCV_CUDA_IMPLEMENT_WARP_TEX(float2)
-        OPENCV_CUDA_IMPLEMENT_WARP_TEX(float4)
+        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcherNonStreamTex
+        {
+            static void call(PtrStepSz<T> src, PtrStepSz<T> srcWhole, int xoff, int yoff, PtrStepSz<T> dst, const float* borderValue, const float warpMat[Transform::rows*3], bool cc20)
+            {
+                typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_type;
+                dim3 block(32, cc20 ? 8 : 4);
+                dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+                if (xoff || yoff) {
+                    cudev::TextureOff<T> texSrcWhole(srcWhole, yoff, xoff);
+                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                    BorderReader<cudev::TextureOffPtr<T>, B<work_type>> brdSrc(texSrcWhole, brd);
+                    Filter<BorderReader<cudev::TextureOffPtr<T>, B<work_type>>> filter_src(brdSrc);
+                    warp<Transform><<<grid, block>>> (filter_src, dst, warpMat);
+                }
+                else {
+                    cudev::Texture<T> texSrcWhole(srcWhole);
+                    B<work_type> brd(src.rows, src.cols, VecTraits<work_type>::make(borderValue));
+                    BorderReader<cudev::TexturePtr<T>, B<work_type>>brdSrc(texSrcWhole, brd);
+                    Filter< BorderReader<cudev::TexturePtr<T>, B<work_type>>> filter_src(brdSrc);
+                    warp<Transform><<<grid, block>>> (filter_src, dst, warpMat);
+                }
+                cudaSafeCall( cudaGetLastError() );
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        };

-        #undef OPENCV_CUDA_IMPLEMENT_WARP_TEX
+        template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, uchar> :
+            WarpDispatcherNonStreamTex<Transform, Filter, B, uchar> {};
+        template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, uchar4> :
+            WarpDispatcherNonStreamTex<Transform, Filter, B, uchar4> {};
+        template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, ushort> :
+            WarpDispatcherNonStreamTex<Transform, Filter, B, ushort> {};
+        template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, ushort4> :
+            WarpDispatcherNonStreamTex<Transform, Filter, B, ushort4> {};
+        template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, short> :
+            WarpDispatcherNonStreamTex<Transform, Filter, B, short> {};
+        template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, short4> :
+            WarpDispatcherNonStreamTex<Transform, Filter, B, short4> {};
+        template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, float> :
+            WarpDispatcherNonStreamTex<Transform, Filter, B, float> {};
+        template <class Transform, template <typename> class Filter, template <typename> class B> struct WarpDispatcherNonStream<Transform, Filter, B, float4> :
+            WarpDispatcherNonStreamTex<Transform, Filter, B, float4> {};

        template <class Transform, template <typename> class Filter, template <typename> class B, typename T> struct WarpDispatcher
        {
@ -319,8 +282,8 @@ namespace cv { namespace cuda { namespace device
                }
            };

-            funcs[interpolation][borderMode](static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(srcWhole), xoff, yoff,
-                                             static_cast< PtrStepSz<T> >(dst), borderValue, warpMat, stream, cc20);
+            funcs[interpolation][borderMode](static_cast<PtrStepSz<T>>(src), static_cast<PtrStepSz<T>>(srcWhole), xoff, yoff,
+                                             static_cast<PtrStepSz<T>>(dst), borderValue, warpMat, stream, cc20);
        }

        template <typename T> void warpAffine_gpu(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation,
@ -330,32 +293,18 @@ namespace cv { namespace cuda { namespace device
        }

        template void warpAffine_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpAffine_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpAffine_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        //template void warpAffine_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
        template void warpAffine_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpAffine_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpAffine_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

        template void warpAffine_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpAffine_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpAffine_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        //template void warpAffine_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
        template void warpAffine_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpAffine_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpAffine_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpAffine_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[2 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

@ -366,32 +315,18 @@ namespace cv { namespace cuda { namespace device
        }

        template void warpPerspective_gpu<uchar >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<uchar2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpPerspective_gpu<uchar3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpPerspective_gpu<uchar4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        //template void warpPerspective_gpu<schar>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<char2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<char3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<char4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
        template void warpPerspective_gpu<ushort >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<ushort2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpPerspective_gpu<ushort3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpPerspective_gpu<ushort4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

        template void warpPerspective_gpu<short >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<short2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpPerspective_gpu<short3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpPerspective_gpu<short4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);

-        //template void warpPerspective_gpu<int >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<int2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<int3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<int4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-
        template void warpPerspective_gpu<float >(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
-        //template void warpPerspective_gpu<float2>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpPerspective_gpu<float3>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
        template void warpPerspective_gpu<float4>(PtrStepSzb src, PtrStepSzb srcWhole, int xoff, int yoff, float coeffs[3 * 3], PtrStepSzb dst, int interpolation, int borderMode, const float* borderValue, cudaStream_t stream, bool cc20);
    } // namespace imgproc
--- a/modules/cudawarping/test/test_precomp.hpp
+++ b/modules/cudawarping/test/test_precomp.hpp
@ -42,6 +42,8 @@
 #ifndef __OPENCV_TEST_PRECOMP_HPP__
 #define __OPENCV_TEST_PRECOMP_HPP__

+#include <thread>
+
 #include "opencv2/ts.hpp"
 #include "opencv2/ts/cuda_test.hpp"

--- a/modules/cudawarping/test/test_resize.cpp
+++ b/modules/cudawarping/test/test_resize.cpp
@ -206,6 +206,60 @@ INSTANTIATE_TEST_CASE_P(CUDA_Warping, ResizeSameAsHost, testing::Combine(
    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_AREA)),
    WHOLE_SUBMAT));

+PARAM_TEST_CASE(ResizeTextures, cv::cuda::DeviceInfo, Interpolation)
+{
+    cv::cuda::DeviceInfo devInfo;
+    Interpolation interpolation;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        interpolation = GET_PARAM(1);
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+void ResizeThread(const Interpolation interp, const GpuMat& imgIn, const std::vector<GpuMat>& imgsOut, Stream& stream) {
+    for (auto& imgOut : imgsOut)
+        cv::cuda::resize(imgIn, imgOut, imgOut.size(), 0, 0, interp, stream);
+}
+
+CUDA_TEST_P(ResizeTextures, Accuracy)
+{
+    constexpr int nThreads = 5;
+    constexpr int nIters = 5;
+    const Size szIn(100, 100);
+    const Size szOut(200, 200);
+    vector<Stream> streams(nThreads, cv::cuda::Stream::Null());
+    vector<GpuMat> imgsIn;
+    vector<vector<GpuMat>> imgsOut;
+    for (int i = 0; i < nThreads; i++) {
+        imgsIn.push_back(GpuMat(szIn, CV_8UC1, i));
+        vector<GpuMat> imgsOutPerThread;
+        for (int j = 0; j < nIters; j++)
+            imgsOutPerThread.push_back(GpuMat(szOut, CV_8UC1));
+        imgsOut.push_back(imgsOutPerThread);
+    }
+
+    vector<std::thread> thread(nThreads);
+    for (int i = 0; i < nThreads; i++) thread.at(i) = std::thread(ResizeThread, interpolation, std::ref(imgsIn.at(i)), std::ref(imgsOut.at(i)), std::ref(streams.at(i)));
+    for (int i = 0; i < nThreads; i++) thread.at(i).join();
+
+    for (int i = 0; i < nThreads; i++) {
+        GpuMat imgOutGs;
+        cv::cuda::resize(imgsIn.at(i), imgOutGs, szOut, 0, 0, interpolation, streams.at(i));
+        Mat imgOutGsHost; imgOutGs.download(imgOutGsHost);
+        for (const auto& imgOut : imgsOut.at(i)) {
+            Mat imgOutHost; imgOut.download(imgOutHost);
+            ASSERT_TRUE(cv::norm(imgOutHost, imgOutGsHost, NORM_INF) == 0);
+        }
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Warping, ResizeTextures, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC))));
+

 }} // namespace
 #endif // HAVE_CUDA
--- a/modules/cudev/include/opencv2/cudev/ptr2d/texture.hpp
+++ b/modules/cudev/include/opencv2/cudev/ptr2d/texture.hpp
@ -1,147 +1,159 @@
-/*M///////////////////////////////////////////////////////////////////////////////////////
-//
-//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
-//
-//  By downloading, copying, installing or using the software you agree to this license.
-//  If you do not agree to this license, do not download, install,
-//  copy or use the software.
-//
-//
-//                          License Agreement
-//                For Open Source Computer Vision Library
-//
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
-// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
-// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
-// Third party copyrights are property of their respective owners.
-//
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
-//
-//   * Redistribution's of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//
-//   * Redistribution's in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//
-//   * The name of the copyright holders may not be used to endorse or promote products
-//     derived from this software without specific prior written permission.
-//
-// This software is provided by the copyright holders and contributors "as is" and
-// any express or implied warranties, including, but not limited to, the implied
-// warranties of merchantability and fitness for a particular purpose are disclaimed.
-// In no event shall the Intel Corporation or contributors be liable for any direct,
-// indirect, incidental, special, exemplary, or consequential damages
-// (including, but not limited to, procurement of substitute goods or services;
-// loss of use, data, or profits; or business interruption) however caused
-// and on any theory of liability, whether in contract, strict liability,
-// or tort (including negligence or otherwise) arising in any way out of
-// the use of this software, even if advised of the possibility of such damage.
-//
-//M*/
-
-#pragma once
-
-#ifndef OPENCV_CUDEV_PTR2D_TEXTURE_HPP
-#define OPENCV_CUDEV_PTR2D_TEXTURE_HPP
-
-#include <cstring>
-#include "../common.hpp"
-#include "glob.hpp"
-#include "gpumat.hpp"
-#include "traits.hpp"
-
-#if CUDART_VERSION >= 5050
-
-namespace
-{
-    template <typename T> struct CvCudevTextureRef
-    {
-        typedef texture<T, cudaTextureType2D, cudaReadModeElementType> TexRef;
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.

-        static TexRef ref;
+#ifndef OPENCV_CUDEV_PTR2D_TEXTURE_OBJECT_HPP
+#define OPENCV_CUDEV_PTR2D_TEXTURE_OBJECT_HPP

-        __host__ static void bind(const cv::cudev::GlobPtrSz<T>& mat,
-                                  bool normalizedCoords = false,
-                                  cudaTextureFilterMode filterMode = cudaFilterModePoint,
-                                  cudaTextureAddressMode addressMode = cudaAddressModeClamp)
-        {
-            ref.normalized = normalizedCoords;
-            ref.filterMode = filterMode;
-            ref.addressMode[0] = addressMode;
-            ref.addressMode[1] = addressMode;
-            ref.addressMode[2] = addressMode;
+#include <opencv2/core.hpp>
+#include <opencv2/core/utils/logger.hpp>
+#include <opencv2/core/cuda_types.hpp>
+#include <opencv2/cudev/common.hpp>
+#include <opencv2/cudev/ptr2d/traits.hpp>

-            cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
+/** \file texture.hpp
+*/

-            CV_CUDEV_SAFE_CALL( cudaBindTexture2D(0, &ref, mat.data, &desc, mat.cols, mat.rows, mat.step) );
+namespace cv {  namespace cudev {
+
+//! @addtogroup cudev
+//! @{
+
+    /** @brief Simple lightweight structures that encapsulate information about an image texture on the device.
+    * They are intended to be passed to nvcc-compiled code.
+    */
+    template<class T, class R = T>
+    struct TexturePtr {
+        typedef R     elem_type, value_type;
+        typedef float index_type;
+        __host__ TexturePtr() {};
+        __host__ TexturePtr(const cudaTextureObject_t tex_) : tex(tex_) {};
+        __device__ __forceinline__ R operator ()(index_type y, index_type x) const {
+            return tex2D<R>(tex, x, y);
+        }
+        __device__ __forceinline__ R operator ()(index_type x) const {
+            return tex1Dfetch<R>(tex, x);
        }
+    private:
+        cudaTextureObject_t tex;
+    };

-        __host__ static void unbind()
-        {
-            cudaUnbindTexture(ref);
+    // textures are a maximum of 32 bits wide, 64 bits is read as two 32 bit wide values
+    template <class R>
+    struct TexturePtr<uint64, R> {
+        typedef float index_type;
+        __host__ TexturePtr() {};
+        __host__ TexturePtr(const cudaTextureObject_t tex_) : tex(tex_) {};
+        __device__ __forceinline__ R operator ()(index_type y, index_type x) const {
+            const uint2 retVal = tex2D<uint2>(tex, x, y);
+            return *(reinterpret_cast<const R*>(&retVal));
+        }
+        __device__ __forceinline__ R operator ()(index_type x) const {
+            const uint2 retVal = tex1Dfetch<uint2>(tex, x);
+            return *(reinterpret_cast<const R*>(&retVal));
        }
+    private:
+        cudaTextureObject_t tex;
    };

-    template <typename T>
-    typename CvCudevTextureRef<T>::TexRef CvCudevTextureRef<T>::ref;
-}
+    template<class T, class R = T>
+    struct TextureOffPtr {
+        typedef R     elem_type;
+        typedef float index_type;
+        __host__ TextureOffPtr(const cudaTextureObject_t tex_, const int yoff_, const int xoff_) : tex(tex_), yoff(yoff_), xoff(xoff_) {};
+        __device__ __forceinline__ R operator ()(index_type y, index_type x) const {
+            return tex2D<R>(tex, x + xoff, y + yoff);
+        }
+    private:
+        cudaTextureObject_t tex;
+        int xoff = 0;
+        int yoff = 0;
+    };

-#endif
+    /** @brief non-copyable smart CUDA texture object
+    *
+    * UniqueTexture is a smart non-sharable wrapper for a cudaTextureObject_t handle which ensures that the handle is destroyed after use.
+    */
+    template<class T, class R = T>
+    class UniqueTexture {
+    public:
+        __host__ UniqueTexture() noexcept { }
+        __host__ UniqueTexture(UniqueTexture&) = delete;
+        __host__ UniqueTexture(UniqueTexture&& other) noexcept {
+            tex = other.tex;
+            other.tex = 0;
+        }

-namespace cv { namespace cudev {
+        __host__ UniqueTexture(const int rows, const int cols, T* data, const size_t step, const bool normalizedCoords = false,
+            const cudaTextureFilterMode filterMode = cudaFilterModePoint, const cudaTextureAddressMode addressMode = cudaAddressModeClamp,
+            const cudaTextureReadMode readMode = cudaReadModeElementType)
+        {
+            create(rows, cols, data, step, normalizedCoords, filterMode, addressMode, readMode);
+        }

-//! @addtogroup cudev
-//! @{
+        __host__ UniqueTexture(const size_t sizeInBytes, T* data, const bool normalizedCoords = false, const cudaTextureFilterMode filterMode = cudaFilterModePoint,
+            const cudaTextureAddressMode addressMode = cudaAddressModeClamp, const cudaTextureReadMode readMode = cudaReadModeElementType)
+        {
+            create(1, static_cast<int>(sizeInBytes/sizeof(T)), data, sizeInBytes, normalizedCoords, filterMode, addressMode, readMode);
+        }

-#if CUDART_VERSION >= 5050
+        __host__ ~UniqueTexture() {
+            if (tex != cudaTextureObject_t()) {
+                try {
+                    CV_CUDEV_SAFE_CALL(cudaDestroyTextureObject(tex));
+                }
+                catch (const cv::Exception& ex) {
+                    std::ostringstream os;
+                    os << "Exception caught during CUDA texture object destruction.\n";
+                    os << ex.what();
+                    os << "Exception will be ignored.\n";
+                    CV_LOG_WARNING(0, os.str().c_str());
+                }
+            }

-template <typename T> struct TexturePtr
-{
-    typedef T     value_type;
-    typedef float index_type;
+        }

-    cudaTextureObject_t texObj;
+        __host__ UniqueTexture& operator=(const UniqueTexture&) = delete;
+        __host__ UniqueTexture& operator=(UniqueTexture&& other) noexcept {
+            CV_Assert(other);
+            if (&other != this) {
+                UniqueTexture(std::move(*this)); /* destroy current texture object */
+                tex = other.tex;
+                other.tex = cudaTextureObject_t();
+            }
+            return *this;
+        }

-    __device__ __forceinline__ T operator ()(float y, float x) const
-    {
-    #if CV_CUDEV_ARCH < 300
-        // Use the texture reference
-        return tex2D(CvCudevTextureRef<T>::ref, x, y);
-    #else
-        // Use the texture object
-        return tex2D<T>(texObj, x, y);
-    #endif
-    }
-};
-
-template <typename T> struct Texture : TexturePtr<T>
-{
-    int rows, cols;
-    bool cc30;
-
-    __host__ explicit Texture(const GlobPtrSz<T>& mat,
-                              bool normalizedCoords = false,
-                              cudaTextureFilterMode filterMode = cudaFilterModePoint,
-                              cudaTextureAddressMode addressMode = cudaAddressModeClamp)
-    {
-        cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);
+        __host__ cudaTextureObject_t get() const noexcept {
+            CV_Assert(tex);
+            return tex;
+        }
+
+        __host__ explicit operator bool() const noexcept { return tex != cudaTextureObject_t(); }

-        rows = mat.rows;
-        cols = mat.cols;
+    private:

-        if (cc30)
+        template <class T1>
+        __host__ void create(const int rows, const int cols, T1* data, const size_t step, const bool normalizedCoords, const cudaTextureFilterMode filterMode,
+            const cudaTextureAddressMode addressMode, const cudaTextureReadMode readMode)
        {
-            // Use the texture object
            cudaResourceDesc texRes;
            std::memset(&texRes, 0, sizeof(texRes));
-            texRes.resType = cudaResourceTypePitch2D;
-            texRes.res.pitch2D.devPtr = mat.data;
-            texRes.res.pitch2D.height = mat.rows;
-            texRes.res.pitch2D.width = mat.cols;
-            texRes.res.pitch2D.pitchInBytes = mat.step;
-            texRes.res.pitch2D.desc = cudaCreateChannelDesc<T>();
+            if (rows == 1) {
+                CV_Assert(rows == 1 && cols*sizeof(T) == step);
+                texRes.resType = cudaResourceTypeLinear;
+                texRes.res.linear.devPtr = data;
+                texRes.res.linear.sizeInBytes = step;
+                texRes.res.linear.desc = cudaCreateChannelDesc<T1>();
+            }
+            else {
+                texRes.resType = cudaResourceTypePitch2D;
+                texRes.res.pitch2D.devPtr = data;
+                texRes.res.pitch2D.height = rows;
+                texRes.res.pitch2D.width = cols;
+                texRes.res.pitch2D.pitchInBytes = step;
+                texRes.res.pitch2D.desc = cudaCreateChannelDesc<T1>();
+            }

            cudaTextureDesc texDescr;
            std::memset(&texDescr, 0, sizeof(texDescr));
@ -150,109 +162,112 @@ template <typename T> struct Texture : TexturePtr<T>
            texDescr.addressMode[0] = addressMode;
            texDescr.addressMode[1] = addressMode;
            texDescr.addressMode[2] = addressMode;
-            texDescr.readMode = cudaReadModeElementType;
+            texDescr.readMode = readMode;

-            CV_CUDEV_SAFE_CALL( cudaCreateTextureObject(&this->texObj, &texRes, &texDescr, 0) );
+            CV_CUDEV_SAFE_CALL(cudaCreateTextureObject(&tex, &texRes, &texDescr, 0));
        }
-        else
+
+        __host__ void create(const int rows, const int cols, uint64* data, const size_t step, const bool normalizedCoords, const cudaTextureFilterMode filterMode,
+            const cudaTextureAddressMode addressMode, const cudaTextureReadMode readMode)
        {
-            // Use the texture reference
-            CvCudevTextureRef<T>::bind(mat, normalizedCoords, filterMode, addressMode);
+            create<uint2>(rows, cols, (uint2*)data, step, normalizedCoords, filterMode, addressMode, readMode);
        }
-    }

-    __host__ ~Texture()
-    {
-        if (cc30)
+    private:
+        cudaTextureObject_t tex;
+    };
+
+    /** @brief sharable smart CUDA texture object
+    *
+    * Texture is a smart sharable wrapper for a cudaTextureObject_t handle which ensures that the handle is destroyed after use.
+    */
+    template<class T, class R = T>
+    class Texture {
+    public:
+        Texture() = default;
+        Texture(const Texture&) = default;
+        Texture(Texture&&) = default;
+
+        __host__ Texture(const int rows_, const int cols_, T* data, const size_t step, const bool normalizedCoords = false, const cudaTextureFilterMode filterMode = cudaFilterModePoint,
+            const cudaTextureAddressMode addressMode = cudaAddressModeClamp, const cudaTextureReadMode readMode = cudaReadModeElementType) :
+            rows(rows_), cols(cols_), texture(std::make_shared<UniqueTexture<T,R>>(rows, cols, data, step, normalizedCoords, filterMode, addressMode, readMode))
        {
-            // Use the texture object
-            cudaDestroyTextureObject(this->texObj);
        }
-        else
+
+        __host__ Texture(const size_t sizeInBytes, T* data, const bool normalizedCoords = false, const cudaTextureFilterMode filterMode = cudaFilterModePoint,
+            const cudaTextureAddressMode addressMode = cudaAddressModeClamp, const cudaTextureReadMode readMode = cudaReadModeElementType) :
+            rows(1), cols(static_cast<int>(sizeInBytes/sizeof(T))), texture(std::make_shared<UniqueTexture<T, R>>(sizeInBytes, data, normalizedCoords, filterMode, addressMode, readMode))
        {
-            // Use the texture reference
-            CvCudevTextureRef<T>::unbind();
        }
-    }
-};

-template <typename T> struct PtrTraits< Texture<T> > : PtrTraitsBase<Texture<T>, TexturePtr<T> >
-{
-};
+        __host__ Texture(PtrStepSz<T> src, const bool normalizedCoords = false, const cudaTextureFilterMode filterMode = cudaFilterModePoint,
+            const cudaTextureAddressMode addressMode = cudaAddressModeClamp, const cudaTextureReadMode readMode = cudaReadModeElementType) :
+            Texture(src.rows, src.cols, src.data, src.step, normalizedCoords, filterMode, addressMode, readMode)
+        {
+        }

-#else
+        Texture& operator=(const Texture&) = default;
+        Texture& operator=(Texture&&) = default;

-template <typename T> struct TexturePtr
-{
-    typedef T     value_type;
-    typedef float index_type;
+        __host__ explicit operator bool() const noexcept {
+            if (!texture)
+                return false;
+            return texture->operator bool();
+        }

-    cudaTextureObject_t texObj;
+        __host__ operator TexturePtr<T, R>() const {
+            if (texture)
+                return TexturePtr<T, R>(texture->get());
+            else
+                return TexturePtr<T, R>(cudaTextureObject_t());
+        }

-    __device__ __forceinline__ T operator ()(float y, float x) const
-    {
-    #if CV_CUDEV_ARCH >= 300
-        // Use the texture object
-        return tex2D<T>(texObj, x, y);
-    #else
-        CV_UNUSED(y);
-        CV_UNUSED(x);
-        return T();
-    #endif
-    }
-};
-
-template <typename T> struct Texture : TexturePtr<T>
-{
-    int rows, cols;
-
-    __host__ explicit Texture(const GlobPtrSz<T>& mat,
-                              bool normalizedCoords = false,
-                              cudaTextureFilterMode filterMode = cudaFilterModePoint,
-                              cudaTextureAddressMode addressMode = cudaAddressModeClamp)
-    {
-        CV_Assert( deviceSupports(FEATURE_SET_COMPUTE_30) );
-
-        rows = mat.rows;
-        cols = mat.cols;
-
-        // Use the texture object
-        cudaResourceDesc texRes;
-        std::memset(&texRes, 0, sizeof(texRes));
-        texRes.resType = cudaResourceTypePitch2D;
-        texRes.res.pitch2D.devPtr = mat.data;
-        texRes.res.pitch2D.height = mat.rows;
-        texRes.res.pitch2D.width = mat.cols;
-        texRes.res.pitch2D.pitchInBytes = mat.step;
-        texRes.res.pitch2D.desc = cudaCreateChannelDesc<T>();
-
-        cudaTextureDesc texDescr;
-        std::memset(&texDescr, 0, sizeof(texDescr));
-        texDescr.normalizedCoords = normalizedCoords;
-        texDescr.filterMode = filterMode;
-        texDescr.addressMode[0] = addressMode;
-        texDescr.addressMode[1] = addressMode;
-        texDescr.addressMode[2] = addressMode;
-        texDescr.readMode = cudaReadModeElementType;
-
-        CV_CUDEV_SAFE_CALL( cudaCreateTextureObject(&this->texObj, &texRes, &texDescr, 0) );
-    }
-
-    __host__ ~Texture()
+        int rows = 0;
+        int cols = 0;
+
+    protected:
+        std::shared_ptr<UniqueTexture<T, R>> texture = 0;
+    };
+
+    template <typename T, typename R> struct PtrTraits<Texture<T, R>> : PtrTraitsBase<Texture<T, R>, TexturePtr<T, R>>
    {
-        // Use the texture object
-        cudaDestroyTextureObject(this->texObj);
-    }
-};
+    };

-template <typename T> struct PtrTraits< Texture<T> > : PtrTraitsBase<Texture<T>, TexturePtr<T> >
-{
-};

-#endif
+    /** @brief sharable smart CUDA texture object with offset
+    * TextureOff is a smart sharable wrapper for a cudaTextureObject_t handle which ensures that the handle is destroyed after use.
+    */
+    template<class T, class R = T>
+    class TextureOff {
+    public:
+        TextureOff(const TextureOff&) = default;
+        TextureOff(TextureOff&&) = default;
+
+        __host__ TextureOff(const int rows, const int cols, T* data, const size_t step, const int yoff_ = 0, const int xoff_ = 0, const bool normalizedCoords = false,
+            const cudaTextureFilterMode filterMode = cudaFilterModePoint, const cudaTextureAddressMode addressMode = cudaAddressModeClamp,
+            const cudaTextureReadMode readMode = cudaReadModeElementType) :
+            texture(std::make_shared<UniqueTexture<T, R>>(rows, cols, data, step, normalizedCoords, filterMode, addressMode, readMode)), xoff(xoff_), yoff(yoff_)
+        {
+        }

-//! @}
+        __host__ TextureOff(PtrStepSz<T> src, const int yoff = 0, const int xoff = 0, const bool normalizedCoords = false, const cudaTextureFilterMode filterMode = cudaFilterModePoint,
+            const cudaTextureAddressMode addressMode = cudaAddressModeClamp, const cudaTextureReadMode readMode = cudaReadModeElementType) :
+            TextureOff(src.rows, src.cols, src.data, src.step, yoff, xoff, normalizedCoords, filterMode, addressMode, readMode)
+        {
+        }
+
+        TextureOff& operator=(const TextureOff&) = default;
+        TextureOff& operator=(TextureOff&&) = default;

+        __host__ operator TextureOffPtr<T, R>() const {
+            return TextureOffPtr<T, R>(texture->get(), yoff, xoff);
+        }
+
+    private:
+        int xoff = 0;
+        int yoff = 0;
+        std::shared_ptr<UniqueTexture<T, R>> texture = 0;
+    };
 }}

 #endif
--- a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
+++ b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
@ -213,7 +213,7 @@ __device__ double shfl_up(double val, uint delta, int width = warpSize)
    return __hiloint2double(hi, lo);
 }

-__device__ __forceinline__ unsigned long long shfl_up(unsigned long long val, uint delta, int width = warpSize)
+__device__ __forceinline__ uint64 shfl_up(uint64 val, uint delta, int width = warpSize)
 {
    return __shfl_up(val, delta, width);
 }
--- a/modules/xfeatures2d/src/cuda/surf.cu
+++ b/modules/xfeatures2d/src/cuda/surf.cu
@ -51,6 +51,7 @@
 #include "opencv2/core/cuda/utility.hpp"
 #include "opencv2/core/cuda/functional.hpp"
 #include "opencv2/core/cuda/filters.hpp"
+#include <opencv2/cudev/ptr2d/texture.hpp>

 namespace cv { namespace cuda { namespace device
 {
@ -59,23 +60,19 @@ namespace cv { namespace cuda { namespace device
        void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
        void loadOctaveConstants(int octave, int layer_rows, int layer_cols);

-        void bindImgTex(PtrStepSzb img);
-        size_t bindSumTex(PtrStepSz<unsigned int> sum);
-        size_t bindMaskSumTex(PtrStepSz<unsigned int> maskSum);
-
-        void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
+        void icvCalcLayerDetAndTrace_gpu(const PtrStepSz<unsigned int>& sum, const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
            int octave, int nOctaveLayer);

-        void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
+        void icvFindMaximaInLayer_gpu(const PtrStepSz<unsigned int>& maskSum, const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
            int img_rows, int img_cols, int octave, bool use_mask, int nLayers);

        void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
            float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
            unsigned int* featureCounter);

-        void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);
+        void icvCalcOrientation_gpu(const PtrStepSz<unsigned int>& sum, const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);

-        void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
+        void compute_descriptors_gpu(const PtrStepSzb& img, PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
    }
 }}}

@ -121,34 +118,8 @@ namespace cv { namespace cuda { namespace device
            cudaSafeCall( cudaMemcpyToSymbol(c_layer_cols, &layer_cols, sizeof(layer_cols)) );
        }

-        ////////////////////////////////////////////////////////////////////////
-        // Integral image texture
-
-        texture<unsigned char, 2, cudaReadModeElementType> imgTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-        texture<unsigned int, 2, cudaReadModeElementType> sumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-        texture<unsigned int, 2, cudaReadModeElementType> maskSumTex(0, cudaFilterModePoint, cudaAddressModeClamp);
-
-        void bindImgTex(PtrStepSzb img)
-        {
-            bindTexture(&imgTex, img);
-        }
-
-        size_t bindSumTex(PtrStepSz<uint> sum)
-        {
-            size_t offset;
-            cudaChannelFormatDesc desc_sum = cudaCreateChannelDesc<uint>();
-            cudaSafeCall( cudaBindTexture2D(&offset, sumTex, sum.data, desc_sum, sum.cols, sum.rows, sum.step));
-            return offset / sizeof(uint);
-        }
-        size_t bindMaskSumTex(PtrStepSz<uint> maskSum)
-        {
-            size_t offset;
-            cudaChannelFormatDesc desc_sum = cudaCreateChannelDesc<uint>();
-            cudaSafeCall( cudaBindTexture2D(&offset, maskSumTex, maskSum.data, desc_sum, maskSum.cols, maskSum.rows, maskSum.step));
-            return offset / sizeof(uint);
-        }

-        template <int N> __device__ float icvCalcHaarPatternSum(const float src[][5], int oldSize, int newSize, int y, int x)
+        template <int N> __device__ float icvCalcHaarPatternSum(cudev::TexturePtr<unsigned int> texSum, const float src[][5], int oldSize, int newSize, int y, int x)
        {
        #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 200
            typedef double real_t;
@ -169,10 +140,10 @@ namespace cv { namespace cuda { namespace device
                int dy2 = __float2int_rn(ratio * src[k][3]);

                real_t t = 0;
-                t += tex2D(sumTex, x + dx1, y + dy1);
-                t -= tex2D(sumTex, x + dx1, y + dy2);
-                t -= tex2D(sumTex, x + dx2, y + dy1);
-                t += tex2D(sumTex, x + dx2, y + dy2);
+                t += texSum(y + dy1, x + dx1);
+                t -= texSum(y + dy2, x + dx1);
+                t -= texSum(y + dy1, x + dx2);
+                t += texSum(y + dy2, x + dx2);

                d += t * src[k][4] / ((dx2 - dx1) * (dy2 - dy1));
            }
@ -201,7 +172,7 @@ namespace cv { namespace cuda { namespace device
            return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
        }

-        __global__ void icvCalcLayerDetAndTrace(PtrStepf det, PtrStepf trace)
+        __global__ void icvCalcLayerDetAndTrace(cudev::TexturePtr<unsigned int> texSum, PtrStepf det, PtrStepf trace)
        {
            // Determine the indices
            const int gridDim_y = gridDim.y / (c_nOctaveLayers + 2);
@ -222,29 +193,29 @@ namespace cv { namespace cuda { namespace device

            if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
            {
-                const float dx  = icvCalcHaarPatternSum<3>(c_DX , 9, size, (i << c_octave), (j << c_octave));
-                const float dy  = icvCalcHaarPatternSum<3>(c_DY , 9, size, (i << c_octave), (j << c_octave));
-                const float dxy = icvCalcHaarPatternSum<4>(c_DXY, 9, size, (i << c_octave), (j << c_octave));
+                const float dx  = icvCalcHaarPatternSum<3>(texSum, c_DX , 9, size, (i << c_octave), (j << c_octave));
+                const float dy  = icvCalcHaarPatternSum<3>(texSum, c_DY , 9, size, (i << c_octave), (j << c_octave));
+                const float dxy = icvCalcHaarPatternSum<4>(texSum, c_DXY, 9, size, (i << c_octave), (j << c_octave));

                det.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx * dy - 0.81f * dxy * dxy;
                trace.ptr(layer * c_layer_rows + i + margin)[j + margin] = dx + dy;
            }
        }

-        void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
+        void icvCalcLayerDetAndTrace_gpu(const PtrStepSz<unsigned int>& sum, const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
            int octave, int nOctaveLayers)
        {
            const int min_size = calcSize(octave, 0);
            const int max_samples_i = 1 + ((img_rows - min_size) >> octave);
            const int max_samples_j = 1 + ((img_cols - min_size) >> octave);
-
+            cudev::Texture<unsigned int> texSum(sum);
            dim3 threads(16, 16);

            dim3 grid;
            grid.x = divUp(max_samples_j, threads.x);
            grid.y = divUp(max_samples_i, threads.y) * (nOctaveLayers + 2);

-            icvCalcLayerDetAndTrace<<<grid, threads>>>(det, trace);
+            icvCalcLayerDetAndTrace<<<grid, threads>>>(texSum, det, trace);
            cudaSafeCall( cudaGetLastError() );

            cudaSafeCall( cudaDeviceSynchronize() );
@ -255,10 +226,14 @@ namespace cv { namespace cuda { namespace device

        __constant__ float c_DM[5] = {0, 0, 9, 9, 1};

-        struct WithMask
+        template<bool useMask = true>
+        struct Mask
        {
-            static __device__ bool check(int sum_i, int sum_j, int size)
+            __host__ Mask(){};
+            __host__ Mask(cudev::TexturePtr<unsigned int> tex_): tex(tex_) {};
+            __device__ bool check(int sum_i, int sum_j, int size)
            {
+                if (!useMask) return true;
                float ratio = (float)size / 9.0f;

                float d = 0;
@ -269,19 +244,20 @@ namespace cv { namespace cuda { namespace device
                int dy2 = __float2int_rn(ratio * c_DM[3]);

                float t = 0;
-                t += tex2D(maskSumTex, sum_j + dx1, sum_i + dy1);
-                t -= tex2D(maskSumTex, sum_j + dx1, sum_i + dy2);
-                t -= tex2D(maskSumTex, sum_j + dx2, sum_i + dy1);
-                t += tex2D(maskSumTex, sum_j + dx2, sum_i + dy2);
+                t += tex(sum_i + dy1, sum_j + dx1);
+                t -= tex(sum_i + dy2, sum_j + dx1);
+                t -= tex(sum_i + dy1, sum_j + dx2);
+                t += tex(sum_i + dy2, sum_j + dx2);

                d += t * c_DM[4] / ((dx2 - dx1) * (dy2 - dy1));

                return (d >= 0.5f);
            }
+            cudev::TexturePtr<unsigned int> tex;
        };

-        template <typename Mask>
-        __global__ void icvFindMaximaInLayer(const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer,
+        template<class T>
+        __global__ void icvFindMaximaInLayer(T mask, const PtrStepf det, const PtrStepf trace, int4* maxPosBuffer,
            unsigned int* maxCounter)
        {
            #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
@ -323,7 +299,7 @@ namespace cv { namespace cuda { namespace device
                    const int sum_i = (i - ((size >> 1) >> c_octave)) << c_octave;
                    const int sum_j = (j - ((size >> 1) >> c_octave)) << c_octave;

-                    if (Mask::check(sum_i, sum_j, size))
+                    if (mask.check(sum_i, sum_j, size))
                    {
                        // Check to see if we have a max (in its 26 neighbours)
                        const bool condmax = val0 > N9[localLin - 1 - blockDim.x - zoff]
@ -374,7 +350,7 @@ namespace cv { namespace cuda { namespace device
            #endif
        }

-        void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
+        void icvFindMaximaInLayer_gpu(const PtrStepSz<unsigned int>& maskSum, const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
            int img_rows, int img_cols, int octave, bool use_mask, int nOctaveLayers)
        {
            const int layer_rows = img_rows >> octave;
@ -390,10 +366,15 @@ namespace cv { namespace cuda { namespace device

            const size_t smem_size = threads.x * threads.y * 3 * sizeof(float);

-            if (use_mask)
-                icvFindMaximaInLayer<WithMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
-            else
-                icvFindMaximaInLayer<WithOutMask><<<grid, threads, smem_size>>>(det, trace, maxPosBuffer, maxCounter);
+            if (use_mask) {
+                cudev::Texture<unsigned int> texMaskSum(maskSum);
+                Mask<true> mask(texMaskSum);
+                icvFindMaximaInLayer<<<grid, threads, smem_size>>>(mask, det, trace, maxPosBuffer, maxCounter);
+            }
+            else {
+                Mask<false> mask;
+                icvFindMaximaInLayer<<<grid, threads, smem_size>>>(mask, det, trace, maxPosBuffer, maxCounter);
+            }

            cudaSafeCall( cudaGetLastError() );

@ -539,7 +520,7 @@ namespace cv { namespace cuda { namespace device
        __constant__ float c_NX[2][5] = {{0, 0, 2, 4, -1}, {2, 0, 4, 4, 1}};
        __constant__ float c_NY[2][5] = {{0, 0, 4, 2, 1}, {0, 2, 4, 4, -1}};

-        __global__ void icvCalcOrientation(const float* featureX, const float* featureY, const float* featureSize, float* featureDir)
+        __global__ void icvCalcOrientation(cudev::TexturePtr<unsigned int> texSum, const float* featureX, const float* featureY, const float* featureSize, float* featureDir)
        {
            __shared__ float s_X[128];
            __shared__ float s_Y[128];
@ -576,8 +557,8 @@ namespace cv { namespace cuda { namespace device
                if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size &&
                    x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
                {
-                    X = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NX, 4, grad_wav_size, y, x);
-                    Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(c_NY, 4, grad_wav_size, y, x);
+                    X = c_aptW[tid] * icvCalcHaarPatternSum<2>(texSum, c_NX, 4, grad_wav_size, y, x);
+                    Y = c_aptW[tid] * icvCalcHaarPatternSum<2>(texSum, c_NY, 4, grad_wav_size, y, x);

                    angle = atan2f(Y, X);
                    if (angle < 0)
@ -676,8 +657,9 @@ namespace cv { namespace cuda { namespace device
        #undef ORI_WIN
        #undef ORI_SAMPLES

-        void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures)
+        void icvCalcOrientation_gpu(const PtrStepSz<unsigned int>& sum, const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures)
        {
+            cudev::Texture<unsigned int> texSum(sum);
            dim3 threads;
            threads.x = 32;
            threads.y = 4;
@ -685,7 +667,7 @@ namespace cv { namespace cuda { namespace device
            dim3 grid;
            grid.x = nFeatures;

-            icvCalcOrientation<<<grid, threads>>>(featureX, featureY, featureSize, featureDir);
+            icvCalcOrientation<<<grid, threads>>>(texSum, featureX, featureY, featureSize, featureDir);
            cudaSafeCall( cudaGetLastError() );

            cudaSafeCall( cudaDeviceSynchronize() );
@ -724,12 +706,14 @@ namespace cv { namespace cuda { namespace device
        {
            typedef uchar elem_type;

+            __device__ WinReader(cudev::TexturePtr<uchar> tex_) : tex(tex_) {};
+
            __device__ __forceinline__ uchar operator ()(int i, int j) const
            {
                float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
                float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;

-                return tex2D(imgTex, pixel_x, pixel_y);
+                return tex(pixel_y, pixel_x);
            }

            float centerX;
@ -739,19 +723,17 @@ namespace cv { namespace cuda { namespace device
            float sin_dir;
            int width;
            int height;
+            cudev::TexturePtr<uchar> tex;
        };

-        __device__ void calc_dx_dy(const float* featureX, const float* featureY, const float* featureSize, const float* featureDir,
-                                   float& dx, float& dy);
-
-        __device__ void calc_dx_dy(const float* featureX, const float* featureY, const float* featureSize, const float* featureDir,
+        __device__ void calc_dx_dy(cudev::TexturePtr<uchar> tex, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir,
                                   float& dx, float& dy)
        {
            __shared__ float s_PATCH[PATCH_SZ + 1][PATCH_SZ + 1];

            dx = dy = 0.0f;

-            WinReader win;
+            WinReader win(tex);

            win.centerX = featureX[blockIdx.x];
            win.centerY = featureY[blockIdx.x];
@ -813,14 +795,14 @@ namespace cv { namespace cuda { namespace device
            }
        }

-        __global__ void compute_descriptors_64(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
+        __global__ void compute_descriptors_64(cudev::TexturePtr<uchar> texImg, PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
        {
            __shared__ float smem[32 * 16];

            float* sRow = smem + threadIdx.y * 32;

            float dx, dy;
-            calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy);
+            calc_dx_dy(texImg, featureX, featureY, featureSize, featureDir, dx, dy);

            float dxabs = ::fabsf(dx);
            float dyabs = ::fabsf(dy);
@ -839,14 +821,14 @@ namespace cv { namespace cuda { namespace device
                *descriptors_block = make_float4(dx, dy, dxabs, dyabs);
        }

-        __global__ void compute_descriptors_128(PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
+        __global__ void compute_descriptors_128(cudev::TexturePtr<uchar> texImg, PtrStep<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
        {
            __shared__ float smem[32 * 16];

            float* sRow = smem + threadIdx.y * 32;

            float dx, dy;
-            calc_dx_dy(featureX, featureY, featureSize, featureDir, dx, dy);
+            calc_dx_dy(texImg, featureX, featureY, featureSize, featureDir, dx, dy);

            float4* descriptors_block = descriptors.ptr(blockIdx.x) + threadIdx.y * 2;

@ -925,13 +907,13 @@ namespace cv { namespace cuda { namespace device
            descriptor_base[threadIdx.x] = val / s_len;
        }

-        void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
+        void compute_descriptors_gpu(const PtrStepSzb& img, PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures)
        {
            // compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
-
+            cudev::Texture<unsigned char> texImg(img);
            if (descriptors.cols == 64)
            {
-                compute_descriptors_64<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir);
+                compute_descriptors_64<<<nFeatures, dim3(32, 16)>>>(texImg, descriptors, featureX, featureY, featureSize, featureDir);
                cudaSafeCall( cudaGetLastError() );

                cudaSafeCall( cudaDeviceSynchronize() );
@ -943,7 +925,7 @@ namespace cv { namespace cuda { namespace device
            }
            else
            {
-                compute_descriptors_128<<<nFeatures, dim3(32, 16)>>>(descriptors, featureX, featureY, featureSize, featureDir);
+                compute_descriptors_128<<<nFeatures, dim3(32, 16)>>>(texImg, descriptors, featureX, featureY, featureSize, featureDir);
                cudaSafeCall( cudaGetLastError() );

                cudaSafeCall( cudaDeviceSynchronize() );
--- a/modules/xfeatures2d/src/surf.cuda.cpp
+++ b/modules/xfeatures2d/src/surf.cuda.cpp
@ -94,23 +94,19 @@ namespace cv { namespace cuda { namespace device
        void loadGlobalConstants(int maxCandidates, int maxFeatures, int img_rows, int img_cols, int nOctaveLayers, float hessianThreshold);
        void loadOctaveConstants(int octave, int layer_rows, int layer_cols);

-        void bindImgTex(PtrStepSzb img);
-        size_t bindSumTex(PtrStepSz<unsigned int> sum);
-        size_t bindMaskSumTex(PtrStepSz<unsigned int> maskSum);
-
-        void icvCalcLayerDetAndTrace_gpu(const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
+        void icvCalcLayerDetAndTrace_gpu(const PtrStepSz<unsigned int>& sum, const PtrStepf& det, const PtrStepf& trace, int img_rows, int img_cols,
            int octave, int nOctaveLayer);

-        void icvFindMaximaInLayer_gpu(const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
+        void icvFindMaximaInLayer_gpu(const PtrStepSz<unsigned int>& maskSum, const PtrStepf& det, const PtrStepf& trace, int4* maxPosBuffer, unsigned int* maxCounter,
            int img_rows, int img_cols, int octave, bool use_mask, int nLayers);

        void icvInterpolateKeypoint_gpu(const PtrStepf& det, const int4* maxPosBuffer, unsigned int maxCounter,
            float* featureX, float* featureY, int* featureLaplacian, int* featureOctave, float* featureSize, float* featureHessian,
            unsigned int* featureCounter);

-        void icvCalcOrientation_gpu(const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);
+        void icvCalcOrientation_gpu(const PtrStepSz<unsigned int>& sum, const float* featureX, const float* featureY, const float* featureSize, float* featureDir, int nFeatures);

-        void compute_descriptors_gpu(PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
+        void compute_descriptors_gpu(const PtrStepSzb& img, PtrStepSz<float4> descriptors, const float* featureX, const float* featureY, const float* featureSize, const float* featureDir, int nFeatures);
    }
 }}}

@ -138,10 +134,7 @@ namespace
    class SURF_CUDA_Invoker
    {
    public:
-        SURF_CUDA_Invoker(cv::cuda::SURF_CUDA& surf, const GpuMat& img, const GpuMat& mask) :
-            surf_(surf),
-            img_cols(img.cols), img_rows(img.rows),
-            use_mask(!mask.empty())
+        SURF_CUDA_Invoker(cv::cuda::SURF_CUDA& surf, const GpuMat& img_, const GpuMat& mask) : surf_(surf), img(img_), img_cols(img_.cols), img_rows(img_.rows), use_mask(!mask.empty())
        {
            CV_Assert(!img.empty() && img.type() == CV_8UC1);
            CV_Assert(mask.empty() || (mask.size() == img.size() && mask.type() == CV_8UC1));
@ -167,16 +160,12 @@ namespace

            loadGlobalConstants(maxCandidates, maxFeatures, img_rows, img_cols, surf_.nOctaveLayers, static_cast<float>(surf_.hessianThreshold));

-            bindImgTex(img);
-
            cuda::integral(img, surf_.sum);
-            sumOffset = bindSumTex(surf_.sum);

            if (use_mask)
            {
                cuda::min(mask, 1.0, surf_.mask1);
                cuda::integral(surf_.mask1, surf_.maskSum);
-                maskOffset = bindMaskSumTex(surf_.maskSum);
            }
        }

@ -195,9 +184,9 @@ namespace
                const int layer_cols = img_cols >> octave;
                loadOctaveConstants(octave, layer_rows, layer_cols);

-                icvCalcLayerDetAndTrace_gpu(surf_.det, surf_.trace, img_rows, img_cols, octave, surf_.nOctaveLayers);
+                icvCalcLayerDetAndTrace_gpu(surf_.sum, surf_.det, surf_.trace, img_rows, img_cols, octave, surf_.nOctaveLayers);

-                icvFindMaximaInLayer_gpu(surf_.det, surf_.trace, surf_.maxPosBuffer.ptr<int4>(), counters.ptr<unsigned int>() + 1 + octave,
+                icvFindMaximaInLayer_gpu(surf_.maskSum, surf_.det, surf_.trace, surf_.maxPosBuffer.ptr<int4>(), counters.ptr<unsigned int>() + 1 + octave,
                    img_rows, img_cols, octave, use_mask, surf_.nOctaveLayers);

                unsigned int maxCounter;
@ -230,7 +219,7 @@ namespace
            const int nFeatures = keypoints.cols;
            if (nFeatures > 0)
            {
-                icvCalcOrientation_gpu(keypoints.ptr<float>(SURF_CUDA::X_ROW), keypoints.ptr<float>(SURF_CUDA::Y_ROW),
+                icvCalcOrientation_gpu(surf_.sum, keypoints.ptr<float>(SURF_CUDA::X_ROW), keypoints.ptr<float>(SURF_CUDA::Y_ROW),
                    keypoints.ptr<float>(SURF_CUDA::SIZE_ROW), keypoints.ptr<float>(SURF_CUDA::ANGLE_ROW), nFeatures);
            }
        }
@ -241,7 +230,7 @@ namespace
            if (nFeatures > 0)
            {
                ensureSizeIsEnough(nFeatures, descriptorSize, CV_32F, descriptors);
-                compute_descriptors_gpu(descriptors, keypoints.ptr<float>(SURF_CUDA::X_ROW), keypoints.ptr<float>(SURF_CUDA::Y_ROW),
+                compute_descriptors_gpu(img, descriptors, keypoints.ptr<float>(SURF_CUDA::X_ROW), keypoints.ptr<float>(SURF_CUDA::Y_ROW),
                    keypoints.ptr<float>(SURF_CUDA::SIZE_ROW), keypoints.ptr<float>(SURF_CUDA::ANGLE_ROW), nFeatures);
            }
        }
@ -252,6 +241,8 @@ namespace

        SURF_CUDA& surf_;

+        GpuMat img;
+
        int img_cols, img_rows;

        bool use_mask;
@ -259,9 +250,6 @@ namespace
        int maxCandidates;
        int maxFeatures;

-        size_t maskOffset;
-        size_t sumOffset;
-
        GpuMat counters;
    };
 }