diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8efc48e1c6..0d38deb1e6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -759,7 +759,7 @@ if(HAVE_CUDA)
   status("    Use CUFFT:"            HAVE_CUFFT  THEN YES ELSE NO)
   status("    Use CUBLAS:"           HAVE_CUBLAS THEN YES ELSE NO)
   status("    NVIDIA GPU arch:"      ${OPENCV_CUDA_ARCH_BIN})
-  status("    NVIDIA PTX archs:"     ${OPENCV_CUDA_ARCH_BIN})
+  status("    NVIDIA PTX archs:"     ${OPENCV_CUDA_ARCH_PTX})
 endif()
 
 # ========================== python ==========================
diff --git a/modules/gpu/src/cuda/hog.cu b/modules/gpu/src/cuda/hog.cu
index 8cdbc79df3..3a3abd47a7 100644
--- a/modules/gpu/src/cuda/hog.cu
+++ b/modules/gpu/src/cuda/hog.cu
@@ -149,8 +149,8 @@ namespace cv { namespace gpu { namespace device
                     float2 vote = *(const float2*)grad_ptr;
                     uchar2 bin = *(const uchar2*)qangle_ptr;
 
-                    grad_ptr += grad.step/grad.elemSize();
-                    qangle_ptr += qangle.step/qangle.elemSize();
+                    grad_ptr += grad.step/sizeof(float);
+                    qangle_ptr += qangle.step;
 
                     int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
                     int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
diff --git a/modules/gpu/src/cuda/stereobp.cu b/modules/gpu/src/cuda/stereobp.cu
index e45d0d4bbb..cf9a743632 100644
--- a/modules/gpu/src/cuda/stereobp.cu
+++ b/modules/gpu/src/cuda/stereobp.cu
@@ -140,7 +140,7 @@ namespace cv { namespace gpu { namespace device
                 const uchar* rs = right.ptr(y) + x * cn;
 
                 D* ds = data.ptr(y) + x;
-                const size_t disp_step = data.step * left.rows / PtrStep<D>::elem_size;
+                const size_t disp_step = data.step * left.rows / sizeof(D);
 
                 for (int disp = 0; disp < cndisp; disp++)
                 {
@@ -303,8 +303,8 @@ namespace cv { namespace gpu { namespace device
 
             if (x < dst_cols && y < dst_rows)
             {
-                const size_t dst_disp_step = dst.step * dst_rows / PtrStep<T>::elem_size;
-                const size_t src_disp_step = src.step * src_rows / PtrStep<T>::elem_size;
+                const size_t dst_disp_step = dst.step * dst_rows / sizeof(T);
+                const size_t src_disp_step = src.step * src_rows / sizeof(T);
 
                 T*       dstr = dst.ptr(y  ) + x;
                 const T* srcr = src.ptr(y/2) + x/2;
@@ -419,26 +419,26 @@ namespace cv { namespace gpu { namespace device
         }
 
         template <typename T>
-        __global__ void one_iteration(int t, PtrStep<T> u, T* d, T* l, T* r, const PtrStep<T> data, int cols, int rows)
+        __global__ void one_iteration(int t, int elem_step, T* u, T* d, T* l, T* r, const PtrStep<T> data, int cols, int rows)
         {
             const int y = blockIdx.y * blockDim.y + threadIdx.y;
             const int x = ((blockIdx.x * blockDim.x + threadIdx.x) << 1) + ((y + t) & 1);
 
             if ((y > 0) && (y < rows - 1) && (x > 0) && (x < cols - 1))
             {
-                T* us = u.ptr(y) + x;
-                T* ds = d + y * u.step/PtrStep<T>::elem_size + x;
-                T* ls = l + y * u.step/PtrStep<T>::elem_size + x;
-                T* rs = r + y * u.step/PtrStep<T>::elem_size + x;
+                T* us = u + y * elem_step + x;
+                T* ds = d + y * elem_step + x;
+                T* ls = l + y * elem_step + x;
+                T* rs = r + y * elem_step + x;
                 const T* dt = data.ptr(y) + x;
 
-                size_t msg_disp_step = u.step * rows;
-                size_t data_disp_step = data.step * rows / PtrStep<T>::elem_size;
+                size_t msg_disp_step = elem_step * rows;
+                size_t data_disp_step = data.step * rows / sizeof(T);
 
-                message(us + u.step, ls      + 1, rs - 1, dt, us, msg_disp_step, data_disp_step);
-                message(ds - u.step, ls      + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step);
-                message(us + u.step, ds - u.step, rs - 1, dt, rs, msg_disp_step, data_disp_step);
-                message(us + u.step, ds - u.step, ls + 1, dt, ls, msg_disp_step, data_disp_step);
+                message(us + elem_step, ls         + 1, rs - 1, dt, us, msg_disp_step, data_disp_step);
+                message(ds - elem_step, ls         + 1, rs - 1, dt, ds, msg_disp_step, data_disp_step);
+                message(us + elem_step, ds - elem_step, rs - 1, dt, rs, msg_disp_step, data_disp_step);
+                message(us + elem_step, ds - elem_step, ls + 1, dt, ls, msg_disp_step, data_disp_step);
             }
         }
 
@@ -452,9 +452,11 @@ namespace cv { namespace gpu { namespace device
             grid.x = divUp(cols, threads.x << 1);
             grid.y = divUp(rows, threads.y);
 
+            int elem_step = u.step/sizeof(T);
+
             for(int t = 0; t < iters; ++t)
             {
-                one_iteration<T><<<grid, threads, 0, stream>>>(t, (PtrStepSz<T>)u, (T*)d.data, (T*)l.data, (T*)r.data, (PtrStepSz<T>)data, cols, rows);
+                one_iteration<T><<<grid, threads, 0, stream>>>(t, elem_step, (T*)u.data, (T*)d.data, (T*)l.data, (T*)r.data, (PtrStepSz<T>)data, cols, rows);
                 cudaSafeCall( cudaGetLastError() );
 
                 if (stream == 0)
@@ -470,7 +472,7 @@ namespace cv { namespace gpu { namespace device
         ///////////////////////////////////////////////////////////////
 
         template <typename T>
-        __global__ void output(const PtrStep<T> u, const T* d, const T* l, const T* r, const T* data,
+        __global__ void output(const int elem_step, const T* u, const T* d, const T* l, const T* r, const T* data,
             PtrStepSz<short> disp)
         {
             const int x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -478,13 +480,13 @@ namespace cv { namespace gpu { namespace device
 
             if (y > 0 && y < disp.rows - 1 && x > 0 && x < disp.cols - 1)
             {
-                const T* us = u.ptr(y + 1) + x;
-                const T* ds = d + (y - 1) * u.step/PtrStep<T>::elem_size + x;
-                const T* ls = l + y * u.step/PtrStep<T>::elem_size + (x + 1);
-                const T* rs = r + y * u.step/PtrStep<T>::elem_size + (x - 1);
-                const T* dt = data + y * u.step/PtrStep<T>::elem_size + x;
+                const T* us = u + (y + 1) * elem_step + x;
+                const T* ds = d + (y - 1) * elem_step + x;
+                const T* ls = l + y * elem_step + (x + 1);
+                const T* rs = r + y * elem_step+ (x - 1);
+                const T* dt = data + y * elem_step + x;
 
-                size_t disp_step = disp.rows * u.step/PtrStep<T>::elem_size;
+                size_t disp_step = disp.rows * elem_step;
 
                 int best = 0;
                 float best_val = numeric_limits<float>::max();
@@ -517,7 +519,9 @@ namespace cv { namespace gpu { namespace device
             grid.x = divUp(disp.cols, threads.x);
             grid.y = divUp(disp.rows, threads.y);
 
-            output<T><<<grid, threads, 0, stream>>>((PtrStepSz<T>)u, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp);
+            int elem_step = static_cast<int>(u.step/sizeof(T));
+
+            output<T><<<grid, threads, 0, stream>>>(elem_step, (const T*)u.data, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp);
             cudaSafeCall( cudaGetLastError() );
 
             if (stream == 0)