diff --git a/modules/gpu/src/cuda/stereobp.cu b/modules/gpu/src/cuda/stereobp.cu
index 05a19b4199..d011c7f4fe 100644
--- a/modules/gpu/src/cuda/stereobp.cu
+++ b/modules/gpu/src/cuda/stereobp.cu
@@ -255,7 +255,7 @@ namespace cv { namespace gpu { namespace device
         ///////////////////////////////////////////////////////////////
 
         template <typename T>
-        __global__ void data_step_down(int dst_cols, int dst_rows, int src_rows, const PtrStep<T> src, PtrStep<T> dst)
+        __global__ void data_step_down(int dst_cols, int dst_rows, int src_cols, int src_rows, const PtrStep<T> src, PtrStep<T> dst)
         {
             const int x = blockIdx.x * blockDim.x + threadIdx.x;
             const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -264,10 +264,10 @@ namespace cv { namespace gpu { namespace device
             {
                 for (int d = 0; d < cndisp; ++d)
                 {
-                    float dst_reg  = src.ptr(d * src_rows + (2*y+0))[(2*x+0)];
-                          dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+0)];
-                          dst_reg += src.ptr(d * src_rows + (2*y+0))[(2*x+1)];
-                          dst_reg += src.ptr(d * src_rows + (2*y+1))[(2*x+1)];
+                    float dst_reg  = src.ptr(d * src_rows + ::min(2*y+0, src_rows-1))[::min(2*x+0, src_cols-1)];
+                          dst_reg += src.ptr(d * src_rows + ::min(2*y+1, src_rows-1))[::min(2*x+0, src_cols-1)];
+                          dst_reg += src.ptr(d * src_rows + ::min(2*y+0, src_rows-1))[::min(2*x+1, src_cols-1)];
+                          dst_reg += src.ptr(d * src_rows + ::min(2*y+1, src_rows-1))[::min(2*x+1, src_cols-1)];
 
                     dst.ptr(d * dst_rows + y)[x] = saturate_cast<T>(dst_reg);
                 }
@@ -275,7 +275,7 @@ namespace cv { namespace gpu { namespace device
         }
 
         template<typename T>
-        void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream)
+        void data_step_down_gpu(int dst_cols, int dst_rows, int src_cols, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream)
         {
             dim3 threads(32, 8, 1);
             dim3 grid(1, 1, 1);
@@ -283,15 +283,15 @@ namespace cv { namespace gpu { namespace device
             grid.x = divUp(dst_cols, threads.x);
             grid.y = divUp(dst_rows, threads.y);
 
-            data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (PtrStepSz<T>)src, (PtrStepSz<T>)dst);
+            data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_cols, src_rows, (PtrStepSz<T>)src, (PtrStepSz<T>)dst);
             cudaSafeCall( cudaGetLastError() );
 
             if (stream == 0)
                 cudaSafeCall( cudaDeviceSynchronize() );
         }
 
-        template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
-        template void data_step_down_gpu<float>(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
+        template void data_step_down_gpu<short>(int dst_cols, int dst_rows, int src_cols, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
+        template void data_step_down_gpu<float>(int dst_cols, int dst_rows, int src_cols, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
 
         ///////////////////////////////////////////////////////////////
         /////////////////// level up messages  ////////////////////////
diff --git a/modules/gpu/src/stereobp.cpp b/modules/gpu/src/stereobp.cpp
index 3b827a3129..2bcefe3770 100644
--- a/modules/gpu/src/stereobp.cpp
+++ b/modules/gpu/src/stereobp.cpp
@@ -67,7 +67,7 @@ namespace cv { namespace gpu { namespace device
         template<typename T, typename D>
         void comp_data_gpu(const PtrStepSzb& left, const PtrStepSzb& right, const PtrStepSzb& data, cudaStream_t stream);
         template<typename T>
-        void data_step_down_gpu(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
+        void data_step_down_gpu(int dst_cols, int dst_rows, int src_cols, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
         template <typename T>
         void level_up_messages_gpu(int dst_idx, int dst_cols, int dst_rows, int src_rows, PtrStepSzb* mus, PtrStepSzb* mds, PtrStepSzb* mls, PtrStepSzb* mrs, cudaStream_t stream);
         template <typename T>
@@ -158,7 +158,7 @@ namespace
 
             init(stream);
 
-            datas[0].create(rows * rthis.ndisp, cols, rthis.msg_type);
+            createContinuous(rows * rthis.ndisp, cols, rthis.msg_type, datas[0]);
 
             comp_data_callers[rthis.msg_type == CV_32F][left.channels()](left, right, datas[0], StreamAccessor::getStream(stream));
 
@@ -187,10 +187,10 @@ namespace
     private:
         void init(Stream& stream)
         {
-            u.create(rows * rthis.ndisp, cols, rthis.msg_type);
-            d.create(rows * rthis.ndisp, cols, rthis.msg_type);
-            l.create(rows * rthis.ndisp, cols, rthis.msg_type);
-            r.create(rows * rthis.ndisp, cols, rthis.msg_type);
+            createContinuous(rows * rthis.ndisp, cols, rthis.msg_type, u);
+            createContinuous(rows * rthis.ndisp, cols, rthis.msg_type, d);
+            createContinuous(rows * rthis.ndisp, cols, rthis.msg_type, l);
+            createContinuous(rows * rthis.ndisp, cols, rthis.msg_type, r);
 
             if (rthis.levels & 1)
             {
@@ -213,13 +213,13 @@ namespace
 
             if (rthis.levels > 1)
             {
-                int less_rows = rows / 2;
-                int less_cols = cols / 2;
+                int less_rows = (rows + 1) / 2;
+                int less_cols = (cols + 1) / 2;
 
-                u2.create(less_rows * rthis.ndisp, less_cols, rthis.msg_type);
-                d2.create(less_rows * rthis.ndisp, less_cols, rthis.msg_type);
-                l2.create(less_rows * rthis.ndisp, less_cols, rthis.msg_type);
-                r2.create(less_rows * rthis.ndisp, less_cols, rthis.msg_type);
+                createContinuous(less_rows * rthis.ndisp, less_cols, rthis.msg_type, u2);
+                createContinuous(less_rows * rthis.ndisp, less_cols, rthis.msg_type, d2);
+                createContinuous(less_rows * rthis.ndisp, less_cols, rthis.msg_type, l2);
+                createContinuous(less_rows * rthis.ndisp, less_cols, rthis.msg_type, r2);
 
                 if ((rthis.levels & 1) == 0)
                 {
@@ -253,7 +253,7 @@ namespace
 
         void calcBP(GpuMat& disp, Stream& stream)
         {
-            typedef void (*data_step_down_t)(int dst_cols, int dst_rows, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
+            typedef void (*data_step_down_t)(int dst_cols, int dst_rows, int src_cols, int src_rows, const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
             static const data_step_down_t data_step_down_callers[2] =
             {
                 data_step_down_gpu<short>, data_step_down_gpu<float>
@@ -283,12 +283,12 @@ namespace
 
             for (int i = 1; i < rthis.levels; ++i)
             {
-                cols_all[i] = cols_all[i-1] / 2;
-                rows_all[i] = rows_all[i-1] / 2;
+                cols_all[i] = (cols_all[i-1] + 1) / 2;
+                rows_all[i] = (rows_all[i-1] + 1) / 2;
 
-                datas[i].create(rows_all[i] * rthis.ndisp, cols_all[i], rthis.msg_type);
+                createContinuous(rows_all[i] * rthis.ndisp, cols_all[i], rthis.msg_type, datas[i]);
 
-                data_step_down_callers[funcIdx](cols_all[i], rows_all[i], rows_all[i-1], datas[i-1], datas[i], cudaStream);
+                data_step_down_callers[funcIdx](cols_all[i], rows_all[i], cols_all[i-1], rows_all[i-1], datas[i-1], datas[i], cudaStream);
             }
 
             PtrStepSzb mus[] = {u, u2};