diff --git a/modules/gpu/src/color.cpp b/modules/gpu/src/color.cpp
index dc022f437e..51a71dd184 100644
--- a/modules/gpu/src/color.cpp
+++ b/modules/gpu/src/color.cpp
@@ -243,7 +243,7 @@ namespace
                     if (dcn <= 0) dcn = 3;
                     CV_Assert((scn == 3 || scn == 4) && (dcn == 3 || dcn == 4));
 
-                    bidx = code == CV_BGR2YCrCb || code == CV_RGB2YUV ? 0 : 2;
+                    bidx = code == CV_BGR2YCrCb || code == CV_BGR2YUV ? 0 : 2;
 
                     static const float yuv_f[] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };
                     static const int yuv_i[] = { B2Y, G2Y, R2Y, 8061, 14369 };
@@ -281,7 +281,7 @@ namespace
 
                     CV_Assert((scn == 3 || scn == 4) && (dcn == 3 || dcn == 4));
 
-                    bidx = code == CV_YCrCb2BGR || code == CV_YUV2RGB ? 0 : 2;
+                    bidx = code == CV_YCrCb2BGR || code == CV_YUV2BGR ? 0 : 2;
 
                     static const float yuv_f[] = { 2.032f, -0.395f, -0.581f, 1.140f };
                     static const int yuv_i[] = { 33292, -6472, -9519, 18678 }; 
@@ -391,9 +391,9 @@ namespace
                         
                     dst.create(sz, CV_MAKETYPE(depth, dcn));
                     
-                    //const void* coeffs = depth == CV_32F ? (void*)coeffs_f : (void*)coeffs_i;
+                    const void* coeffs = depth == CV_32F ? (void*)coeffs_f : (void*)coeffs_i;
 
-                    funcs[depth](src, scn, dst, dcn, coeffs_i, stream);
+                    funcs[depth](src, scn, dst, dcn, coeffs, stream);
                     break;
                 }
 
diff --git a/modules/gpu/src/matrix_operations.cpp b/modules/gpu/src/matrix_operations.cpp
index 52821f9c7a..2a79167e32 100644
--- a/modules/gpu/src/matrix_operations.cpp
+++ b/modules/gpu/src/matrix_operations.cpp
@@ -590,10 +590,21 @@ void cv::gpu::ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m)
 bool cv::gpu::CudaMem::canMapHostMemory()
 {
     cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, 0);
+    cudaGetDeviceProperties(&prop, getDevice());
     return (prop.canMapHostMemory != 0) ? true : false;
 }
 
+namespace
+{
+    int alignUp(int what, int alignment)
+    {
+        int alignMask = alignment-1;
+        int inverseAlignMask = ~alignMask;
+        int res = (what + alignMask) & inverseAlignMask;
+        return res;
+    }
+}
+
 void cv::gpu::CudaMem::create(int _rows, int _cols, int _type, int _alloc_type)
 {
     if (_alloc_type == ALLOC_ZEROCOPY && !canMapHostMemory())
@@ -611,6 +622,12 @@ void cv::gpu::CudaMem::create(int _rows, int _cols, int _type, int _alloc_type)
         rows = _rows;
         cols = _cols;
         step = elemSize()*cols;
+        if (_alloc_type == ALLOC_ZEROCOPY)
+        {
+            cudaDeviceProp prop;
+            cudaGetDeviceProperties(&prop, getDevice());
+            step = alignUp(step, prop.textureAlignment);
+        }
         int64 _nettosize = (int64)step*rows;
         size_t nettosize = (size_t)_nettosize;
         if( _nettosize != (int64)nettosize )