diff --git a/modules/gpu/src/cuda/bf_radius_match.cu b/modules/gpu/src/cuda/bf_radius_match.cu
index 0aa71913b2..c6cba928c1 100644
--- a/modules/gpu/src/cuda/bf_radius_match.cu
+++ b/modules/gpu/src/cuda/bf_radius_match.cu
@@ -56,7 +56,7 @@ namespace cv { namespace gpu { namespace device
         __global__ void matchUnrolled(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
             PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
         {
-            #if __CUDA_ARCH__ >= 110
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
 
             extern __shared__ int smem[];
 
@@ -168,7 +168,7 @@ namespace cv { namespace gpu { namespace device
         __global__ void match(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
             PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
         {
-            #if __CUDA_ARCH__ >= 110
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
 
             extern __shared__ int smem[];
 
diff --git a/modules/gpu/src/cuda/canny.cu b/modules/gpu/src/cuda/canny.cu
index f9dd490bf6..95028c2eaf 100644
--- a/modules/gpu/src/cuda/canny.cu
+++ b/modules/gpu/src/cuda/canny.cu
@@ -261,7 +261,7 @@ namespace cv { namespace gpu { namespace device
 
         __global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)
         {
-            #if __CUDA_ARCH__ >= 120
+            #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120)
 
             __shared__ int smem[18][18];
 
@@ -358,7 +358,7 @@ namespace cv { namespace gpu { namespace device
 
         __global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)
         {
-            #if __CUDA_ARCH__ >= 120
+            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 120
 
             const int stack_size = 512;
 
diff --git a/modules/gpu/src/cuda/column_filter.cu b/modules/gpu/src/cuda/column_filter.cu
index 624bd3fbc0..fb52d6e989 100644
--- a/modules/gpu/src/cuda/column_filter.cu
+++ b/modules/gpu/src/cuda/column_filter.cu
@@ -64,7 +64,7 @@ namespace cv { namespace gpu { namespace device
         template <int KSIZE, typename T, typename D, typename B>
         __global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep<D> dst, const int anchor, const B brd)
         {
-            #if __CUDA_ARCH__ >= 200
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
                 const int BLOCK_DIM_X = 16;
                 const int BLOCK_DIM_Y = 16;
                 const int PATCH_PER_BLOCK = 4;
diff --git a/modules/gpu/src/cuda/fast.cu b/modules/gpu/src/cuda/fast.cu
index 8f904cd985..a511e8f35f 100644
--- a/modules/gpu/src/cuda/fast.cu
+++ b/modules/gpu/src/cuda/fast.cu
@@ -223,7 +223,7 @@ namespace cv { namespace gpu { namespace device
         template <bool calcScore, class Mask>
         __global__ void calcKeypoints(const DevMem2Db img, const Mask mask, short2* kpLoc, const unsigned int maxKeypoints, PtrStepi score, const int threshold)
         {
-            #if __CUDA_ARCH__ >= 110
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
 
             const int j = threadIdx.x + blockIdx.x * blockDim.x + 3;
             const int i = threadIdx.y + blockIdx.y * blockDim.y + 3;
@@ -325,7 +325,7 @@ namespace cv { namespace gpu { namespace device
 
         __global__ void nonmaxSupression(const short2* kpLoc, int count, const DevMem2Di scoreMat, short2* locFinal, float* responseFinal)
         {
-            #if __CUDA_ARCH__ >= 110
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
 
             const int kpIdx = threadIdx.x + blockIdx.x * blockDim.x;
 
diff --git a/modules/gpu/src/cuda/hist.cu b/modules/gpu/src/cuda/hist.cu
index a7f45719ea..0a08f82f41 100644
--- a/modules/gpu/src/cuda/hist.cu
+++ b/modules/gpu/src/cuda/hist.cu
@@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device
 
     #define MERGE_THREADBLOCK_SIZE 256
 
-    #define USE_SMEM_ATOMICS (__CUDA_ARCH__ >= 120)
+    #define USE_SMEM_ATOMICS (defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 120))
 
     namespace hist
     {
diff --git a/modules/gpu/src/cuda/integral_image.cu b/modules/gpu/src/cuda/integral_image.cu
index ead0ddefec..5cb777dd92 100644
--- a/modules/gpu/src/cuda/integral_image.cu
+++ b/modules/gpu/src/cuda/integral_image.cu
@@ -59,7 +59,7 @@ namespace cv { namespace gpu { namespace device
 
         __global__ void shfl_integral_horizontal(const PtrStep_<uint4> img, PtrStep_<uint4> integral)
         {
-        #if __CUDA_ARCH__ >= 300
+        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
             __shared__ int sums[128];
 
             const int id = threadIdx.x;
@@ -299,7 +299,7 @@ namespace cv { namespace gpu { namespace device
         // block sums.
         __global__ void shfl_integral_vertical(DevMem2D_<unsigned int> integral)
         {
-        #if __CUDA_ARCH__ >= 300
+        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
             __shared__ unsigned int sums[32][9];
 
             const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/modules/gpu/src/cuda/matrix_reductions.cu b/modules/gpu/src/cuda/matrix_reductions.cu
index a0be65c3cd..d9b1b5cba4 100644
--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
@@ -215,7 +215,7 @@ namespace cv { namespace gpu { namespace device
                     maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
                 }
 
-            #if __CUDA_ARCH__ >= 110
+            #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
 		        __shared__ bool is_last;
 
 		        if (tid == 0)
@@ -535,7 +535,7 @@ namespace cv { namespace gpu { namespace device
 
                 findMinMaxLocInSmem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
 
-            #if __CUDA_ARCH__ >= 110
+            #if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
 		        __shared__ bool is_last;
 
 		        if (tid == 0)
@@ -841,7 +841,7 @@ namespace cv { namespace gpu { namespace device
 
                 sumInSmem<nthreads, uint>(scount, tid);
 
-            #if __CUDA_ARCH__ >= 110
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
 		        __shared__ bool is_last;
 
 		        if (tid == 0)
@@ -1034,7 +1034,7 @@ namespace cv { namespace gpu { namespace device
 
                 sumInSmem<nthreads, R>(smem, tid);
 
-            #if __CUDA_ARCH__ >= 110
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
                 __shared__ bool is_last;
 
                 if (tid == 0)
@@ -1115,7 +1115,7 @@ namespace cv { namespace gpu { namespace device
                 sumInSmem<nthreads, R>(smem, tid);
                 sumInSmem<nthreads, R>(smem + nthreads, tid);
 
-            #if __CUDA_ARCH__ >= 110
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
                 __shared__ bool is_last;
 
                 if (tid == 0)
@@ -1222,7 +1222,7 @@ namespace cv { namespace gpu { namespace device
                 sumInSmem<nthreads, R>(smem + nthreads, tid);
                 sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
 
-            #if __CUDA_ARCH__ >= 110
+            #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
                 __shared__ bool is_last;
 
                 if (tid == 0)
@@ -1339,7 +1339,7 @@ namespace cv { namespace gpu { namespace device
                 sumInSmem<nthreads, R>(smem + 2 * nthreads, tid);
                 sumInSmem<nthreads, R>(smem + 3 * nthreads, tid);
 
-            #if __CUDA_ARCH__ >= 110
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
                 __shared__ bool is_last;
 
                 if (tid == 0)
@@ -1975,7 +1975,7 @@ namespace cv { namespace gpu { namespace device
             for (int c = 0; c < cn; ++c)
                 myVal[c] = op.startValue();
 
-        #if __CUDA_ARCH__ >= 200
+        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 200
 
             // For cc >= 2.0 prefer L1 cache
             for (int x = threadIdx.x; x < src.cols; x += 256)
diff --git a/modules/gpu/src/cuda/pyrlk.cu b/modules/gpu/src/cuda/pyrlk.cu
index b06d607687..271660515b 100644
--- a/modules/gpu/src/cuda/pyrlk.cu
+++ b/modules/gpu/src/cuda/pyrlk.cu
@@ -82,7 +82,7 @@ namespace cv { namespace gpu { namespace device
             smem3[tid] = val3;
             __syncthreads();
 
-#if __CUDA_ARCH__ > 110
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 110)
             if (tid < 128)
             {
                 smem1[tid] = val1 += smem1[tid + 128];
@@ -138,7 +138,7 @@ namespace cv { namespace gpu { namespace device
             smem2[tid] = val2;
             __syncthreads();
 
-#if __CUDA_ARCH__ > 110
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 110)
             if (tid < 128)
             {
                 smem1[tid] = val1 += smem1[tid + 128];
@@ -184,7 +184,7 @@ namespace cv { namespace gpu { namespace device
             smem1[tid] = val1;
             __syncthreads();
 
-#if __CUDA_ARCH__ > 110
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 110)
             if (tid < 128)
             {
                 smem1[tid] = val1 += smem1[tid + 128];
@@ -271,7 +271,7 @@ namespace cv { namespace gpu { namespace device
         template <int cn, int PATCH_X, int PATCH_Y, bool calcErr>
         __global__ void lkSparse(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
         {
-#if __CUDA_ARCH__ <= 110
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ <= 110)
             __shared__ float smem1[128];
             __shared__ float smem2[128];
             __shared__ float smem3[128];
diff --git a/modules/gpu/src/cuda/row_filter.cu b/modules/gpu/src/cuda/row_filter.cu
index 8963212a23..d40af71496 100644
--- a/modules/gpu/src/cuda/row_filter.cu
+++ b/modules/gpu/src/cuda/row_filter.cu
@@ -64,7 +64,7 @@ namespace cv { namespace gpu { namespace device
         template <int KSIZE, typename T, typename D, typename B>
         __global__ void linearRowFilter(const DevMem2D_<T> src, PtrStep<D> dst, const int anchor, const B brd)
         {
-            #if __CUDA_ARCH__ >= 200
+            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
                 const int BLOCK_DIM_X = 32;
                 const int BLOCK_DIM_Y = 8;
                 const int PATCH_PER_BLOCK = 4;
diff --git a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
index f75453930b..0e6aca0889 100644
--- a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
+++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
@@ -2070,7 +2070,7 @@ NCVStatus nppiStInterpolateFrames(const NppStInterpolationState *pState)
 //==============================================================================
 
 
-#if __CUDA_ARCH__ < 200
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 200)
 
 // FP32 atomic add
 static __forceinline__ __device__ float _atomicAdd(float *addr, float val)