diff --git a/modules/ocl/src/arithm.cpp b/modules/ocl/src/arithm.cpp
index c1147cb413..8d502ea56c 100644
--- a/modules/ocl/src/arithm.cpp
+++ b/modules/ocl/src/arithm.cpp
@@ -341,7 +341,7 @@ static void arithmetic_sum_buffer_run(const oclMat &src, cl_mem &dst, int vlen ,
     args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
     args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst ));
     size_t gt[3] = {groupnum * 256, 1, 1}, lt[3] = {256, 1, 1};
-    if(src.oclchannels() != 3)
+    if (src.oclchannels() != 3)
         openCLExecuteKernel(src.clCxt, &arithm_sum, "arithm_op_sum", gt, lt, args, -1, -1, build_options);
     else
         openCLExecuteKernel(src.clCxt, &arithm_sum_3, "arithm_op_sum_3", gt, lt, args, -1, -1, build_options);
@@ -365,9 +365,9 @@ Scalar arithmetic_sum(const oclMat &src, int type = 0)
 
     memset(p, 0, dbsize * sizeof(T));
     openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize * sizeof(T));
-    for(int i = 0; i < dbsize;)
+    for (int i = 0; i < dbsize;)
     {
-        for(int j = 0; j < src.oclchannels(); j++, i++)
+        for (int j = 0; j < src.oclchannels(); j++, i++)
             s.val[j] += p[i];
     }
     delete[] p;
@@ -378,9 +378,9 @@ Scalar arithmetic_sum(const oclMat &src, int type = 0)
 typedef Scalar (*sumFunc)(const oclMat &src, int type);
 Scalar cv::ocl::sum(const oclMat &src)
 {
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
-        CV_Error(CV_GpuNotSupported, "select device don't support double");
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double");
     }
     static sumFunc functab[2] =
     {
@@ -395,9 +395,9 @@ Scalar cv::ocl::sum(const oclMat &src)
 
 Scalar cv::ocl::absSum(const oclMat &src)
 {
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
-        CV_Error(CV_GpuNotSupported, "select device don't support double");
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double");
     }
     static sumFunc functab[2] =
     {
@@ -412,9 +412,9 @@ Scalar cv::ocl::absSum(const oclMat &src)
 
 Scalar cv::ocl::sqrSum(const oclMat &src)
 {
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
-        CV_Error(CV_GpuNotSupported, "select device don't support double");
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double");
     }
     static sumFunc functab[2] =
     {
@@ -446,7 +446,7 @@ void cv::ocl::meanStdDev(const oclMat &src, Scalar &mean, Scalar &stddev)
     m1 = (Mat)dst1;
     m2 = (Mat)dst2;
     int i = 0, *p = (int *)m1.data, *q = (int *)m2.data;
-    for(; i < channels; i++)
+    for (; i < channels; i++)
     {
         mean.val[i] = (double)p[i] / (src.cols * src.rows);
         stddev.val[i] = std::sqrt(std::max((double) q[i] / (src.cols * src.rows) - mean.val[i] * mean.val[i] , 0.));
@@ -476,7 +476,7 @@ static void arithmetic_minMax_run(const oclMat &src, const oclMat &mask, cl_mem
     args.push_back( make_pair( sizeof(cl_int) , (void *)&elemnum));
     args.push_back( make_pair( sizeof(cl_int) , (void *)&groupnum));
     args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data));
-    if(!mask.empty())
+    if (!mask.empty())
     {
         int mall_cols = mask.step / (vlen * mask.elemSize1());
         int mpre_cols = (mask.offset % mask.step) / (vlen * mask.elemSize1());
@@ -499,7 +499,7 @@ static void arithmetic_minMax_mask_run(const oclMat &src, const oclMat &mask, cl
     vector<pair<size_t , const void *> > args;
     size_t gt[3] = {groupnum * 256, 1, 1}, lt[3] = {256, 1, 1};
     char build_options[50];
-    if(src.oclchannels() == 1)
+    if (src.oclchannels() == 1)
     {
         int cols = (src.cols - 1) / vlen + 1;
         int invalid_cols = src.step / (vlen * src.elemSize1()) - cols;
@@ -519,8 +519,6 @@ static void arithmetic_minMax_mask_run(const oclMat &src, const oclMat &mask, cl
         args.push_back( make_pair( sizeof(cl_int) , (void *)&moffset ));
         args.push_back( make_pair( sizeof(cl_mem) , (void *)&mask.data ));
         args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst ));
-        //        printf("elemnum:%d,cols:%d,invalid_cols:%d,offset:%d,minvalid_cols:%d,moffset:%d,repeat_e:%d\r\n",
-        //               elemnum,cols,invalid_cols,offset,minvalid_cols,moffset,repeat_me);
         openCLExecuteKernel(src.clCxt, &arithm_minMax_mask, kernelName, gt, lt, args, -1, -1, build_options);
     }
 }
@@ -549,18 +547,18 @@ template <typename T> void arithmetic_minMax(const oclMat &src, double *minVal,
 
     Mat matbuf = Mat(buf);
     T *p = matbuf.ptr<T>();
-    if(minVal != NULL)
+    if (minVal != NULL)
     {
         *minVal = std::numeric_limits<double>::max();
-        for(int i = 0; i < vlen * (int)groupnum; i++)
+        for (int i = 0; i < vlen * (int)groupnum; i++)
         {
             *minVal = *minVal < p[i] ? *minVal : p[i];
         }
     }
-    if(maxVal != NULL)
+    if (maxVal != NULL)
     {
         *maxVal = -std::numeric_limits<double>::max();
-        for(int i = vlen * (int)groupnum; i < 2 * vlen * (int)groupnum; i++)
+        for (int i = vlen * (int)groupnum; i < 2 * vlen * (int)groupnum; i++)
         {
             *maxVal = *maxVal > p[i] ? *maxVal : p[i];
         }
@@ -577,9 +575,9 @@ void cv::ocl::minMax(const oclMat &src, double *minVal, double *maxVal, const oc
 void cv::ocl::minMax_buf(const oclMat &src, double *minVal, double *maxVal, const oclMat &mask, oclMat &buf)
 {
     CV_Assert(src.oclchannels() == 1);
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
-        CV_Error(CV_GpuNotSupported, "select device don't support double");
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double");
     }
     static minMaxFunc functab[8] =
     {
@@ -625,7 +623,7 @@ double cv::ocl::norm(const oclMat &src1, const oclMat &src2, int normType)
         m = (gm2);
         p = (int *)m.data;
         r = -std::numeric_limits<double>::max();
-        for(i = 0; i < channels; i++)
+        for (i = 0; i < channels; i++)
         {
             r = std::max(r, (double)p[i]);
         }
@@ -635,7 +633,7 @@ double cv::ocl::norm(const oclMat &src1, const oclMat &src2, int normType)
         //arithmetic_sum_run(gm1, gm2,"arithm_op_sum");
         m = (gm2);
         p = (int *)m.data;
-        for(i = 0; i < channels; i++)
+        for (i = 0; i < channels; i++)
         {
             r = r + (double)p[i];
         }
@@ -645,14 +643,14 @@ double cv::ocl::norm(const oclMat &src1, const oclMat &src2, int normType)
         //arithmetic_sum_run(gm1, gm2,"arithm_op_squares_sum");
         m = (gm2);
         p = (int *)m.data;
-        for(i = 0; i < channels; i++)
+        for (i = 0; i < channels; i++)
         {
             r = r + (double)p[i];
         }
         r = std::sqrt(r);
         break;
     }
-    if(isRelative)
+    if (isRelative)
         r = r / norm(src2, normType);
     return r;
 }
@@ -663,9 +661,9 @@ double cv::ocl::norm(const oclMat &src1, const oclMat &src2, int normType)
 
 static void arithmetic_flip_rows_run(const oclMat &src, oclMat &dst, string kernelName)
 {
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
+    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
     {
-        CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
     }
 
@@ -710,9 +708,9 @@ static void arithmetic_flip_rows_run(const oclMat &src, oclMat &dst, string kern
 
 static void arithmetic_flip_cols_run(const oclMat &src, oclMat &dst, string kernelName, bool isVertical)
 {
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
+    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.type() == CV_64F)
     {
-        CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
     }
 
@@ -749,7 +747,7 @@ static void arithmetic_flip_cols_run(const oclMat &src, oclMat &dst, string kern
     args.push_back( make_pair( sizeof(cl_int), (void *)&dst.rows ));
     args.push_back( make_pair( sizeof(cl_int), (void *)&dst.cols ));
 
-    if(isVertical)
+    if (isVertical)
         args.push_back( make_pair( sizeof(cl_int), (void *)&rows ));
     else
         args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
@@ -764,11 +762,11 @@ static void arithmetic_flip_cols_run(const oclMat &src, oclMat &dst, string kern
 void cv::ocl::flip(const oclMat &src, oclMat &dst, int flipCode)
 {
     dst.create(src.size(), src.type());
-    if(flipCode == 0)
+    if (flipCode == 0)
     {
         arithmetic_flip_rows_run(src, dst, "arithm_flip_rows");
     }
-    else if(flipCode > 0)
+    else if (flipCode > 0)
         arithmetic_flip_cols_run(src, dst, "arithm_flip_cols", false);
     else
         arithmetic_flip_cols_run(src, dst, "arithm_flip_rc", true);
@@ -877,9 +875,9 @@ void cv::ocl::log(const oclMat &src, oclMat &dst)
 
 static void arithmetic_magnitude_phase_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName)
 {
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
+    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
     {
-        CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
     }
 
@@ -921,9 +919,9 @@ void cv::ocl::magnitude(const oclMat &src1, const oclMat &src2, oclMat &dst)
 
 static void arithmetic_phase_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string kernelName, const char **kernelString)
 {
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
+    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
     {
-        CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
     }
 
@@ -964,7 +962,7 @@ void cv::ocl::phase(const oclMat &x, const oclMat &y, oclMat &Angle , bool angle
     CV_Assert(x.type() == y.type() && x.size() == y.size() && (x.depth() == CV_32F || x.depth() == CV_64F));
     Angle.create(x.size(), x.type());
     string kernelName = angleInDegrees ? "arithm_phase_indegrees" : "arithm_phase_inradians";
-    if(angleInDegrees)
+    if (angleInDegrees)
         arithmetic_phase_run(x, y, Angle, kernelName, &arithm_phase);
     else
         arithmetic_phase_run(x, y, Angle, kernelName, &arithm_phase);
@@ -977,9 +975,9 @@ void cv::ocl::phase(const oclMat &x, const oclMat &y, oclMat &Angle , bool angle
 static void arithmetic_cartToPolar_run(const oclMat &src1, const oclMat &src2, oclMat &dst_mag, oclMat &dst_cart,
                                 string kernelName, bool angleInDegrees)
 {
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
+    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
     {
-        CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
     }
 
@@ -1030,9 +1028,9 @@ void cv::ocl::cartToPolar(const oclMat &x, const oclMat &y, oclMat &mag, oclMat
 static void arithmetic_ptc_run(const oclMat &src1, const oclMat &src2, oclMat &dst1, oclMat &dst2, bool angleInDegrees,
                         string kernelName)
 {
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
+    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE) && src1.type() == CV_64F)
     {
-        CV_Error(CV_GpuNotSupported, "Selected device don't support double\r\n");
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double\r\n");
         return;
     }
 
@@ -1048,7 +1046,7 @@ static void arithmetic_ptc_run(const oclMat &src1, const oclMat &src2, oclMat &d
 
     int tmp = angleInDegrees ? 1 : 0;
     vector<pair<size_t , const void *> > args;
-    if(src1.data)
+    if (src1.data)
     {
         args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
         args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step ));
@@ -1077,7 +1075,7 @@ void cv::ocl::polarToCart(const oclMat &magnitude, const oclMat &angle, oclMat &
     x.create(angle.size(), angle.type());
     y.create(angle.size(), angle.type());
 
-    if( magnitude.data )
+    if ( magnitude.data )
     {
         CV_Assert( magnitude.size() == angle.size() && magnitude.type() == angle.type() );
         arithmetic_ptc_run(magnitude, angle, x, y, angleInDegrees, "arithm_polarToCart_mag");
@@ -1119,7 +1117,7 @@ static void arithmetic_minMaxLoc_mask_run(const oclMat &src, const oclMat &mask,
     vector<pair<size_t , const void *> > args;
     size_t gt[3] = {groupnum * 256, 1, 1}, lt[3] = {256, 1, 1};
     char build_options[50];
-    if(src.oclchannels() == 1)
+    if (src.oclchannels() == 1)
     {
         int cols = (src.cols - 1) / vlen + 1;
         int invalid_cols = src.step / (vlen * src.elemSize1()) - cols;
@@ -1143,7 +1141,8 @@ static void arithmetic_minMaxLoc_mask_run(const oclMat &src, const oclMat &mask,
         openCLExecuteKernel(src.clCxt, &arithm_minMaxLoc_mask, "arithm_op_minMaxLoc_mask", gt, lt, args, -1, -1, build_options);
     }
 }
-template<typename T>
+
+template <typename T>
 void arithmetic_minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
                           Point *minLoc, Point *maxLoc, const oclMat &mask)
 {
@@ -1164,12 +1163,12 @@ void arithmetic_minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
     T *p = new T[groupnum * vlen * 4];
     memset(p, 0, dbsize);
     openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize);
-    for(int i = 0; i < vlen * (int)groupnum; i++)
+    for (int i = 0; i < vlen * (int)groupnum; i++)
     {
         *minVal = (*minVal < p[i] || p[i + 2 * vlen * groupnum] == -1) ? *minVal : p[i];
         minloc = (*minVal < p[i] || p[i + 2 * vlen * groupnum] == -1) ? minloc : cvRound(p[i + 2 * vlen * groupnum]);
     }
-    for(int i = vlen * (int)groupnum; i < 2 * vlen * (int)groupnum; i++)
+    for (int i = vlen * (int)groupnum; i < 2 * vlen * (int)groupnum; i++)
     {
         *maxVal = (*maxVal > p[i] || p[i + 2 * vlen * groupnum] == -1) ? *maxVal : p[i];
         maxloc = (*maxVal > p[i] || p[i + 2 * vlen * groupnum] == -1) ? maxloc : cvRound(p[i + 2 * vlen * groupnum]);
@@ -1178,9 +1177,9 @@ void arithmetic_minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
     int pre_rows = src.offset / src.step;
     int pre_cols = (src.offset % src.step) / src.elemSize1();
     int wholecols = src.step / src.elemSize1();
-    if( minLoc )
+    if ( minLoc )
     {
-        if( minloc >= 0 )
+        if ( minloc >= 0 )
         {
             minLoc->y = minloc / wholecols - pre_rows;
             minLoc->x = minloc % wholecols - pre_cols;
@@ -1188,9 +1187,9 @@ void arithmetic_minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
         else
             minLoc->x = minLoc->y = -1;
     }
-    if( maxLoc )
+    if ( maxLoc )
     {
-        if( maxloc >= 0 )
+        if ( maxloc >= 0 )
         {
             maxLoc->y = maxloc / wholecols - pre_rows;
             maxLoc->x = maxloc % wholecols - pre_cols;
@@ -1209,9 +1208,9 @@ typedef void (*minMaxLocFunc)(const oclMat &src, double *minVal, double *maxVal,
 void cv::ocl::minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
                         Point *minLoc, Point *maxLoc, const oclMat &mask)
 {
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
-        CV_Error(CV_GpuNotSupported, "select device don't support double");
+        CV_Error(CV_GpuNotSupported, "Selected device doesn't support double");
         return;
     }
 
@@ -1259,12 +1258,11 @@ static void arithmetic_countNonZero_run(const oclMat &src, cl_mem &dst, int vlen
 int cv::ocl::countNonZero(const oclMat &src)
 {
     size_t groupnum = src.clCxt->computeUnits();
-    if(!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
+    if (!src.clCxt->supportsFeature(Context::CL_DOUBLE) && src.depth() == CV_64F)
     {
-        CV_Error(CV_GpuNotSupported, "select device don't support double");
+        CV_Error(CV_GpuNotSupported, "selected device doesn't support double");
     }
     CV_Assert(groupnum != 0);
-//    groupnum = groupnum * 2;
     int vlen = 8 , dbsize = groupnum * vlen;
     Context *clCxt = src.clCxt;
     string kernelName = "arithm_op_nonzero";
@@ -1274,7 +1272,7 @@ int cv::ocl::countNonZero(const oclMat &src)
 
     memset(p, 0, dbsize * sizeof(int));
     openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize * sizeof(int));
-    for(int i = 0; i < dbsize; i++)
+    for (int i = 0; i < dbsize; i++)
         nonzero += p[i];
 
     delete[] p;
@@ -1677,7 +1675,7 @@ static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string
     args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
 
     float pf = static_cast<float>(p);
-    if(!src1.clCxt->supportsFeature(Context::CL_DOUBLE))
+    if (!src1.clCxt->supportsFeature(Context::CL_DOUBLE))
         args.push_back( make_pair( sizeof(cl_float), (void *)&pf ));
     else
         args.push_back( make_pair( sizeof(cl_double), (void *)&p ));
@@ -1687,7 +1685,7 @@ static void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string
 
 void cv::ocl::pow(const oclMat &x, double p, oclMat &y)
 {
-    if(!x.clCxt->supportsFeature(Context::CL_DOUBLE) && x.type() == CV_64F)
+    if (!x.clCxt->supportsFeature(Context::CL_DOUBLE) && x.type() == CV_64F)
     {
         cout << "Selected device do not support double" << endl;
         return;
@@ -1714,14 +1712,14 @@ void cv::ocl::setIdentity(oclMat& src, double scalar)
     size_t global_threads[] = {src.cols, src.rows, 1};
 
     string kernelName = "setIdentityKernel";
-    if(src.type() == CV_32FC1)
+    if (src.type() == CV_32FC1)
         kernelName += "_F1";
-    else if(src.type() == CV_32SC1)
+    else if (src.type() == CV_32SC1)
         kernelName += "_I1";
     else
     {
         kernelName += "_D1";
-        if(!(clCxt->supportsFeature(Context::CL_DOUBLE)))
+        if (!(clCxt->supportsFeature(Context::CL_DOUBLE)))
         {
             oclMat temp;
             src.convertTo(temp, CV_32FC1);
@@ -1738,9 +1736,9 @@ void cv::ocl::setIdentity(oclMat& src, double scalar)
 
     int scalar_i = 0;
     float scalar_f = 0.0f;
-    if(clCxt->supportsFeature(Context::CL_DOUBLE))
+    if (clCxt->supportsFeature(Context::CL_DOUBLE))
     {
-        if(src.type() == CV_32SC1)
+        if (src.type() == CV_32SC1)
         {
             scalar_i = (int)scalar;
             args.push_back(make_pair(sizeof(cl_int), (void*)&scalar_i));
@@ -1750,7 +1748,7 @@ void cv::ocl::setIdentity(oclMat& src, double scalar)
     }
     else
     {
-        if(src.type() == CV_32SC1)
+        if (src.type() == CV_32SC1)
         {
             scalar_i = (int)scalar;
             args.push_back(make_pair(sizeof(cl_int), (void*)&scalar_i));
diff --git a/modules/ocl/src/opencl/arithm_minMaxLoc.cl b/modules/ocl/src/opencl/arithm_minMaxLoc.cl
index 94cc14d258..848aac3197 100644
--- a/modules/ocl/src/opencl/arithm_minMaxLoc.cl
+++ b/modules/ocl/src/opencl/arithm_minMaxLoc.cl
@@ -142,29 +142,35 @@
 #pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable
 
 /**************************************Array minMax**************************************/
-__kernel void arithm_op_minMaxLoc (int cols,int invalid_cols,int offset,int elemnum,int groupnum,
+
+__kernel void arithm_op_minMaxLoc(int cols, int invalid_cols, int offset, int elemnum, int groupnum,
                                   __global VEC_TYPE *src, __global RES_TYPE *dst)
 {
    unsigned int lid = get_local_id(0);
    unsigned int gid = get_group_id(0);
    unsigned int  id = get_global_id(0);
    unsigned int idx = offset + id + (id / cols) * invalid_cols;
-   __local VEC_TYPE localmem_max[128],localmem_min[128];
-   VEC_TYPE minval,maxval,temp;
-   __local VEC_TYPE_LOC localmem_maxloc[128],localmem_minloc[128];
-   VEC_TYPE_LOC minloc,maxloc,temploc,negative = -1;
+
+   __local VEC_TYPE localmem_max[128], localmem_min[128];
+   VEC_TYPE minval, maxval, temp;
+
+   __local VEC_TYPE_LOC localmem_maxloc[128], localmem_minloc[128];
+   VEC_TYPE_LOC minloc, maxloc, temploc, negative = -1;
+
    int idx_c;
-   if(id < elemnum)
+
+   if (id < elemnum)
    {
        temp = src[idx];
        idx_c = idx << 2;
-       temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
-       if(id % cols == 0 )
+       temploc = (VEC_TYPE_LOC)(idx_c, idx_c + 1, idx_c + 2, idx_c + 3);
+
+       if (id % cols == 0 )
        {
            repeat_s(temp);
            repeat_s(temploc);
        }
-       if(id % cols == cols - 1)
+       if (id % cols == cols - 1)
        {
            repeat_e(temp);
            repeat_e(temploc);
@@ -181,164 +187,33 @@ __kernel void arithm_op_minMaxLoc (int cols,int invalid_cols,int offset,int elem
        minloc = negative;
        maxloc = negative;
    }
-   float4 aaa;
-   for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
+
+   int grainSize = (groupnum << 8);
+   for (id = id + grainSize; id < elemnum; id = id + grainSize)
    {
        idx = offset + id + (id / cols) * invalid_cols;
        temp = src[idx];
        idx_c = idx << 2;
-       temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
-       if(id % cols == 0 )
+       temploc = (VEC_TYPE_LOC)(idx_c, idx_c+1, idx_c+2, idx_c+3);
+
+       if (id % cols == 0 )
        {
                repeat_s(temp);
                repeat_s(temploc);
        }
-       if(id % cols == cols - 1)
+       if (id % cols == cols - 1)
        {
                repeat_e(temp);
                repeat_e(temploc);
        }
-       minval = min(minval,temp);
-       maxval = max(maxval,temp);
-       minloc = CONDITION_FUNC(minval == temp, temploc , minloc);
-       maxloc = CONDITION_FUNC(maxval == temp, temploc , maxloc);
-       aaa= convert_float4(maxval == temp);
-       maxloc = convert_int4(aaa) ? temploc : maxloc;
-   }
-   if(lid > 127)
-   {
-       localmem_min[lid - 128] = minval;
-       localmem_max[lid - 128] = maxval;
-       localmem_minloc[lid - 128] = minloc;
-       localmem_maxloc[lid - 128] = maxloc;
-   }
-   barrier(CLK_LOCAL_MEM_FENCE);
-   if(lid < 128)
-   {
-       localmem_min[lid] = min(minval,localmem_min[lid]);
-       localmem_max[lid] = max(maxval,localmem_max[lid]);
-       localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == minval, minloc , localmem_minloc[lid]);
-       localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == maxval, maxloc , localmem_maxloc[lid]);
-   }
-   barrier(CLK_LOCAL_MEM_FENCE);
-   for(int lsize = 64; lsize > 0; lsize >>= 1)
-   {
-       if(lid < lsize)
-       {
-           int lid2 = lsize + lid;
-           localmem_min[lid] = min(localmem_min[lid] , localmem_min[lid2]);
-           localmem_max[lid] = max(localmem_max[lid] , localmem_max[lid2]);
-           localmem_minloc[lid] =
-                   CONDITION_FUNC(localmem_min[lid] == localmem_min[lid2], localmem_minloc[lid2] , localmem_minloc[lid]);
-           localmem_maxloc[lid] =
-                   CONDITION_FUNC(localmem_max[lid] == localmem_max[lid2], localmem_maxloc[lid2] , localmem_maxloc[lid]);
-       }
-       barrier(CLK_LOCAL_MEM_FENCE);
-   }
-   if( lid == 0)
-   {
-       dst[gid] = CONVERT_RES_TYPE(localmem_min[0]);
-       dst[gid + groupnum] = CONVERT_RES_TYPE(localmem_max[0]);
-       dst[gid + 2 * groupnum] = CONVERT_RES_TYPE(localmem_minloc[0]);
-       dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(localmem_maxloc[0]);
-   }
-}
 
-#if defined (REPEAT_S0)
-#define repeat_ms(a) a = a;
-#endif
-#if defined (REPEAT_S1)
-#define repeat_ms(a) a.s0 = 0;
-#endif
-#if defined (REPEAT_S2)
-#define repeat_ms(a) a.s0 = 0;a.s1 = 0;
-#endif
-#if defined (REPEAT_S3)
-#define repeat_ms(a) a.s0 = 0;a.s1 = 0;a.s2 = 0;
-#endif
-
-#if defined (REPEAT_E0)
-#define repeat_me(a) a = a;
-#endif
-#if defined (REPEAT_E1)
-#define repeat_me(a) a.s3 = 0;
-#endif
-#if defined (REPEAT_E2)
-#define repeat_me(a) a.s3 = 0;a.s2 = 0;
-#endif
-#if defined (REPEAT_E3)
-#define repeat_me(a) a.s3 = 0;a.s2 = 0;a.s1 = 0;
-#endif
-
-
-/**************************************Array minMaxLoc mask**************************************/
-/*
-__kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int elemnum,int groupnum,__global VEC_TYPE *src,
-                                        int minvalid_cols,int moffset,__global uchar4 *mask,__global RES_TYPE  *dst)
-{
-   unsigned int lid = get_local_id(0);
-   unsigned int gid = get_group_id(0);
-   unsigned int  id = get_global_id(0);
-   unsigned int idx = offset + id + (id / cols) * invalid_cols;
-   unsigned int midx = moffset + id + (id / cols) * minvalid_cols;
-   __local VEC_TYPE localmem_max[128],localmem_min[128];
-   VEC_TYPE minval,maxval,temp,max_val = MAX_VAL,min_val = MIN_VAL,zero = 0,m_temp;
-   __local VEC_TYPE_LOC localmem_maxloc[128],localmem_minloc[128];
-   VEC_TYPE_LOC minloc,maxloc,temploc,negative = -1;
-   if(id < elemnum)
-   {
-       temp = src[idx];
-       m_temp = CONVERT_TYPE(mask[midx]);
-       int idx_c = idx << 2;
-       temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
-       if(id % cols == 0 )
-       {
-           repeat_ms(m_temp);
-           repeat_s(temploc);
-       }
-       if(id % cols == cols - 1)
-       {
-           repeat_me(m_temp);
-           repeat_e(temploc);
-       }
-       minval = m_temp > zero ? temp : max_val;
-       maxval = m_temp > zero ? temp : min_val;
-       minloc = CONDITION_FUNC(m_temp > zero, temploc , negative);
-       maxloc = minloc;
+       minval = min(minval, temp);
+       maxval = max(maxval, temp);
+       minloc = CONDITION_FUNC(minval == temp, temploc, minloc);
+       maxloc = CONDITION_FUNC(maxval == temp, temploc, maxloc);
    }
-   else
-   {
-       minval = MAX_VAL;
-       maxval = MIN_VAL;
-       minloc = negative;
-       maxloc = negative;
-   }
-   for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8))
-   {
-       idx = offset + id + (id / cols) * invalid_cols;
-       midx = moffset + id + (id / cols) * minvalid_cols;
-       temp = src[idx];
-       m_temp = CONVERT_TYPE(mask[midx]);
-       int idx_c = idx << 2;
-       temploc = (VEC_TYPE_LOC)(idx_c,idx_c+1,idx_c+2,idx_c+3);
-       if(id % cols == 0 )
-       {
-           repeat_ms(m_temp);
-           repeat_s(temploc);
-       }
-       if(id % cols == cols - 1)
-       {
-           repeat_me(m_temp);
-           repeat_e(temploc);
-       }
-       minval = min(minval,m_temp > zero ? temp : max_val);
-       maxval = max(maxval,m_temp > zero ? temp : min_val);
 
-       temploc = CONDITION_FUNC(m_temp > zero, temploc , negative);
-       minloc = CONDITION_FUNC(minval == temp, temploc , minloc);
-       maxloc = CONDITION_FUNC(maxval == temp, temploc , maxloc);
-   }
-   if(lid > 127)
+   if (lid > 127)
    {
        localmem_min[lid - 128] = minval;
        localmem_max[lid - 128] = maxval;
@@ -346,29 +221,30 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int
        localmem_maxloc[lid - 128] = maxloc;
    }
    barrier(CLK_LOCAL_MEM_FENCE);
-   if(lid < 128)
+
+   if (lid < 128)
    {
        localmem_min[lid] = min(minval,localmem_min[lid]);
        localmem_max[lid] = max(maxval,localmem_max[lid]);
-       localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == minval, minloc , localmem_minloc[lid]);
-       localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == maxval, maxloc , localmem_maxloc[lid]);
+       localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == minval, minloc, localmem_minloc[lid]);
+       localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == maxval, maxloc, localmem_maxloc[lid]);
    }
    barrier(CLK_LOCAL_MEM_FENCE);
-   for(int lsize = 64; lsize > 0; lsize >>= 1)
+
+   for (int lsize = 64; lsize > 0; lsize >>= 1)
    {
-       if(lid < lsize)
+       if (lid < lsize)
        {
            int lid2 = lsize + lid;
-           localmem_min[lid] = min(localmem_min[lid] , localmem_min[lid2]);
-           localmem_max[lid] = max(localmem_max[lid] , localmem_max[lid2]);
-           localmem_minloc[lid] =
-                   CONDITION_FUNC(localmem_min[lid] == localmem_min[lid2], localmem_minloc[lid2] , localmem_minloc[lid]);
-           localmem_maxloc[lid] =
-                   CONDITION_FUNC(localmem_max[lid] == localmem_max[lid2], localmem_maxloc[lid2] , localmem_maxloc[lid]);
+           localmem_min[lid] = min(localmem_min[lid], localmem_min[lid2]);
+           localmem_max[lid] = max(localmem_max[lid], localmem_max[lid2]);
+           localmem_minloc[lid] = CONDITION_FUNC(localmem_min[lid] == localmem_min[lid2], localmem_minloc[lid2], localmem_minloc[lid]);
+           localmem_maxloc[lid] = CONDITION_FUNC(localmem_max[lid] == localmem_max[lid2], localmem_maxloc[lid2], localmem_maxloc[lid]);
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
-   if( lid == 0)
+
+   if ( lid == 0)
    {
        dst[gid] = CONVERT_RES_TYPE(localmem_min[0]);
        dst[gid + groupnum] = CONVERT_RES_TYPE(localmem_max[0]);
@@ -376,5 +252,3 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int
        dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(localmem_maxloc[0]);
    }
 }
-
-*/