diff --git a/apps/traincascade/boost.cpp b/apps/traincascade/boost.cpp
index 2d29f338b0..4f91d5a29d 100644
--- a/apps/traincascade/boost.cpp
+++ b/apps/traincascade/boost.cpp
@@ -766,7 +766,7 @@ float CvCascadeBoostTrainData::getVarValue( int vi, int si )
 }
 
 
-struct FeatureIdxOnlyPrecalc
+struct FeatureIdxOnlyPrecalc : ParallelLoopBody
 {
     FeatureIdxOnlyPrecalc( const CvFeatureEvaluator* _featureEvaluator, CvMat* _buf, int _sample_count, bool _is_buf_16u )
     {
@@ -776,11 +776,11 @@ struct FeatureIdxOnlyPrecalc
         idst = _buf->data.i;
         is_buf_16u = _is_buf_16u;
     }
-    void operator()( const BlockedRange& range ) const
+    void operator()( const Range& range ) const
     {
         cv::AutoBuffer<float> valCache(sample_count);
         float* valCachePtr = (float*)valCache;
-        for ( int fi = range.begin(); fi < range.end(); fi++)
+        for ( int fi = range.start; fi < range.end; fi++)
         {
             for( int si = 0; si < sample_count; si++ )
             {
@@ -803,7 +803,7 @@ struct FeatureIdxOnlyPrecalc
     bool is_buf_16u;
 };
 
-struct FeatureValAndIdxPrecalc
+struct FeatureValAndIdxPrecalc : ParallelLoopBody
 {
     FeatureValAndIdxPrecalc( const CvFeatureEvaluator* _featureEvaluator, CvMat* _buf, Mat* _valCache, int _sample_count, bool _is_buf_16u )
     {
@@ -814,9 +814,9 @@ struct FeatureValAndIdxPrecalc
         idst = _buf->data.i;
         is_buf_16u = _is_buf_16u;
     }
-    void operator()( const BlockedRange& range ) const
+    void operator()( const Range& range ) const
     {
-        for ( int fi = range.begin(); fi < range.end(); fi++)
+        for ( int fi = range.start; fi < range.end; fi++)
         {
             for( int si = 0; si < sample_count; si++ )
             {
@@ -840,7 +840,7 @@ struct FeatureValAndIdxPrecalc
     bool is_buf_16u;
 };
 
-struct FeatureValOnlyPrecalc
+struct FeatureValOnlyPrecalc : ParallelLoopBody
 {
     FeatureValOnlyPrecalc( const CvFeatureEvaluator* _featureEvaluator, Mat* _valCache, int _sample_count )
     {
@@ -848,9 +848,9 @@ struct FeatureValOnlyPrecalc
         valCache = _valCache;
         sample_count = _sample_count;
     }
-    void operator()( const BlockedRange& range ) const
+    void operator()( const Range& range ) const
     {
-        for ( int fi = range.begin(); fi < range.end(); fi++)
+        for ( int fi = range.start; fi < range.end; fi++)
             for( int si = 0; si < sample_count; si++ )
                 valCache->at<float>(fi,si) = (*featureEvaluator)( fi, si );
     }
@@ -864,12 +864,12 @@ void CvCascadeBoostTrainData::precalculate()
     int minNum = MIN( numPrecalcVal, numPrecalcIdx);
 
     double proctime = -TIME( 0 );
-    parallel_for( BlockedRange(numPrecalcVal, numPrecalcIdx),
-                  FeatureIdxOnlyPrecalc(featureEvaluator, buf, sample_count, is_buf_16u!=0) );
-    parallel_for( BlockedRange(0, minNum),
-                  FeatureValAndIdxPrecalc(featureEvaluator, buf, &valCache, sample_count, is_buf_16u!=0) );
-    parallel_for( BlockedRange(minNum, numPrecalcVal),
-                  FeatureValOnlyPrecalc(featureEvaluator, &valCache, sample_count) );
+    parallel_for_( Range(numPrecalcVal, numPrecalcIdx),
+                   FeatureIdxOnlyPrecalc(featureEvaluator, buf, sample_count, is_buf_16u!=0) );
+    parallel_for_( Range(0, minNum),
+                   FeatureValAndIdxPrecalc(featureEvaluator, buf, &valCache, sample_count, is_buf_16u!=0) );
+    parallel_for_( Range(minNum, numPrecalcVal),
+                   FeatureValOnlyPrecalc(featureEvaluator, &valCache, sample_count) );
     cout << "Precalculation time: " << (proctime + TIME( 0 )) << endl;
 }
 
diff --git a/modules/calib3d/src/solvepnp.cpp b/modules/calib3d/src/solvepnp.cpp
index 25988be48a..3d2c0c2c47 100644
--- a/modules/calib3d/src/solvepnp.cpp
+++ b/modules/calib3d/src/solvepnp.cpp
@@ -115,31 +115,6 @@ namespace cv
             transform(points, modif_points, transformation);
         }
 
-        class Mutex
-        {
-        public:
-            Mutex() {
-            }
-            void lock()
-            {
-#ifdef HAVE_TBB
-                resultsMutex.lock();
-#endif
-            }
-
-            void unlock()
-            {
-#ifdef HAVE_TBB
-                resultsMutex.unlock();
-#endif
-            }
-
-        private:
-#ifdef HAVE_TBB
-            tbb::mutex resultsMutex;
-#endif
-        };
-
         struct CameraParameters
         {
             void init(Mat _intrinsics, Mat _distCoeffs)
diff --git a/modules/calib3d/src/stereobm.cpp b/modules/calib3d/src/stereobm.cpp
index 32514276b5..623883df74 100644
--- a/modules/calib3d/src/stereobm.cpp
+++ b/modules/calib3d/src/stereobm.cpp
@@ -699,7 +699,7 @@ struct PrefilterInvoker
 };
 
 
-struct FindStereoCorrespInvoker
+struct FindStereoCorrespInvoker : ParallelLoopBody
 {
     FindStereoCorrespInvoker( const Mat& _left, const Mat& _right,
                               Mat& _disp, CvStereoBMState* _state,
@@ -713,12 +713,12 @@ struct FindStereoCorrespInvoker
         validDisparityRect = _validDisparityRect;
     }
 
-    void operator()( const BlockedRange& range ) const
+    void operator()( const Range& range ) const
     {
         int cols = left->cols, rows = left->rows;
-        int _row0 = min(cvRound(range.begin() * rows / nstripes), rows);
-        int _row1 = min(cvRound(range.end() * rows / nstripes), rows);
-        uchar *ptr = state->slidingSumBuf->data.ptr + range.begin() * stripeBufSize;
+        int _row0 = min(cvRound(range.start * rows / nstripes), rows);
+        int _row1 = min(cvRound(range.end * rows / nstripes), rows);
+        uchar *ptr = state->slidingSumBuf->data.ptr + range.start * stripeBufSize;
         int FILTERED = (state->minDisparity - 1)*16;
 
         Rect roi = validDisparityRect & Rect(0, _row0, cols, _row1 - _row0);
@@ -871,14 +871,10 @@ static void findStereoCorrespondenceBM( const Mat& left0, const Mat& right0, Mat
     const bool useShorts = false;
 #endif
 
-#ifdef HAVE_TBB
     const double SAD_overhead_coeff = 10.0;
     double N0 = 8000000 / (useShorts ? 1 : 4);  // approx tbb's min number instructions reasonable for one thread
     double maxStripeSize = min(max(N0 / (width * ndisp), (wsz-1) * SAD_overhead_coeff), (double)height);
     int nstripes = cvCeil(height / maxStripeSize);
-#else
-    const int nstripes = 1;
-#endif
 
     int bufSize = max(bufSize0 * nstripes, max(bufSize1 * 2, bufSize2));
 
@@ -898,9 +894,9 @@ static void findStereoCorrespondenceBM( const Mat& left0, const Mat& right0, Mat
                                               state->minDisparity, state->numberOfDisparities,
                                               state->SADWindowSize);
 
-    parallel_for(BlockedRange(0, nstripes),
-                 FindStereoCorrespInvoker(left, right, disp, state, nstripes,
-                                          bufSize0, useShorts, validDisparityRect));
+    parallel_for_(Range(0, nstripes),
+                  FindStereoCorrespInvoker(left, right, disp, state, nstripes,
+                                           bufSize0, useShorts, validDisparityRect));
 
     if( state->speckleRange >= 0 && state->speckleWindowSize > 0 )
     {
diff --git a/modules/features2d/src/detectors.cpp b/modules/features2d/src/detectors.cpp
index 2efd5a652a..a1e389a435 100644
--- a/modules/features2d/src/detectors.cpp
+++ b/modules/features2d/src/detectors.cpp
@@ -214,7 +214,7 @@ static void keepStrongest( int N, vector<KeyPoint>& keypoints )
 }
 
 namespace {
-class GridAdaptedFeatureDetectorInvoker
+class GridAdaptedFeatureDetectorInvoker : public ParallelLoopBody
 {
 private:
     int gridRows_, gridCols_;
@@ -223,29 +223,24 @@ private:
     const Mat& image_;
     const Mat& mask_;
     const Ptr<FeatureDetector>& detector_;
-#ifdef HAVE_TBB
-    tbb::mutex* kptLock_;
-#endif
+    Mutex* kptLock_;
 
     GridAdaptedFeatureDetectorInvoker& operator=(const GridAdaptedFeatureDetectorInvoker&); // to quiet MSVC
 
 public:
 
-    GridAdaptedFeatureDetectorInvoker(const Ptr<FeatureDetector>& detector, const Mat& image, const Mat& mask, vector<KeyPoint>& keypoints, int maxPerCell, int gridRows, int gridCols
-#ifdef HAVE_TBB
-        , tbb::mutex* kptLock
-#endif
-        ) : gridRows_(gridRows), gridCols_(gridCols), maxPerCell_(maxPerCell),
-            keypoints_(keypoints), image_(image), mask_(mask), detector_(detector)
-#ifdef HAVE_TBB
-            , kptLock_(kptLock)
-#endif
+    GridAdaptedFeatureDetectorInvoker(const Ptr<FeatureDetector>& detector, const Mat& image, const Mat& mask,
+                                      vector<KeyPoint>& keypoints, int maxPerCell, int gridRows, int gridCols,
+                                      cv::Mutex* kptLock)
+        : gridRows_(gridRows), gridCols_(gridCols), maxPerCell_(maxPerCell),
+          keypoints_(keypoints), image_(image), mask_(mask), detector_(detector),
+          kptLock_(kptLock)
     {
     }
 
-    void operator() (const BlockedRange& range) const
+    void operator() (const Range& range) const
     {
-        for (int i = range.begin(); i < range.end(); ++i)
+        for (int i = range.start; i < range.end; ++i)
         {
             int celly = i / gridCols_;
             int cellx = i - celly * gridCols_;
@@ -270,9 +265,8 @@ public:
                 it->pt.x += col_range.start;
                 it->pt.y += row_range.start;
             }
-#ifdef HAVE_TBB
-            tbb::mutex::scoped_lock join_keypoints(*kptLock_);
-#endif
+
+            cv::AutoLock join_keypoints(*kptLock_);
             keypoints_.insert( keypoints_.end(), sub_keypoints.begin(), sub_keypoints.end() );
         }
     }
@@ -289,13 +283,9 @@ void GridAdaptedFeatureDetector::detectImpl( const Mat& image, vector<KeyPoint>&
     keypoints.reserve(maxTotalKeypoints);
     int maxPerCell = maxTotalKeypoints / (gridRows * gridCols);
 
-#ifdef HAVE_TBB
-    tbb::mutex kptLock;
-    cv::parallel_for(cv::BlockedRange(0, gridRows * gridCols),
+    cv::Mutex kptLock;
+    cv::parallel_for_(cv::Range(0, gridRows * gridCols),
         GridAdaptedFeatureDetectorInvoker(detector, image, mask, keypoints, maxPerCell, gridRows, gridCols, &kptLock));
-#else
-    GridAdaptedFeatureDetectorInvoker(detector, image, mask, keypoints, maxPerCell, gridRows, gridCols)(cv::BlockedRange(0, gridRows * gridCols));
-#endif
 }
 
 /*
diff --git a/modules/gpu/src/calib3d.cpp b/modules/gpu/src/calib3d.cpp
index e83213f90f..b84f09d0ab 100644
--- a/modules/gpu/src/calib3d.cpp
+++ b/modules/gpu/src/calib3d.cpp
@@ -151,7 +151,7 @@ namespace
     }
 
     // Computes rotation, translation pair for small subsets if the input data
-    class TransformHypothesesGenerator
+    class TransformHypothesesGenerator : public ParallelLoopBody
     {
     public:
         TransformHypothesesGenerator(const Mat& object_, const Mat& image_, const Mat& dist_coef_,
@@ -161,7 +161,7 @@ namespace
                   num_points(num_points_), subset_size(subset_size_), rot_matrices(rot_matrices_),
                   transl_vectors(transl_vectors_) {}
 
-        void operator()(const BlockedRange& range) const
+        void operator()(const Range& range) const
         {
             // Input data for generation of the current hypothesis
             vector<int> subset_indices(subset_size);
@@ -173,7 +173,7 @@ namespace
             Mat rot_mat(3, 3, CV_64F);
             Mat transl_vec(1, 3, CV_64F);
 
-            for (int iter = range.begin(); iter < range.end(); ++iter)
+            for (int iter = range.start; iter < range.end; ++iter)
             {
                 selectRandom(subset_size, num_points, subset_indices);
                 for (int i = 0; i < subset_size; ++i)
@@ -239,7 +239,7 @@ void cv::gpu::solvePnPRansac(const Mat& object, const Mat& image, const Mat& cam
     // Generate set of hypotheses using small subsets of the input data
     TransformHypothesesGenerator body(object, image_normalized, empty_dist_coef, eye_camera_mat,
                                       num_points, subset_size, rot_matrices, transl_vectors);
-    parallel_for(BlockedRange(0, num_iters), body);
+    parallel_for_(Range(0, num_iters), body);
 
     // Compute scores (i.e. number of inliers) for each hypothesis
     GpuMat d_object(object);
diff --git a/modules/imgproc/src/clahe.cpp b/modules/imgproc/src/clahe.cpp
new file mode 100644
index 0000000000..4ce479713e
--- /dev/null
+++ b/modules/imgproc/src/clahe.cpp
@@ -0,0 +1,334 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2013, NVIDIA Corporation, all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the copyright holders or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "precomp.hpp"
+
+// ----------------------------------------------------------------------
+// CLAHE
+
+namespace
+{
+    class CLAHE_CalcLut_Body : public cv::ParallelLoopBody
+    {
+    public:
+        CLAHE_CalcLut_Body(const cv::Mat& src, cv::Mat& lut, cv::Size tileSize, int tilesX, int tilesY, int clipLimit, float lutScale) :
+            src_(src), lut_(lut), tileSize_(tileSize), tilesX_(tilesX), tilesY_(tilesY), clipLimit_(clipLimit), lutScale_(lutScale)
+        {
+        }
+
+        void operator ()(const cv::Range& range) const;
+
+    private:
+        cv::Mat src_;
+        mutable cv::Mat lut_;
+
+        cv::Size tileSize_;
+        int tilesX_;
+        int tilesY_;
+        int clipLimit_;
+        float lutScale_;
+    };
+
+    void CLAHE_CalcLut_Body::operator ()(const cv::Range& range) const
+    {
+        const int histSize = 256;
+
+        uchar* tileLut = lut_.ptr(range.start);
+        const size_t lut_step = lut_.step;
+
+        for (int k = range.start; k < range.end; ++k, tileLut += lut_step)
+        {
+            const int ty = k / tilesX_;
+            const int tx = k % tilesX_;
+
+            // retrieve tile submatrix
+
+            cv::Rect tileROI;
+            tileROI.x = tx * tileSize_.width;
+            tileROI.y = ty * tileSize_.height;
+            tileROI.width = tileSize_.width;
+            tileROI.height = tileSize_.height;
+
+            const cv::Mat tile = src_(tileROI);
+
+            // calc histogram
+
+            int tileHist[histSize] = {0, };
+
+            int height = tileROI.height;
+            const size_t sstep = tile.step;
+            for (const uchar* ptr = tile.ptr<uchar>(0); height--; ptr += sstep)
+            {
+                int x = 0;
+                for (; x <= tileROI.width - 4; x += 4)
+                {
+                    int t0 = ptr[x], t1 = ptr[x+1];
+                    tileHist[t0]++; tileHist[t1]++;
+                    t0 = ptr[x+2]; t1 = ptr[x+3];
+                    tileHist[t0]++; tileHist[t1]++;
+                }
+
+                for (; x < tileROI.width; ++x)
+                    tileHist[ptr[x]]++;
+            }
+
+            // clip histogram
+
+            if (clipLimit_ > 0)
+            {
+                // how many pixels were clipped
+                int clipped = 0;
+                for (int i = 0; i < histSize; ++i)
+                {
+                    if (tileHist[i] > clipLimit_)
+                    {
+                        clipped += tileHist[i] - clipLimit_;
+                        tileHist[i] = clipLimit_;
+                    }
+                }
+
+                // redistribute clipped pixels
+                int redistBatch = clipped / histSize;
+                int residual = clipped - redistBatch * histSize;
+
+                for (int i = 0; i < histSize; ++i)
+                    tileHist[i] += redistBatch;
+
+                for (int i = 0; i < residual; ++i)
+                    tileHist[i]++;
+            }
+
+            // calc Lut
+
+            int sum = 0;
+            for (int i = 0; i < histSize; ++i)
+            {
+                sum += tileHist[i];
+                tileLut[i] = cv::saturate_cast<uchar>(sum * lutScale_);
+            }
+        }
+    }
+
+    class CLAHE_Interpolation_Body : public cv::ParallelLoopBody
+    {
+    public:
+        CLAHE_Interpolation_Body(const cv::Mat& src, cv::Mat& dst, const cv::Mat& lut, cv::Size tileSize, int tilesX, int tilesY) :
+            src_(src), dst_(dst), lut_(lut), tileSize_(tileSize), tilesX_(tilesX), tilesY_(tilesY)
+        {
+        }
+
+        void operator ()(const cv::Range& range) const;
+
+    private:
+        cv::Mat src_;
+        mutable cv::Mat dst_;
+        cv::Mat lut_;
+
+        cv::Size tileSize_;
+        int tilesX_;
+        int tilesY_;
+    };
+
+    void CLAHE_Interpolation_Body::operator ()(const cv::Range& range) const
+    {
+        const size_t lut_step = lut_.step;
+
+        for (int y = range.start; y < range.end; ++y)
+        {
+            const uchar* srcRow = src_.ptr<uchar>(y);
+            uchar* dstRow = dst_.ptr<uchar>(y);
+
+            const float tyf = (static_cast<float>(y) / tileSize_.height) - 0.5f;
+
+            int ty1 = cvFloor(tyf);
+            int ty2 = ty1 + 1;
+
+            const float ya = tyf - ty1;
+
+            ty1 = std::max(ty1, 0);
+            ty2 = std::min(ty2, tilesY_ - 1);
+
+            const uchar* lutPlane1 = lut_.ptr(ty1 * tilesX_);
+            const uchar* lutPlane2 = lut_.ptr(ty2 * tilesX_);
+
+            for (int x = 0; x < src_.cols; ++x)
+            {
+                const float txf = (static_cast<float>(x) / tileSize_.width) - 0.5f;
+
+                int tx1 = cvFloor(txf);
+                int tx2 = tx1 + 1;
+
+                const float xa = txf - tx1;
+
+                tx1 = std::max(tx1, 0);
+                tx2 = std::min(tx2, tilesX_ - 1);
+
+                const int srcVal = srcRow[x];
+
+                const size_t ind1 = tx1 * lut_step + srcVal;
+                const size_t ind2 = tx2 * lut_step + srcVal;
+
+                float res = 0;
+
+                res += lutPlane1[ind1] * ((1.0f - xa) * (1.0f - ya));
+                res += lutPlane1[ind2] * ((xa) * (1.0f - ya));
+                res += lutPlane2[ind1] * ((1.0f - xa) * (ya));
+                res += lutPlane2[ind2] * ((xa) * (ya));
+
+                dstRow[x] = cv::saturate_cast<uchar>(res);
+            }
+        }
+    }
+
+    class CLAHE_Impl : public cv::CLAHE
+    {
+    public:
+        CLAHE_Impl(double clipLimit = 40.0, int tilesX = 8, int tilesY = 8);
+
+        cv::AlgorithmInfo* info() const;
+
+        void apply(cv::InputArray src, cv::OutputArray dst);
+
+        void setClipLimit(double clipLimit);
+        double getClipLimit() const;
+
+        void setTilesGridSize(cv::Size tileGridSize);
+        cv::Size getTilesGridSize() const;
+
+        void collectGarbage();
+
+    private:
+        double clipLimit_;
+        int tilesX_;
+        int tilesY_;
+
+        cv::Mat srcExt_;
+        cv::Mat lut_;
+    };
+
+    CLAHE_Impl::CLAHE_Impl(double clipLimit, int tilesX, int tilesY) :
+        clipLimit_(clipLimit), tilesX_(tilesX), tilesY_(tilesY)
+    {
+    }
+
+    CV_INIT_ALGORITHM(CLAHE_Impl, "CLAHE",
+        obj.info()->addParam(obj, "clipLimit", obj.clipLimit_);
+        obj.info()->addParam(obj, "tilesX", obj.tilesX_);
+        obj.info()->addParam(obj, "tilesY", obj.tilesY_))
+
+    void CLAHE_Impl::apply(cv::InputArray _src, cv::OutputArray _dst)
+    {
+        cv::Mat src = _src.getMat();
+
+        CV_Assert( src.type() == CV_8UC1 );
+
+        _dst.create( src.size(), src.type() );
+        cv::Mat dst = _dst.getMat();
+
+        const int histSize = 256;
+
+        lut_.create(tilesX_ * tilesY_, histSize, CV_8UC1);
+
+        cv::Size tileSize;
+        cv::Mat srcForLut;
+
+        if (src.cols % tilesX_ == 0 && src.rows % tilesY_ == 0)
+        {
+            tileSize = cv::Size(src.cols / tilesX_, src.rows / tilesY_);
+            srcForLut = src;
+        }
+        else
+        {
+            cv::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0, tilesX_ - (src.cols % tilesX_), cv::BORDER_REFLECT_101);
+
+            tileSize = cv::Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_);
+            srcForLut = srcExt_;
+        }
+
+        const int tileSizeTotal = tileSize.area();
+        const float lutScale = static_cast<float>(histSize - 1) / tileSizeTotal;
+
+        int clipLimit = 0;
+        if (clipLimit_ > 0.0)
+        {
+            clipLimit = static_cast<int>(clipLimit_ * tileSizeTotal / histSize);
+            clipLimit = std::max(clipLimit, 1);
+        }
+
+        CLAHE_CalcLut_Body calcLutBody(srcForLut, lut_, tileSize, tilesX_, tilesY_, clipLimit, lutScale);
+        cv::parallel_for_(cv::Range(0, tilesX_ * tilesY_), calcLutBody);
+
+        CLAHE_Interpolation_Body interpolationBody(src, dst, lut_, tileSize, tilesX_, tilesY_);
+        cv::parallel_for_(cv::Range(0, src.rows), interpolationBody);
+    }
+
+    void CLAHE_Impl::setClipLimit(double clipLimit)
+    {
+        clipLimit_ = clipLimit;
+    }
+
+    double CLAHE_Impl::getClipLimit() const
+    {
+        return clipLimit_;
+    }
+
+    void CLAHE_Impl::setTilesGridSize(cv::Size tileGridSize)
+    {
+        tilesX_ = tileGridSize.width;
+        tilesY_ = tileGridSize.height;
+    }
+
+    cv::Size CLAHE_Impl::getTilesGridSize() const
+    {
+        return cv::Size(tilesX_, tilesY_);
+    }
+
+    void CLAHE_Impl::collectGarbage()
+    {
+        srcExt_.release();
+        lut_.release();
+    }
+}
+
+cv::Ptr<cv::CLAHE> cv::createCLAHE(double clipLimit, cv::Size tileGridSize)
+{
+    return new CLAHE_Impl(clipLimit, tileGridSize.width, tileGridSize.height);
+}
diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp
index 3799d435e3..41ca2db9c0 100644
--- a/modules/imgproc/src/color.cpp
+++ b/modules/imgproc/src/color.cpp
@@ -2755,7 +2755,7 @@ const int ITUR_BT_601_CGV = -385875;
 const int ITUR_BT_601_CBV = -74448;
 
 template<int bIdx, int uIdx>
-struct YUV420sp2RGB888Invoker
+struct YUV420sp2RGB888Invoker : ParallelLoopBody
 {
     Mat* dst;
     const uchar* my1, *muv;
@@ -2764,10 +2764,10 @@ struct YUV420sp2RGB888Invoker
     YUV420sp2RGB888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _uv)
         : dst(_dst), my1(_y1), muv(_uv), width(_dst->cols), stride(_stride) {}
 
-    void operator()(const BlockedRange& range) const
+    void operator()(const Range& range) const
     {
-        int rangeBegin = range.begin() * 2;
-        int rangeEnd = range.end() * 2;
+        int rangeBegin = range.start * 2;
+        int rangeEnd = range.end * 2;
 
         //R = 1.164(Y - 16) + 1.596(V - 128)
         //G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
@@ -2824,7 +2824,7 @@ struct YUV420sp2RGB888Invoker
 };
 
 template<int bIdx, int uIdx>
-struct YUV420sp2RGBA8888Invoker
+struct YUV420sp2RGBA8888Invoker : ParallelLoopBody
 {
     Mat* dst;
     const uchar* my1, *muv;
@@ -2833,10 +2833,10 @@ struct YUV420sp2RGBA8888Invoker
     YUV420sp2RGBA8888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _uv)
         : dst(_dst), my1(_y1), muv(_uv), width(_dst->cols), stride(_stride) {}
 
-    void operator()(const BlockedRange& range) const
+    void operator()(const Range& range) const
     {
-        int rangeBegin = range.begin() * 2;
-        int rangeEnd = range.end() * 2;
+        int rangeBegin = range.start * 2;
+        int rangeEnd = range.end * 2;
 
         //R = 1.164(Y - 16) + 1.596(V - 128)
         //G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
@@ -2897,7 +2897,7 @@ struct YUV420sp2RGBA8888Invoker
 };
 
 template<int bIdx>
-struct YUV420p2RGB888Invoker
+struct YUV420p2RGB888Invoker : ParallelLoopBody
 {
     Mat* dst;
     const uchar* my1, *mu, *mv;
@@ -2907,19 +2907,19 @@ struct YUV420p2RGB888Invoker
     YUV420p2RGB888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx)
         : dst(_dst), my1(_y1), mu(_u), mv(_v), width(_dst->cols), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {}
 
-    void operator()(const BlockedRange& range) const
+    void operator()(const Range& range) const
     {
-        const int rangeBegin = range.begin() * 2;
-        const int rangeEnd = range.end() * 2;
+        const int rangeBegin = range.start * 2;
+        const int rangeEnd = range.end * 2;
 
         size_t uvsteps[2] = {width/2, stride - width/2};
         int usIdx = ustepIdx, vsIdx = vstepIdx;
 
         const uchar* y1 = my1 + rangeBegin * stride;
-        const uchar* u1 = mu + (range.begin() / 2) * stride;
-        const uchar* v1 = mv + (range.begin() / 2) * stride;
+        const uchar* u1 = mu + (range.start / 2) * stride;
+        const uchar* v1 = mv + (range.start / 2) * stride;
 
-        if(range.begin() % 2 == 1)
+        if(range.start % 2 == 1)
         {
             u1 += uvsteps[(usIdx++) & 1];
             v1 += uvsteps[(vsIdx++) & 1];
@@ -2965,7 +2965,7 @@ struct YUV420p2RGB888Invoker
 };
 
 template<int bIdx>
-struct YUV420p2RGBA8888Invoker
+struct YUV420p2RGBA8888Invoker : ParallelLoopBody
 {
     Mat* dst;
     const uchar* my1, *mu, *mv;
@@ -2975,19 +2975,19 @@ struct YUV420p2RGBA8888Invoker
     YUV420p2RGBA8888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx)
         : dst(_dst), my1(_y1), mu(_u), mv(_v), width(_dst->cols), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {}
 
-    void operator()(const BlockedRange& range) const
+    void operator()(const Range& range) const
     {
-        int rangeBegin = range.begin() * 2;
-        int rangeEnd = range.end() * 2;
+        int rangeBegin = range.start * 2;
+        int rangeEnd = range.end * 2;
 
         size_t uvsteps[2] = {width/2, stride - width/2};
         int usIdx = ustepIdx, vsIdx = vstepIdx;
 
         const uchar* y1 = my1 + rangeBegin * stride;
-        const uchar* u1 = mu + (range.begin() / 2) * stride;
-        const uchar* v1 = mv + (range.begin() / 2) * stride;
+        const uchar* u1 = mu + (range.start / 2) * stride;
+        const uchar* v1 = mv + (range.start / 2) * stride;
 
-        if(range.begin() % 2 == 1)
+        if(range.start % 2 == 1)
         {
             u1 += uvsteps[(usIdx++) & 1];
             v1 += uvsteps[(vsIdx++) & 1];
@@ -3042,48 +3042,40 @@ template<int bIdx, int uIdx>
 inline void cvtYUV420sp2RGB(Mat& _dst, int _stride, const uchar* _y1, const uchar* _uv)
 {
     YUV420sp2RGB888Invoker<bIdx, uIdx> converter(&_dst, _stride, _y1,  _uv);
-#ifdef HAVE_TBB
     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
-        parallel_for(BlockedRange(0, _dst.rows/2), converter);
+        parallel_for_(Range(0, _dst.rows/2), converter);
     else
-#endif
-        converter(BlockedRange(0, _dst.rows/2));
+        converter(Range(0, _dst.rows/2));
 }
 
 template<int bIdx, int uIdx>
 inline void cvtYUV420sp2RGBA(Mat& _dst, int _stride, const uchar* _y1, const uchar* _uv)
 {
     YUV420sp2RGBA8888Invoker<bIdx, uIdx> converter(&_dst, _stride, _y1,  _uv);
-#ifdef HAVE_TBB
     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
-        parallel_for(BlockedRange(0, _dst.rows/2), converter);
+        parallel_for_(Range(0, _dst.rows/2), converter);
     else
-#endif
-        converter(BlockedRange(0, _dst.rows/2));
+        converter(Range(0, _dst.rows/2));
 }
 
 template<int bIdx>
 inline void cvtYUV420p2RGB(Mat& _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx)
 {
     YUV420p2RGB888Invoker<bIdx> converter(&_dst, _stride, _y1,  _u, _v, ustepIdx, vstepIdx);
-#ifdef HAVE_TBB
     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
-        parallel_for(BlockedRange(0, _dst.rows/2), converter);
+        parallel_for_(Range(0, _dst.rows/2), converter);
     else
-#endif
-        converter(BlockedRange(0, _dst.rows/2));
+        converter(Range(0, _dst.rows/2));
 }
 
 template<int bIdx>
 inline void cvtYUV420p2RGBA(Mat& _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx)
 {
     YUV420p2RGBA8888Invoker<bIdx> converter(&_dst, _stride, _y1,  _u, _v, ustepIdx, vstepIdx);
-#ifdef HAVE_TBB
     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
-        parallel_for(BlockedRange(0, _dst.rows/2), converter);
+        parallel_for_(Range(0, _dst.rows/2), converter);
     else
-#endif
-        converter(BlockedRange(0, _dst.rows/2));
+        converter(Range(0, _dst.rows/2));
 }
 
 ///////////////////////////////////// RGB -> YUV420p /////////////////////////////////////
@@ -3167,7 +3159,7 @@ static void cvtRGBtoYUV420p(const Mat& src, Mat& dst)
 ///////////////////////////////////// YUV422 -> RGB /////////////////////////////////////
 
 template<int bIdx, int uIdx, int yIdx>
-struct YUV422toRGB888Invoker
+struct YUV422toRGB888Invoker : ParallelLoopBody
 {
     Mat* dst;
     const uchar* src;
@@ -3176,10 +3168,10 @@ struct YUV422toRGB888Invoker
     YUV422toRGB888Invoker(Mat* _dst, int _stride, const uchar* _yuv)
         : dst(_dst), src(_yuv), width(_dst->cols), stride(_stride) {}
 
-    void operator()(const BlockedRange& range) const
+    void operator()(const Range& range) const
     {
-        int rangeBegin = range.begin();
-        int rangeEnd = range.end();
+        int rangeBegin = range.start;
+        int rangeEnd = range.end;
 
         const int uidx = 1 - yIdx + uIdx * 2;
         const int vidx = (2 + uidx) % 4;
@@ -3213,7 +3205,7 @@ struct YUV422toRGB888Invoker
 };
 
 template<int bIdx, int uIdx, int yIdx>
-struct YUV422toRGBA8888Invoker
+struct YUV422toRGBA8888Invoker : ParallelLoopBody
 {
     Mat* dst;
     const uchar* src;
@@ -3222,10 +3214,10 @@ struct YUV422toRGBA8888Invoker
     YUV422toRGBA8888Invoker(Mat* _dst, int _stride, const uchar* _yuv)
         : dst(_dst), src(_yuv), width(_dst->cols), stride(_stride) {}
 
-    void operator()(const BlockedRange& range) const
+    void operator()(const Range& range) const
     {
-        int rangeBegin = range.begin();
-        int rangeEnd = range.end();
+        int rangeBegin = range.start;
+        int rangeEnd = range.end;
 
         const int uidx = 1 - yIdx + uIdx * 2;
         const int vidx = (2 + uidx) % 4;
@@ -3266,24 +3258,20 @@ template<int bIdx, int uIdx, int yIdx>
 inline void cvtYUV422toRGB(Mat& _dst, int _stride, const uchar* _yuv)
 {
     YUV422toRGB888Invoker<bIdx, uIdx, yIdx> converter(&_dst, _stride, _yuv);
-#ifdef HAVE_TBB
     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION)
-        parallel_for(BlockedRange(0, _dst.rows), converter);
+        parallel_for_(Range(0, _dst.rows), converter);
     else
-#endif
-        converter(BlockedRange(0, _dst.rows));
+        converter(Range(0, _dst.rows));
 }
 
 template<int bIdx, int uIdx, int yIdx>
 inline void cvtYUV422toRGBA(Mat& _dst, int _stride, const uchar* _yuv)
 {
     YUV422toRGBA8888Invoker<bIdx, uIdx, yIdx> converter(&_dst, _stride, _yuv);
-#ifdef HAVE_TBB
     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION)
-        parallel_for(BlockedRange(0, _dst.rows), converter);
+        parallel_for_(Range(0, _dst.rows), converter);
     else
-#endif
-        converter(BlockedRange(0, _dst.rows));
+        converter(Range(0, _dst.rows));
 }
 
 /////////////////////////// RGBA <-> mRGBA (alpha premultiplied) //////////////
diff --git a/modules/imgproc/src/distransform.cpp b/modules/imgproc/src/distransform.cpp
index 89d3a550f4..d3e6f90242 100644
--- a/modules/imgproc/src/distransform.cpp
+++ b/modules/imgproc/src/distransform.cpp
@@ -443,7 +443,7 @@ icvGetDistanceTransformMask( int maskType, float *metrics )
 namespace cv
 {
 
-struct DTColumnInvoker
+struct DTColumnInvoker : ParallelLoopBody
 {
     DTColumnInvoker( const CvMat* _src, CvMat* _dst, const int* _sat_tab, const float* _sqr_tab)
     {
@@ -453,9 +453,9 @@ struct DTColumnInvoker
         sqr_tab = _sqr_tab;
     }
 
-    void operator()( const BlockedRange& range ) const
+    void operator()( const Range& range ) const
     {
-        int i, i1 = range.begin(), i2 = range.end();
+        int i, i1 = range.start, i2 = range.end;
         int m = src->rows;
         size_t sstep = src->step, dstep = dst->step/sizeof(float);
         AutoBuffer<int> _d(m);
@@ -490,7 +490,7 @@ struct DTColumnInvoker
 };
 
 
-struct DTRowInvoker
+struct DTRowInvoker : ParallelLoopBody
 {
     DTRowInvoker( CvMat* _dst, const float* _sqr_tab, const float* _inv_tab )
     {
@@ -499,10 +499,10 @@ struct DTRowInvoker
         inv_tab = _inv_tab;
     }
 
-    void operator()( const BlockedRange& range ) const
+    void operator()( const Range& range ) const
     {
         const float inf = 1e15f;
-        int i, i1 = range.begin(), i2 = range.end();
+        int i, i1 = range.start, i2 = range.end;
         int n = dst->cols;
         AutoBuffer<uchar> _buf((n+2)*2*sizeof(float) + (n+2)*sizeof(int));
         float* f = (float*)(uchar*)_buf;
@@ -586,7 +586,7 @@ icvTrueDistTrans( const CvMat* src, CvMat* dst )
     for( ; i <= m*3; i++ )
         sat_tab[i] = i - shift;
 
-    cv::parallel_for(cv::BlockedRange(0, n), cv::DTColumnInvoker(src, dst, sat_tab, sqr_tab));
+    cv::parallel_for_(cv::Range(0, n), cv::DTColumnInvoker(src, dst, sat_tab, sqr_tab));
 
     // stage 2: compute modified distance transform for each row
     float* inv_tab = sqr_tab + n;
@@ -598,7 +598,7 @@ icvTrueDistTrans( const CvMat* src, CvMat* dst )
         sqr_tab[i] = (float)(i*i);
     }
 
-    cv::parallel_for(cv::BlockedRange(0, m), cv::DTRowInvoker(dst, sqr_tab, inv_tab));
+    cv::parallel_for_(cv::Range(0, m), cv::DTRowInvoker(dst, sqr_tab, inv_tab));
 }
 
 
diff --git a/modules/imgproc/src/histogram.cpp b/modules/imgproc/src/histogram.cpp
index 22dd9beb1f..bfcdee515f 100644
--- a/modules/imgproc/src/histogram.cpp
+++ b/modules/imgproc/src/histogram.cpp
@@ -2986,29 +2986,23 @@ cvCalcProbDensity( const CvHistogram* hist, const CvHistogram* hist_mask,
     }
 }
 
-class EqualizeHistCalcHist_Invoker
+class EqualizeHistCalcHist_Invoker : public cv::ParallelLoopBody
 {
 public:
     enum {HIST_SZ = 256};
 
-#ifdef HAVE_TBB
-    typedef tbb::mutex* MutextPtr;
-#else
-    typedef void* MutextPtr;
-#endif
-
-    EqualizeHistCalcHist_Invoker(cv::Mat& src, int* histogram, MutextPtr histogramLock)
+    EqualizeHistCalcHist_Invoker(cv::Mat& src, int* histogram, cv::Mutex* histogramLock)
         : src_(src), globalHistogram_(histogram), histogramLock_(histogramLock)
     { }
 
-    void operator()( const cv::BlockedRange& rowRange ) const
+    void operator()( const cv::Range& rowRange ) const
     {
         int localHistogram[HIST_SZ] = {0, };
 
         const size_t sstep = src_.step;
 
         int width = src_.cols;
-        int height = rowRange.end() - rowRange.begin();
+        int height = rowRange.end - rowRange.start;
 
         if (src_.isContinuous())
         {
@@ -3016,7 +3010,7 @@ public:
             height = 1;
         }
 
-        for (const uchar* ptr = src_.ptr<uchar>(rowRange.begin()); height--; ptr += sstep)
+        for (const uchar* ptr = src_.ptr<uchar>(rowRange.start); height--; ptr += sstep)
         {
             int x = 0;
             for (; x <= width - 4; x += 4)
@@ -3031,9 +3025,7 @@ public:
                 localHistogram[ptr[x]]++;
         }
 
-#ifdef HAVE_TBB
-        tbb::mutex::scoped_lock lock(*histogramLock_);
-#endif
+        cv::AutoLock lock(*histogramLock_);
 
         for( int i = 0; i < HIST_SZ; i++ )
             globalHistogram_[i] += localHistogram[i];
@@ -3041,12 +3033,7 @@ public:
 
     static bool isWorthParallel( const cv::Mat& src )
     {
-#ifdef HAVE_TBB
         return ( src.total() >= 640*480 );
-#else
-        (void)src;
-        return false;
-#endif
     }
 
 private:
@@ -3054,10 +3041,10 @@ private:
 
     cv::Mat& src_;
     int* globalHistogram_;
-    MutextPtr histogramLock_;
+    cv::Mutex* histogramLock_;
 };
 
-class EqualizeHistLut_Invoker
+class EqualizeHistLut_Invoker : public cv::ParallelLoopBody
 {
 public:
     EqualizeHistLut_Invoker( cv::Mat& src, cv::Mat& dst, int* lut )
@@ -3066,13 +3053,13 @@ public:
           lut_(lut)
     { }
 
-    void operator()( const cv::BlockedRange& rowRange ) const
+    void operator()( const cv::Range& rowRange ) const
     {
         const size_t sstep = src_.step;
         const size_t dstep = dst_.step;
 
         int width = src_.cols;
-        int height = rowRange.end() - rowRange.begin();
+        int height = rowRange.end - rowRange.start;
         int* lut = lut_;
 
         if (src_.isContinuous() && dst_.isContinuous())
@@ -3081,8 +3068,8 @@ public:
             height = 1;
         }
 
-        const uchar* sptr = src_.ptr<uchar>(rowRange.begin());
-        uchar* dptr = dst_.ptr<uchar>(rowRange.begin());
+        const uchar* sptr = src_.ptr<uchar>(rowRange.start);
+        uchar* dptr = dst_.ptr<uchar>(rowRange.start);
 
         for (; height--; sptr += sstep, dptr += dstep)
         {
@@ -3111,12 +3098,7 @@ public:
 
     static bool isWorthParallel( const cv::Mat& src )
     {
-#ifdef HAVE_TBB
         return ( src.total() >= 640*480 );
-#else
-        (void)src;
-        return false;
-#endif
     }
 
 private:
@@ -3143,23 +3125,18 @@ void cv::equalizeHist( InputArray _src, OutputArray _dst )
     if(src.empty())
         return;
 
-#ifdef HAVE_TBB
-    tbb::mutex histogramLockInstance;
-    EqualizeHistCalcHist_Invoker::MutextPtr histogramLock = &histogramLockInstance;
-#else
-    EqualizeHistCalcHist_Invoker::MutextPtr histogramLock = 0;
-#endif
+    Mutex histogramLockInstance;
 
     const int hist_sz = EqualizeHistCalcHist_Invoker::HIST_SZ;
     int hist[hist_sz] = {0,};
     int lut[hist_sz];
 
-    EqualizeHistCalcHist_Invoker calcBody(src, hist, histogramLock);
+    EqualizeHistCalcHist_Invoker calcBody(src, hist, &histogramLockInstance);
     EqualizeHistLut_Invoker      lutBody(src, dst, lut);
-    cv::BlockedRange heightRange(0, src.rows);
+    cv::Range heightRange(0, src.rows);
 
     if(EqualizeHistCalcHist_Invoker::isWorthParallel(src))
-        parallel_for(heightRange, calcBody);
+        parallel_for_(heightRange, calcBody);
     else
         calcBody(heightRange);
 
@@ -3183,303 +3160,11 @@ void cv::equalizeHist( InputArray _src, OutputArray _dst )
     }
 
     if(EqualizeHistLut_Invoker::isWorthParallel(src))
-        parallel_for(heightRange, lutBody);
+        parallel_for_(heightRange, lutBody);
     else
         lutBody(heightRange);
 }
 
-// ----------------------------------------------------------------------
-// CLAHE
-
-namespace
-{
-    class CLAHE_CalcLut_Body : public cv::ParallelLoopBody
-    {
-    public:
-        CLAHE_CalcLut_Body(const cv::Mat& src, cv::Mat& lut, cv::Size tileSize, int tilesX, int tilesY, int clipLimit, float lutScale) :
-            src_(src), lut_(lut), tileSize_(tileSize), tilesX_(tilesX), tilesY_(tilesY), clipLimit_(clipLimit), lutScale_(lutScale)
-        {
-        }
-
-        void operator ()(const cv::Range& range) const;
-
-    private:
-        cv::Mat src_;
-        mutable cv::Mat lut_;
-
-        cv::Size tileSize_;
-        int tilesX_;
-        int tilesY_;
-        int clipLimit_;
-        float lutScale_;
-    };
-
-    void CLAHE_CalcLut_Body::operator ()(const cv::Range& range) const
-    {
-        const int histSize = 256;
-
-        uchar* tileLut = lut_.ptr(range.start);
-        const size_t lut_step = lut_.step;
-
-        for (int k = range.start; k < range.end; ++k, tileLut += lut_step)
-        {
-            const int ty = k / tilesX_;
-            const int tx = k % tilesX_;
-
-            // retrieve tile submatrix
-
-            cv::Rect tileROI;
-            tileROI.x = tx * tileSize_.width;
-            tileROI.y = ty * tileSize_.height;
-            tileROI.width = tileSize_.width;
-            tileROI.height = tileSize_.height;
-
-            const cv::Mat tile = src_(tileROI);
-
-            // calc histogram
-
-            int tileHist[histSize] = {0, };
-
-            int height = tileROI.height;
-            const size_t sstep = tile.step;
-            for (const uchar* ptr = tile.ptr<uchar>(0); height--; ptr += sstep)
-            {
-                int x = 0;
-                for (; x <= tileROI.width - 4; x += 4)
-                {
-                    int t0 = ptr[x], t1 = ptr[x+1];
-                    tileHist[t0]++; tileHist[t1]++;
-                    t0 = ptr[x+2]; t1 = ptr[x+3];
-                    tileHist[t0]++; tileHist[t1]++;
-                }
-
-                for (; x < tileROI.width; ++x)
-                    tileHist[ptr[x]]++;
-            }
-
-            // clip histogram
-
-            if (clipLimit_ > 0)
-            {
-                // how many pixels were clipped
-                int clipped = 0;
-                for (int i = 0; i < histSize; ++i)
-                {
-                    if (tileHist[i] > clipLimit_)
-                    {
-                        clipped += tileHist[i] - clipLimit_;
-                        tileHist[i] = clipLimit_;
-                    }
-                }
-
-                // redistribute clipped pixels
-                int redistBatch = clipped / histSize;
-                int residual = clipped - redistBatch * histSize;
-
-                for (int i = 0; i < histSize; ++i)
-                    tileHist[i] += redistBatch;
-
-                for (int i = 0; i < residual; ++i)
-                    tileHist[i]++;
-            }
-
-            // calc Lut
-
-            int sum = 0;
-            for (int i = 0; i < histSize; ++i)
-            {
-                sum += tileHist[i];
-                tileLut[i] = cv::saturate_cast<uchar>(sum * lutScale_);
-            }
-        }
-    }
-
-    class CLAHE_Interpolation_Body : public cv::ParallelLoopBody
-    {
-    public:
-        CLAHE_Interpolation_Body(const cv::Mat& src, cv::Mat& dst, const cv::Mat& lut, cv::Size tileSize, int tilesX, int tilesY) :
-            src_(src), dst_(dst), lut_(lut), tileSize_(tileSize), tilesX_(tilesX), tilesY_(tilesY)
-        {
-        }
-
-        void operator ()(const cv::Range& range) const;
-
-    private:
-        cv::Mat src_;
-        mutable cv::Mat dst_;
-        cv::Mat lut_;
-
-        cv::Size tileSize_;
-        int tilesX_;
-        int tilesY_;
-    };
-
-    void CLAHE_Interpolation_Body::operator ()(const cv::Range& range) const
-    {
-        const size_t lut_step = lut_.step;
-
-        for (int y = range.start; y < range.end; ++y)
-        {
-            const uchar* srcRow = src_.ptr<uchar>(y);
-            uchar* dstRow = dst_.ptr<uchar>(y);
-
-            const float tyf = (static_cast<float>(y) / tileSize_.height) - 0.5f;
-
-            int ty1 = cvFloor(tyf);
-            int ty2 = ty1 + 1;
-
-            const float ya = tyf - ty1;
-
-            ty1 = std::max(ty1, 0);
-            ty2 = std::min(ty2, tilesY_ - 1);
-
-            const uchar* lutPlane1 = lut_.ptr(ty1 * tilesX_);
-            const uchar* lutPlane2 = lut_.ptr(ty2 * tilesX_);
-
-            for (int x = 0; x < src_.cols; ++x)
-            {
-                const float txf = (static_cast<float>(x) / tileSize_.width) - 0.5f;
-
-                int tx1 = cvFloor(txf);
-                int tx2 = tx1 + 1;
-
-                const float xa = txf - tx1;
-
-                tx1 = std::max(tx1, 0);
-                tx2 = std::min(tx2, tilesX_ - 1);
-
-                const int srcVal = srcRow[x];
-
-                const size_t ind1 = tx1 * lut_step + srcVal;
-                const size_t ind2 = tx2 * lut_step + srcVal;
-
-                float res = 0;
-
-                res += lutPlane1[ind1] * ((1.0f - xa) * (1.0f - ya));
-                res += lutPlane1[ind2] * ((xa) * (1.0f - ya));
-                res += lutPlane2[ind1] * ((1.0f - xa) * (ya));
-                res += lutPlane2[ind2] * ((xa) * (ya));
-
-                dstRow[x] = cv::saturate_cast<uchar>(res);
-            }
-        }
-    }
-
-    class CLAHE_Impl : public cv::CLAHE
-    {
-    public:
-        CLAHE_Impl(double clipLimit = 40.0, int tilesX = 8, int tilesY = 8);
-
-        cv::AlgorithmInfo* info() const;
-
-        void apply(cv::InputArray src, cv::OutputArray dst);
-
-        void setClipLimit(double clipLimit);
-        double getClipLimit() const;
-
-        void setTilesGridSize(cv::Size tileGridSize);
-        cv::Size getTilesGridSize() const;
-
-        void collectGarbage();
-
-    private:
-        double clipLimit_;
-        int tilesX_;
-        int tilesY_;
-
-        cv::Mat srcExt_;
-        cv::Mat lut_;
-    };
-
-    CLAHE_Impl::CLAHE_Impl(double clipLimit, int tilesX, int tilesY) :
-        clipLimit_(clipLimit), tilesX_(tilesX), tilesY_(tilesY)
-    {
-    }
-
-    CV_INIT_ALGORITHM(CLAHE_Impl, "CLAHE",
-        obj.info()->addParam(obj, "clipLimit", obj.clipLimit_);
-        obj.info()->addParam(obj, "tilesX", obj.tilesX_);
-        obj.info()->addParam(obj, "tilesY", obj.tilesY_))
-
-    void CLAHE_Impl::apply(cv::InputArray _src, cv::OutputArray _dst)
-    {
-        cv::Mat src = _src.getMat();
-
-        CV_Assert( src.type() == CV_8UC1 );
-
-        _dst.create( src.size(), src.type() );
-        cv::Mat dst = _dst.getMat();
-
-        const int histSize = 256;
-
-        lut_.create(tilesX_ * tilesY_, histSize, CV_8UC1);
-
-        cv::Size tileSize;
-        cv::Mat srcForLut;
-
-        if (src.cols % tilesX_ == 0 && src.rows % tilesY_ == 0)
-        {
-            tileSize = cv::Size(src.cols / tilesX_, src.rows / tilesY_);
-            srcForLut = src;
-        }
-        else
-        {
-            cv::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0, tilesX_ - (src.cols % tilesX_), cv::BORDER_REFLECT_101);
-
-            tileSize = cv::Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_);
-            srcForLut = srcExt_;
-        }
-
-        const int tileSizeTotal = tileSize.area();
-        const float lutScale = static_cast<float>(histSize - 1) / tileSizeTotal;
-
-        int clipLimit = 0;
-        if (clipLimit_ > 0.0)
-        {
-            clipLimit = static_cast<int>(clipLimit_ * tileSizeTotal / histSize);
-            clipLimit = std::max(clipLimit, 1);
-        }
-
-        CLAHE_CalcLut_Body calcLutBody(srcForLut, lut_, tileSize, tilesX_, tilesY_, clipLimit, lutScale);
-        cv::parallel_for_(cv::Range(0, tilesX_ * tilesY_), calcLutBody);
-
-        CLAHE_Interpolation_Body interpolationBody(src, dst, lut_, tileSize, tilesX_, tilesY_);
-        cv::parallel_for_(cv::Range(0, src.rows), interpolationBody);
-    }
-
-    void CLAHE_Impl::setClipLimit(double clipLimit)
-    {
-        clipLimit_ = clipLimit;
-    }
-
-    double CLAHE_Impl::getClipLimit() const
-    {
-        return clipLimit_;
-    }
-
-    void CLAHE_Impl::setTilesGridSize(cv::Size tileGridSize)
-    {
-        tilesX_ = tileGridSize.width;
-        tilesY_ = tileGridSize.height;
-    }
-
-    cv::Size CLAHE_Impl::getTilesGridSize() const
-    {
-        return cv::Size(tilesX_, tilesY_);
-    }
-
-    void CLAHE_Impl::collectGarbage()
-    {
-        srcExt_.release();
-        lut_.release();
-    }
-}
-
-cv::Ptr<cv::CLAHE> cv::createCLAHE(double clipLimit, cv::Size tileGridSize)
-{
-    return new CLAHE_Impl(clipLimit, tileGridSize.width, tileGridSize.height);
-}
-
 // ----------------------------------------------------------------------
 
 /* Implementation of RTTI and Generic Functions for CvHistogram */
diff --git a/modules/imgproc/src/morph.cpp b/modules/imgproc/src/morph.cpp
index a63e08ff01..53d2347ec4 100644
--- a/modules/imgproc/src/morph.cpp
+++ b/modules/imgproc/src/morph.cpp
@@ -1081,7 +1081,7 @@ cv::Mat cv::getStructuringElement(int shape, Size ksize, Point anchor)
 namespace cv
 {
 
-class MorphologyRunner
+class MorphologyRunner : public ParallelLoopBody
 {
 public:
     MorphologyRunner(Mat _src, Mat _dst, int _nStripes, int _iterations,
@@ -1102,14 +1102,14 @@ public:
         columnBorderType = _columnBorderType;
     }
 
-    void operator () ( const BlockedRange& range ) const
+    void operator () ( const Range& range ) const
     {
-        int row0 = min(cvRound(range.begin() * src.rows / nStripes), src.rows);
-        int row1 = min(cvRound(range.end() * src.rows / nStripes), src.rows);
+        int row0 = min(cvRound(range.start * src.rows / nStripes), src.rows);
+        int row1 = min(cvRound(range.end * src.rows / nStripes), src.rows);
 
         /*if(0)
             printf("Size = (%d, %d), range[%d,%d), row0 = %d, row1 = %d\n",
-                   src.rows, src.cols, range.begin(), range.end(), row0, row1);*/
+                   src.rows, src.cols, range.start, range.end, row0, row1);*/
 
         Mat srcStripe = src.rowRange(row0, row1);
         Mat dstStripe = dst.rowRange(row0, row1);
@@ -1173,15 +1173,15 @@ static void morphOp( int op, InputArray _src, OutputArray _dst,
     }
 
     int nStripes = 1;
-#if defined HAVE_TBB && defined HAVE_TEGRA_OPTIMIZATION
+#if defined HAVE_TEGRA_OPTIMIZATION
     if (src.data != dst.data && iterations == 1 &&  //NOTE: threads are not used for inplace processing
         (borderType & BORDER_ISOLATED) == 0 && //TODO: check border types
         src.rows >= 64 ) //NOTE: just heuristics
         nStripes = 4;
 #endif
 
-    parallel_for(BlockedRange(0, nStripes),
-                 MorphologyRunner(src, dst, nStripes, iterations, op, kernel, anchor, borderType, borderType, borderValue));
+    parallel_for_(Range(0, nStripes),
+                  MorphologyRunner(src, dst, nStripes, iterations, op, kernel, anchor, borderType, borderType, borderValue));
 
     //Ptr<FilterEngine> f = createMorphologyFilter(op, src.type(),
     //                                             kernel, anchor, borderType, borderType, borderValue );
diff --git a/modules/ml/src/ann_mlp.cpp b/modules/ml/src/ann_mlp.cpp
index 438872ae8c..bf85425b9c 100644
--- a/modules/ml/src/ann_mlp.cpp
+++ b/modules/ml/src/ann_mlp.cpp
@@ -40,10 +40,6 @@
 
 #include "precomp.hpp"
 
-#ifdef HAVE_TBB
-#include <tbb/tbb.h>
-#endif
-
 CvANN_MLP_TrainParams::CvANN_MLP_TrainParams()
 {
     term_crit = cvTermCriteria( CV_TERMCRIT_ITER + CV_TERMCRIT_EPS, 1000, 0.01 );
@@ -1022,7 +1018,7 @@ int CvANN_MLP::train_backprop( CvVectors x0, CvVectors u, const double* sw )
     return iter;
 }
 
-struct rprop_loop {
+struct rprop_loop : cv::ParallelLoopBody {
   rprop_loop(const CvANN_MLP* _point, double**& _weights, int& _count, int& _ivcount, CvVectors* _x0,
      int& _l_count, CvMat*& _layer_sizes, int& _ovcount, int& _max_count,
      CvVectors* _u, const double*& _sw, double& _inv_count, CvMat*& _dEdw, int& _dcount0, double* _E, int _buf_sz)
@@ -1063,7 +1059,7 @@ struct rprop_loop {
   int buf_sz;
 
 
-  void operator()( const cv::BlockedRange& range ) const
+  void operator()( const cv::Range& range ) const
   {
     double* buf_ptr;
     double** x = 0;
@@ -1084,7 +1080,7 @@ struct rprop_loop {
         buf_ptr += (df[i] - x[i])*2;
     }
 
-    for(int si = range.begin(); si < range.end(); si++ )
+    for(int si = range.start; si < range.end; si++ )
     {
         if (si % dcount0 != 0) continue;
         int n1, n2, k;
@@ -1170,36 +1166,33 @@ struct rprop_loop {
             }
 
         // backward pass, update dEdw
-        #ifdef HAVE_TBB
-        static tbb::spin_mutex mutex;
-        tbb::spin_mutex::scoped_lock lock;
-        #endif
+        static cv::Mutex mutex;
+
         for(int i = l_count-1; i > 0; i-- )
         {
             n1 = layer_sizes->data.i[i-1]; n2 = layer_sizes->data.i[i];
             cvInitMatHeader( &_df, dcount, n2, CV_64F, df[i] );
             cvMul( grad1, &_df, grad1 );
-            #ifdef HAVE_TBB
-            lock.acquire(mutex);
-            #endif
-            cvInitMatHeader( &_dEdw, n1, n2, CV_64F, dEdw->data.db+(weights[i]-weights[0]) );
-            cvInitMatHeader( x1, dcount, n1, CV_64F, x[i-1] );
-            cvGEMM( x1, grad1, 1, &_dEdw, 1, &_dEdw, CV_GEMM_A_T );
-
-            // update bias part of dEdw
-           for( k = 0; k < dcount; k++ )
-           {
-               double* dst = _dEdw.data.db + n1*n2;
-               const double* src = grad1->data.db + k*n2;
-               for(int j = 0; j < n2; j++ )
-                   dst[j] += src[j];
+
+            {
+                cv::AutoLock lock(mutex);
+                cvInitMatHeader( &_dEdw, n1, n2, CV_64F, dEdw->data.db+(weights[i]-weights[0]) );
+                cvInitMatHeader( x1, dcount, n1, CV_64F, x[i-1] );
+                cvGEMM( x1, grad1, 1, &_dEdw, 1, &_dEdw, CV_GEMM_A_T );
+
+                // update bias part of dEdw
+                for( k = 0; k < dcount; k++ )
+                {
+                    double* dst = _dEdw.data.db + n1*n2;
+                    const double* src = grad1->data.db + k*n2;
+                    for(int j = 0; j < n2; j++ )
+                        dst[j] += src[j];
+                }
+
+                if (i > 1)
+                    cvInitMatHeader( &_w, n1, n2, CV_64F, weights[i] );
            }
 
-           if (i > 1)
-               cvInitMatHeader( &_w, n1, n2, CV_64F, weights[i] );
-           #ifdef HAVE_TBB
-           lock.release();
-           #endif
            cvInitMatHeader( grad2, dcount, n1, CV_64F, grad2->data.db );
            if( i > 1 )
                cvGEMM( grad1, &_w, 1, 0, 0, grad2, CV_GEMM_B_T );
@@ -1297,7 +1290,7 @@ int CvANN_MLP::train_rprop( CvVectors x0, CvVectors u, const double* sw )
         double E = 0;
 
         // first, iterate through all the samples and compute dEdw
-        cv::parallel_for(cv::BlockedRange(0, count),
+        cv::parallel_for_(cv::Range(0, count),
             rprop_loop(this, weights, count, ivcount, &x0, l_count, layer_sizes,
                        ovcount, max_count, &u, sw, inv_count, dEdw, dcount0, &E, buf_sz)
         );
diff --git a/modules/ml/src/gbt.cpp b/modules/ml/src/gbt.cpp
index 6671a3495b..b52ffbe5a3 100644
--- a/modules/ml/src/gbt.cpp
+++ b/modules/ml/src/gbt.cpp
@@ -900,7 +900,7 @@ float CvGBTrees::predict_serial( const CvMat* _sample, const CvMat* _missing,
 }
 
 
-class Tree_predictor
+class Tree_predictor : public cv::ParallelLoopBody
 {
 private:
     pCvSeq* weak;
@@ -910,9 +910,7 @@ private:
     const CvMat* missing;
     const float shrinkage;
 
-#ifdef HAVE_TBB
-    static tbb::spin_mutex SumMutex;
-#endif
+    static cv::Mutex SumMutex;
 
 
 public:
@@ -931,14 +929,11 @@ public:
     Tree_predictor& operator=( const Tree_predictor& )
     { return *this; }
 
-    virtual void operator()(const cv::BlockedRange& range) const
+    virtual void operator()(const cv::Range& range) const
     {
-#ifdef HAVE_TBB
-        tbb::spin_mutex::scoped_lock lock;
-#endif
         CvSeqReader reader;
-        int begin = range.begin();
-        int end = range.end();
+        int begin = range.start;
+        int end = range.end;
 
         int weak_count = end - begin;
         CvDTree* tree;
@@ -956,13 +951,11 @@ public:
                     tmp_sum += shrinkage*(float)(tree->predict(sample, missing)->value);
                 }
             }
-#ifdef HAVE_TBB
-            lock.acquire(SumMutex);
-            sum[i] += tmp_sum;
-            lock.release();
-#else
-            sum[i] += tmp_sum;
-#endif
+
+            {
+                cv::AutoLock lock(SumMutex);
+                sum[i] += tmp_sum;
+            }
         }
     } // Tree_predictor::operator()
 
@@ -970,11 +963,7 @@ public:
 
 }; // class Tree_predictor
 
-
-#ifdef HAVE_TBB
-tbb::spin_mutex Tree_predictor::SumMutex;
-#endif
-
+cv::Mutex Tree_predictor::SumMutex;
 
 
 float CvGBTrees::predict( const CvMat* _sample, const CvMat* _missing,
@@ -992,12 +981,7 @@ float CvGBTrees::predict( const CvMat* _sample, const CvMat* _missing,
         Tree_predictor predictor = Tree_predictor(weak_seq, class_count,
                                     params.shrinkage, _sample, _missing, sum);
 
-//#ifdef HAVE_TBB
-//      tbb::parallel_for(cv::BlockedRange(begin, end), predictor,
-//                          tbb::auto_partitioner());
-//#else
-        cv::parallel_for(cv::BlockedRange(begin, end), predictor);
-//#endif
+        cv::parallel_for_(cv::Range(begin, end), predictor);
 
         for (int i=0; i<class_count; ++i)
             sum[i] = sum[i] /** params.shrinkage*/ + base_value;
@@ -1228,7 +1212,7 @@ void CvGBTrees::read( CvFileStorage* fs, CvFileNode* node )
 
 //===========================================================================
 
-class Sample_predictor
+class Sample_predictor : public cv::ParallelLoopBody
 {
 private:
     const CvGBTrees* gbt;
@@ -1258,10 +1242,10 @@ public:
     {}
 
 
-    virtual void operator()(const cv::BlockedRange& range) const
+    virtual void operator()(const cv::Range& range) const
     {
-        int begin = range.begin();
-        int end = range.end();
+        int begin = range.start;
+        int end = range.end;
 
         CvMat x;
         CvMat miss;
@@ -1317,11 +1301,7 @@ CvGBTrees::calc_error( CvMLData* _data, int type, std::vector<float> *resp )
     Sample_predictor predictor = Sample_predictor(this, pred_resp, _data->get_values(),
             _data->get_missing(), _sample_idx);
 
-//#ifdef HAVE_TBB
-//    tbb::parallel_for(cv::BlockedRange(0,n), predictor, tbb::auto_partitioner());
-//#else
-    cv::parallel_for(cv::BlockedRange(0,n), predictor);
-//#endif
+    cv::parallel_for_(cv::Range(0,n), predictor);
 
     int* sidx = _sample_idx ? _sample_idx->data.i : 0;
     int r_step = CV_IS_MAT_CONT(response->type) ?
diff --git a/modules/ml/src/knearest.cpp b/modules/ml/src/knearest.cpp
index 3c2f9ebada..6b6f5e6afa 100644
--- a/modules/ml/src/knearest.cpp
+++ b/modules/ml/src/knearest.cpp
@@ -306,7 +306,7 @@ float CvKNearest::write_results( int k, int k1, int start, int end,
     return result;
 }
 
-struct P1 {
+struct P1 : cv::ParallelLoopBody {
   P1(const CvKNearest* _pointer, int _buf_sz, int _k, const CvMat* __samples, const float** __neighbors,
      int _k1, CvMat* __results, CvMat* __neighbor_responses, CvMat* __dist, float* _result)
   {
@@ -333,10 +333,10 @@ struct P1 {
   float* result;
   int buf_sz;
 
-  void operator()( const cv::BlockedRange& range ) const
+  void operator()( const cv::Range& range ) const
   {
     cv::AutoBuffer<float> buf(buf_sz);
-    for(int i = range.begin(); i < range.end(); i += 1 )
+    for(int i = range.start; i < range.end; i += 1 )
     {
         float* neighbor_responses = &buf[0];
         float* dist = neighbor_responses + 1*k;
@@ -410,8 +410,8 @@ float CvKNearest::find_nearest( const CvMat* _samples, int k, CvMat* _results,
     int k1 = get_sample_count();
     k1 = MIN( k1, k );
 
-    cv::parallel_for(cv::BlockedRange(0, count), P1(this, buf_sz, k, _samples, _neighbors, k1,
-                                                    _results, _neighbor_responses, _dist, &result)
+    cv::parallel_for_(cv::Range(0, count), P1(this, buf_sz, k, _samples, _neighbors, k1,
+                                             _results, _neighbor_responses, _dist, &result)
     );
 
     return result;
diff --git a/modules/ml/src/nbayes.cpp b/modules/ml/src/nbayes.cpp
index 15146d6f4e..f1f7a24ec0 100644
--- a/modules/ml/src/nbayes.cpp
+++ b/modules/ml/src/nbayes.cpp
@@ -277,7 +277,7 @@ bool CvNormalBayesClassifier::train( const CvMat* _train_data, const CvMat* _res
     return result;
 }
 
-struct predict_body {
+struct predict_body : cv::ParallelLoopBody {
   predict_body(CvMat* _c, CvMat** _cov_rotate_mats, CvMat** _inv_eigen_values, CvMat** _avg,
      const CvMat* _samples, const int* _vidx, CvMat* _cls_labels,
      CvMat* _results, float* _value, int _var_count1
@@ -307,7 +307,7 @@ struct predict_body {
   float* value;
   int var_count1;
 
-  void operator()( const cv::BlockedRange& range ) const
+  void operator()( const cv::Range& range ) const
   {
 
     int cls = -1;
@@ -324,7 +324,7 @@ struct predict_body {
     cv::AutoBuffer<double> buffer(nclasses + var_count1);
     CvMat diff = cvMat( 1, var_count1, CV_64FC1, &buffer[0] );
 
-    for(int k = range.begin(); k < range.end(); k += 1 )
+    for(int k = range.start; k < range.end; k += 1 )
     {
         int ival;
         double opt = FLT_MAX;
@@ -397,9 +397,9 @@ float CvNormalBayesClassifier::predict( const CvMat* samples, CvMat* results ) c
 
     const int* vidx = var_idx ? var_idx->data.i : 0;
 
-    cv::parallel_for(cv::BlockedRange(0, samples->rows), predict_body(c, cov_rotate_mats, inv_eigen_values, avg, samples,
-                                                                      vidx, cls_labels, results, &value, var_count
-    ));
+    cv::parallel_for_(cv::Range(0, samples->rows),
+                      predict_body(c, cov_rotate_mats, inv_eigen_values, avg, samples,
+                                   vidx, cls_labels, results, &value, var_count));
 
     return value;
 }
diff --git a/modules/ml/src/svm.cpp b/modules/ml/src/svm.cpp
index 9752848b9a..2e1b2e3565 100644
--- a/modules/ml/src/svm.cpp
+++ b/modules/ml/src/svm.cpp
@@ -2143,7 +2143,7 @@ float CvSVM::predict( const CvMat* sample, bool returnDFVal ) const
     return result;
 }
 
-struct predict_body_svm {
+struct predict_body_svm : ParallelLoopBody {
     predict_body_svm(const CvSVM* _pointer, float* _result, const CvMat* _samples, CvMat* _results)
     {
         pointer = _pointer;
@@ -2157,9 +2157,9 @@ struct predict_body_svm {
     const CvMat* samples;
     CvMat* results;
 
-    void operator()( const cv::BlockedRange& range ) const
+    void operator()( const cv::Range& range ) const
     {
-        for(int i = range.begin(); i < range.end(); i++ )
+        for(int i = range.start; i < range.end; i++ )
         {
             CvMat sample;
             cvGetRow( samples, &sample, i );
@@ -2175,7 +2175,7 @@ struct predict_body_svm {
 float CvSVM::predict(const CvMat* samples, CV_OUT CvMat* results) const
 {
     float result = 0;
-    cv::parallel_for(cv::BlockedRange(0, samples->rows),
+    cv::parallel_for_(cv::Range(0, samples->rows),
              predict_body_svm(this, &result, samples, results)
     );
     return result;
diff --git a/modules/nonfree/src/surf.cpp b/modules/nonfree/src/surf.cpp
index bb6d53e4b9..2fc459fb61 100644
--- a/modules/nonfree/src/surf.cpp
+++ b/modules/nonfree/src/surf.cpp
@@ -258,7 +258,7 @@ interpolateKeypoint( float N9[3][9], int dx, int dy, int ds, KeyPoint& kpt )
 }
 
 // Multi-threaded construction of the scale-space pyramid
-struct SURFBuildInvoker
+struct SURFBuildInvoker : ParallelLoopBody
 {
     SURFBuildInvoker( const Mat& _sum, const vector<int>& _sizes,
                       const vector<int>& _sampleSteps,
@@ -271,9 +271,9 @@ struct SURFBuildInvoker
         traces = &_traces;
     }
 
-    void operator()(const BlockedRange& range) const
+    void operator()(const Range& range) const
     {
-        for( int i=range.begin(); i<range.end(); i++ )
+        for( int i=range.start; i<range.end; i++ )
             calcLayerDetAndTrace( *sum, (*sizes)[i], (*sampleSteps)[i], (*dets)[i], (*traces)[i] );
     }
 
@@ -285,7 +285,7 @@ struct SURFBuildInvoker
 };
 
 // Multi-threaded search of the scale-space pyramid for keypoints
-struct SURFFindInvoker
+struct SURFFindInvoker : ParallelLoopBody
 {
     SURFFindInvoker( const Mat& _sum, const Mat& _mask_sum,
                      const vector<Mat>& _dets, const vector<Mat>& _traces,
@@ -310,9 +310,9 @@ struct SURFFindInvoker
                    const vector<int>& sizes, vector<KeyPoint>& keypoints,
                    int octave, int layer, float hessianThreshold, int sampleStep );
 
-    void operator()(const BlockedRange& range) const
+    void operator()(const Range& range) const
     {
-        for( int i=range.begin(); i<range.end(); i++ )
+        for( int i=range.start; i<range.end; i++ )
         {
             int layer = (*middleIndices)[i];
             int octave = i / nOctaveLayers;
@@ -333,14 +333,10 @@ struct SURFFindInvoker
     int nOctaveLayers;
     float hessianThreshold;
 
-#ifdef HAVE_TBB
-    static tbb::mutex findMaximaInLayer_m;
-#endif
+    static Mutex findMaximaInLayer_m;
 };
 
-#ifdef HAVE_TBB
-tbb::mutex SURFFindInvoker::findMaximaInLayer_m;
-#endif
+Mutex SURFFindInvoker::findMaximaInLayer_m;
 
 
 /*
@@ -437,9 +433,7 @@ void SURFFindInvoker::findMaximaInLayer( const Mat& sum, const Mat& mask_sum,
                     if( interp_ok  )
                     {
                         /*printf( "KeyPoint %f %f %d\n", point.pt.x, point.pt.y, point.size );*/
-#ifdef HAVE_TBB
-                        tbb::mutex::scoped_lock lock(findMaximaInLayer_m);
-#endif
+                        cv::AutoLock lock(findMaximaInLayer_m);
                         keypoints.push_back(kpt);
                     }
                 }
@@ -505,20 +499,20 @@ static void fastHessianDetector( const Mat& sum, const Mat& mask_sum, vector<Key
     }
 
     // Calculate hessian determinant and trace samples in each layer
-    parallel_for( BlockedRange(0, nTotalLayers),
-                      SURFBuildInvoker(sum, sizes, sampleSteps, dets, traces) );
+    parallel_for_( Range(0, nTotalLayers),
+                   SURFBuildInvoker(sum, sizes, sampleSteps, dets, traces) );
 
     // Find maxima in the determinant of the hessian
-    parallel_for( BlockedRange(0, nMiddleLayers),
-                      SURFFindInvoker(sum, mask_sum, dets, traces, sizes,
-                                      sampleSteps, middleIndices, keypoints,
-                                      nOctaveLayers, hessianThreshold) );
+    parallel_for_( Range(0, nMiddleLayers),
+                   SURFFindInvoker(sum, mask_sum, dets, traces, sizes,
+                                   sampleSteps, middleIndices, keypoints,
+                                   nOctaveLayers, hessianThreshold) );
 
     std::sort(keypoints.begin(), keypoints.end(), KeypointGreater());
 }
 
 
-struct SURFInvoker
+struct SURFInvoker : ParallelLoopBody
 {
     enum { ORI_RADIUS = 6, ORI_WIN = 60, PATCH_SZ = 20 };
 
@@ -566,7 +560,7 @@ struct SURFInvoker
         }
     }
 
-    void operator()(const BlockedRange& range) const
+    void operator()(const Range& range) const
     {
         /* X and Y gradient wavelet data */
         const int NX=2, NY=2;
@@ -587,7 +581,7 @@ struct SURFInvoker
 
         int dsize = extended ? 128 : 64;
 
-        int k, k1 = range.begin(), k2 = range.end();
+        int k, k1 = range.start, k2 = range.end;
         float maxSize = 0;
         for( k = k1; k < k2; k++ )
         {
@@ -954,7 +948,7 @@ void SURF::operator()(InputArray _img, InputArray _mask,
 
         // we call SURFInvoker in any case, even if we do not need descriptors,
         // since it computes orientation of each feature.
-        parallel_for(BlockedRange(0, N), SURFInvoker(img, sum, keypoints, descriptors, extended, upright) );
+        parallel_for_(Range(0, N), SURFInvoker(img, sum, keypoints, descriptors, extended, upright) );
 
         // remove keypoints that were marked for deletion
         for( i = j = 0; i < N; i++ )
diff --git a/modules/objdetect/src/cascadedetect.cpp b/modules/objdetect/src/cascadedetect.cpp
index 69f2d64181..9e78dce243 100644
--- a/modules/objdetect/src/cascadedetect.cpp
+++ b/modules/objdetect/src/cascadedetect.cpp
@@ -1165,15 +1165,10 @@ void CascadeClassifier::detectMultiScale( const Mat& image, vector<Rect>& object
 
         int stripCount, stripSize;
 
-    #ifdef HAVE_TBB
         const int PTS_PER_THREAD = 1000;
         stripCount = ((processingRectSize.width/yStep)*(processingRectSize.height + yStep-1)/yStep + PTS_PER_THREAD/2)/PTS_PER_THREAD;
         stripCount = std::min(std::max(stripCount, 1), 100);
         stripSize = (((processingRectSize.height + stripCount - 1)/stripCount + yStep-1)/yStep)*yStep;
-    #else
-        stripCount = 1;
-        stripSize = processingRectSize.height;
-    #endif
 
         if( !detectSingleScale( scaledImage, stripCount, processingRectSize, stripSize, yStep, factor, candidates,
             rejectLevels, levelWeights, outputRejectLevels ) )
diff --git a/modules/objdetect/src/latentsvm.cpp b/modules/objdetect/src/latentsvm.cpp
index 521f0fdf56..5a45965e77 100644
--- a/modules/objdetect/src/latentsvm.cpp
+++ b/modules/objdetect/src/latentsvm.cpp
@@ -582,7 +582,6 @@ int searchObjectThresholdSomeComponents(const CvLSVMFeaturePyramid *H,
     // For each component perform searching
     for (i = 0; i < kComponents; i++)
     {
-#ifdef HAVE_TBB
         int error = searchObjectThreshold(H, &(filters[componentIndex]), kPartFilters[i],
             b[i], maxXBorder, maxYBorder, scoreThreshold,
             &(pointsArr[i]), &(levelsArr[i]), &(kPointsArr[i]),
@@ -598,13 +597,6 @@ int searchObjectThresholdSomeComponents(const CvLSVMFeaturePyramid *H,
             free(partsDisplacementArr);
             return LATENT_SVM_SEARCH_OBJECT_FAILED;
         }
-#else
-    (void)numThreads;
-        searchObjectThreshold(H, &(filters[componentIndex]), kPartFilters[i],
-            b[i], maxXBorder, maxYBorder, scoreThreshold,
-            &(pointsArr[i]), &(levelsArr[i]), &(kPointsArr[i]),
-            &(scoreArr[i]), &(partsDisplacementArr[i]));
-#endif
         estimateBoxes(pointsArr[i], levelsArr[i], kPointsArr[i],
             filters[componentIndex]->sizeX, filters[componentIndex]->sizeY, &(oppPointsArr[i]));
         componentIndex += (kPartFilters[i] + 1);
diff --git a/modules/photo/src/denoising.cpp b/modules/photo/src/denoising.cpp
index 02d7a6f620..191926ccb7 100644
--- a/modules/photo/src/denoising.cpp
+++ b/modules/photo/src/denoising.cpp
@@ -59,17 +59,17 @@ void cv::fastNlMeansDenoising( InputArray _src, OutputArray _dst, float h,
 
     switch (src.type()) {
         case CV_8U:
-            parallel_for(cv::BlockedRange(0, src.rows),
+            parallel_for_(cv::Range(0, src.rows),
                 FastNlMeansDenoisingInvoker<uchar>(
                     src, dst, templateWindowSize, searchWindowSize, h));
             break;
         case CV_8UC2:
-            parallel_for(cv::BlockedRange(0, src.rows),
+            parallel_for_(cv::Range(0, src.rows),
                 FastNlMeansDenoisingInvoker<cv::Vec2b>(
                     src, dst, templateWindowSize, searchWindowSize, h));
             break;
         case CV_8UC3:
-            parallel_for(cv::BlockedRange(0, src.rows),
+            parallel_for_(cv::Range(0, src.rows),
                 FastNlMeansDenoisingInvoker<cv::Vec3b>(
                     src, dst, templateWindowSize, searchWindowSize, h));
             break;
@@ -159,19 +159,19 @@ void cv::fastNlMeansDenoisingMulti( InputArrayOfArrays _srcImgs, OutputArray _ds
 
     switch (srcImgs[0].type()) {
         case CV_8U:
-            parallel_for(cv::BlockedRange(0, srcImgs[0].rows),
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
                 FastNlMeansMultiDenoisingInvoker<uchar>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
                     dst, templateWindowSize, searchWindowSize, h));
             break;
         case CV_8UC2:
-            parallel_for(cv::BlockedRange(0, srcImgs[0].rows),
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
                 FastNlMeansMultiDenoisingInvoker<cv::Vec2b>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
                     dst, templateWindowSize, searchWindowSize, h));
             break;
         case CV_8UC3:
-            parallel_for(cv::BlockedRange(0, srcImgs[0].rows),
+            parallel_for_(cv::Range(0, srcImgs[0].rows),
                 FastNlMeansMultiDenoisingInvoker<cv::Vec3b>(
                     srcImgs, imgToDenoiseIndex, temporalWindowSize,
                     dst, templateWindowSize, searchWindowSize, h));
diff --git a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
index c4f13826d2..8824f17c0d 100644
--- a/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_denoising_invoker.hpp
@@ -55,12 +55,12 @@ using namespace std;
 using namespace cv;
 
 template <typename T>
-struct FastNlMeansDenoisingInvoker {
+struct FastNlMeansDenoisingInvoker : ParallelLoopBody {
     public:
         FastNlMeansDenoisingInvoker(const Mat& src, Mat& dst,
             int template_window_size, int search_window_size, const float h);
 
-        void operator() (const BlockedRange& range) const;
+        void operator() (const Range& range) const;
 
     private:
         void operator= (const FastNlMeansDenoisingInvoker&);
@@ -156,9 +156,9 @@ FastNlMeansDenoisingInvoker<T>::FastNlMeansDenoisingInvoker(
 }
 
 template <class T>
-void FastNlMeansDenoisingInvoker<T>::operator() (const BlockedRange& range) const {
-    int row_from = range.begin();
-    int row_to = range.end() - 1;
+void FastNlMeansDenoisingInvoker<T>::operator() (const Range& range) const {
+    int row_from = range.start;
+    int row_to = range.end - 1;
 
     Array2d<int> dist_sums(search_window_size_, search_window_size_);
 
diff --git a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
index 2ae5054e00..8b32eded18 100644
--- a/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
+++ b/modules/photo/src/fast_nlmeans_multi_denoising_invoker.hpp
@@ -55,13 +55,13 @@ using namespace std;
 using namespace cv;
 
 template <typename T>
-struct FastNlMeansMultiDenoisingInvoker {
+struct FastNlMeansMultiDenoisingInvoker : ParallelLoopBody {
     public:
         FastNlMeansMultiDenoisingInvoker(
             const std::vector<Mat>& srcImgs, int imgToDenoiseIndex, int temporalWindowSize,
             Mat& dst, int template_window_size, int search_window_size, const float h);
 
-        void operator() (const BlockedRange& range) const;
+        void operator() (const Range& range) const;
 
     private:
         void operator= (const FastNlMeansMultiDenoisingInvoker&);
@@ -175,9 +175,9 @@ FastNlMeansMultiDenoisingInvoker<T>::FastNlMeansMultiDenoisingInvoker(
 }
 
 template <class T>
-void FastNlMeansMultiDenoisingInvoker<T>::operator() (const BlockedRange& range) const {
-    int row_from = range.begin();
-    int row_to = range.end() - 1;
+void FastNlMeansMultiDenoisingInvoker<T>::operator() (const Range& range) const {
+    int row_from = range.start;
+    int row_to = range.end - 1;
 
     Array3d<int> dist_sums(temporal_window_size_, search_window_size_, search_window_size_);
 
diff --git a/modules/stitching/src/matchers.cpp b/modules/stitching/src/matchers.cpp
index 9bab58c52f..b5bd8ad4d1 100644
--- a/modules/stitching/src/matchers.cpp
+++ b/modules/stitching/src/matchers.cpp
@@ -66,7 +66,7 @@ struct DistIdxPair
 };
 
 
-struct MatchPairsBody
+struct MatchPairsBody : ParallelLoopBody
 {
     MatchPairsBody(const MatchPairsBody& other)
             : matcher(other.matcher), features(other.features),
@@ -77,10 +77,10 @@ struct MatchPairsBody
             : matcher(_matcher), features(_features),
               pairwise_matches(_pairwise_matches), near_pairs(_near_pairs) {}
 
-    void operator ()(const BlockedRange &r) const
+    void operator ()(const Range &r) const
     {
         const int num_images = static_cast<int>(features.size());
-        for (int i = r.begin(); i < r.end(); ++i)
+        for (int i = r.start; i < r.end; ++i)
         {
             int from = near_pairs[i].first;
             int to = near_pairs[i].second;
@@ -526,9 +526,9 @@ void FeaturesMatcher::operator ()(const vector<ImageFeatures> &features, vector<
     MatchPairsBody body(*this, features, pairwise_matches, near_pairs);
 
     if (is_thread_safe_)
-        parallel_for(BlockedRange(0, static_cast<int>(near_pairs.size())), body);
+        parallel_for_(Range(0, static_cast<int>(near_pairs.size())), body);
     else
-        body(BlockedRange(0, static_cast<int>(near_pairs.size())));
+        body(Range(0, static_cast<int>(near_pairs.size())));
     LOGLN_CHAT("");
 }
 
diff --git a/modules/video/src/bgfg_gaussmix2.cpp b/modules/video/src/bgfg_gaussmix2.cpp
index e532af2ae6..6bbb960482 100644
--- a/modules/video/src/bgfg_gaussmix2.cpp
+++ b/modules/video/src/bgfg_gaussmix2.cpp
@@ -248,7 +248,7 @@ detectShadowGMM(const float* data, int nchannels, int nmodes,
 //IEEE Trans. on Pattern Analysis and Machine Intelligence, vol.26, no.5, pages 651-656, 2004
 //http://www.zoranz.net/Publications/zivkovic2004PAMI.pdf
 
-struct MOG2Invoker
+struct MOG2Invoker : ParallelLoopBody
 {
     MOG2Invoker(const Mat& _src, Mat& _dst,
                 GMM* _gmm, float* _mean,
@@ -280,9 +280,9 @@ struct MOG2Invoker
         cvtfunc = src->depth() != CV_32F ? getConvertFunc(src->depth(), CV_32F) : 0;
     }
 
-    void operator()(const BlockedRange& range) const
+    void operator()(const Range& range) const
     {
-        int y0 = range.begin(), y1 = range.end();
+        int y0 = range.start, y1 = range.end;
         int ncols = src->cols, nchannels = src->channels();
         AutoBuffer<float> buf(src->cols*nchannels);
         float alpha1 = 1.f - alphaT;
@@ -562,15 +562,15 @@ void BackgroundSubtractorMOG2::operator()(InputArray _image, OutputArray _fgmask
     learningRate = learningRate >= 0 && nframes > 1 ? learningRate : 1./min( 2*nframes, history );
     CV_Assert(learningRate >= 0);
 
-    parallel_for(BlockedRange(0, image.rows),
-                 MOG2Invoker(image, fgmask,
-                             (GMM*)bgmodel.data,
-                             (float*)(bgmodel.data + sizeof(GMM)*nmixtures*image.rows*image.cols),
-                             bgmodelUsedModes.data, nmixtures, (float)learningRate,
-                             (float)varThreshold,
-                             backgroundRatio, varThresholdGen,
-                             fVarInit, fVarMin, fVarMax, float(-learningRate*fCT), fTau,
-                             bShadowDetection, nShadowDetection));
+    parallel_for_(Range(0, image.rows),
+                  MOG2Invoker(image, fgmask,
+                              (GMM*)bgmodel.data,
+                              (float*)(bgmodel.data + sizeof(GMM)*nmixtures*image.rows*image.cols),
+                              bgmodelUsedModes.data, nmixtures, (float)learningRate,
+                              (float)varThreshold,
+                              backgroundRatio, varThresholdGen,
+                              fVarInit, fVarMin, fVarMax, float(-learningRate*fCT), fTau,
+                              bShadowDetection, nShadowDetection));
 }
 
 void BackgroundSubtractorMOG2::getBackgroundImage(OutputArray backgroundImage) const
diff --git a/modules/video/src/lkpyramid.cpp b/modules/video/src/lkpyramid.cpp
index 9e47eb8029..291cb86a26 100644
--- a/modules/video/src/lkpyramid.cpp
+++ b/modules/video/src/lkpyramid.cpp
@@ -156,7 +156,7 @@ cv::detail::LKTrackerInvoker::LKTrackerInvoker(
     minEigThreshold = _minEigThreshold;
 }
 
-void cv::detail::LKTrackerInvoker::operator()(const BlockedRange& range) const
+void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
 {
     Point2f halfWin((winSize.width-1)*0.5f, (winSize.height-1)*0.5f);
     const Mat& I = *prevImg;
@@ -170,7 +170,7 @@ void cv::detail::LKTrackerInvoker::operator()(const BlockedRange& range) const
     Mat IWinBuf(winSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf);
     Mat derivIWinBuf(winSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf + winSize.area()*cn);
 
-    for( int ptidx = range.begin(); ptidx < range.end(); ptidx++ )
+    for( int ptidx = range.start; ptidx < range.end; ptidx++ )
     {
         Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
         Point2f nextPt;
@@ -733,11 +733,11 @@ void cv::calcOpticalFlowPyrLK( InputArray _prevImg, InputArray _nextImg,
         typedef cv::detail::LKTrackerInvoker LKTrackerInvoker;
 #endif
 
-        parallel_for(BlockedRange(0, npoints), LKTrackerInvoker(prevPyr[level * lvlStep1], derivI,
-                                                                nextPyr[level * lvlStep2], prevPts, nextPts,
-                                                                status, err,
-                                                                winSize, criteria, level, maxLevel,
-                                                                flags, (float)minEigThreshold));
+        parallel_for_(Range(0, npoints), LKTrackerInvoker(prevPyr[level * lvlStep1], derivI,
+                                                          nextPyr[level * lvlStep2], prevPts, nextPts,
+                                                          status, err,
+                                                          winSize, criteria, level, maxLevel,
+                                                          flags, (float)minEigThreshold));
     }
 }
 
diff --git a/modules/video/src/lkpyramid.hpp b/modules/video/src/lkpyramid.hpp
index 390e46bf99..4aff37ef84 100644
--- a/modules/video/src/lkpyramid.hpp
+++ b/modules/video/src/lkpyramid.hpp
@@ -7,7 +7,7 @@ namespace detail
 
     typedef short deriv_type;
 
-    struct LKTrackerInvoker
+    struct LKTrackerInvoker : ParallelLoopBody
     {
         LKTrackerInvoker( const Mat& _prevImg, const Mat& _prevDeriv, const Mat& _nextImg,
                           const Point2f* _prevPts, Point2f* _nextPts,
@@ -15,7 +15,7 @@ namespace detail
                           Size _winSize, TermCriteria _criteria,
                           int _level, int _maxLevel, int _flags, float _minEigThreshold );
 
-        void operator()(const BlockedRange& range) const;
+        void operator()(const Range& range) const;
 
         const Mat* prevImg;
         const Mat* nextImg;