Merge pull request #2073 from KonstantinMatskevich:ocl_tapi_clahe

11 years ago · 8a456c5a6b
parent 8151571dd6 55634c1f52
commit 8a456c5a6b
4 changed files with 375 additions and 15 deletions
--- a/modules/core/include/opencv2/core/ocl.hpp
+++ b/modules/core/include/opencv2/core/ocl.hpp
@ -489,6 +489,7 @@ public:
    bool runTask(bool sync, const Queue& q=Queue());
    size_t workGroupSize() const;
    size_t preferedWorkGroupSizeMultiple() const;
    bool compileWorkGroupSize(size_t wsz[]) const;
    size_t localMemSize() const;
--- a/modules/core/src/ocl.cpp
+++ b/modules/core/src/ocl.cpp
@ -2817,6 +2817,16 @@ size_t Kernel::workGroupSize() const
                                    sizeof(val), &val, &retsz) >= 0 ? val : 0;
 }
 size_t Kernel::preferedWorkGroupSizeMultiple() const
 {
    if(!p)
        return 0;
    size_t val = 0, retsz = 0;
    cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
    return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
                                    sizeof(val), &val, &retsz) >= 0 ? val : 0;
 }
 bool Kernel::compileWorkGroupSize(size_t wsz[]) const
 {
    if(!p || !wsz)
--- a/modules/imgproc/src/clahe.cpp
+++ b/modules/imgproc/src/clahe.cpp
@ -40,10 +40,90 @@
 //M*/
 #include "precomp.hpp"
 #include "opencl_kernels.hpp"
 // ----------------------------------------------------------------------
 // CLAHE
 namespace clahe
 {
    static bool calcLut(cv::InputArray _src, cv::OutputArray _dst,
        const int tilesX, const int tilesY, const cv::Size tileSize,
        const int clipLimit, const float lutScale)
    {
        cv::ocl::Kernel _k("calcLut", cv::ocl::imgproc::clahe_oclsrc);
        bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU;
        cv::String opts;
        if(is_cpu)
            opts = "-D CPU ";
        else
            opts = cv::format("-D WAVE_SIZE=%d", _k.preferedWorkGroupSizeMultiple());
        cv::ocl::Kernel k("calcLut", cv::ocl::imgproc::clahe_oclsrc, opts);
        if(k.empty())
            return false;
        cv::UMat src = _src.getUMat();
        _dst.create(tilesX * tilesY, 256, CV_8UC1);
        cv::UMat dst = _dst.getUMat();
        int tile_size[2];
        tile_size[0] = tileSize.width;
        tile_size[1] = tileSize.height;
        size_t localThreads[3]  = { 32, 8, 1 };
        size_t globalThreads[3] = { tilesX * localThreads[0], tilesY * localThreads[1], 1 };
        int idx = 0;
        idx = k.set(idx, cv::ocl::KernelArg::ReadOnlyNoSize(src));
        idx = k.set(idx, cv::ocl::KernelArg::WriteOnlyNoSize(dst));
        idx = k.set(idx, tile_size);
        idx = k.set(idx, tilesX);
        idx = k.set(idx, clipLimit);
        idx = k.set(idx, lutScale);
        if (!k.run(2, globalThreads, localThreads, false))
            return false;
        return true;
    }
    static bool transform(const cv::InputArray _src, cv::OutputArray _dst, const cv::InputArray _lut,
        const int tilesX, const int tilesY, const cv::Size & tileSize)
    {
        cv::ocl::Kernel k("transform", cv::ocl::imgproc::clahe_oclsrc);
        if(k.empty())
            return false;
        int tile_size[2];
        tile_size[0] = tileSize.width;
        tile_size[1] = tileSize.height;
        cv::UMat src = _src.getUMat();
        _dst.create(src.size(), src.type());
        cv::UMat dst = _dst.getUMat();
        cv::UMat lut = _lut.getUMat();
        size_t localThreads[3]  = { 32, 8, 1 };
        size_t globalThreads[3] = { src.cols, src.rows, 1 };
        int idx = 0;
        idx = k.set(idx, cv::ocl::KernelArg::ReadOnlyNoSize(src));
        idx = k.set(idx, cv::ocl::KernelArg::WriteOnlyNoSize(dst));
        idx = k.set(idx, cv::ocl::KernelArg::ReadOnlyNoSize(lut));
        idx = k.set(idx, src.cols);
        idx = k.set(idx, src.rows);
        idx = k.set(idx, tile_size);
        idx = k.set(idx, tilesX);
        idx = k.set(idx, tilesY);
        if (!k.run(2, globalThreads, localThreads, false))
            return false;
        return true;
    }
 }
 namespace
 {
    class CLAHE_CalcLut_Body : public cv::ParallelLoopBody
@ -241,7 +321,9 @@ namespace
        int tilesY_;
        cv::Mat srcExt_;
        cv::UMat usrcExt_;
        cv::Mat lut_;
        cv::UMat ulut_;
    };
    CLAHE_Impl::CLAHE_Impl(double clipLimit, int tilesX, int tilesY) :
@ -256,31 +338,34 @@ namespace
    void CLAHE_Impl::apply(cv::InputArray _src, cv::OutputArray _dst)
    {
-        cv::Mat src = _src.getMat();
+        CV_Assert( _src.type() == CV_8UC1 );
-        CV_Assert( src.type() == CV_8UC1 );
+        bool useOpenCL = cv::ocl::useOpenCL() && _src.isUMat() && _src.dims()<=2;
        _dst.create( src.size(), src.type() );
        cv::Mat dst = _dst.getMat();
        const int histSize = 256;
        lut_.create(tilesX_ * tilesY_, histSize, CV_8UC1);
        cv::Size tileSize;
-        cv::Mat srcForLut;
+        cv::_InputArray _srcForLut;
-        if (src.cols % tilesX_ == 0 && src.rows % tilesY_ == 0)
+        if (_src.size().width % tilesX_ == 0 && _src.size().height % tilesY_ == 0)
        {
-            tileSize = cv::Size(src.cols / tilesX_, src.rows / tilesY_);
+            tileSize = cv::Size(_src.size().width / tilesX_, _src.size().height / tilesY_);
-            srcForLut = src;
+            _srcForLut = _src;
        }
        else
        {
-            cv::copyMakeBorder(src, srcExt_, 0, tilesY_ - (src.rows % tilesY_), 0, tilesX_ - (src.cols % tilesX_), cv::BORDER_REFLECT_101);
+            if(useOpenCL)
-
+            {
-            tileSize = cv::Size(srcExt_.cols / tilesX_, srcExt_.rows / tilesY_);
+                cv::copyMakeBorder(_src, usrcExt_, 0, tilesY_ - (_src.size().height % tilesY_), 0, tilesX_ - (_src.size().width % tilesX_), cv::BORDER_REFLECT_101);
-            srcForLut = srcExt_;
+                tileSize = cv::Size(usrcExt_.size().width / tilesX_, usrcExt_.size().height / tilesY_);
                _srcForLut = usrcExt_;
            }
            else
            {
                cv::copyMakeBorder(_src, srcExt_, 0, tilesY_ - (_src.size().height % tilesY_), 0, tilesX_ - (_src.size().width % tilesX_), cv::BORDER_REFLECT_101);
                tileSize = cv::Size(srcExt_.size().width / tilesX_, srcExt_.size().height / tilesY_);
                _srcForLut = srcExt_;
            }
        }
        const int tileSizeTotal = tileSize.area();
@ -293,6 +378,16 @@ namespace
            clipLimit = std::max(clipLimit, 1);
        }
        if(useOpenCL && clahe::calcLut(_srcForLut, ulut_, tilesX_, tilesY_, tileSize, clipLimit, lutScale) )
            if( clahe::transform(_src, _dst, ulut_, tilesX_, tilesY_, tileSize) )
                return;
        cv::Mat src = _src.getMat();
        _dst.create( src.size(), src.type() );
        cv::Mat dst = _dst.getMat();
        cv::Mat srcForLut = _srcForLut.getMat();
        lut_.create(tilesX_ * tilesY_, histSize, CV_8UC1);
        CLAHE_CalcLut_Body calcLutBody(srcForLut, lut_, tileSize, tilesX_, tilesY_, clipLimit, lutScale);
        cv::parallel_for_(cv::Range(0, tilesX_ * tilesY_), calcLutBody);
@ -325,6 +420,8 @@ namespace
    {
        srcExt_.release();
        lut_.release();
        usrcExt_.release();
        ulut_.release();
    }
 }
--- a/modules/imgproc/src/opencl/clahe.cl
+++ b/modules/imgproc/src/opencl/clahe.cl
@ -0,0 +1,252 @@
 /*M///////////////////////////////////////////////////////////////////////////////////////
 //
 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
 //
 //  By downloading, copying, installing or using the software you agree to this license.
 //  If you do not agree to this license, do not download, install,
 //  copy or use the software.
 //
 //
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
 //
 // @Authors
 //    Sen Liu, swjtuls1987@126.com
 //
 // Redistribution and use in source and binary forms, with or without modification,
 // are permitted provided that the following conditions are met:
 //
 //   * Redistribution's of source code must retain the above copyright notice,
 //     this list of conditions and the following disclaimer.
 //
 //   * Redistribution's in binary form must reproduce the above copyright notice,
 //     this list of conditions and the following disclaimer in the documentation
 //     and/or other materials provided with the distribution.
 //
 //   * The name of the copyright holders may not be used to endorse or promote products
 //     derived from this software without specific prior written permission.
 //
 // This software is provided by the copyright holders and contributors as is and
 // any express or implied warranties, including, but not limited to, the implied
 // warranties of merchantability and fitness for a particular purpose are disclaimed.
 // In no event shall the Intel Corporation or contributors be liable for any direct,
 // indirect, incidental, special, exemplary, or consequential damages
 // (including, but not limited to, procurement of substitute goods or services;
 // loss of use, data, or profits; or business interruption) however caused
 // and on any theory of liability, whether in contract, strict liability,
 // or tort (including negligence or otherwise) arising in any way out of
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
 #ifndef WAVE_SIZE
 #define WAVE_SIZE 1
 #endif
 inline int calc_lut(__local int* smem, int val, int tid)
 {
    smem[tid] = val;
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid == 0)
        for (int i = 1; i < 256; ++i)
            smem[i] += smem[i - 1];
    barrier(CLK_LOCAL_MEM_FENCE);
    return smem[tid];
 }
 #ifdef CPU
 inline void reduce(volatile __local int* smem, int val, int tid)
 {
    smem[tid] = val;
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 128)
        smem[tid] = val += smem[tid + 128];
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 64)
        smem[tid] = val += smem[tid + 64];
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 32)
        smem[tid] += smem[tid + 32];
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 16)
        smem[tid] += smem[tid + 16];
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 8)
        smem[tid] += smem[tid + 8];
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 4)
        smem[tid] += smem[tid + 4];
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 2)
        smem[tid] += smem[tid + 2];
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 1)
        smem[256] = smem[tid] + smem[tid + 1];
    barrier(CLK_LOCAL_MEM_FENCE);
 }
 #else
 inline void reduce(__local volatile int* smem, int val, int tid)
 {
    smem[tid] = val;
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 128)
        smem[tid] = val += smem[tid + 128];
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 64)
        smem[tid] = val += smem[tid + 64];
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 32)
    {
        smem[tid] += smem[tid + 32];
 #if WAVE_SIZE < 32
    } barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 16)
    {
 #endif
        smem[tid] += smem[tid + 16];
 #if WAVE_SIZE < 16
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    if (tid < 8)
    {
 #endif
        smem[tid] += smem[tid + 8];
        smem[tid] += smem[tid + 4];
        smem[tid] += smem[tid + 2];
        smem[tid] += smem[tid + 1];
    }
 }
 #endif
 __kernel void calcLut(__global __const uchar * src, const int srcStep,
                      const int src_offset, __global uchar * lut,
                      const int dstStep, const int dst_offset,
                      const int2 tileSize, const int tilesX,
                      const int clipLimit, const float lutScale)
 {
    __local int smem[512];
    int tx = get_group_id(0);
    int ty = get_group_id(1);
    int tid = get_local_id(1) * get_local_size(0)
                             + get_local_id(0);
    smem[tid] = 0;
    barrier(CLK_LOCAL_MEM_FENCE);
    for (int i = get_local_id(1); i < tileSize.y; i += get_local_size(1))
    {
        __global const uchar* srcPtr = src + mad24(ty * tileSize.y + i, srcStep, tx * tileSize.x + src_offset);
        for (int j = get_local_id(0); j < tileSize.x; j += get_local_size(0))
        {
            const int data = srcPtr[j];
            atomic_inc(&smem[data]);
        }
    }
    barrier(CLK_LOCAL_MEM_FENCE);
    int tHistVal = smem[tid];
    barrier(CLK_LOCAL_MEM_FENCE);
    if (clipLimit > 0)
    {
        // clip histogram bar
        int clipped = 0;
        if (tHistVal > clipLimit)
        {
            clipped = tHistVal - clipLimit;
            tHistVal = clipLimit;
        }
        // find number of overall clipped samples
        reduce(smem, clipped, tid);
        barrier(CLK_LOCAL_MEM_FENCE);
 #ifdef CPU
        clipped = smem[256];
 #else
        clipped = smem[0];
 #endif
        // broadcast evaluated value
        __local int totalClipped;
        if (tid == 0)
            totalClipped = clipped;
        barrier(CLK_LOCAL_MEM_FENCE);
        // redistribute clipped samples evenly
        int redistBatch = totalClipped / 256;
        tHistVal += redistBatch;
        int residual = totalClipped - redistBatch * 256;
        if (tid < residual)
            ++tHistVal;
    }
    const int lutVal = calc_lut(smem, tHistVal, tid);
    uint ires = (uint)convert_int_rte(lutScale * lutVal);
    lut[(ty * tilesX + tx) * dstStep + tid + dst_offset] =
        convert_uchar(clamp(ires, (uint)0, (uint)255));
 }
 __kernel void transform(__global __const uchar * src, const int srcStep, const int src_offset,
                        __global uchar * dst, const int dstStep, const int dst_offset,
                        __global uchar * lut, const int lutStep, int lut_offset,
                        const int cols, const int rows,
                        const int2 tileSize,
                        const int tilesX, const int tilesY)
 {
    const int x = get_global_id(0);
    const int y = get_global_id(1);
    if (x >= cols || y >= rows)
        return;
    const float tyf = (convert_float(y) / tileSize.y) - 0.5f;
    int ty1 = convert_int_rtn(tyf);
    int ty2 = ty1 + 1;
    const float ya = tyf - ty1;
    ty1 = max(ty1, 0);
    ty2 = min(ty2, tilesY - 1);
    const float txf = (convert_float(x) / tileSize.x) - 0.5f;
    int tx1 = convert_int_rtn(txf);
    int tx2 = tx1 + 1;
    const float xa = txf - tx1;
    tx1 = max(tx1, 0);
    tx2 = min(tx2, tilesX - 1);
    const int srcVal = src[mad24(y, srcStep, x + src_offset)];
    float res = 0;
    res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (1.0f - ya));
    res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (1.0f - ya));
    res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (ya));
    res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (ya));
    uint ires = (uint)convert_int_rte(res);
    dst[mad24(y, dstStep, x + dst_offset)] = convert_uchar(clamp(ires, (uint)0, (uint)255));
 }