opencv/modules/gpu/src/cuda/canny.cu

/*M///////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
//                           License Agreement
//                For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other materials provided with the distribution.
//
//   * The name of the copyright holders may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/

#include <utility>
#include "internal_shared.hpp"
#include "opencv2/gpu/device/utility.hpp"

using namespace cv::gpu;
using namespace cv::gpu::device;

namespace cv { namespace gpu { namespace canny
{
    __global__ void calcSobelRowPass(PtrStep src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
    {
        __shared__ int smem[16][18];

        const int j = blockIdx.x * blockDim.x + threadIdx.x;
        const int i = blockIdx.y * blockDim.y + threadIdx.y;

        if (i < rows)
        {
            smem[threadIdx.y][threadIdx.x + 1] = src.ptr(i)[j];
            if (threadIdx.x == 0)
            {
                smem[threadIdx.y][0] = src.ptr(i)[max(j - 1, 0)];
                smem[threadIdx.y][17] = src.ptr(i)[min(j + 16, cols - 1)];
            }
            __syncthreads();

            if (j < cols)
            {
                dx_buf.ptr(i)[j] = -smem[threadIdx.y][threadIdx.x] + smem[threadIdx.y][threadIdx.x + 2];
                dy_buf.ptr(i)[j] = smem[threadIdx.y][threadIdx.x] + 2 * smem[threadIdx.y][threadIdx.x + 1] + smem[threadIdx.y][threadIdx.x + 2];
            }
        }
    }

    void calcSobelRowPass_gpu(PtrStep src, PtrStepi dx_buf, PtrStepi dy_buf, int rows, int cols)
    {
        dim3 block(16, 16, 1);
        dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);

        calcSobelRowPass<<<grid, block>>>(src, dx_buf, dy_buf, rows, cols);
        cudaSafeCall( cudaGetLastError() );

        cudaSafeCall(cudaThreadSynchronize());
    }

    struct L1
    {
        static __device__ __forceinline__ float calc(int x, int y)
        {
            return abs(x) + abs(y);
        }
    };
    struct L2
    {
        static __device__ __forceinline__ float calc(int x, int y)
        {
            return sqrtf(x * x + y * y);
        }
    };

    template <typename Norm> __global__ void calcMagnitude(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
    {
        __shared__ int sdx[18][16];
        __shared__ int sdy[18][16];

        const int j = blockIdx.x * blockDim.x + threadIdx.x;
        const int i = blockIdx.y * blockDim.y + threadIdx.y;

        if (j < cols)
        {
            sdx[threadIdx.y + 1][threadIdx.x] = dx_buf.ptr(i)[j];
            sdy[threadIdx.y + 1][threadIdx.x] = dy_buf.ptr(i)[j];
            if (threadIdx.y == 0)
            {
                sdx[0][threadIdx.x] = dx_buf.ptr(max(i - 1, 0))[j];
                sdx[17][threadIdx.x] = dx_buf.ptr(min(i + 16, rows - 1))[j];

                sdy[0][threadIdx.x] = dy_buf.ptr(max(i - 1, 0))[j];
                sdy[17][threadIdx.x] = dy_buf.ptr(min(i + 16, rows - 1))[j];
            }
            __syncthreads();

            if (i < rows)
            {
                int x = sdx[threadIdx.y][threadIdx.x] + 2 * sdx[threadIdx.y + 1][threadIdx.x] + sdx[threadIdx.y + 2][threadIdx.x];
                int y = -sdy[threadIdx.y][threadIdx.x] + sdy[threadIdx.y + 2][threadIdx.x];

                dx.ptr(i)[j] = x;
                dy.ptr(i)[j] = y;

                mag.ptr(i + 1)[j + 1] = Norm::calc(x, y);
            }
        }
    }

    void calcMagnitude_gpu(PtrStepi dx_buf, PtrStepi dy_buf, PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
    {
        dim3 block(16, 16, 1);
        dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);

        if (L2Grad)
            calcMagnitude<L2><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);
        else
            calcMagnitude<L1><<<grid, block>>>(dx_buf, dy_buf, dx, dy, mag, rows, cols);

        cudaSafeCall( cudaGetLastError() );

        cudaSafeCall(cudaThreadSynchronize());
    }

    template <typename Norm> __global__ void calcMagnitude(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols)
    {
        const int j = blockIdx.x * blockDim.x + threadIdx.x;
        const int i = blockIdx.y * blockDim.y + threadIdx.y;

        if (i < rows && j < cols)
            mag.ptr(i + 1)[j + 1] = Norm::calc(dx.ptr(i)[j], dy.ptr(i)[j]);
    }

    void calcMagnitude_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, int rows, int cols, bool L2Grad)
    {
        dim3 block(16, 16, 1);
        dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);

        if (L2Grad)
            calcMagnitude<L2><<<grid, block>>>(dx, dy, mag, rows, cols);
        else
            calcMagnitude<L1><<<grid, block>>>(dx, dy, mag, rows, cols);

        cudaSafeCall( cudaGetLastError() );

        cudaSafeCall(cudaThreadSynchronize());
    }

//////////////////////////////////////////////////////////////////////////////////////////

#define CANNY_SHIFT 15
#define TG22        (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5)

    __global__ void calcMap(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
    {
        __shared__ float smem[18][18];

        const int j = blockIdx.x * 16 + threadIdx.x;
        const int i = blockIdx.y * 16 + threadIdx.y;

        const int tid = threadIdx.y * 16 + threadIdx.x;
        const int lx = tid % 18;
        const int ly = tid / 18;

        if (ly < 14)
            smem[ly][lx] = mag.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];

        if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
            smem[ly + 14][lx] = mag.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];

        __syncthreads();

        if (i < rows && j < cols)
        {
            int x = dx.ptr(i)[j];
            int y = dy.ptr(i)[j];
            const int s = (x ^ y) < 0 ? -1 : 1;
            const float m = smem[threadIdx.y + 1][threadIdx.x + 1];

            x = abs(x);
            y = abs(y);

            // 0 - the pixel can not belong to an edge
            // 1 - the pixel might belong to an edge
            // 2 - the pixel does belong to an edge
            int edge_type = 0;

            if (m > low_thresh)
            {
                const int tg22x = x * TG22;
                const int tg67x = tg22x + ((x + x) << CANNY_SHIFT);

                y <<= CANNY_SHIFT;

                if (y < tg22x)
                {
                    if (m > smem[threadIdx.y + 1][threadIdx.x] && m >= smem[threadIdx.y + 1][threadIdx.x + 2])
                        edge_type = 1 + (int)(m > high_thresh);
                }
                else if( y > tg67x )
                {
                    if (m > smem[threadIdx.y][threadIdx.x + 1] && m >= smem[threadIdx.y + 2][threadIdx.x + 1])
                        edge_type = 1 + (int)(m > high_thresh);
                }
                else
                {
                    if (m > smem[threadIdx.y][threadIdx.x + 1 - s] && m > smem[threadIdx.y + 2][threadIdx.x + 1 + s])
                        edge_type = 1 + (int)(m > high_thresh);
                }
            }

            map.ptr(i + 1)[j + 1] = edge_type;
        }
    }

#undef CANNY_SHIFT
#undef TG22

    void calcMap_gpu(PtrStepi dx, PtrStepi dy, PtrStepf mag, PtrStepi map, int rows, int cols, float low_thresh, float high_thresh)
    {
        dim3 block(16, 16, 1);
        dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);

        calcMap<<<grid, block>>>(dx, dy, mag, map, rows, cols, low_thresh, high_thresh);
        cudaSafeCall( cudaGetLastError() );

        cudaSafeCall(cudaThreadSynchronize());
    }

//////////////////////////////////////////////////////////////////////////////////////////

    __device__ unsigned int counter = 0;

    __global__ void edgesHysteresisLocal(PtrStepi map, ushort2* st, int rows, int cols)
    {
        #if __CUDA_ARCH__ >= 120

        __shared__ int smem[18][18];

        const int j = blockIdx.x * 16 + threadIdx.x;
        const int i = blockIdx.y * 16 + threadIdx.y;

        const int tid = threadIdx.y * 16 + threadIdx.x;
        const int lx = tid % 18;
        const int ly = tid / 18;

        if (ly < 14)
            smem[ly][lx] = map.ptr(blockIdx.y * 16 + ly)[blockIdx.x * 16 + lx];

        if (ly < 4 && blockIdx.y * 16 + ly + 14 <= rows && blockIdx.x * 16 + lx <= cols)
            smem[ly + 14][lx] = map.ptr(blockIdx.y * 16 + ly + 14)[blockIdx.x * 16 + lx];

        __syncthreads();

        if (i < rows && j < cols)
        {
            int n;

            #pragma unroll
            for (int k = 0; k < 16; ++k)
            {
                n = 0;

                if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
                {
                    n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
                    n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
                    n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;

                    n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
                    n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;

                    n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
                    n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
                    n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
                }

                if (n > 0)
                    smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
            }

            const int e = smem[threadIdx.y + 1][threadIdx.x + 1];

            map.ptr(i + 1)[j + 1] = e;

            n = 0;

            if (e == 2)
            {
                n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
                n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
                n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;

                n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
                n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;

                n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
                n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
                n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
            }

            if (n > 0)
            {
                const unsigned int ind = atomicInc(&counter, (unsigned int)(-1));
                st[ind] = make_ushort2(j + 1, i + 1);
            }
        }

        #endif
    }

    void edgesHysteresisLocal_gpu(PtrStepi map, ushort2* st1, int rows, int cols)
    {
        dim3 block(16, 16, 1);
        dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);

        edgesHysteresisLocal<<<grid, block>>>(map, st1, rows, cols);
        cudaSafeCall( cudaGetLastError() );

        cudaSafeCall(cudaThreadSynchronize());
    }

    __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
    __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};

    __global__ void edgesHysteresisGlobal(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols, int count)
    {
        #if __CUDA_ARCH__ >= 120

        const int stack_size = 512;

        __shared__ unsigned int s_counter;
        __shared__ unsigned int s_ind;
        __shared__ ushort2 s_st[stack_size];

        if (threadIdx.x == 0)
            s_counter = 0;
        __syncthreads();

        int ind = blockIdx.y * gridDim.x + blockIdx.x;

        if (ind < count)
        {
            ushort2 pos = st1[ind];

            if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
            {
                if (threadIdx.x < 8)
                {
                    pos.x += c_dx[threadIdx.x];
                    pos.y += c_dy[threadIdx.x];

                    if (map.ptr(pos.y)[pos.x] == 1)
                    {
                        map.ptr(pos.y)[pos.x] = 2;

                        ind = atomicInc(&s_counter, (unsigned int)(-1));

                        s_st[ind] = pos;
                    }
                }
                __syncthreads();

                while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
                {
                    const int subTaskIdx = threadIdx.x >> 3;
                    const int portion = min(s_counter, blockDim.x >> 3);

                    pos.x = pos.y = 0;

                    if (subTaskIdx < portion)
                        pos = s_st[s_counter - 1 - subTaskIdx];
                    __syncthreads();

                    if (threadIdx.x == 0)
                        s_counter -= portion;
                    __syncthreads();

                    if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows)
                    {
                        pos.x += c_dx[threadIdx.x & 7];
                        pos.y += c_dy[threadIdx.x & 7];

                        if (map.ptr(pos.y)[pos.x] == 1)
                        {
                            map.ptr(pos.y)[pos.x] = 2;

                            ind = atomicInc(&s_counter, (unsigned int)(-1));

                            s_st[ind] = pos;
                        }
                    }
                    __syncthreads();
                }

                if (s_counter > 0)
                {
                    if (threadIdx.x == 0)
                    {
                        ind = atomicAdd(&counter, s_counter);
                        s_ind = ind - s_counter;
                    }
                    __syncthreads();

                    ind = s_ind;

                    for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
                    {
                        st2[ind + i] = s_st[i];
                    }
                }
            }
        }

        #endif
    }

    void edgesHysteresisGlobal_gpu(PtrStepi map, ushort2* st1, ushort2* st2, int rows, int cols)
    {
        void* counter_ptr;
        cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, "cv::gpu::canny::counter") );

        unsigned int count;
        cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );

        while (count > 0)
        {
            cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );

            dim3 block(128, 1, 1);
            dim3 grid(min(count, 65535u), divUp(count, 65535), 1);
            edgesHysteresisGlobal<<<grid, block>>>(map, st1, st2, rows, cols, count);
            cudaSafeCall( cudaGetLastError() );

            cudaSafeCall(cudaThreadSynchronize());

            cudaSafeCall( cudaMemcpy(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );

            std::swap(st1, st2);
        }
    }

    __global__ void getEdges(PtrStepi map, PtrStep dst, int rows, int cols)
    {
        const int j = blockIdx.x * 16 + threadIdx.x;
        const int i = blockIdx.y * 16 + threadIdx.y;

        if (i < rows && j < cols)
            dst.ptr(i)[j] = (uchar)(-(map.ptr(i + 1)[j + 1] >> 1));
    }

    void getEdges_gpu(PtrStepi map, PtrStep dst, int rows, int cols)
    {
        dim3 block(16, 16, 1);
        dim3 grid(divUp(cols, block.x), divUp(rows, block.y), 1);

        getEdges<<<grid, block>>>(map, dst, rows, cols);
        cudaSafeCall( cudaGetLastError() );

        cudaSafeCall(cudaThreadSynchronize());
    }
}}}