Merge pull request #3516

An error occurred

from cudawarped:cuda_moments

`cuda`: add `moments`
1 year ago · 0bcbc73bca
parent faa5468552 eca7f1c917
commit 0bcbc73bca
8 changed files with 548 additions and 5 deletions
--- a/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
+++ b/modules/cudaimgproc/include/opencv2/cudaimgproc.hpp
@ -57,6 +57,7 @@
    @{
      @defgroup cudaimgproc_color Color space processing
      @defgroup cudaimgproc_hist Histogram Calculation
      @defgroup cudaimgproc_shape Structural Analysis and Shape Descriptors
      @defgroup cudaimgproc_hough Hough Transform
      @defgroup cudaimgproc_feature Feature Detection
    @}
@ -779,9 +780,84 @@ CV_EXPORTS_AS(connectedComponentsWithAlgorithm) void connectedComponents(InputAr
 CV_EXPORTS_W void connectedComponents(InputArray image, OutputArray labels,
    int connectivity = 8, int ltype = CV_32S);
 //! @}
 //! @addtogroup cudaimgproc_shape
 //! @{
 /** @brief Order of image moments.
 * @param FIRST_ORDER_MOMENTS First order moments
 * @param SECOND_ORDER_MOMENTS Second order moments.
 * @param THIRD_ORDER_MOMENTS Third order moments.
 * */
 enum MomentsOrder {
    FIRST_ORDER_MOMENTS = 1,
    SECOND_ORDER_MOMENTS = 2,
    THIRD_ORDER_MOMENTS = 3
 };
 /** @brief Returns the number of image moments less than or equal to the largest image moments \a order.
@param order Order of largest moments to calculate with lower order moments requiring less computation.
@returns number of image moments.
@sa cuda::moments, cuda::spatialMoments, cuda::MomentsOrder
 */
 CV_EXPORTS_W int numMoments(const MomentsOrder order);
 /** @brief Calculates all of the spatial moments up to the 3rd order of a rasterized shape.
 Asynchronous version of cuda::moments() which only calculates the spatial (not centralized or normalized) moments, up to the 3rd order, of a rasterized shape.
 Each moment is returned as a column entry in the 1D \a moments array.
@param src Raster image (single-channel 2D array).
@param [out] moments 1D array with each column entry containing a spatial image moment.
@param binaryImage If it is true, all non-zero image pixels are treated as 1's.
@param order Order of largest moments to calculate with lower order moments requiring less computation.
@param momentsType Precision to use when calculating moments. Available types are `CV_32F` and `CV_64F` with the performance of `CV_32F` an order of magnitude greater than `CV_64F`. If the image is small the accuracy from `CV_32F` can be equal or very close to `CV_64F`.
@param stream Stream for the asynchronous version.
@note For maximum performance pre-allocate a 1D GpuMat for \a moments of the correct type and size large enough to store the all the image moments of up to the desired \a order. e.g. With \a order === MomentsOrder::SECOND_ORDER_MOMENTS and \a momentsType == `CV_32F` \a moments can be allocated as
 ```
 GpuMat momentsDevice(1,numMoments(MomentsOrder::SECOND_ORDER_MOMENTS),CV_32F)
 ```
 The central and normalized moments can easily be calculated on the host by downloading the \a moments array and using the cv::Moments constructor. e.g.
 ```
 HostMem momentsHostMem(1, numMoments(MomentsOrder::SECOND_ORDER_MOMENTS), CV_32F);
 momentsDevice.download(momentsHostMem, stream);
 stream.waitForCompletion();
 Mat momentsMat = momentsHostMem.createMatHeader();
 cv::Moments cvMoments(momentsMat.at<float>(0), momentsMat.at<float>(1), momentsMat.at<float>(2), momentsMat.at<float>(3), momentsMat.at<float>(4), momentsMat.at<float>(5), momentsMat.at<float>(6), momentsMat.at<float>(7), momentsMat.at<float>(8), momentsMat.at<float>(9));
 ```
 see the \a CUDA_TEST_P(Moments, Async) test inside opencv_contrib_source_code/modules/cudaimgproc/test/test_moments.cpp for an example.
@returns cv::Moments.
@sa cuda::moments
 */
 CV_EXPORTS_W void spatialMoments(InputArray src, OutputArray moments, const bool binaryImage = false, const MomentsOrder order = MomentsOrder::THIRD_ORDER_MOMENTS, const int momentsType = CV_64F, Stream& stream = Stream::Null());
 /** @brief Calculates all of the moments up to the 3rd order of a rasterized shape.
 The function computes moments, up to the 3rd order, of a rasterized shape. The
 results are returned in the structure cv::Moments.
@param src Raster image (single-channel 2D array).
@param binaryImage If it is true, all non-zero image pixels are treated as 1's.
@param order Order of largest moments to calculate with lower order moments requiring less computation.
 @param momentsType Precision to use when calculating moments. Available types are `CV_32F` and `CV_64F` with the performance of `CV_32F` an order of magnitude greater than `CV_64F`. If the image is small the accuracy from `CV_32F` can be equal or very close to `CV_64F`.
@note For maximum performance use the asynchronous version cuda::spatialMoments() as this version interally allocates and deallocates both GpuMat and HostMem to respectively perform the calculation on the device and download the result to the host.
 The costly HostMem allocation cannot be avoided however the GpuMat device allocation can be by using BufferPool, e.g.
 ```
    setBufferPoolUsage(true);
    setBufferPoolConfig(getDevice(), numMoments(order) * ((momentsType == CV_64F) ? sizeof(double) : sizeof(float)), 1);
 ```
 see the \a CUDA_TEST_P(Moments, Accuracy) test inside opencv_contrib_source_code/modules/cudaimgproc/test/test_moments.cpp for an example.
@returns cv::Moments.
@sa cuda::spatialMoments
 */
 CV_EXPORTS_W Moments moments(InputArray src, const bool binaryImage = false, const MomentsOrder order = MomentsOrder::THIRD_ORDER_MOMENTS, const int momentsType = CV_64F);
 //! @} cudaimgproc_shape
 }} // namespace cv { namespace cuda {
 #endif /* OPENCV_CUDAIMGPROC_HPP */
--- a/modules/cudaimgproc/misc/python/test/test_cudaimgproc.py
+++ b/modules/cudaimgproc/misc/python/test/test_cudaimgproc.py
@ -89,5 +89,30 @@ class cudaimgproc_test(NewOpenCVTests):
        self.assertTrue(np.allclose(cv.cuda.cvtColor(cuMat, cv.COLOR_BGR2HSV).download(),
                                         cv.cvtColor(npMat, cv.COLOR_BGR2HSV)))
    def test_moments(self):
        # setup
        src_host = (np.ones([10,10])).astype(np.uint8)*255
        cpu_moments = cv.moments(src_host, True)
        moments_order = cv.cuda.THIRD_ORDER_MOMENTS
        n_moments = cv.cuda.numMoments(cv.cuda.THIRD_ORDER_MOMENTS)
        src_device = cv.cuda.GpuMat(src_host)
        # synchronous
        cv.cuda.setBufferPoolUsage(True)
        cv.cuda.setBufferPoolConfig(cv.cuda.getDevice(), n_moments * np.dtype(float).itemsize, 1);
        gpu_moments = cv.cuda.moments(src_device, True, moments_order, cv.CV_64F)
        self.assertTrue(len([1 for moment_type in cpu_moments if moment_type in gpu_moments and cpu_moments[moment_type] == gpu_moments[moment_type]]) == 24)
        # asynchronous
        stream = cv.cuda.Stream()
        moments_array_host = np.empty([1, n_moments], np.float64)
        cv.cuda.registerPageLocked(moments_array_host)
        moments_array_device = cv.cuda.GpuMat(1, n_moments, cv.CV_64F)
        cv.cuda.spatialMoments(src_device, moments_array_device, True, moments_order, cv.CV_64F, stream)
        moments_array_device.download(stream, moments_array_host);
        stream.waitForCompletion()
        cv.cuda.unregisterPageLocked(moments_array_host)
        self.assertTrue(len([ 1 for moment_type,gpu_moment in zip(cpu_moments,moments_array_host[0]) if cpu_moments[moment_type] == gpu_moment]) == 10)
 if __name__ == '__main__':
    NewOpenCVTests.bootstrap()
--- a/modules/cudaimgproc/perf/perf_moments.cpp
+++ b/modules/cudaimgproc/perf/perf_moments.cpp
@ -0,0 +1,61 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 #include "perf_precomp.hpp"
 namespace opencv_test { namespace {
 static void drawCircle(cv::Mat& dst, const cv::Vec3i& circle, bool fill)
 {
    dst.setTo(Scalar::all(0));
    cv::circle(dst, Point2i(circle[0], circle[1]), circle[2], Scalar::all(255), fill ? -1 : 1, cv::LINE_AA);
 }
 DEF_PARAM_TEST(Sz_Depth, Size, MatDepth);
 PERF_TEST_P(Sz_Depth, SpatialMoments, Combine(CUDA_TYPICAL_MAT_SIZES, Values(MatDepth(CV_32F), MatDepth((CV_64F)))))
 {
    const cv::Size size = GET_PARAM(0);
    const int momentsType = GET_PARAM(1);
    Mat imgHost(size, CV_8U);
    const Vec3i circle(size.width / 2, size.height / 2, static_cast<int>(static_cast<float>(size.width / 2) * 0.9));
    drawCircle(imgHost, circle, true);
    if (PERF_RUN_CUDA()) {
        const MomentsOrder order = MomentsOrder::THIRD_ORDER_MOMENTS;
        const int nMoments = numMoments(order);
        GpuMat momentsDevice(1, nMoments, momentsType);
        const GpuMat imgDevice(imgHost);
        TEST_CYCLE() cuda::spatialMoments(imgDevice, momentsDevice, false, order, momentsType);
        SANITY_CHECK_NOTHING();
    }
    else {
        cv::Moments momentsHost;
        TEST_CYCLE() momentsHost = cv::moments(imgHost, false);
        SANITY_CHECK_NOTHING();
    }
 }
 PERF_TEST_P(Sz_Depth, Moments, Combine(CUDA_TYPICAL_MAT_SIZES, Values(MatDepth(CV_32F), MatDepth(CV_64F))))
 {
    const cv::Size size = GET_PARAM(0);
    const int momentsType = GET_PARAM(1);
    Mat imgHost(size, CV_8U);
    const Vec3i circle(size.width / 2, size.height / 2, static_cast<int>(static_cast<float>(size.width / 2) * 0.9));
    drawCircle(imgHost, circle, true);
    if (PERF_RUN_CUDA()) {
        const MomentsOrder order = MomentsOrder::THIRD_ORDER_MOMENTS;
        const int nMoments = numMoments(order);
        setBufferPoolUsage(true);
        setBufferPoolConfig(getDevice(), nMoments * ((momentsType == CV_64F) ? sizeof(double) : sizeof(float)), 1);
        const GpuMat imgDevice(imgHost);
        cv::Moments momentsHost;
        TEST_CYCLE() momentsHost = cuda::moments(imgDevice, false, order, momentsType);
        SANITY_CHECK_NOTHING();
    }
    else {
        cv::Moments momentsHost;
        TEST_CYCLE() momentsHost = cv::moments(imgHost, false);
        SANITY_CHECK_NOTHING();
    }
 }
 }}
--- a/modules/cudaimgproc/src/cuda/moments.cu
+++ b/modules/cudaimgproc/src/cuda/moments.cu
@ -0,0 +1,186 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 #if !defined CUDA_DISABLER
 #include <opencv2/core/cuda/common.hpp>
 #include <opencv2/cudev/util/atomic.hpp>
 #include "moments.cuh"
 namespace cv { namespace cuda { namespace device { namespace imgproc {
 constexpr int blockSizeX = 32;
 constexpr int blockSizeY = 16;
 template <typename T>
 __device__ T butterflyWarpReduction(T value) {
    for (int i = 16; i >= 1; i /= 2)
        value += __shfl_xor_sync(0xffffffff, value, i, 32);
    return value;
 }
 template <typename T>
 __device__ T butterflyHalfWarpReduction(T value) {
    for (int i = 8; i >= 1; i /= 2)
        value += __shfl_xor_sync(0xffff, value, i, 32);
    return value;
 }
 template<typename T, int nMoments>
 __device__ void updateSums(const T val, const unsigned int x, T r[4]) {
    const T x2 = x * x;
    const T x3 = static_cast<T>(x) * x2;
    r[0] += val;
    r[1] += val * x;
    if (nMoments >= n12) r[2] += val * x2;
    if (nMoments >= n123) r[3] += val * x3;
 }
 template<typename TSrc, typename TMoments, int nMoments>
 __device__ void rowReductions(const PtrStepSz<TSrc> img, const bool binary, const unsigned int y, TMoments r[4], TMoments smem[][nMoments + 1]) {
    for (int x = threadIdx.x; x < img.cols; x += blockDim.x) {
        const TMoments val = (!binary || img(y, x) == 0) ? img(y, x) : 1;
        updateSums<TMoments,nMoments>(val, x, r);
    }
 }
 template<typename TSrc, typename TMoments, bool fourByteAligned, int nMoments>
 __device__ void rowReductionsCoalesced(const PtrStepSz<TSrc> img, const bool binary, const unsigned int y, TMoments r[4], const int offsetX, TMoments smem[][nMoments + 1]) {
    const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
    // load uncoalesced head
    if (!fourByteAligned && threadIdx.x == 0) {
        for (int x = 0; x < ::min(alignedOffset, static_cast<int>(img.cols)); x++) {
            const TMoments val = (!binary || img(y, x) == 0) ? img(y, x) : 1;
            updateSums<TMoments, nMoments>(val, x, r);
        }
    }
    // coalesced loads
    const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? img.ptr(y) : img.ptr(y) + alignedOffset);
    const int cols4 = fourByteAligned ? img.cols / 4 : (img.cols - alignedOffset) / 4;
    for (int x = threadIdx.x; x < cols4; x += blockDim.x) {
        const unsigned int data = rowPtrIntAligned[x];
 #pragma unroll 4
        for (int i = 0; i < 4; i++) {
            const int iX = alignedOffset + 4 * x + i;
            const uchar ucharVal = ((data >> i * 8) & 0xFFU);
            const TMoments val = (!binary || ucharVal == 0) ? ucharVal : 1;
            updateSums<TMoments, nMoments>(val, iX, r);
        }
    }
    // load uncoalesced tail
    if (threadIdx.x == 0) {
        const int iTailStart = fourByteAligned ? cols4 * 4 : cols4 * 4 + alignedOffset;
        for (int x = iTailStart; x < img.cols; x++) {
            const TMoments val = (!binary || img(y, x) == 0) ? img(y, x) : 1;
            updateSums<TMoments, nMoments>(val, x, r);
        }
    }
 }
 template <typename TSrc, typename TMoments, bool coalesced = false, bool fourByteAligned = false, int nMoments>
 __global__ void spatialMoments(const PtrStepSz<TSrc> img, const bool binary, TMoments* moments, const int offsetX = 0) {
    const unsigned int y = blockIdx.x * blockDim.y + threadIdx.y;
    __shared__ TMoments smem[blockSizeY][nMoments + 1];
    if (threadIdx.y < nMoments && threadIdx.x < blockSizeY)
        smem[threadIdx.x][threadIdx.y] = 0;
    __syncthreads();
    TMoments r[4] = { 0 };
    if (y < img.rows) {
        if (coalesced)
            rowReductionsCoalesced<TSrc, TMoments, fourByteAligned, nMoments>(img, binary, y, r, offsetX, smem);
        else
            rowReductions<TSrc, TMoments, nMoments>(img, binary, y, r, smem);
    }
    const unsigned long y2 = y * y;
    const TMoments y3 = static_cast<TMoments>(y2) * y;
    const TMoments res = butterflyWarpReduction<float>(r[0]);
    if (res) {
        smem[threadIdx.y][0] = res; //0th
        smem[threadIdx.y][1] = butterflyWarpReduction(r[1]); //1st
        smem[threadIdx.y][2] = y * res; //1st
        if (nMoments >= n12) {
            smem[threadIdx.y][3] = butterflyWarpReduction(r[2]); //2nd
            smem[threadIdx.y][4] = smem[threadIdx.y][1] * y; //2nd
            smem[threadIdx.y][5] = y2 * res; //2nd
        }
        if (nMoments >= n123) {
            smem[threadIdx.y][6] = butterflyWarpReduction(r[3]); //3rd
            smem[threadIdx.y][7] = smem[threadIdx.y][3] * y; //3rd
            smem[threadIdx.y][8] = smem[threadIdx.y][1] * y2; //3rd
            smem[threadIdx.y][9] = y3 * res; //3rd
        }
    }
    __syncthreads();
    if (threadIdx.x < blockSizeY && threadIdx.y < nMoments)
        smem[threadIdx.y][nMoments] = butterflyHalfWarpReduction(smem[threadIdx.x][threadIdx.y]);
    __syncthreads();
    if (threadIdx.y == 0 && threadIdx.x < nMoments) {
        if (smem[threadIdx.x][nMoments])
            cudev::atomicAdd(&moments[threadIdx.x], smem[threadIdx.x][nMoments]);
    }
 }
 template <typename TSrc, typename TMoments, int nMoments> struct momentsDispatcherNonChar {
    static void call(const PtrStepSz<TSrc> src, PtrStepSz<TMoments> moments, const bool binary, const int offsetX, const cudaStream_t stream) {
        dim3 blockSize(blockSizeX, blockSizeY);
        dim3 gridSize = dim3(divUp(src.rows, blockSizeY));
        spatialMoments<TSrc, TMoments, false, false, nMoments> << <gridSize, blockSize, 0, stream >> > (src, binary, moments.ptr());
        if (stream == 0)
            cudaSafeCall(cudaStreamSynchronize(stream));
    };
 };
 template <typename TSrc, int nMoments> struct momentsDispatcherChar {
    static void call(const PtrStepSz<TSrc> src, PtrStepSz<float> moments, const bool binary, const int offsetX, const cudaStream_t stream) {
        dim3 blockSize(blockSizeX, blockSizeY);
        dim3 gridSize = dim3(divUp(src.rows, blockSizeY));
        if (offsetX)
            spatialMoments<TSrc, float, true, false, nMoments> << <gridSize, blockSize, 0, stream >> > (src, binary, moments.ptr(), offsetX);
        else
            spatialMoments<TSrc, float, true, true, nMoments> << <gridSize, blockSize, 0, stream >> > (src, binary, moments.ptr());
        if (stream == 0)
            cudaSafeCall(cudaStreamSynchronize(stream));
    };
 };
 template <typename TSrc, typename TMoments, int nMoments> struct momentsDispatcher : momentsDispatcherNonChar<TSrc, TMoments, nMoments> {};
 template <int nMoments> struct momentsDispatcher<uchar, float, nMoments> : momentsDispatcherChar<uchar, nMoments> {};
 template <int nMoments> struct momentsDispatcher<schar, float, nMoments> : momentsDispatcherChar<schar, nMoments> {};
 template <typename TSrc, typename TMoments>
 void moments(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream) {
    if (order == 1)
        momentsDispatcher<TSrc, TMoments, n1>::call(static_cast<PtrStepSz<TSrc>>(src), static_cast<PtrStepSz<TMoments>>(moments), binary, offsetX, stream);
    else if (order == 2)
        momentsDispatcher<TSrc, TMoments, n12>::call(static_cast<PtrStepSz<TSrc>>(src), static_cast<PtrStepSz<TMoments>>(moments), binary, offsetX, stream);
    else if (order == 3)
        momentsDispatcher<TSrc, TMoments, n123>::call(static_cast<PtrStepSz<TSrc>>(src), static_cast<PtrStepSz<TMoments>>(moments), binary, offsetX, stream);
 };
 template void moments<uchar, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
 template void moments<schar, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
 template void moments<ushort, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
 template void moments<short, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
 template void moments<int, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
 template void moments<float, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
 template void moments<double, float>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
 template void moments<uchar, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
 template void moments<schar, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
 template void moments<ushort, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
 template void moments<short, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
 template void moments<int, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
 template void moments<float, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
 template void moments<double, double>(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
 }}}}
 #endif /* CUDA_DISABLER */
--- a/modules/cudaimgproc/src/cuda/moments.cuh
+++ b/modules/cudaimgproc/src/cuda/moments.cuh
@ -0,0 +1,6 @@
 #pragma once
 namespace cv { namespace cuda { namespace device { namespace imgproc {
    constexpr int n1 = 3;
    constexpr int n12 = 6;
    constexpr int n123 = 10;
 }}}}
--- a/modules/cudaimgproc/src/moments.cpp
+++ b/modules/cudaimgproc/src/moments.cpp
@ -0,0 +1,67 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 #include "precomp.hpp"
 #include "cuda/moments.cuh"
 using namespace cv;
 using namespace cv::cuda;
 int cv::cuda::numMoments(const MomentsOrder order) {
    return order == MomentsOrder::FIRST_ORDER_MOMENTS ? device::imgproc::n1 : order == MomentsOrder::SECOND_ORDER_MOMENTS ? device::imgproc::n12 : device::imgproc::n123;
 }
 #if !defined (HAVE_CUDA) || defined (CUDA_DISABLER)
    Moments cv::cuda::moments(InputArray src, const bool binary, const MomentsOrder order, const int momentsType) { throw_no_cuda(); }
    void spatialMoments(InputArray src, OutputArray moments, const bool binary, const MomentsOrder order, const int momentsType, Stream& stream) { throw_no_cuda(); }
 #else /* !defined (HAVE_CUDA) */
 namespace cv { namespace cuda { namespace device { namespace imgproc {
        template <typename TSrc, typename TMoments>
        void moments(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
 }}}}
 void cv::cuda::spatialMoments(InputArray src, OutputArray moments, const bool binary, const MomentsOrder order, const int momentsType, Stream& stream) {
    CV_Assert(src.depth() <= CV_64F);
    const GpuMat srcDevice = getInputMat(src, stream);
    CV_Assert(momentsType == CV_32F || momentsType == CV_64F);
    const int nMoments = numMoments(order);
    const int momentsCols = nMoments < moments.cols() ? moments.cols() : nMoments;
    GpuMat momentsDevice = getOutputMat(moments, 1, momentsCols, momentsType, stream);
    momentsDevice.setTo(0);
    Point ofs; Size wholeSize;
    srcDevice.locateROI(wholeSize, ofs);
    typedef void (*func_t)(const PtrStepSzb src, PtrStepSzb moments, const bool binary, const int order, const int offsetX, const cudaStream_t stream);
    static const func_t funcs[7][2] =
    {
        {device::imgproc::moments<uchar, float>,  device::imgproc::moments<uchar, double> },
        {device::imgproc::moments<schar, float>,  device::imgproc::moments<schar, double> },
        {device::imgproc::moments<ushort, float>, device::imgproc::moments<ushort, double>},
        {device::imgproc::moments<short, float>,  device::imgproc::moments<short, double> },
        {device::imgproc::moments<int, float>,    device::imgproc::moments<int, double> },
        {device::imgproc::moments<float, float>,  device::imgproc::moments<float, double> },
        {device::imgproc::moments<double, float>, device::imgproc::moments<double, double> }
    };
    const func_t func = funcs[srcDevice.depth()][momentsType == CV_64F];
    func(srcDevice, momentsDevice, binary, static_cast<int>(order), ofs.x, StreamAccessor::getStream(stream));
    syncOutput(momentsDevice, moments, stream);
 }
 Moments cv::cuda::moments(InputArray src, const bool binary, const MomentsOrder order, const int momentsType) {
    Stream& stream = Stream::Null();
    HostMem dst;
    spatialMoments(src, dst, binary, order, momentsType, stream);
    stream.waitForCompletion();
    Mat moments = dst.createMatHeader();
    if(momentsType == CV_32F)
        return Moments(moments.at<float>(0), moments.at<float>(1), moments.at<float>(2), moments.at<float>(3), moments.at<float>(4), moments.at<float>(5), moments.at<float>(6), moments.at<float>(7), moments.at<float>(8), moments.at<float>(9));
    else
        return Moments(moments.at<double>(0), moments.at<double>(1), moments.at<double>(2), moments.at<double>(3), moments.at<double>(4), moments.at<double>(5), moments.at<double>(6), moments.at<double>(7), moments.at<double>(8), moments.at<double>(9));
 }
 #endif /* !defined (HAVE_CUDA) */
--- a/modules/cudaimgproc/test/test_moments.cpp
+++ b/modules/cudaimgproc/test/test_moments.cpp
@ -0,0 +1,124 @@
 // This file is part of OpenCV project.
 // It is subject to the license terms in the LICENSE file found in the top-level directory
 // of this distribution and at http://opencv.org/license.html.
 #include "test_precomp.hpp"
 #ifdef HAVE_CUDA
 namespace opencv_test { namespace {
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // Moments
 CV_ENUM(MaxMomentsOrder, MomentsOrder::FIRST_ORDER_MOMENTS, MomentsOrder::SECOND_ORDER_MOMENTS, MomentsOrder::THIRD_ORDER_MOMENTS)
 PARAM_TEST_CASE(Moments, cv::cuda::DeviceInfo, cv::Size, bool, MatDepth, MatDepth, UseRoi, MaxMomentsOrder)
 {
    DeviceInfo devInfo;
    Size size;
    bool isBinary;
    float pcWidth = 0.6f;
    int momentsType;
    int imgType;
    bool useRoi;
    MomentsOrder order;
    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
        size = GET_PARAM(1);
        isBinary = GET_PARAM(2);
        momentsType = GET_PARAM(3);
        imgType = GET_PARAM(4);
        useRoi = GET_PARAM(5);
        order = static_cast<MomentsOrder>(static_cast<int>(GET_PARAM(6)));
        cv::cuda::setDevice(devInfo.deviceID());
    }
    static void drawCircle(cv::Mat& dst, const cv::Vec3i& circle, bool fill)
    {
        dst.setTo(Scalar::all(0));
        cv::circle(dst, Point2i(circle[0], circle[1]), circle[2], Scalar::all(255), fill ? -1 : 1, cv::LINE_AA);
    }
 };
 bool Equal(const double m0, const double m1, const double absPcErr) {
    if (absPcErr == 0) return m0 == m1;
    if (m0 == 0) {
        if (m1 < absPcErr) return true;
        else return false;
    }
    const double pcDiff = abs(m0 - m1) / m1;
    return pcDiff < absPcErr;
 }
 void CheckMoments(const cv::Moments m0, const cv::Moments m1, const MomentsOrder order, const int momentsType) {
    double absPcErr = momentsType == CV_64F ? 0 : 5e-7;
    ASSERT_TRUE(Equal(m0.m00, m1.m00, absPcErr)) << "m0.m00: " << m0.m00 << ", m1.m00: " << m1.m00 << ", absPcErr: " << absPcErr;
    ASSERT_TRUE(Equal(m0.m10, m1.m10, absPcErr)) << "m0.m10: " << m0.m10 << ", m1.m10: " << m1.m10 << ", absPcErr: " << absPcErr;
    ASSERT_TRUE(Equal(m0.m01, m1.m01, absPcErr)) << "m0.m01: " << m0.m01 << ", m1.m01: " << m1.m01 << ", absPcErr: " << absPcErr;
    if (static_cast<int>(order) >= static_cast<int>(MomentsOrder::SECOND_ORDER_MOMENTS)) {
        ASSERT_TRUE(Equal(m0.m20, m1.m20, absPcErr)) << "m0.m20: " << m0.m20 << ", m1.m20: " << m1.m20 << ", absPcErr: " << absPcErr;
        ASSERT_TRUE(Equal(m0.m11, m1.m11, absPcErr)) << "m0.m11: " << m0.m11 << ", m1.m11: " << m1.m11 << ", absPcErr: " << absPcErr;
        ASSERT_TRUE(Equal(m0.m02, m1.m02, absPcErr)) << "m0.m02: " << m0.m02 << ", m1.m02: " << m1.m02 << ", absPcErr: " << absPcErr;
    }
    if (static_cast<int>(order) >= static_cast<int>(MomentsOrder::THIRD_ORDER_MOMENTS)) {
        ASSERT_TRUE(Equal(m0.m30, m1.m30, absPcErr)) << "m0.m30: " << m0.m30 << ", m1.m30: " << m1.m30 << ", absPcErr: " << absPcErr;
        ASSERT_TRUE(Equal(m0.m21, m1.m21, absPcErr)) << "m0.m21: " << m0.m21 << ", m1.m21: " << m1.m21 << ", absPcErr: " << absPcErr;
        ASSERT_TRUE(Equal(m0.m12, m1.m12, absPcErr)) << "m0.m12: " << m0.m12 << ", m1.m12: " << m1.m12 << ", absPcErr: " << absPcErr;
        ASSERT_TRUE(Equal(m0.m03, m1.m03, absPcErr)) << "m0.m03: " << m0.m03 << ", m1.m03: " << m1.m03 << ", absPcErr: " << absPcErr;
    }
 }
 CUDA_TEST_P(Moments, Accuracy)
 {
    Mat imgHost(size, imgType);
    const Rect roi = useRoi ? Rect(1, 0, imgHost.cols - 2, imgHost.rows) : Rect(0, 0, imgHost.cols, imgHost.rows);
    const Vec3i circle(size.width / 2, size.height / 2, static_cast<int>(static_cast<float>(size.width/2) * pcWidth));
    drawCircle(imgHost, circle, true);
    const GpuMat imgDevice(imgHost);
    const int nMoments = numMoments(order);
    setBufferPoolUsage(true);
    setBufferPoolConfig(getDevice(), nMoments * ((momentsType == CV_64F) ? sizeof(double) : sizeof(float)), 1);
    const cv::Moments moments = cuda::moments(imgDevice(roi), isBinary, order, momentsType);
    Mat imgHostFloat; imgHost(roi).convertTo(imgHostFloat, CV_32F);
    const cv::Moments momentsGs = cv::moments(imgHostFloat, isBinary);
    CheckMoments(momentsGs, moments, order, momentsType);
 }
 CUDA_TEST_P(Moments, Async)
 {
    Stream stream;
    const int nMoments = numMoments(order);
    GpuMat momentsDevice(1, nMoments, momentsType);
    Mat imgHost(size, imgType);
    const Rect roi = useRoi ? Rect(1, 0, imgHost.cols - 2, imgHost.rows) : Rect(0, 0, imgHost.cols, imgHost.rows);
    const Vec3i circle(size.width / 2, size.height / 2, static_cast<int>(static_cast<float>(size.width/2) * pcWidth));
    drawCircle(imgHost, circle, true);
    const GpuMat imgDevice(imgHost);
    cuda::spatialMoments(imgDevice(roi), momentsDevice, isBinary, order, momentsType, stream);
    HostMem momentsHost(1, nMoments, momentsType);
    momentsDevice.download(momentsHost, stream);
    stream.waitForCompletion();
    Mat momentsHost64F = momentsHost.createMatHeader();
    if (momentsType == CV_32F)
        momentsHost.createMatHeader().convertTo(momentsHost64F, CV_64F);
    const cv::Moments moments = cv::Moments(momentsHost64F.at<double>(0), momentsHost64F.at<double>(1), momentsHost64F.at<double>(2), momentsHost64F.at<double>(3), momentsHost64F.at<double>(4), momentsHost64F.at<double>(5), momentsHost64F.at<double>(6), momentsHost64F.at<double>(7), momentsHost64F.at<double>(8), momentsHost64F.at<double>(9));
    Mat imgHostAdjustedType = imgHost(roi);
    if (imgType != CV_8U && imgType != CV_32F)
        imgHost(roi).convertTo(imgHostAdjustedType, CV_32F);
    const cv::Moments momentsGs = cv::moments(imgHostAdjustedType, isBinary);
    CheckMoments(momentsGs, moments, order, momentsType);
 }
 #define SIZES DIFFERENT_SIZES
 #define GRAYSCALE_BINARY testing::Bool()
 #define MOMENTS_TYPE testing::Values(MatDepth(CV_32F), MatDepth(CV_64F))
 #define IMG_TYPE ALL_DEPTH
 #define USE_ROI WHOLE_SUBMAT
 #define MOMENTS_ORDER testing::Values(MaxMomentsOrder(MomentsOrder::FIRST_ORDER_MOMENTS), MaxMomentsOrder(MomentsOrder::SECOND_ORDER_MOMENTS), MaxMomentsOrder(MomentsOrder::THIRD_ORDER_MOMENTS))
 INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, Moments, testing::Combine(ALL_DEVICES, SIZES, GRAYSCALE_BINARY, MOMENTS_TYPE, IMG_TYPE, USE_ROI, MOMENTS_ORDER));
 }} // namespace
 #endif // HAVE_CUDA
--- a/modules/cudev/include/opencv2/cudev/util/atomic.hpp
+++ b/modules/cudev/include/opencv2/cudev/util/atomic.hpp
@ -83,7 +83,7 @@ __device__ __forceinline__ float atomicAdd(float* address, float val)
 __device__ static double atomicAdd(double* address, double val)
 {
-#if CV_CUDEV_ARCH >= 130
+#if CV_CUDEV_ARCH < 600
    unsigned long long int* address_as_ull = (unsigned long long int*) address;
    unsigned long long int old = *address_as_ull, assumed;
    do {
@ -93,9 +93,7 @@ __device__ static double atomicAdd(double* address, double val)
    } while (assumed != old);
    return __longlong_as_double(old);
 #else
-    CV_UNUSED(address);
+    return ::atomicAdd(address, val);
    CV_UNUSED(val);
    return 0.0;
 #endif
 }