Merge branch 'opencv:4.x' into 4.x

2 years ago · c3e7d078f1
parent 614b39c21c f10c84d48b
commit c3e7d078f1
9 changed files with 243 additions and 161 deletions
--- a/modules/cudacodec/src/ffmpeg_video_source.cpp
+++ b/modules/cudacodec/src/ffmpeg_video_source.cpp
@ -66,8 +66,10 @@ static std::string fourccToString(int fourcc)
        (i32_c.c[3] >= ' ' && i32_c.c[3] < 128) ? i32_c.c[3] : '?');
 }

+// handle old FFmpeg backend - remove when windows shared library is updated
+#ifdef _WIN32
 static
-Codec FourccToCodec(int codec)
+Codec FourccToCodecWin32Old(int codec)
 {
    switch (codec)
    {
@ -100,9 +102,34 @@ Codec FourccToCodec(int codec)
    case CV_FOURCC_MACRO('a', 'v', '0', '1'): // fallthru
    case CV_FOURCC_MACRO('A', 'V', '0', '1'): return AV1;
    default:
-        break;
+        return NumCodecs;
    }
+}
+#endif

+static
+Codec FourccToCodec(int codec)
+{
+#ifdef _WIN32 // handle old FFmpeg backend - remove when windows shared library is updated
+    Codec win32OldCodec = FourccToCodecWin32Old(codec);
+    if(win32OldCodec != NumCodecs)
+        return win32OldCodec;
+#endif
+    switch (codec)
+    {
+    case CV_FOURCC_MACRO('m', 'p', 'g', '1'): return MPEG1;
+    case CV_FOURCC_MACRO('m', 'p', 'g', '2'): return MPEG2;
+    case CV_FOURCC_MACRO('F', 'M', 'P', '4'): return MPEG4;
+    case CV_FOURCC_MACRO('W', 'V', 'C', '1'): return VC1;
+    case CV_FOURCC_MACRO('h', '2', '6', '4'): return H264;
+    case CV_FOURCC_MACRO('h', 'e', 'v', 'c'): return HEVC;
+    case CV_FOURCC_MACRO('M', 'J', 'P', 'G'): return JPEG;
+    case CV_FOURCC_MACRO('V', 'P', '8', '0'): return VP8;
+    case CV_FOURCC_MACRO('V', 'P', '9', '0'): return VP9;
+    case CV_FOURCC_MACRO('A', 'V', '0', '1'): return AV1;
+    default:
+        break;
+    }
    std::string msg = cv::format("Unknown codec FOURCC: 0x%08X (%s)", codec, fourccToString(codec).c_str());
    CV_LOG_WARNING(NULL, msg);
    CV_Error(Error::StsUnsupportedFormat, msg);
@ -163,7 +190,6 @@ cv::cudacodec::detail::FFmpegVideoSource::FFmpegVideoSource(const String& fname,

    int codec = (int)cap.get(CAP_PROP_FOURCC);
    int pixelFormat = (int)cap.get(CAP_PROP_CODEC_PIXEL_FORMAT);
-
    format_.codec = FourccToCodec(codec);
    format_.height = cap.get(CAP_PROP_FRAME_HEIGHT);
    format_.width = cap.get(CAP_PROP_FRAME_WIDTH);
--- a/modules/cudacodec/src/video_decoder.cpp
+++ b/modules/cudacodec/src/video_decoder.cpp
@ -97,10 +97,10 @@ void cv::cudacodec::detail::VideoDecoder::create(const FormatInfo& videoFormat)
                            cudaVideoCodec_UYVY     == _codec;

 #if defined (HAVE_CUDA)
-#if (CUDART_VERSION >= 6500)
+#if (CUDART_VERSION >= 6050)
    codecSupported |=       cudaVideoCodec_HEVC     == _codec;
 #endif
-#if  ((CUDART_VERSION == 7500) || (CUDART_VERSION >= 9000))
+#if  ((CUDART_VERSION == 7050) || (CUDART_VERSION >= 9000))
    codecSupported |=       cudaVideoCodec_VP8      == _codec ||
                            cudaVideoCodec_VP9      == _codec ||
                            cudaVideoCodec_AV1      == _codec ||
@ -160,9 +160,11 @@ void cv::cudacodec::detail::VideoDecoder::create(const FormatInfo& videoFormat)
    createInfo_.ulCreationFlags     = videoCreateFlags;
    createInfo_.vidLock = lock_;
    cuSafeCall(cuCtxPushCurrent(ctx_));
-    cuSafeCall(cuvidCreateDecoder(&decoder_, &createInfo_));
+    {
+        AutoLock autoLock(mtx_);
+        cuSafeCall(cuvidCreateDecoder(&decoder_, &createInfo_));
+    }
    cuSafeCall(cuCtxPopCurrent(NULL));
-    inited_ = true;
 }

 int cv::cudacodec::detail::VideoDecoder::reconfigure(const FormatInfo& videoFormat) {
--- a/modules/cudacodec/src/video_decoder.hpp
+++ b/modules/cudacodec/src/video_decoder.hpp
@ -70,6 +70,7 @@ public:
    void create(const FormatInfo& videoFormat);
    int reconfigure(const FormatInfo& videoFormat);
    void release();
+    bool inited() { AutoLock autoLock(mtx_); return decoder_; }

    // Get the codec-type currently used.
    cudaVideoCodec codec() const { return static_cast<cudaVideoCodec>(videoFormat_.codec); }
@ -85,8 +86,6 @@ public:
    unsigned long targetWidth() { return videoFormat_.width; }
    unsigned long targetHeight() { return videoFormat_.height; }

-    bool inited() { return inited_; }
-
    cudaVideoChromaFormat chromaFormat() const { return static_cast<cudaVideoChromaFormat>(videoFormat_.chromaFormat); }
    int nBitDepthMinus8() const { return videoFormat_.nBitDepthMinus8; }

@ -114,10 +113,9 @@ public:
 private:
    CUcontext ctx_ = 0;
    CUvideoctxlock lock_;
-    CUvideodecoder        decoder_ = 0;
+    CUvideodecoder decoder_ = 0;
    FormatInfo videoFormat_ = {};
    Mutex mtx_;
-    bool inited_ = false;
 };

 }}}
--- a/modules/cudacodec/src/video_reader.cpp
+++ b/modules/cudacodec/src/video_reader.cpp
@ -67,17 +67,14 @@ void cvtFromNv12(const GpuMat& decodedFrame, GpuMat& outFrame, int width, int he
        outFrame.create(height, width, CV_8UC3);
        Npp8u* pSrc[2] = { decodedFrame.data, &decodedFrame.data[decodedFrame.step * height] };
        NppiSize oSizeROI = { width,height };
-#if (CUDART_VERSION < 9200)
-        CV_Error(Error::StsUnsupportedFormat, "ColorFormat::BGR is not supported until CUDA 9.2, use default ColorFormat::BGRA.");
-#elif (CUDART_VERSION < 10100)
+#if (CUDART_VERSION < 10010)
        cv::cuda::NppStreamHandler h(stream);
        if (videoFullRangeFlag)
            nppSafeCall(nppiNV12ToBGR_709HDTV_8u_P2C3R(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI));
        else {
-            CV_LOG_DEBUG(NULL, "Color reproduction may be inaccurate due CUDA version <= 11.0, for better results upgrade CUDA runtime or try ColorFormat::BGRA.");
            nppSafeCall(nppiNV12ToBGR_8u_P2C3R(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI));
        }
-#elif (CUDART_VERSION >= 10100)
+#elif (CUDART_VERSION >= 10010)
        NppStreamContext nppStreamCtx;
        nppSafeCall(nppGetStreamContext(&nppStreamCtx));
        nppStreamCtx.hStream = StreamAccessor::getStream(stream);
@ -85,7 +82,6 @@ void cvtFromNv12(const GpuMat& decodedFrame, GpuMat& outFrame, int width, int he
            nppSafeCall(nppiNV12ToBGR_709HDTV_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, nppStreamCtx));
        else {
 #if (CUDART_VERSION < 11000)
-            CV_LOG_DEBUG(NULL, "Color reproduction may be inaccurate due CUDA version <= 11.0, for better results upgrade CUDA runtime or try ColorFormat::BGRA.");
            nppSafeCall(nppiNV12ToBGR_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, nppStreamCtx));
 #else
            nppSafeCall(nppiNV12ToBGR_709CSC_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, nppStreamCtx));
@ -137,6 +133,7 @@ namespace

    private:
        bool internalGrab(GpuMat& frame, Stream& stream);
+        void waitForDecoderInit();

        Ptr<VideoSource> videoSource_;

@ -160,6 +157,15 @@ namespace
        return videoSource_->format();
    }

+    void VideoReaderImpl::waitForDecoderInit() {
+        for (;;) {
+            if (videoDecoder_->inited()) break;
+            if (videoParser_->hasError() || frameQueue_->isEndOfDecode())
+                CV_Error(Error::StsError, "Parsing/Decoding video source failed, check GPU memory is available and GPU supports hardware decoding.");
+            Thread::sleep(1);
+        }
+    }
+
    VideoReaderImpl::VideoReaderImpl(const Ptr<VideoSource>& source, const int minNumDecodeSurfaces, const bool allowFrameDrop, const bool udpSource,
        const Size targetSz, const Rect srcRoi, const Rect targetRoi) :
        videoSource_(source),
@ -177,6 +183,8 @@ namespace
        videoParser_.reset(new VideoParser(videoDecoder_, frameQueue_, allowFrameDrop, udpSource));
        videoSource_->setVideoParser(videoParser_);
        videoSource_->start();
+        waitForDecoderInit();
+        videoSource_->updateFormat(videoDecoder_->format());
    }

    VideoReaderImpl::~VideoReaderImpl()
@ -307,6 +315,15 @@ namespace

    bool VideoReaderImpl::set(const ColorFormat colorFormat_) {
        if (!ValidColorFormat(colorFormat_)) return false;
+        if (colorFormat_ == ColorFormat::BGR) {
+#if (CUDART_VERSION < 9020)
+            CV_LOG_DEBUG(NULL, "ColorFormat::BGR is not supported until CUDA 9.2, use default ColorFormat::BGRA.");
+            return false;
+#elif (CUDART_VERSION < 11000)
+            if (!videoDecoder_->format().videoFullRangeFlag)
+                CV_LOG_INFO(NULL, "Color reproduction may be inaccurate due CUDA version <= 11.0, for better results upgrade CUDA runtime or try ColorFormat::BGRA.");
+#endif
+        }
        colorFormat = colorFormat_;
        return true;
    }
--- a/modules/cudacodec/test/test_video.cpp
+++ b/modules/cudacodec/test/test_video.cpp
@ -230,14 +230,15 @@ CUDA_TEST_P(Scaling, Reader)
        static_cast<int>(params.targetSz.width * targetRoiIn.width), static_cast<int>(params.targetSz.height * targetRoiIn.height));

    cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile, {}, params);
+    const cudacodec::FormatInfo format = reader->format();
+    ASSERT_TRUE(format.valid);
    ASSERT_TRUE(reader->set(cudacodec::ColorFormat::GRAY));
    GpuMat frame;
    ASSERT_TRUE(reader->nextFrame(frame));
-    const cudacodec::FormatInfo format = reader->format();
    Size targetSzOut = params.targetSz;
    Rect srcRoiOut = params.srcRoi, targetRoiOut = params.targetRoi;
    ForceAlignment(srcRoiOut, targetRoiOut, targetSzOut);
-    ASSERT_TRUE(format.valid && format.targetSz == targetSzOut && format.srcRoi == srcRoiOut && format.targetRoi == targetRoiOut);
+    ASSERT_TRUE(format.targetSz == targetSzOut && format.srcRoi == srcRoiOut && format.targetRoi == targetRoiOut);
    ASSERT_TRUE(frame.size() == targetSzOut);
    GpuMat frameGs;
    cv::cuda::resize(frameOr(srcRoiOut), frameGs, targetRoiOut.size(), 0, 0, INTER_AREA);
@ -280,15 +281,11 @@ CUDA_TEST_P(DisplayResolution, Reader)
 CUDA_TEST_P(Video, Reader)
 {
    cv::cuda::setDevice(GET_PARAM(0).deviceID());
+    const std::string relativeFilePath = GET_PARAM(1);

    // CUDA demuxer has to fall back to ffmpeg to process "cv/video/768x576.avi"
-    if (GET_PARAM(1) == "cv/video/768x576.avi" && !videoio_registry::hasBackend(CAP_FFMPEG))
-        throw SkipTestException("FFmpeg backend not found");
-
-#ifdef _WIN32  // handle old FFmpeg backend
-    if (GET_PARAM(1) == "/cv/tracking/faceocc2/data/faceocc2.webm")
-        throw SkipTestException("Feature not yet supported by Windows FFmpeg shared library!");
-#endif
+    if (relativeFilePath == "cv/video/768x576.avi" && !videoio_registry::hasBackend(CAP_FFMPEG))
+        throw SkipTestException("FFmpeg backend not found  - SKIP");

    const std::vector<std::pair< cudacodec::ColorFormat, int>> formatsToChannels = {
        {cudacodec::ColorFormat::GRAY,1},
@ -297,7 +294,7 @@ CUDA_TEST_P(Video, Reader)
        {cudacodec::ColorFormat::NV_NV12,1}
    };

-    std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../" + GET_PARAM(1);
+    std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../" + relativeFilePath;
    cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile);
    ASSERT_FALSE(reader->set(cudacodec::ColorFormat::RGB));
    cv::cudacodec::FormatInfo fmt = reader->format();
@ -310,8 +307,6 @@ CUDA_TEST_P(Video, Reader)
        double colorFormat;
        ASSERT_TRUE(reader->get(cudacodec::VideoReaderProps::PROP_COLOR_FORMAT, colorFormat) && static_cast<cudacodec::ColorFormat>(colorFormat) == formatToChannels.first);
        ASSERT_TRUE(reader->nextFrame(frame));
-        if(!fmt.valid)
-            fmt = reader->format();
        const int height = formatToChannels.first == cudacodec::ColorFormat::NV_NV12 ? static_cast<int>(1.5 * fmt.height) : fmt.height;
        ASSERT_TRUE(frame.cols == fmt.width && frame.rows == height);
        ASSERT_FALSE(frame.empty());
@ -326,6 +321,7 @@ CUDA_TEST_P(ColorConversion, Reader)
    const std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../" + get<0>(GET_PARAM(2));
    const bool videoFullRangeFlag = get<1>(GET_PARAM(2));
    cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile);
+    cv::cudacodec::FormatInfo fmt = reader->format();
    reader->set(colorFormat);
    cv::VideoCapture cap(inputFile);

@ -336,8 +332,8 @@ CUDA_TEST_P(ColorConversion, Reader)
        reader->nextFrame(frame);
        frame.download(frameFromDevice);
        cap.read(frameHost);
-        const cv::cudacodec::FormatInfo fmt = reader->format();
-        ASSERT_TRUE(fmt.valid && fmt.videoFullRangeFlag == videoFullRangeFlag);
+        fmt = reader->format();
+        ASSERT_TRUE(fmt.videoFullRangeFlag == videoFullRangeFlag);
        if (colorFormat == cv::cudacodec::ColorFormat::BGRA)
            cv::cvtColor(frameHost, frameHostGs, COLOR_BGR2BGRA);
        else
@ -384,7 +380,7 @@ CUDA_TEST_P(ReconfigureDecoderWithScaling, Reader)
        if (nFrames++ == 0)
            initialSize = frame.size();
        fmt = reader->format();
-        ASSERT_TRUE(fmt.valid && (frame.size() == initialSize));
+        ASSERT_TRUE(frame.size() == initialSize);
        ASSERT_TRUE((frame.size() == targetSzOut) && (fmt.targetSz == targetSzOut) && (fmt.srcRoi == srcRoiOut) && (fmt.targetRoi == targetRoiOut));
        // simple check - zero borders, non zero contents
        ASSERT_TRUE(!cuda::absSum(frame, mask)[0] && cuda::sum(frame)[0]);
@ -413,7 +409,7 @@ CUDA_TEST_P(ReconfigureDecoder, Reader)
            initialSize = frame.size();
            initialCodedSize = Size(fmt.ulWidth, fmt.ulHeight);
        }
-        ASSERT_TRUE(fmt.valid && (frame.size() == initialSize));
+        ASSERT_TRUE(frame.size() == initialSize);
        ASSERT_TRUE(fmt.srcRoi.empty());
        const bool resChanged = (initialCodedSize.width != fmt.ulWidth) || (initialCodedSize.height != fmt.ulHeight);
        if (resChanged)
@ -541,11 +537,6 @@ CUDA_TEST_P(CheckDecodeSurfaces, Reader)
    {
        cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile);
        cv::cudacodec::FormatInfo fmt = reader->format();
-        if (!fmt.valid) {
-            reader->grab();
-            fmt = reader->format();
-            ASSERT_TRUE(fmt.valid);
-        }
        ulNumDecodeSurfaces = fmt.ulNumDecodeSurfaces;
    }

@ -554,11 +545,6 @@ CUDA_TEST_P(CheckDecodeSurfaces, Reader)
        params.minNumDecodeSurfaces = ulNumDecodeSurfaces - 1;
        cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile, {}, params);
        cv::cudacodec::FormatInfo fmt = reader->format();
-        if (!fmt.valid) {
-            reader->grab();
-            fmt = reader->format();
-            ASSERT_TRUE(fmt.valid);
-        }
        ASSERT_TRUE(fmt.ulNumDecodeSurfaces == ulNumDecodeSurfaces);
        for (int i = 0; i < 100; i++) ASSERT_TRUE(reader->grab());
    }
@ -568,11 +554,6 @@ CUDA_TEST_P(CheckDecodeSurfaces, Reader)
        params.minNumDecodeSurfaces = ulNumDecodeSurfaces + 1;
        cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile, {}, params);
        cv::cudacodec::FormatInfo fmt = reader->format();
-        if (!fmt.valid) {
-            reader->grab();
-            fmt = reader->format();
-            ASSERT_TRUE(fmt.valid);
-        }
        ASSERT_TRUE(fmt.ulNumDecodeSurfaces == ulNumDecodeSurfaces + 1);
        for (int i = 0; i < 100; i++) ASSERT_TRUE(reader->grab());
    }
@ -626,10 +607,6 @@ CUDA_TEST_P(TransCode, H264ToH265)
        cv::cuda::Stream stream;
        for (int i = 0; i < nFrames; ++i) {
            ASSERT_TRUE(reader->nextFrame(frame, stream));
-            if (!fmt.valid) {
-                fmt = reader->format();
-                ASSERT_TRUE(fmt.valid);
-            }
            ASSERT_FALSE(frame.empty());
            Mat tst; frame.download(tst);
            if (writer.empty()) {
@ -837,13 +814,13 @@ INSTANTIATE_TEST_CASE_P(CUDA_Codec, Scaling, testing::Combine(

 INSTANTIATE_TEST_CASE_P(CUDA_Codec, DisplayResolution, ALL_DEVICES);

-#define VIDEO_SRC_R "highgui/video/big_buck_bunny.mp4", "cv/video/768x576.avi", "cv/video/1920x1080.avi", "highgui/video/big_buck_bunny.avi", \
+#define VIDEO_SRC_R testing::Values("highgui/video/big_buck_bunny.mp4", "cv/video/768x576.avi", "cv/video/1920x1080.avi", "highgui/video/big_buck_bunny.avi", \
    "highgui/video/big_buck_bunny.h264", "highgui/video/big_buck_bunny.h265", "highgui/video/big_buck_bunny.mpg", \
-    "highgui/video/sample_322x242_15frames.yuv420p.libvpx-vp9.mp4", "highgui/video/sample_322x242_15frames.yuv420p.libaom-av1.mp4", \
-    "cv/tracking/faceocc2/data/faceocc2.webm"
-INSTANTIATE_TEST_CASE_P(CUDA_Codec, Video, testing::Combine(
-    ALL_DEVICES,
-    testing::Values(VIDEO_SRC_R)));
+    "highgui/video/sample_322x242_15frames.yuv420p.libvpx-vp9.mp4")
+    //, "highgui/video/sample_322x242_15frames.yuv420p.libaom-av1.mp4", \
+    "cv/tracking/faceocc2/data/faceocc2.webm", "highgui/video/sample_322x242_15frames.yuv420p.mpeg2video.mp4", "highgui/video/sample_322x242_15frames.yuv420p.mjpeg.mp4")
+
+INSTANTIATE_TEST_CASE_P(CUDA_Codec, Video, testing::Combine(ALL_DEVICES,VIDEO_SRC_R));

 const color_conversion_params_t color_conversion_params[] =
 {
@ -878,9 +855,11 @@ INSTANTIATE_TEST_CASE_P(CUDA_Codec, CheckExtraData, testing::Combine(
    ALL_DEVICES,
    testing::ValuesIn(check_extra_data_params)));

+#define VIDEO_SRC_KEY "highgui/video/big_buck_bunny.mp4", "cv/video/768x576.avi", "cv/video/1920x1080.avi", "highgui/video/big_buck_bunny.avi", \
+    "highgui/video/big_buck_bunny.h264", "highgui/video/big_buck_bunny.h265", "highgui/video/big_buck_bunny.mpg"
 INSTANTIATE_TEST_CASE_P(CUDA_Codec, CheckKeyFrame, testing::Combine(
    ALL_DEVICES,
-    testing::Values(VIDEO_SRC_R)));
+    testing::Values(VIDEO_SRC_KEY)));

 INSTANTIATE_TEST_CASE_P(CUDA_Codec, CheckParams, ALL_DEVICES);

--- a/modules/cudaimgproc/src/cuda/hist.cu
+++ b/modules/cudaimgproc/src/cuda/hist.cu
@ -52,38 +52,41 @@ using namespace cv::cuda::device;

 namespace hist
 {
-    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist)
+    template<bool fourByteAligned>
+    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist, const int offsetX = 0)
    {
        __shared__ int shist[256];

        const int y = blockIdx.x * blockDim.y + threadIdx.y;
        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
+        const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
        shist[tid] = 0;
        __syncthreads();

-        if (y < rows)
-        {
-            const unsigned int* rowPtr = (const unsigned int*) (src + y * step);
-
-            const int cols_4 = cols / 4;
-            for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
-            {
-                unsigned int data = rowPtr[x];
+        if (y < rows) {
+            const uchar* rowPtr = &src[y * step];
+            // load uncoalesced head
+            if (!fourByteAligned && threadIdx.x == 0) {
+                for (int x = 0; x < min(alignedOffset, cols); x++)
+                    Emulation::smem::atomicAdd(&shist[static_cast<int>(rowPtr[x])], 1);
+            }

-                Emulation::smem::atomicAdd(&shist[(data >>  0) & 0xFFU], 1);
-                Emulation::smem::atomicAdd(&shist[(data >>  8) & 0xFFU], 1);
+            // coalesced loads
+            const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * step] : &src[alignedOffset + y * step]);
+            const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4;
+            for (int x = threadIdx.x; x < cols_4; x += blockDim.x) {
+                const unsigned int data = rowPtrIntAligned[x];
+                Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
+                Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);
                Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
                Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
            }

-            if (cols % 4 != 0 && threadIdx.x == 0)
-            {
-                for (int x = cols_4 * 4; x < cols; ++x)
-                {
-                    unsigned int data = ((const uchar*)rowPtr)[x];
-                    Emulation::smem::atomicAdd(&shist[data], 1);
-                }
+            // load uncoalesced tail
+            if (threadIdx.x == 0) {
+                const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
+                for (int x = iTailStart; x < cols; x++)
+                    Emulation::smem::atomicAdd(&shist[static_cast<int>(rowPtr[x])], 1);
            }
        }

@ -94,61 +97,70 @@ namespace hist
            ::atomicAdd(hist + tid, histVal);
    }

-    void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream)
+    void histogram256(PtrStepSzb src, int* hist, const int offsetX, cudaStream_t stream)
    {
        const dim3 block(32, 8);
        const dim3 grid(divUp(src.rows, block.y));
-
-        histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist);
+        if(offsetX)
+            histogram256Kernel<false><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist, offsetX);
+        else
+            histogram256Kernel<true><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist, offsetX);
        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
            cudaSafeCall( cudaDeviceSynchronize() );
    }

-    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int* hist)
+    template<bool fourByteAligned>
+    __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int* hist, const int offsetX = 0)
    {
        __shared__ int shist[256];

        const int y = blockIdx.x * blockDim.y + threadIdx.y;
        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
+        const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
        shist[tid] = 0;
        __syncthreads();

        if (y < rows)
        {
-            const unsigned int* rowPtr = (const unsigned int*) (src + y * srcStep);
-            const unsigned int* maskRowPtr = (const unsigned int*) (mask + y * maskStep);
+            const uchar* rowPtr = &src[y * srcStep];
+            const uchar* maskRowPtr = &mask[y * maskStep];
+            // load uncoalesced head
+            if (!fourByteAligned && threadIdx.x == 0) {
+                for (int x = 0; x < min(alignedOffset, cols); x++) {
+                    if (maskRowPtr[x])
+                        Emulation::smem::atomicAdd(&shist[rowPtr[x]], 1);
+                }
+            }

-            const int cols_4 = cols / 4;
-            for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
-            {
-                unsigned int data = rowPtr[x];
-                unsigned int m = maskRowPtr[x];
+            // coalesced loads
+            const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * srcStep] : &src[alignedOffset + y * maskStep]);
+            const unsigned int* maskRowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &mask[y * maskStep] : &mask[alignedOffset + y * maskStep]);
+            const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4;
+            for (int x = threadIdx.x; x < cols_4; x += blockDim.x) {
+                const unsigned int data = rowPtrIntAligned[x];
+                const unsigned int m = maskRowPtrIntAligned[x];

-                if ((m >>  0) & 0xFFU)
-                    Emulation::smem::atomicAdd(&shist[(data >>  0) & 0xFFU], 1);
+                if ((m >> 0) & 0xFFU)
+                    Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);

-                if ((m >>  8) & 0xFFU)
-                    Emulation::smem::atomicAdd(&shist[(data >>  8) & 0xFFU], 1);
+                if ((m >> 8) & 0xFFU)
+                    Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);

-                if ((m >>  16) & 0xFFU)
+                if ((m >> 16) & 0xFFU)
                    Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);

-                if ((m >>  24) & 0xFFU)
+                if ((m >> 24) & 0xFFU)
                    Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
            }

-            if (cols % 4 != 0 && threadIdx.x == 0)
-            {
-                for (int x = cols_4 * 4; x < cols; ++x)
-                {
-                    unsigned int data = ((const uchar*)rowPtr)[x];
-                    unsigned int m = ((const uchar*)maskRowPtr)[x];
-
-                    if (m)
-                        Emulation::smem::atomicAdd(&shist[data], 1);
+            // load uncoalesced tail
+            if (threadIdx.x == 0) {
+                const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
+                for (int x = iTailStart; x < cols; x++) {
+                    if (maskRowPtr[x])
+                        Emulation::smem::atomicAdd(&shist[static_cast<int>(rowPtr[x])], 1);
                }
            }
        }
@ -160,12 +172,15 @@ namespace hist
            ::atomicAdd(hist + tid, histVal);
    }

-    void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream)
+    void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, const int offsetX, cudaStream_t stream)
    {
        const dim3 block(32, 8);
        const dim3 grid(divUp(src.rows, block.y));

-        histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist);
+        if(offsetX)
+            histogram256Kernel<false><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist, offsetX);
+        else
+            histogram256Kernel<true><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist, offsetX);
        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
@ -186,42 +201,44 @@ namespace hist
        }
    }

-    __global__ void histEven8u(const uchar* src, const size_t step, const int rows, const int cols,
-                               int* hist, const int binCount, const int binSize, const int lowerLevel, const int upperLevel)
+    template<bool fourByteAligned>
+    __global__ void histEven8u(const uchar* src, const size_t step, const int rows, const int cols, int* hist, const int binCount, const int binSize,
+        const int lowerLevel, const int upperLevel, const int offsetX)
    {
        extern __shared__ int shist[];

        const int y = blockIdx.x * blockDim.y + threadIdx.y;
        const int tid = threadIdx.y * blockDim.x + threadIdx.x;
-
+        const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
        if (tid < binCount)
            shist[tid] = 0;
-
        __syncthreads();

        if (y < rows)
        {
-            const uchar* rowPtr = src + y * step;
-            const uint* rowPtr4 = (uint*) rowPtr;
-
-            const int cols_4 = cols / 4;
-            for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
-            {
-                const uint data = rowPtr4[x];
+            const uchar* rowPtr = &src[y * step];
+            // load uncoalesced head
+            if (!fourByteAligned && threadIdx.x == 0) {
+                for (int x = 0; x < min(alignedOffset, cols); x++)
+                    histEvenInc(shist, rowPtr[x], binSize, lowerLevel, upperLevel);
+            }

-                histEvenInc(shist, (data >>  0) & 0xFFU, binSize, lowerLevel, upperLevel);
-                histEvenInc(shist, (data >>  8) & 0xFFU, binSize, lowerLevel, upperLevel);
+            // coalesced loads
+            const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * step] : &src[alignedOffset + y * step]);
+            const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4;
+            for (int x = threadIdx.x; x < cols_4; x += blockDim.x) {
+                const unsigned int data = rowPtrIntAligned[x];
+                histEvenInc(shist, (data >> 0) & 0xFFU, binSize, lowerLevel, upperLevel);
+                histEvenInc(shist, (data >> 8) & 0xFFU, binSize, lowerLevel, upperLevel);
                histEvenInc(shist, (data >> 16) & 0xFFU, binSize, lowerLevel, upperLevel);
                histEvenInc(shist, (data >> 24) & 0xFFU, binSize, lowerLevel, upperLevel);
            }

-            if (cols % 4 != 0 && threadIdx.x == 0)
-            {
-                for (int x = cols_4 * 4; x < cols; ++x)
-                {
-                    const uchar data = rowPtr[x];
-                    histEvenInc(shist, data, binSize, lowerLevel, upperLevel);
-                }
+            // load uncoalesced tail
+            if (threadIdx.x == 0) {
+                const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
+                for (int x = iTailStart; x < cols; x++)
+                    histEvenInc(shist, rowPtr[x], binSize, lowerLevel, upperLevel);
            }
        }

@ -236,7 +253,7 @@ namespace hist
        }
    }

-    void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream)
+    void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, const int offsetX, cudaStream_t stream)
    {
        const dim3 block(32, 8);
        const dim3 grid(divUp(src.rows, block.y));
@ -245,7 +262,10 @@ namespace hist

        const size_t smem_size = binCount * sizeof(int);

-        histEven8u<<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel);
+        if(offsetX)
+            histEven8u<false><<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel, offsetX);
+        else
+            histEven8u<true><<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel, offsetX);
        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
--- a/modules/cudaimgproc/src/histogram.cpp
+++ b/modules/cudaimgproc/src/histogram.cpp
@ -68,8 +68,8 @@ void cv::cuda::histRange(InputArray, GpuMat*, const GpuMat*, Stream&) { throw_no

 namespace hist
 {
-    void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream);
-    void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream);
+    void histogram256(PtrStepSzb src, int* hist, const int offsetX, cudaStream_t stream);
+    void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, const int offsetX, cudaStream_t stream);
 }

 void cv::cuda::calcHist(InputArray _src, OutputArray _hist, Stream& stream)
@ -91,10 +91,12 @@ void cv::cuda::calcHist(InputArray _src, InputArray _mask, OutputArray _hist, St

    hist.setTo(Scalar::all(0), stream);

+    Point ofs; Size wholeSize;
+    src.locateROI(wholeSize, ofs);
    if (mask.empty())
-        hist::histogram256(src, hist.ptr<int>(), StreamAccessor::getStream(stream));
+        hist::histogram256(src, hist.ptr<int>(), ofs.x, StreamAccessor::getStream(stream));
    else
-        hist::histogram256(src, mask, hist.ptr<int>(), StreamAccessor::getStream(stream));
+        hist::histogram256(src, mask, hist.ptr<int>(), ofs.x, StreamAccessor::getStream(stream));
 }

 ////////////////////////////////////////////////////////////////////////
@ -494,16 +496,18 @@ void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int

 namespace hist
 {
-    void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream);
+    void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, const int offsetX, cudaStream_t stream);
 }

 namespace
 {
    void histEven8u(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, cudaStream_t stream)
    {
+        Point ofs; Size wholeSize;
+        src.locateROI(wholeSize, ofs);
        hist.create(1, histSize, CV_32S);
        cudaSafeCall( cudaMemsetAsync(hist.data, 0, histSize * sizeof(int), stream) );
-        hist::histEven8u(src, hist.ptr<int>(), histSize, lowerLevel, upperLevel, stream);
+        hist::histEven8u(src, hist.ptr<int>(), histSize, lowerLevel, upperLevel, ofs.x, stream);
    }
 }

--- a/modules/cudaimgproc/test/test_histogram.cpp
+++ b/modules/cudaimgproc/test/test_histogram.cpp
@ -49,15 +49,40 @@ namespace opencv_test { namespace {
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // HistEven

-PARAM_TEST_CASE(HistEven, cv::cuda::DeviceInfo, cv::Size)
+typedef tuple<Size, int> hist_size_to_roi_offset_params_t;
+const hist_size_to_roi_offset_params_t hist_size_to_roi_offset_params[] =
+{
+    // uchar reads only
+    hist_size_to_roi_offset_params_t(Size(1,32), 0),
+    hist_size_to_roi_offset_params_t(Size(2,32), 0),
+    hist_size_to_roi_offset_params_t(Size(2,32), 1),
+    hist_size_to_roi_offset_params_t(Size(3,32), 0),
+    hist_size_to_roi_offset_params_t(Size(3,32), 1),
+    hist_size_to_roi_offset_params_t(Size(3,32), 2),
+    hist_size_to_roi_offset_params_t(Size(4,32), 0),
+    hist_size_to_roi_offset_params_t(Size(4,32), 1),
+    hist_size_to_roi_offset_params_t(Size(4,32), 2),
+    hist_size_to_roi_offset_params_t(Size(4,32), 3),
+    // uchar and int reads
+    hist_size_to_roi_offset_params_t(Size(129,32), 0),
+    hist_size_to_roi_offset_params_t(Size(129,32), 1),
+    hist_size_to_roi_offset_params_t(Size(129,32), 2),
+    hist_size_to_roi_offset_params_t(Size(129,32), 3),
+    // int reads only
+    hist_size_to_roi_offset_params_t(Size(128,32), 0)
+};
+
+PARAM_TEST_CASE(HistEven, cv::cuda::DeviceInfo, hist_size_to_roi_offset_params_t)
 {
    cv::cuda::DeviceInfo devInfo;
    cv::Size size;
+    int roiOffsetX;

    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
+        size = get<0>(GET_PARAM(1));
+        roiOffsetX = get<1>(GET_PARAM(1));

        cv::cuda::setDevice(devInfo.deviceID());
    }
@ -66,19 +91,21 @@ PARAM_TEST_CASE(HistEven, cv::cuda::DeviceInfo, cv::Size)
 CUDA_TEST_P(HistEven, Accuracy)
 {
    cv::Mat src = randomMat(size, CV_8UC1);
-
+    const Rect roi = Rect(roiOffsetX, 0, src.cols - roiOffsetX, src.rows);
    int hbins = 30;
    float hranges[] = {50.0f, 200.0f};

    cv::cuda::GpuMat hist;
-    cv::cuda::histEven(loadMat(src), hist, hbins, (int) hranges[0], (int) hranges[1]);
+    cv::cuda::GpuMat srcDevice = loadMat(src);
+    cv::cuda::histEven(srcDevice(roi), hist, hbins, (int)hranges[0], (int)hranges[1]);

    cv::Mat hist_gold;

    int histSize[] = {hbins};
    const float* ranges[] = {hranges};
    int channels[] = {0};
-    cv::calcHist(&src, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges);
+    Mat srcRoi = src(roi);
+    cv::calcHist(&srcRoi, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges);

    hist_gold = hist_gold.t();
    hist_gold.convertTo(hist_gold, CV_32S);
@ -87,22 +114,24 @@ CUDA_TEST_P(HistEven, Accuracy)
 }

 INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, HistEven, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES));
+    ALL_DEVICES, testing::ValuesIn(hist_size_to_roi_offset_params)));

 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // CalcHist

-PARAM_TEST_CASE(CalcHist, cv::cuda::DeviceInfo, cv::Size)
+PARAM_TEST_CASE(CalcHist, cv::cuda::DeviceInfo, hist_size_to_roi_offset_params_t)
 {
    cv::cuda::DeviceInfo devInfo;

    cv::Size size;

+    int roiOffsetX;
+
    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
+        size = get<0>(GET_PARAM(1));
+        roiOffsetX = get<1>(GET_PARAM(1));

        cv::cuda::setDevice(devInfo.deviceID());
    }
@ -111,9 +140,10 @@ PARAM_TEST_CASE(CalcHist, cv::cuda::DeviceInfo, cv::Size)
 CUDA_TEST_P(CalcHist, Accuracy)
 {
    cv::Mat src = randomMat(size, CV_8UC1);
-
+    const Rect roi = Rect(roiOffsetX, 0, src.cols - roiOffsetX, src.rows);
    cv::cuda::GpuMat hist;
-    cv::cuda::calcHist(loadMat(src), hist);
+    GpuMat srcDevice = loadMat(src);
+    cv::cuda::calcHist(srcDevice(roi), hist);

    cv::Mat hist_gold;

@ -123,7 +153,8 @@ CUDA_TEST_P(CalcHist, Accuracy)
    const float* ranges[] = {hranges};
    const int channels[] = {0};

-    cv::calcHist(&src, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges);
+    const Mat srcRoi = src(roi);
+    cv::calcHist(&srcRoi, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges);
    hist_gold = hist_gold.reshape(1, 1);
    hist_gold.convertTo(hist_gold, CV_32S);

@ -131,19 +162,21 @@ CUDA_TEST_P(CalcHist, Accuracy)
 }

 INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CalcHist, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES));
+    ALL_DEVICES, testing::ValuesIn(hist_size_to_roi_offset_params)));

-PARAM_TEST_CASE(CalcHistWithMask, cv::cuda::DeviceInfo, cv::Size)
+PARAM_TEST_CASE(CalcHistWithMask, cv::cuda::DeviceInfo, hist_size_to_roi_offset_params_t)
 {
    cv::cuda::DeviceInfo devInfo;

    cv::Size size;

+    int roiOffsetX;
+
    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
-        size = GET_PARAM(1);
+        size = get<0>(GET_PARAM(1));
+        roiOffsetX = get<1>(GET_PARAM(1));

        cv::cuda::setDevice(devInfo.deviceID());
    }
@ -152,11 +185,14 @@ PARAM_TEST_CASE(CalcHistWithMask, cv::cuda::DeviceInfo, cv::Size)
 CUDA_TEST_P(CalcHistWithMask, Accuracy)
 {
    cv::Mat src = randomMat(size, CV_8UC1);
+    const Rect roi = Rect(roiOffsetX, 0, src.cols - roiOffsetX, src.rows);
    cv::Mat mask = randomMat(size, CV_8UC1);
    cv::Mat(mask, cv::Rect(0, 0, size.width / 2, size.height / 2)).setTo(0);

    cv::cuda::GpuMat hist;
-    cv::cuda::calcHist(loadMat(src), loadMat(mask), hist);
+    GpuMat srcDevice = loadMat(src);
+    GpuMat maskDevice = loadMat(mask);
+    cv::cuda::calcHist(srcDevice(roi), maskDevice(roi), hist);

    cv::Mat hist_gold;

@ -166,7 +202,8 @@ CUDA_TEST_P(CalcHistWithMask, Accuracy)
    const float* ranges[] = {hranges};
    const int channels[] = {0};

-    cv::calcHist(&src, 1, channels, mask, hist_gold, 1, histSize, ranges);
+    const Mat srcRoi = src(roi);
+    cv::calcHist(&srcRoi, 1, channels, mask(roi), hist_gold, 1, histSize, ranges);
    hist_gold = hist_gold.reshape(1, 1);
    hist_gold.convertTo(hist_gold, CV_32S);

@ -174,8 +211,7 @@ CUDA_TEST_P(CalcHistWithMask, Accuracy)
 }

 INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CalcHistWithMask, testing::Combine(
-    ALL_DEVICES,
-    DIFFERENT_SIZES));
+    ALL_DEVICES, testing::ValuesIn(hist_size_to_roi_offset_params)));

 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // EqualizeHist
--- a/modules/ximgproc/src/find_ellipses.cpp
+++ b/modules/ximgproc/src/find_ellipses.cpp
@ -1272,9 +1272,9 @@ void EllipseDetectorImpl::preProcessing(Mat1b &image, Mat1b &dp, Mat1b &dn) {

    // buffer
    int *magBuffer[3];
-    void *buffer = malloc((imgSize.width + 2) * (imgSize.height + 2) +
-                          (imgSize.width + 2) * 3 * sizeof(int));
-    magBuffer[0] = (int *) buffer;
+    AutoBuffer<int> buffer((imgSize.width + 2) * (imgSize.height + 2) +
+                          (imgSize.width + 2) * 3);
+    magBuffer[0] = buffer.data();
    magBuffer[1] = magBuffer[0] + imgSize.width + 2;
    magBuffer[2] = magBuffer[1] + imgSize.width + 2;
    uchar *map = (uchar *) (magBuffer[2] + imgSize.width + 2);
@ -1300,8 +1300,8 @@ void EllipseDetectorImpl::preProcessing(Mat1b &image, Mat1b &dp, Mat1b &dn) {
    // 2 - the pixel does belong to an edge
    for (int i = 0; i <= imgSize.height; i++) {
        int *tmpMag = magBuffer[(i > 0) + 1] + 1;
-        const short *tmpDx = (short *) (dx[i]);
-        const short *tmpDy = (short *) (dy[i]);
+        const short *tmpDx = dx.ptr<short>(i);
+        const short *tmpDy = dy.ptr<short>(i);
        uchar *tmpMap;
        int prevFlag = 0;

@ -1980,4 +1980,4 @@ void findEllipses(
    Mat(_ellipses).copyTo(ellipses);
 }
 } // namespace ximgproc
-} // namespace cv
+} // namespace cv