Merge branch 'opencv:4.x' into 4.x

pull/3471/head
Amir Hassan 2 years ago committed by GitHub
commit c3e7d078f1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 32
      modules/cudacodec/src/ffmpeg_video_source.cpp
  2. 10
      modules/cudacodec/src/video_decoder.cpp
  3. 6
      modules/cudacodec/src/video_decoder.hpp
  4. 29
      modules/cudacodec/src/video_reader.cpp
  5. 63
      modules/cudacodec/test/test_video.cpp
  6. 160
      modules/cudaimgproc/src/cuda/hist.cu
  7. 16
      modules/cudaimgproc/src/histogram.cpp
  8. 76
      modules/cudaimgproc/test/test_histogram.cpp
  9. 12
      modules/ximgproc/src/find_ellipses.cpp

@ -66,8 +66,10 @@ static std::string fourccToString(int fourcc)
(i32_c.c[3] >= ' ' && i32_c.c[3] < 128) ? i32_c.c[3] : '?');
}
// handle old FFmpeg backend - remove when windows shared library is updated
#ifdef _WIN32
static
Codec FourccToCodec(int codec)
Codec FourccToCodecWin32Old(int codec)
{
switch (codec)
{
@ -100,9 +102,34 @@ Codec FourccToCodec(int codec)
case CV_FOURCC_MACRO('a', 'v', '0', '1'): // fallthru
case CV_FOURCC_MACRO('A', 'V', '0', '1'): return AV1;
default:
break;
return NumCodecs;
}
}
#endif
static
Codec FourccToCodec(int codec)
{
#ifdef _WIN32 // handle old FFmpeg backend - remove when windows shared library is updated
Codec win32OldCodec = FourccToCodecWin32Old(codec);
if(win32OldCodec != NumCodecs)
return win32OldCodec;
#endif
switch (codec)
{
case CV_FOURCC_MACRO('m', 'p', 'g', '1'): return MPEG1;
case CV_FOURCC_MACRO('m', 'p', 'g', '2'): return MPEG2;
case CV_FOURCC_MACRO('F', 'M', 'P', '4'): return MPEG4;
case CV_FOURCC_MACRO('W', 'V', 'C', '1'): return VC1;
case CV_FOURCC_MACRO('h', '2', '6', '4'): return H264;
case CV_FOURCC_MACRO('h', 'e', 'v', 'c'): return HEVC;
case CV_FOURCC_MACRO('M', 'J', 'P', 'G'): return JPEG;
case CV_FOURCC_MACRO('V', 'P', '8', '0'): return VP8;
case CV_FOURCC_MACRO('V', 'P', '9', '0'): return VP9;
case CV_FOURCC_MACRO('A', 'V', '0', '1'): return AV1;
default:
break;
}
std::string msg = cv::format("Unknown codec FOURCC: 0x%08X (%s)", codec, fourccToString(codec).c_str());
CV_LOG_WARNING(NULL, msg);
CV_Error(Error::StsUnsupportedFormat, msg);
@ -163,7 +190,6 @@ cv::cudacodec::detail::FFmpegVideoSource::FFmpegVideoSource(const String& fname,
int codec = (int)cap.get(CAP_PROP_FOURCC);
int pixelFormat = (int)cap.get(CAP_PROP_CODEC_PIXEL_FORMAT);
format_.codec = FourccToCodec(codec);
format_.height = cap.get(CAP_PROP_FRAME_HEIGHT);
format_.width = cap.get(CAP_PROP_FRAME_WIDTH);

@ -97,10 +97,10 @@ void cv::cudacodec::detail::VideoDecoder::create(const FormatInfo& videoFormat)
cudaVideoCodec_UYVY == _codec;
#if defined (HAVE_CUDA)
#if (CUDART_VERSION >= 6500)
#if (CUDART_VERSION >= 6050)
codecSupported |= cudaVideoCodec_HEVC == _codec;
#endif
#if ((CUDART_VERSION == 7500) || (CUDART_VERSION >= 9000))
#if ((CUDART_VERSION == 7050) || (CUDART_VERSION >= 9000))
codecSupported |= cudaVideoCodec_VP8 == _codec ||
cudaVideoCodec_VP9 == _codec ||
cudaVideoCodec_AV1 == _codec ||
@ -160,9 +160,11 @@ void cv::cudacodec::detail::VideoDecoder::create(const FormatInfo& videoFormat)
createInfo_.ulCreationFlags = videoCreateFlags;
createInfo_.vidLock = lock_;
cuSafeCall(cuCtxPushCurrent(ctx_));
cuSafeCall(cuvidCreateDecoder(&decoder_, &createInfo_));
{
AutoLock autoLock(mtx_);
cuSafeCall(cuvidCreateDecoder(&decoder_, &createInfo_));
}
cuSafeCall(cuCtxPopCurrent(NULL));
inited_ = true;
}
int cv::cudacodec::detail::VideoDecoder::reconfigure(const FormatInfo& videoFormat) {

@ -70,6 +70,7 @@ public:
void create(const FormatInfo& videoFormat);
int reconfigure(const FormatInfo& videoFormat);
void release();
bool inited() { AutoLock autoLock(mtx_); return decoder_; }
// Get the codec-type currently used.
cudaVideoCodec codec() const { return static_cast<cudaVideoCodec>(videoFormat_.codec); }
@ -85,8 +86,6 @@ public:
unsigned long targetWidth() { return videoFormat_.width; }
unsigned long targetHeight() { return videoFormat_.height; }
bool inited() { return inited_; }
cudaVideoChromaFormat chromaFormat() const { return static_cast<cudaVideoChromaFormat>(videoFormat_.chromaFormat); }
int nBitDepthMinus8() const { return videoFormat_.nBitDepthMinus8; }
@ -114,10 +113,9 @@ public:
private:
CUcontext ctx_ = 0;
CUvideoctxlock lock_;
CUvideodecoder decoder_ = 0;
CUvideodecoder decoder_ = 0;
FormatInfo videoFormat_ = {};
Mutex mtx_;
bool inited_ = false;
};
}}}

@ -67,17 +67,14 @@ void cvtFromNv12(const GpuMat& decodedFrame, GpuMat& outFrame, int width, int he
outFrame.create(height, width, CV_8UC3);
Npp8u* pSrc[2] = { decodedFrame.data, &decodedFrame.data[decodedFrame.step * height] };
NppiSize oSizeROI = { width,height };
#if (CUDART_VERSION < 9200)
CV_Error(Error::StsUnsupportedFormat, "ColorFormat::BGR is not supported until CUDA 9.2, use default ColorFormat::BGRA.");
#elif (CUDART_VERSION < 10100)
#if (CUDART_VERSION < 10010)
cv::cuda::NppStreamHandler h(stream);
if (videoFullRangeFlag)
nppSafeCall(nppiNV12ToBGR_709HDTV_8u_P2C3R(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI));
else {
CV_LOG_DEBUG(NULL, "Color reproduction may be inaccurate due CUDA version <= 11.0, for better results upgrade CUDA runtime or try ColorFormat::BGRA.");
nppSafeCall(nppiNV12ToBGR_8u_P2C3R(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI));
}
#elif (CUDART_VERSION >= 10100)
#elif (CUDART_VERSION >= 10010)
NppStreamContext nppStreamCtx;
nppSafeCall(nppGetStreamContext(&nppStreamCtx));
nppStreamCtx.hStream = StreamAccessor::getStream(stream);
@ -85,7 +82,6 @@ void cvtFromNv12(const GpuMat& decodedFrame, GpuMat& outFrame, int width, int he
nppSafeCall(nppiNV12ToBGR_709HDTV_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, nppStreamCtx));
else {
#if (CUDART_VERSION < 11000)
CV_LOG_DEBUG(NULL, "Color reproduction may be inaccurate due CUDA version <= 11.0, for better results upgrade CUDA runtime or try ColorFormat::BGRA.");
nppSafeCall(nppiNV12ToBGR_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, nppStreamCtx));
#else
nppSafeCall(nppiNV12ToBGR_709CSC_8u_P2C3R_Ctx(pSrc, decodedFrame.step, outFrame.data, outFrame.step, oSizeROI, nppStreamCtx));
@ -137,6 +133,7 @@ namespace
private:
bool internalGrab(GpuMat& frame, Stream& stream);
void waitForDecoderInit();
Ptr<VideoSource> videoSource_;
@ -160,6 +157,15 @@ namespace
return videoSource_->format();
}
void VideoReaderImpl::waitForDecoderInit() {
for (;;) {
if (videoDecoder_->inited()) break;
if (videoParser_->hasError() || frameQueue_->isEndOfDecode())
CV_Error(Error::StsError, "Parsing/Decoding video source failed, check GPU memory is available and GPU supports hardware decoding.");
Thread::sleep(1);
}
}
VideoReaderImpl::VideoReaderImpl(const Ptr<VideoSource>& source, const int minNumDecodeSurfaces, const bool allowFrameDrop, const bool udpSource,
const Size targetSz, const Rect srcRoi, const Rect targetRoi) :
videoSource_(source),
@ -177,6 +183,8 @@ namespace
videoParser_.reset(new VideoParser(videoDecoder_, frameQueue_, allowFrameDrop, udpSource));
videoSource_->setVideoParser(videoParser_);
videoSource_->start();
waitForDecoderInit();
videoSource_->updateFormat(videoDecoder_->format());
}
VideoReaderImpl::~VideoReaderImpl()
@ -307,6 +315,15 @@ namespace
bool VideoReaderImpl::set(const ColorFormat colorFormat_) {
if (!ValidColorFormat(colorFormat_)) return false;
if (colorFormat_ == ColorFormat::BGR) {
#if (CUDART_VERSION < 9020)
CV_LOG_DEBUG(NULL, "ColorFormat::BGR is not supported until CUDA 9.2, use default ColorFormat::BGRA.");
return false;
#elif (CUDART_VERSION < 11000)
if (!videoDecoder_->format().videoFullRangeFlag)
CV_LOG_INFO(NULL, "Color reproduction may be inaccurate due CUDA version <= 11.0, for better results upgrade CUDA runtime or try ColorFormat::BGRA.");
#endif
}
colorFormat = colorFormat_;
return true;
}

@ -230,14 +230,15 @@ CUDA_TEST_P(Scaling, Reader)
static_cast<int>(params.targetSz.width * targetRoiIn.width), static_cast<int>(params.targetSz.height * targetRoiIn.height));
cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile, {}, params);
const cudacodec::FormatInfo format = reader->format();
ASSERT_TRUE(format.valid);
ASSERT_TRUE(reader->set(cudacodec::ColorFormat::GRAY));
GpuMat frame;
ASSERT_TRUE(reader->nextFrame(frame));
const cudacodec::FormatInfo format = reader->format();
Size targetSzOut = params.targetSz;
Rect srcRoiOut = params.srcRoi, targetRoiOut = params.targetRoi;
ForceAlignment(srcRoiOut, targetRoiOut, targetSzOut);
ASSERT_TRUE(format.valid && format.targetSz == targetSzOut && format.srcRoi == srcRoiOut && format.targetRoi == targetRoiOut);
ASSERT_TRUE(format.targetSz == targetSzOut && format.srcRoi == srcRoiOut && format.targetRoi == targetRoiOut);
ASSERT_TRUE(frame.size() == targetSzOut);
GpuMat frameGs;
cv::cuda::resize(frameOr(srcRoiOut), frameGs, targetRoiOut.size(), 0, 0, INTER_AREA);
@ -280,15 +281,11 @@ CUDA_TEST_P(DisplayResolution, Reader)
CUDA_TEST_P(Video, Reader)
{
cv::cuda::setDevice(GET_PARAM(0).deviceID());
const std::string relativeFilePath = GET_PARAM(1);
// CUDA demuxer has to fall back to ffmpeg to process "cv/video/768x576.avi"
if (GET_PARAM(1) == "cv/video/768x576.avi" && !videoio_registry::hasBackend(CAP_FFMPEG))
throw SkipTestException("FFmpeg backend not found");
#ifdef _WIN32 // handle old FFmpeg backend
if (GET_PARAM(1) == "/cv/tracking/faceocc2/data/faceocc2.webm")
throw SkipTestException("Feature not yet supported by Windows FFmpeg shared library!");
#endif
if (relativeFilePath == "cv/video/768x576.avi" && !videoio_registry::hasBackend(CAP_FFMPEG))
throw SkipTestException("FFmpeg backend not found - SKIP");
const std::vector<std::pair< cudacodec::ColorFormat, int>> formatsToChannels = {
{cudacodec::ColorFormat::GRAY,1},
@ -297,7 +294,7 @@ CUDA_TEST_P(Video, Reader)
{cudacodec::ColorFormat::NV_NV12,1}
};
std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../" + GET_PARAM(1);
std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../" + relativeFilePath;
cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile);
ASSERT_FALSE(reader->set(cudacodec::ColorFormat::RGB));
cv::cudacodec::FormatInfo fmt = reader->format();
@ -310,8 +307,6 @@ CUDA_TEST_P(Video, Reader)
double colorFormat;
ASSERT_TRUE(reader->get(cudacodec::VideoReaderProps::PROP_COLOR_FORMAT, colorFormat) && static_cast<cudacodec::ColorFormat>(colorFormat) == formatToChannels.first);
ASSERT_TRUE(reader->nextFrame(frame));
if(!fmt.valid)
fmt = reader->format();
const int height = formatToChannels.first == cudacodec::ColorFormat::NV_NV12 ? static_cast<int>(1.5 * fmt.height) : fmt.height;
ASSERT_TRUE(frame.cols == fmt.width && frame.rows == height);
ASSERT_FALSE(frame.empty());
@ -326,6 +321,7 @@ CUDA_TEST_P(ColorConversion, Reader)
const std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../" + get<0>(GET_PARAM(2));
const bool videoFullRangeFlag = get<1>(GET_PARAM(2));
cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile);
cv::cudacodec::FormatInfo fmt = reader->format();
reader->set(colorFormat);
cv::VideoCapture cap(inputFile);
@ -336,8 +332,8 @@ CUDA_TEST_P(ColorConversion, Reader)
reader->nextFrame(frame);
frame.download(frameFromDevice);
cap.read(frameHost);
const cv::cudacodec::FormatInfo fmt = reader->format();
ASSERT_TRUE(fmt.valid && fmt.videoFullRangeFlag == videoFullRangeFlag);
fmt = reader->format();
ASSERT_TRUE(fmt.videoFullRangeFlag == videoFullRangeFlag);
if (colorFormat == cv::cudacodec::ColorFormat::BGRA)
cv::cvtColor(frameHost, frameHostGs, COLOR_BGR2BGRA);
else
@ -384,7 +380,7 @@ CUDA_TEST_P(ReconfigureDecoderWithScaling, Reader)
if (nFrames++ == 0)
initialSize = frame.size();
fmt = reader->format();
ASSERT_TRUE(fmt.valid && (frame.size() == initialSize));
ASSERT_TRUE(frame.size() == initialSize);
ASSERT_TRUE((frame.size() == targetSzOut) && (fmt.targetSz == targetSzOut) && (fmt.srcRoi == srcRoiOut) && (fmt.targetRoi == targetRoiOut));
// simple check - zero borders, non zero contents
ASSERT_TRUE(!cuda::absSum(frame, mask)[0] && cuda::sum(frame)[0]);
@ -413,7 +409,7 @@ CUDA_TEST_P(ReconfigureDecoder, Reader)
initialSize = frame.size();
initialCodedSize = Size(fmt.ulWidth, fmt.ulHeight);
}
ASSERT_TRUE(fmt.valid && (frame.size() == initialSize));
ASSERT_TRUE(frame.size() == initialSize);
ASSERT_TRUE(fmt.srcRoi.empty());
const bool resChanged = (initialCodedSize.width != fmt.ulWidth) || (initialCodedSize.height != fmt.ulHeight);
if (resChanged)
@ -541,11 +537,6 @@ CUDA_TEST_P(CheckDecodeSurfaces, Reader)
{
cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile);
cv::cudacodec::FormatInfo fmt = reader->format();
if (!fmt.valid) {
reader->grab();
fmt = reader->format();
ASSERT_TRUE(fmt.valid);
}
ulNumDecodeSurfaces = fmt.ulNumDecodeSurfaces;
}
@ -554,11 +545,6 @@ CUDA_TEST_P(CheckDecodeSurfaces, Reader)
params.minNumDecodeSurfaces = ulNumDecodeSurfaces - 1;
cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile, {}, params);
cv::cudacodec::FormatInfo fmt = reader->format();
if (!fmt.valid) {
reader->grab();
fmt = reader->format();
ASSERT_TRUE(fmt.valid);
}
ASSERT_TRUE(fmt.ulNumDecodeSurfaces == ulNumDecodeSurfaces);
for (int i = 0; i < 100; i++) ASSERT_TRUE(reader->grab());
}
@ -568,11 +554,6 @@ CUDA_TEST_P(CheckDecodeSurfaces, Reader)
params.minNumDecodeSurfaces = ulNumDecodeSurfaces + 1;
cv::Ptr<cv::cudacodec::VideoReader> reader = cv::cudacodec::createVideoReader(inputFile, {}, params);
cv::cudacodec::FormatInfo fmt = reader->format();
if (!fmt.valid) {
reader->grab();
fmt = reader->format();
ASSERT_TRUE(fmt.valid);
}
ASSERT_TRUE(fmt.ulNumDecodeSurfaces == ulNumDecodeSurfaces + 1);
for (int i = 0; i < 100; i++) ASSERT_TRUE(reader->grab());
}
@ -626,10 +607,6 @@ CUDA_TEST_P(TransCode, H264ToH265)
cv::cuda::Stream stream;
for (int i = 0; i < nFrames; ++i) {
ASSERT_TRUE(reader->nextFrame(frame, stream));
if (!fmt.valid) {
fmt = reader->format();
ASSERT_TRUE(fmt.valid);
}
ASSERT_FALSE(frame.empty());
Mat tst; frame.download(tst);
if (writer.empty()) {
@ -837,13 +814,13 @@ INSTANTIATE_TEST_CASE_P(CUDA_Codec, Scaling, testing::Combine(
INSTANTIATE_TEST_CASE_P(CUDA_Codec, DisplayResolution, ALL_DEVICES);
#define VIDEO_SRC_R "highgui/video/big_buck_bunny.mp4", "cv/video/768x576.avi", "cv/video/1920x1080.avi", "highgui/video/big_buck_bunny.avi", \
#define VIDEO_SRC_R testing::Values("highgui/video/big_buck_bunny.mp4", "cv/video/768x576.avi", "cv/video/1920x1080.avi", "highgui/video/big_buck_bunny.avi", \
"highgui/video/big_buck_bunny.h264", "highgui/video/big_buck_bunny.h265", "highgui/video/big_buck_bunny.mpg", \
"highgui/video/sample_322x242_15frames.yuv420p.libvpx-vp9.mp4", "highgui/video/sample_322x242_15frames.yuv420p.libaom-av1.mp4", \
"cv/tracking/faceocc2/data/faceocc2.webm"
INSTANTIATE_TEST_CASE_P(CUDA_Codec, Video, testing::Combine(
ALL_DEVICES,
testing::Values(VIDEO_SRC_R)));
"highgui/video/sample_322x242_15frames.yuv420p.libvpx-vp9.mp4")
//, "highgui/video/sample_322x242_15frames.yuv420p.libaom-av1.mp4", \
"cv/tracking/faceocc2/data/faceocc2.webm", "highgui/video/sample_322x242_15frames.yuv420p.mpeg2video.mp4", "highgui/video/sample_322x242_15frames.yuv420p.mjpeg.mp4")
INSTANTIATE_TEST_CASE_P(CUDA_Codec, Video, testing::Combine(ALL_DEVICES,VIDEO_SRC_R));
const color_conversion_params_t color_conversion_params[] =
{
@ -878,9 +855,11 @@ INSTANTIATE_TEST_CASE_P(CUDA_Codec, CheckExtraData, testing::Combine(
ALL_DEVICES,
testing::ValuesIn(check_extra_data_params)));
#define VIDEO_SRC_KEY "highgui/video/big_buck_bunny.mp4", "cv/video/768x576.avi", "cv/video/1920x1080.avi", "highgui/video/big_buck_bunny.avi", \
"highgui/video/big_buck_bunny.h264", "highgui/video/big_buck_bunny.h265", "highgui/video/big_buck_bunny.mpg"
INSTANTIATE_TEST_CASE_P(CUDA_Codec, CheckKeyFrame, testing::Combine(
ALL_DEVICES,
testing::Values(VIDEO_SRC_R)));
testing::Values(VIDEO_SRC_KEY)));
INSTANTIATE_TEST_CASE_P(CUDA_Codec, CheckParams, ALL_DEVICES);

@ -52,38 +52,41 @@ using namespace cv::cuda::device;
namespace hist
{
__global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist)
template<bool fourByteAligned>
__global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist, const int offsetX = 0)
{
__shared__ int shist[256];
const int y = blockIdx.x * blockDim.y + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
shist[tid] = 0;
__syncthreads();
if (y < rows)
{
const unsigned int* rowPtr = (const unsigned int*) (src + y * step);
const int cols_4 = cols / 4;
for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
{
unsigned int data = rowPtr[x];
if (y < rows) {
const uchar* rowPtr = &src[y * step];
// load uncoalesced head
if (!fourByteAligned && threadIdx.x == 0) {
for (int x = 0; x < min(alignedOffset, cols); x++)
Emulation::smem::atomicAdd(&shist[static_cast<int>(rowPtr[x])], 1);
}
Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);
// coalesced loads
const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * step] : &src[alignedOffset + y * step]);
const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4;
for (int x = threadIdx.x; x < cols_4; x += blockDim.x) {
const unsigned int data = rowPtrIntAligned[x];
Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);
Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
}
if (cols % 4 != 0 && threadIdx.x == 0)
{
for (int x = cols_4 * 4; x < cols; ++x)
{
unsigned int data = ((const uchar*)rowPtr)[x];
Emulation::smem::atomicAdd(&shist[data], 1);
}
// load uncoalesced tail
if (threadIdx.x == 0) {
const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
for (int x = iTailStart; x < cols; x++)
Emulation::smem::atomicAdd(&shist[static_cast<int>(rowPtr[x])], 1);
}
}
@ -94,61 +97,70 @@ namespace hist
::atomicAdd(hist + tid, histVal);
}
void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream)
void histogram256(PtrStepSzb src, int* hist, const int offsetX, cudaStream_t stream)
{
const dim3 block(32, 8);
const dim3 grid(divUp(src.rows, block.y));
histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist);
if(offsetX)
histogram256Kernel<false><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist, offsetX);
else
histogram256Kernel<true><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist, offsetX);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
__global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int* hist)
template<bool fourByteAligned>
__global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int* hist, const int offsetX = 0)
{
__shared__ int shist[256];
const int y = blockIdx.x * blockDim.y + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
shist[tid] = 0;
__syncthreads();
if (y < rows)
{
const unsigned int* rowPtr = (const unsigned int*) (src + y * srcStep);
const unsigned int* maskRowPtr = (const unsigned int*) (mask + y * maskStep);
const uchar* rowPtr = &src[y * srcStep];
const uchar* maskRowPtr = &mask[y * maskStep];
// load uncoalesced head
if (!fourByteAligned && threadIdx.x == 0) {
for (int x = 0; x < min(alignedOffset, cols); x++) {
if (maskRowPtr[x])
Emulation::smem::atomicAdd(&shist[rowPtr[x]], 1);
}
}
const int cols_4 = cols / 4;
for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
{
unsigned int data = rowPtr[x];
unsigned int m = maskRowPtr[x];
// coalesced loads
const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * srcStep] : &src[alignedOffset + y * maskStep]);
const unsigned int* maskRowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &mask[y * maskStep] : &mask[alignedOffset + y * maskStep]);
const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4;
for (int x = threadIdx.x; x < cols_4; x += blockDim.x) {
const unsigned int data = rowPtrIntAligned[x];
const unsigned int m = maskRowPtrIntAligned[x];
if ((m >> 0) & 0xFFU)
Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
if ((m >> 0) & 0xFFU)
Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
if ((m >> 8) & 0xFFU)
Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);
if ((m >> 8) & 0xFFU)
Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);
if ((m >> 16) & 0xFFU)
if ((m >> 16) & 0xFFU)
Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
if ((m >> 24) & 0xFFU)
if ((m >> 24) & 0xFFU)
Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
}
if (cols % 4 != 0 && threadIdx.x == 0)
{
for (int x = cols_4 * 4; x < cols; ++x)
{
unsigned int data = ((const uchar*)rowPtr)[x];
unsigned int m = ((const uchar*)maskRowPtr)[x];
if (m)
Emulation::smem::atomicAdd(&shist[data], 1);
// load uncoalesced tail
if (threadIdx.x == 0) {
const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
for (int x = iTailStart; x < cols; x++) {
if (maskRowPtr[x])
Emulation::smem::atomicAdd(&shist[static_cast<int>(rowPtr[x])], 1);
}
}
}
@ -160,12 +172,15 @@ namespace hist
::atomicAdd(hist + tid, histVal);
}
void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream)
void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, const int offsetX, cudaStream_t stream)
{
const dim3 block(32, 8);
const dim3 grid(divUp(src.rows, block.y));
histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist);
if(offsetX)
histogram256Kernel<false><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist, offsetX);
else
histogram256Kernel<true><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist, offsetX);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
@ -186,42 +201,44 @@ namespace hist
}
}
__global__ void histEven8u(const uchar* src, const size_t step, const int rows, const int cols,
int* hist, const int binCount, const int binSize, const int lowerLevel, const int upperLevel)
template<bool fourByteAligned>
__global__ void histEven8u(const uchar* src, const size_t step, const int rows, const int cols, int* hist, const int binCount, const int binSize,
const int lowerLevel, const int upperLevel, const int offsetX)
{
extern __shared__ int shist[];
const int y = blockIdx.x * blockDim.y + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
if (tid < binCount)
shist[tid] = 0;
__syncthreads();
if (y < rows)
{
const uchar* rowPtr = src + y * step;
const uint* rowPtr4 = (uint*) rowPtr;
const int cols_4 = cols / 4;
for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
{
const uint data = rowPtr4[x];
const uchar* rowPtr = &src[y * step];
// load uncoalesced head
if (!fourByteAligned && threadIdx.x == 0) {
for (int x = 0; x < min(alignedOffset, cols); x++)
histEvenInc(shist, rowPtr[x], binSize, lowerLevel, upperLevel);
}
histEvenInc(shist, (data >> 0) & 0xFFU, binSize, lowerLevel, upperLevel);
histEvenInc(shist, (data >> 8) & 0xFFU, binSize, lowerLevel, upperLevel);
// coalesced loads
const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * step] : &src[alignedOffset + y * step]);
const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4;
for (int x = threadIdx.x; x < cols_4; x += blockDim.x) {
const unsigned int data = rowPtrIntAligned[x];
histEvenInc(shist, (data >> 0) & 0xFFU, binSize, lowerLevel, upperLevel);
histEvenInc(shist, (data >> 8) & 0xFFU, binSize, lowerLevel, upperLevel);
histEvenInc(shist, (data >> 16) & 0xFFU, binSize, lowerLevel, upperLevel);
histEvenInc(shist, (data >> 24) & 0xFFU, binSize, lowerLevel, upperLevel);
}
if (cols % 4 != 0 && threadIdx.x == 0)
{
for (int x = cols_4 * 4; x < cols; ++x)
{
const uchar data = rowPtr[x];
histEvenInc(shist, data, binSize, lowerLevel, upperLevel);
}
// load uncoalesced tail
if (threadIdx.x == 0) {
const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
for (int x = iTailStart; x < cols; x++)
histEvenInc(shist, rowPtr[x], binSize, lowerLevel, upperLevel);
}
}
@ -236,7 +253,7 @@ namespace hist
}
}
void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream)
void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, const int offsetX, cudaStream_t stream)
{
const dim3 block(32, 8);
const dim3 grid(divUp(src.rows, block.y));
@ -245,7 +262,10 @@ namespace hist
const size_t smem_size = binCount * sizeof(int);
histEven8u<<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel);
if(offsetX)
histEven8u<false><<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel, offsetX);
else
histEven8u<true><<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel, offsetX);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)

@ -68,8 +68,8 @@ void cv::cuda::histRange(InputArray, GpuMat*, const GpuMat*, Stream&) { throw_no
namespace hist
{
void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream);
void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream);
void histogram256(PtrStepSzb src, int* hist, const int offsetX, cudaStream_t stream);
void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, const int offsetX, cudaStream_t stream);
}
void cv::cuda::calcHist(InputArray _src, OutputArray _hist, Stream& stream)
@ -91,10 +91,12 @@ void cv::cuda::calcHist(InputArray _src, InputArray _mask, OutputArray _hist, St
hist.setTo(Scalar::all(0), stream);
Point ofs; Size wholeSize;
src.locateROI(wholeSize, ofs);
if (mask.empty())
hist::histogram256(src, hist.ptr<int>(), StreamAccessor::getStream(stream));
hist::histogram256(src, hist.ptr<int>(), ofs.x, StreamAccessor::getStream(stream));
else
hist::histogram256(src, mask, hist.ptr<int>(), StreamAccessor::getStream(stream));
hist::histogram256(src, mask, hist.ptr<int>(), ofs.x, StreamAccessor::getStream(stream));
}
////////////////////////////////////////////////////////////////////////
@ -494,16 +496,18 @@ void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int
namespace hist
{
void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream);
void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, const int offsetX, cudaStream_t stream);
}
namespace
{
void histEven8u(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, cudaStream_t stream)
{
Point ofs; Size wholeSize;
src.locateROI(wholeSize, ofs);
hist.create(1, histSize, CV_32S);
cudaSafeCall( cudaMemsetAsync(hist.data, 0, histSize * sizeof(int), stream) );
hist::histEven8u(src, hist.ptr<int>(), histSize, lowerLevel, upperLevel, stream);
hist::histEven8u(src, hist.ptr<int>(), histSize, lowerLevel, upperLevel, ofs.x, stream);
}
}

@ -49,15 +49,40 @@ namespace opencv_test { namespace {
///////////////////////////////////////////////////////////////////////////////////////////////////////
// HistEven
PARAM_TEST_CASE(HistEven, cv::cuda::DeviceInfo, cv::Size)
typedef tuple<Size, int> hist_size_to_roi_offset_params_t;
const hist_size_to_roi_offset_params_t hist_size_to_roi_offset_params[] =
{
// uchar reads only
hist_size_to_roi_offset_params_t(Size(1,32), 0),
hist_size_to_roi_offset_params_t(Size(2,32), 0),
hist_size_to_roi_offset_params_t(Size(2,32), 1),
hist_size_to_roi_offset_params_t(Size(3,32), 0),
hist_size_to_roi_offset_params_t(Size(3,32), 1),
hist_size_to_roi_offset_params_t(Size(3,32), 2),
hist_size_to_roi_offset_params_t(Size(4,32), 0),
hist_size_to_roi_offset_params_t(Size(4,32), 1),
hist_size_to_roi_offset_params_t(Size(4,32), 2),
hist_size_to_roi_offset_params_t(Size(4,32), 3),
// uchar and int reads
hist_size_to_roi_offset_params_t(Size(129,32), 0),
hist_size_to_roi_offset_params_t(Size(129,32), 1),
hist_size_to_roi_offset_params_t(Size(129,32), 2),
hist_size_to_roi_offset_params_t(Size(129,32), 3),
// int reads only
hist_size_to_roi_offset_params_t(Size(128,32), 0)
};
PARAM_TEST_CASE(HistEven, cv::cuda::DeviceInfo, hist_size_to_roi_offset_params_t)
{
cv::cuda::DeviceInfo devInfo;
cv::Size size;
int roiOffsetX;
virtual void SetUp()
{
devInfo = GET_PARAM(0);
size = GET_PARAM(1);
size = get<0>(GET_PARAM(1));
roiOffsetX = get<1>(GET_PARAM(1));
cv::cuda::setDevice(devInfo.deviceID());
}
@ -66,19 +91,21 @@ PARAM_TEST_CASE(HistEven, cv::cuda::DeviceInfo, cv::Size)
CUDA_TEST_P(HistEven, Accuracy)
{
cv::Mat src = randomMat(size, CV_8UC1);
const Rect roi = Rect(roiOffsetX, 0, src.cols - roiOffsetX, src.rows);
int hbins = 30;
float hranges[] = {50.0f, 200.0f};
cv::cuda::GpuMat hist;
cv::cuda::histEven(loadMat(src), hist, hbins, (int) hranges[0], (int) hranges[1]);
cv::cuda::GpuMat srcDevice = loadMat(src);
cv::cuda::histEven(srcDevice(roi), hist, hbins, (int)hranges[0], (int)hranges[1]);
cv::Mat hist_gold;
int histSize[] = {hbins};
const float* ranges[] = {hranges};
int channels[] = {0};
cv::calcHist(&src, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges);
Mat srcRoi = src(roi);
cv::calcHist(&srcRoi, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges);
hist_gold = hist_gold.t();
hist_gold.convertTo(hist_gold, CV_32S);
@ -87,22 +114,24 @@ CUDA_TEST_P(HistEven, Accuracy)
}
INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, HistEven, testing::Combine(
ALL_DEVICES,
DIFFERENT_SIZES));
ALL_DEVICES, testing::ValuesIn(hist_size_to_roi_offset_params)));
///////////////////////////////////////////////////////////////////////////////////////////////////////
// CalcHist
PARAM_TEST_CASE(CalcHist, cv::cuda::DeviceInfo, cv::Size)
PARAM_TEST_CASE(CalcHist, cv::cuda::DeviceInfo, hist_size_to_roi_offset_params_t)
{
cv::cuda::DeviceInfo devInfo;
cv::Size size;
int roiOffsetX;
virtual void SetUp()
{
devInfo = GET_PARAM(0);
size = GET_PARAM(1);
size = get<0>(GET_PARAM(1));
roiOffsetX = get<1>(GET_PARAM(1));
cv::cuda::setDevice(devInfo.deviceID());
}
@ -111,9 +140,10 @@ PARAM_TEST_CASE(CalcHist, cv::cuda::DeviceInfo, cv::Size)
CUDA_TEST_P(CalcHist, Accuracy)
{
cv::Mat src = randomMat(size, CV_8UC1);
const Rect roi = Rect(roiOffsetX, 0, src.cols - roiOffsetX, src.rows);
cv::cuda::GpuMat hist;
cv::cuda::calcHist(loadMat(src), hist);
GpuMat srcDevice = loadMat(src);
cv::cuda::calcHist(srcDevice(roi), hist);
cv::Mat hist_gold;
@ -123,7 +153,8 @@ CUDA_TEST_P(CalcHist, Accuracy)
const float* ranges[] = {hranges};
const int channels[] = {0};
cv::calcHist(&src, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges);
const Mat srcRoi = src(roi);
cv::calcHist(&srcRoi, 1, channels, cv::Mat(), hist_gold, 1, histSize, ranges);
hist_gold = hist_gold.reshape(1, 1);
hist_gold.convertTo(hist_gold, CV_32S);
@ -131,19 +162,21 @@ CUDA_TEST_P(CalcHist, Accuracy)
}
INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CalcHist, testing::Combine(
ALL_DEVICES,
DIFFERENT_SIZES));
ALL_DEVICES, testing::ValuesIn(hist_size_to_roi_offset_params)));
PARAM_TEST_CASE(CalcHistWithMask, cv::cuda::DeviceInfo, cv::Size)
PARAM_TEST_CASE(CalcHistWithMask, cv::cuda::DeviceInfo, hist_size_to_roi_offset_params_t)
{
cv::cuda::DeviceInfo devInfo;
cv::Size size;
int roiOffsetX;
virtual void SetUp()
{
devInfo = GET_PARAM(0);
size = GET_PARAM(1);
size = get<0>(GET_PARAM(1));
roiOffsetX = get<1>(GET_PARAM(1));
cv::cuda::setDevice(devInfo.deviceID());
}
@ -152,11 +185,14 @@ PARAM_TEST_CASE(CalcHistWithMask, cv::cuda::DeviceInfo, cv::Size)
CUDA_TEST_P(CalcHistWithMask, Accuracy)
{
cv::Mat src = randomMat(size, CV_8UC1);
const Rect roi = Rect(roiOffsetX, 0, src.cols - roiOffsetX, src.rows);
cv::Mat mask = randomMat(size, CV_8UC1);
cv::Mat(mask, cv::Rect(0, 0, size.width / 2, size.height / 2)).setTo(0);
cv::cuda::GpuMat hist;
cv::cuda::calcHist(loadMat(src), loadMat(mask), hist);
GpuMat srcDevice = loadMat(src);
GpuMat maskDevice = loadMat(mask);
cv::cuda::calcHist(srcDevice(roi), maskDevice(roi), hist);
cv::Mat hist_gold;
@ -166,7 +202,8 @@ CUDA_TEST_P(CalcHistWithMask, Accuracy)
const float* ranges[] = {hranges};
const int channels[] = {0};
cv::calcHist(&src, 1, channels, mask, hist_gold, 1, histSize, ranges);
const Mat srcRoi = src(roi);
cv::calcHist(&srcRoi, 1, channels, mask(roi), hist_gold, 1, histSize, ranges);
hist_gold = hist_gold.reshape(1, 1);
hist_gold.convertTo(hist_gold, CV_32S);
@ -174,8 +211,7 @@ CUDA_TEST_P(CalcHistWithMask, Accuracy)
}
INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, CalcHistWithMask, testing::Combine(
ALL_DEVICES,
DIFFERENT_SIZES));
ALL_DEVICES, testing::ValuesIn(hist_size_to_roi_offset_params)));
///////////////////////////////////////////////////////////////////////////////////////////////////////
// EqualizeHist

@ -1272,9 +1272,9 @@ void EllipseDetectorImpl::preProcessing(Mat1b &image, Mat1b &dp, Mat1b &dn) {
// buffer
int *magBuffer[3];
void *buffer = malloc((imgSize.width + 2) * (imgSize.height + 2) +
(imgSize.width + 2) * 3 * sizeof(int));
magBuffer[0] = (int *) buffer;
AutoBuffer<int> buffer((imgSize.width + 2) * (imgSize.height + 2) +
(imgSize.width + 2) * 3);
magBuffer[0] = buffer.data();
magBuffer[1] = magBuffer[0] + imgSize.width + 2;
magBuffer[2] = magBuffer[1] + imgSize.width + 2;
uchar *map = (uchar *) (magBuffer[2] + imgSize.width + 2);
@ -1300,8 +1300,8 @@ void EllipseDetectorImpl::preProcessing(Mat1b &image, Mat1b &dp, Mat1b &dn) {
// 2 - the pixel does belong to an edge
for (int i = 0; i <= imgSize.height; i++) {
int *tmpMag = magBuffer[(i > 0) + 1] + 1;
const short *tmpDx = (short *) (dx[i]);
const short *tmpDy = (short *) (dy[i]);
const short *tmpDx = dx.ptr<short>(i);
const short *tmpDy = dy.ptr<short>(i);
uchar *tmpMap;
int prevFlag = 0;
@ -1980,4 +1980,4 @@ void findEllipses(
Mat(_ellipses).copyTo(ellipses);
}
} // namespace ximgproc
} // namespace cv
} // namespace cv

Loading…
Cancel
Save