diff --git a/modules/gpu/src/cuda/icf-sc.cu b/modules/gpu/src/cuda/icf-sc.cu index 5334441d8d..12b511fb0b 100644 --- a/modules/gpu/src/cuda/icf-sc.cu +++ b/modules/gpu/src/cuda/icf-sc.cu @@ -88,19 +88,23 @@ namespace icf { return (w < 0 || h < 0)? 0.f : (float)(w * h); } - __global__ void overlap(const uint* n, const Detection* detections, uchar* overlaps) + texture tdetections; + + __global__ void overlap(const uint* n, uchar* overlaps) { const int idx = threadIdx.x; const int total = *n; - for (int i = idx; i < total; i += 192) + for (int i = idx + 1; i < total; i += 192) { - const Detection& a = detections[i]; + const uint4 _a = tex2D(tdetections, i, 0); + const Detection& a = *((Detection*)(&_a)); bool excluded = false; for (int j = i + 1; j < total; ++j) { - const Detection& b = detections[j]; + const uint4 _b = tex2D(tdetections, j, 0); + const Detection& b = *((Detection*)(&_b)); float ovl = overlapArea(a, b) / ::min(a.w * a.h, b.w * b.h); if (ovl > 0.65f) @@ -115,7 +119,7 @@ namespace icf { } } - __global__ void collect(const uint* n, const Detection* detections, uchar* overlaps) + __global__ void collect(const uint* n, uchar* overlaps, uint* ctr, uint4* suppressed) { const int idx = threadIdx.x; const int total = *n; @@ -124,19 +128,24 @@ namespace icf { { if (!overlaps[i]) { - const Detection& det = detections[i]; - // printf("%d: %d %d %d %d %f\n", i, det.x, det.y, det.w, det.h, det.confidence ); + int oidx = atomicInc(ctr, 50); + suppressed[oidx] = tex2D(tdetections, i + 1, 0); } } } - void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections) + void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections, PtrStepSzb suppressed) { int block = 192; int grid = 1; - overlap<<>>((uint*)ndetections.ptr(0), (Detection*)objects.ptr(0), (uchar*)overlaps.ptr(0)); - collect<<>>((uint*)ndetections.ptr(0), (Detection*)objects.ptr(0), (uchar*)overlaps.ptr(0)); + cudaChannelFormatDesc desc = cudaCreateChannelDesc(); + size_t offset; + cudaSafeCall( cudaBindTexture2D(&offset, tdetections, objects.data, desc, objects.cols / sizeof(uint4), objects.rows, objects.step)); + + overlap<<>>((uint*)ndetections.ptr(0), (uchar*)overlaps.ptr(0)); + collect<<>>((uint*)ndetections.ptr(0), (uchar*)overlaps.ptr(0), (uint*)suppressed.ptr(0), ((uint4*)suppressed.ptr(0)) + 1); + // if (!stream) { cudaSafeCall( cudaGetLastError()); diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index 35bd72e552..5324e2e56e 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -86,7 +86,7 @@ namespace icf { void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle, const int fw, const int fh, const int bins, cudaStream_t stream); - void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections); + void suppress(const PtrStepSzb& objects, PtrStepSzb overlaps, PtrStepSzi ndetections, PtrStepSzb suppressed); } namespace imgproc { @@ -312,6 +312,7 @@ struct cv::gpu::SCascade::Fields hogluv.setTo(cv::Scalar::all(0)); overlaps.create(1, 5000, CV_8UC1); + suppressed.create(1, sizeof(Detection) * 51, CV_8UC1); return true; } @@ -447,7 +448,9 @@ public: { ensureSizeIsEnough(objects.rows, objects.cols, CV_8UC1, overlaps); overlaps.setTo(0); - device::icf::suppress(objects, overlaps, ndetections); + suppressed.setTo(0); + + device::icf::suppress(objects, overlaps, ndetections, suppressed); // std::cout << cv::Mat(overlaps) << std::endl; } @@ -484,6 +487,9 @@ public: // used for area overlap computing during GpuMat overlaps; + // used for suppression + GpuMat suppressed; + // Cascade from xml GpuMat octaves; GpuMat stages; @@ -525,7 +531,6 @@ bool cv::gpu::SCascade::load(const FileNode& fn) void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _objects, Stream& s) const { CV_Assert(fields); - const GpuMat colored = image.getGpuMat(); // only color images are supperted @@ -545,6 +550,7 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _ colored.copyTo(flds.hogluv); } + GpuMat spr(objects, cv::Rect(0, 0, flds.suppressed.cols, flds.suppressed.rows)); GpuMat tmp = GpuMat(objects, cv::Rect(0, 0, sizeof(Detection), 1)); objects = GpuMat(objects, cv::Rect( sizeof(Detection), 0, objects.cols - sizeof(Detection), 1)); @@ -552,8 +558,11 @@ void cv::gpu::SCascade::detect(InputArray image, InputArray _rois, OutputArray _ flds.detect(rois, tmp, objects, stream); - // if (rejCriteria != NO_REJECT) - flds.suppress(tmp, objects); + if (rejCriteria != NO_REJECT) + { + flds.suppress(tmp, objects); + flds.suppressed.copyTo(spr); + } } void cv::gpu::SCascade::genRoi(InputArray _roi, OutputArray _mask, Stream& stream) const