diff --git a/modules/gpu/perf/perf_objdetect.cpp b/modules/gpu/perf/perf_objdetect.cpp index 2224194bbd..e6efcc2d62 100644 --- a/modules/gpu/perf/perf_objdetect.cpp +++ b/modules/gpu/perf/perf_objdetect.cpp @@ -176,33 +176,35 @@ PERF_TEST_P(SoftCascadeTest, detect, { if (runOnGpu) { - cv::Mat cpu = readImage (GetParam().second); + cv::Mat cpu = readImage (GET_PARAM(1)); ASSERT_FALSE(cpu.empty()); cv::gpu::GpuMat colored(cpu); cv::gpu::SoftCascade cascade; - ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GetParam().first))); + ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GET_PARAM(0)))); - cv::gpu::GpuMat objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1); - - rois.setTo(0); - cv::gpu::GpuMat sub(rois, cv::Rect(rois.cols / 4, rois.rows / 4,rois.cols / 2, rois.rows / 2)); - sub.setTo(cv::Scalar::all(1)); - cascade.detectMultiScale(colored, rois, objectBoxes); + cv::gpu::GpuMat objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1), trois; + rois.setTo(1); + cv::gpu::transpose(rois, trois); + cascade.detectMultiScale(colored, trois, objectBoxes); TEST_CYCLE() { - cascade.detectMultiScale(colored, rois, objectBoxes); + cascade.detectMultiScale(colored, trois, objectBoxes); } - } else + } + else { - cv::Mat colored = readImage(GetParam().second); + cv::Mat colored = readImage(GET_PARAM(1)); ASSERT_FALSE(colored.empty()); cv::SoftCascade cascade; - ASSERT_TRUE(cascade.load(getDataPath(GetParam().first))); + ASSERT_TRUE(cascade.load(getDataPath(GET_PARAM(0)))); + + std::vector rois; - std::vector rois, objectBoxes; + typedef cv::SoftCascade::Detection Detection; + std::vectorobjectBoxes; cascade.detectMultiScale(colored, rois, objectBoxes); TEST_CYCLE() @@ -262,13 +264,16 @@ PERF_TEST_P(SoftCascadeTestRoi, detectInRoi, sub.setTo(1); } + cv::gpu::GpuMat trois; + cv::gpu::transpose(rois, trois); + cv::gpu::GpuMat curr = objectBoxes; - cascade.detectMultiScale(colored, rois, curr); + cascade.detectMultiScale(colored, trois, curr); TEST_CYCLE() { curr = objectBoxes; - cascade.detectMultiScale(colored, rois, curr); + cascade.detectMultiScale(colored, trois, curr); } } else @@ -301,7 +306,10 @@ PERF_TEST_P(SoftCascadeTestRoi, detectEachRoi, sub.setTo(1); cv::gpu::GpuMat curr = objectBoxes; - cascade.detectMultiScale(colored, rois, curr); + cv::gpu::GpuMat trois; + cv::gpu::transpose(rois, trois); + + cascade.detectMultiScale(colored, trois, curr); TEST_CYCLE() { @@ -372,7 +380,7 @@ PERF_TEST_P(ImageAndCascade, ObjDetect_LBPClassifier, cv::Mat img = readImage(GetParam().first, cv::IMREAD_GRAYSCALE); ASSERT_FALSE(img.empty()); - if (PERF_RUN_GPU()) + if (runOnGpu) { cv::gpu::CascadeClassifier_GPU d_cascade; ASSERT_TRUE(d_cascade.load(perf::TestBase::getDataPath(GetParam().second))); diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index 4bde7f7ea1..8df6907df2 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -86,7 +86,6 @@ namespace icf { } texture thogluv; - texture troi; template __device__ __forceinline__ float rescale(const Level& level, Node& node) @@ -130,11 +129,6 @@ namespace icf { float relScale = level.relScale; float farea = scaledRect.z * scaledRect.w; - dprintf("%d: feature %d box %d %d %d %d\n",threadIdx.x, (node.threshold >> 28), scaledRect.x, scaledRect.y, - scaledRect.z, scaledRect.w); - dprintf("%d: rescale: %f [%f %f] selected %f\n",threadIdx.x, level.relScale, level.scaling[0], level.scaling[1], - level.scaling[(node.threshold >> 28) > 6]); - // rescale scaledRect.x = __float2int_rn(relScale * scaledRect.x); scaledRect.y = __float2int_rn(relScale * scaledRect.y); @@ -146,15 +140,7 @@ namespace icf { const float expected_new_area = farea * relScale * relScale; float approx = __fdividef(sarea, expected_new_area); - dprintf("%d: new rect: %d box %d %d %d %d rel areas %f %f\n",threadIdx.x, (node.threshold >> 28), - scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea); - - float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx; - - rootThreshold *= level.scaling[(node.threshold >> 28) > 6]; - - dprintf("%d: approximation %f %d -> %f %f\n",threadIdx.x, approx, (node.threshold & 0x0FFFFFFFU), rootThreshold, - level.scaling[(node.threshold >> 28) > 6]); + float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx * level.scaling[(node.threshold >> 28) > 6]; return rootThreshold; } @@ -162,33 +148,17 @@ namespace icf { template __device__ __forceinline__ int get(int x, int y, uchar4 area) { - - dprintf("%d: feature box %d %d %d %d\n",threadIdx.x, area.x, area.y, area.z, area.w); - dprintf("%d: extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n",threadIdx.x, - x + area.x, y + area.y, x + area.z, y + area.y, x + area.z,y + area.w, - x + area.x, y + area.w); - dprintf("%d: at point %d %d with offset %d\n", x, y, 0); - int a = tex2D(thogluv, x + area.x, y + area.y); int b = tex2D(thogluv, x + area.z, y + area.y); int c = tex2D(thogluv, x + area.z, y + area.w); int d = tex2D(thogluv, x + area.x, y + area.w); - dprintf("%d retruved integral values: %d %d %d %d\n",threadIdx.x, a, b, c, d); - return (a - b + c - d); } template<> __device__ __forceinline__ int get(int x, int y, uchar4 area) { - - dprintf("%d: feature box %d %d %d %d\n",threadIdx.x, area.x, area.y, area.z, area.w); - dprintf("%d: extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n",threadIdx.x, - x + area.x, y + area.y, x + area.z, y + area.y, x + area.z,y + area.w, - x + area.x, y + area.w); - dprintf("%d: at point %d %d with offset %d\n", x, y, 0); - x += area.x; y += area.y; int a = tex2D(thogluv, x, y); @@ -196,11 +166,10 @@ namespace icf { int c = tex2D(thogluv, x + area.z, y + area.w); int d = tex2D(thogluv, x, y + area.w); - dprintf("%d retruved integral values: %d %d %d %d\n",threadIdx.x, a, b, c, d); - return (a - b + c - d); } + texture troi; #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300 template __global__ void test_kernel_warp(const Level* levels, const Octave* octaves, const float* stages, @@ -210,12 +179,21 @@ namespace icf { const int y = blockIdx.y * blockDim.y + threadIdx.y; const int x = blockIdx.x; + __shared__ volatile char roiCache[8]; + + if (!threadIdx.y && !threadIdx.x) + { + ((float2*)roiCache)[threadIdx.x] = tex2D(troi, blockIdx.y, x); + } + + __syncthreads(); + + if (!roiCache[threadIdx.y]) return; + Level level = levels[downscales + blockIdx.z]; if(x >= level.workRect.x || y >= level.workRect.y) return; - if (!tex2D(troi, x, y)) return; - Octave octave = octaves[level.octave]; int st = octave.index * octave.stages; const int stEnd = st + 1024; @@ -282,9 +260,9 @@ namespace icf { // if (blockIdx.z != 31) return; if(x >= level.workRect.x || y >= level.workRect.y) return; - int roi = tex2D(troi, x, y); - printf("%d\n", roi); - if (!roi) return; + // int roi = tex2D(troi, x, y); + // printf("%d\n", roi); + // if (!roi) return; Octave octave = octaves[level.octave]; @@ -357,8 +335,8 @@ namespace icf { cudaChannelFormatDesc desc = cudaCreateChannelDesc(); cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step)); - cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc(); - cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols, roi.rows, roi.step)); + cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc(); + cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step)); test_kernel_warp<<>>(l, oct, st, nd, lf, det, max_det, ctr, 0); cudaSafeCall( cudaGetLastError()); @@ -391,8 +369,8 @@ namespace icf { cudaChannelFormatDesc desc = cudaCreateChannelDesc(); cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step)); - cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc(); - cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols, roi.rows, roi.step)); + cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc(); + cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step)); if (scale >= downscales) test_kernel_warp<<>>(l, oct, st, nd, lf, det, max_det, ctr, scale); diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index 8b73ae6393..e7fcfff27c 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -481,7 +481,7 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& CV_Assert(colored.type() == CV_8UC3); // we guess user knows about shrincage - CV_Assert((rois.size() == getRoiSize()) && (rois.type() == CV_8UC1)); + CV_Assert((rois.size().width == getRoiSize().height) && (rois.type() == CV_8UC1)); // only this window size allowed CV_Assert(colored.cols == Filds::FRAME_WIDTH && colored.rows == Filds::FRAME_HEIGHT); diff --git a/modules/gpu/test/test_softcascade.cpp b/modules/gpu/test/test_softcascade.cpp index 0b266f827b..04fa9b1811 100644 --- a/modules/gpu/test/test_softcascade.cpp +++ b/modules/gpu/test/test_softcascade.cpp @@ -47,7 +47,7 @@ using cv::gpu::GpuMat; // show detection results on input image with cv::imshow -//#define SHOW_DETECTIONS +#define SHOW_DETECTIONS #if defined SHOW_DETECTIONS # define SHOW(res) \ @@ -154,26 +154,30 @@ GPU_TEST_P(SoftCascadeTest, detectInROI, cv::gpu::SoftCascade cascade; ASSERT_TRUE(cascade.load(cvtest::TS::ptr()->get_data_path() + GET_PARAM(0))); - GpuMat colored(coloredCpu), objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1); + GpuMat colored(coloredCpu), objectBoxes(1, 16384, CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1), trois; rois.setTo(0); int nroi = GET_PARAM(2); + cv::Mat result(coloredCpu); cv::RNG rng; for (int i = 0; i < nroi; ++i) { cv::Rect r = getFromTable(rng(10)); GpuMat sub(rois, r); sub.setTo(1); + r.x *= 4; r.y *= 4; r.width *= 4; r.height *= 4; + cv::rectangle(result, r, cv::Scalar(0, 0, 255, 255), 1); } - cascade.detectMultiScale(colored, rois, objectBoxes); + cv::gpu::transpose(rois, trois); + + cascade.detectMultiScale(colored, trois, objectBoxes); /// cv::Mat dt(objectBoxes); typedef cv::gpu::SoftCascade::Detection detection_t; detection_t* dts = (detection_t*)dt.data; - cv::Mat result(coloredCpu); printTotal(std::cout, dt.cols); for (int i = 0; i < (int)(dt.cols / sizeof(detection_t)); ++i) @@ -204,8 +208,11 @@ GPU_TEST_P(SoftCascadeTest, detectInLevel, GpuMat colored(coloredCpu), objectBoxes(1, 100 * sizeof(detection_t), CV_8UC1), rois(cascade.getRoiSize(), CV_8UC1); rois.setTo(1); + cv::gpu::GpuMat trois; + cv::gpu::transpose(rois, trois); + int level = GET_PARAM(2); - cascade.detectMultiScale(colored, rois, objectBoxes, 1, level); + cascade.detectMultiScale(colored, trois, objectBoxes, 1, level); cv::Mat dt(objectBoxes); @@ -246,6 +253,9 @@ TEST(SoftCascadeTest, detect) GpuMat sub(rois, cv::Rect(rois.cols / 4, rois.rows / 4,rois.cols / 2, rois.rows / 2)); sub.setTo(cv::Scalar::all(1)); - cascade.detectMultiScale(colored, rois, objectBoxes); + cv::gpu::GpuMat trois; + cv::gpu::transpose(rois, trois); + + cascade.detectMultiScale(colored, trois, objectBoxes); } #endif \ No newline at end of file