diff --git a/modules/gpu/perf/perf_objdetect.cpp b/modules/gpu/perf/perf_objdetect.cpp index 48a355d6a7..e272d65352 100644 --- a/modules/gpu/perf/perf_objdetect.cpp +++ b/modules/gpu/perf/perf_objdetect.cpp @@ -104,7 +104,7 @@ PERF_TEST_P(SoftCascade, detect, Values(make_pair("cv/cascadeandhog cv::gpu::SoftCascade cascade; ASSERT_TRUE(cascade.load(perf::TestBase::getDataPath(GetParam().first))); - cv::gpu::GpuMat rois, objectBoxes(1, 1000, CV_8UC1); + cv::gpu::GpuMat rois, objectBoxes(1, 1000, CV_8UC4); cascade.detectMultiScale(colored, rois, objectBoxes); TEST_CYCLE() @@ -117,7 +117,7 @@ PERF_TEST_P(SoftCascade, detect, Values(make_pair("cv/cascadeandhog ASSERT_FALSE(colored.empty()); cv::SoftCascade cascade; - ASSERT_TRUE(cascade.load(GetParam().first)); + ASSERT_TRUE(cascade.load(getDataPath(GetParam().first))); std::vector rois, objectBoxes; cascade.detectMultiScale(colored, rois, objectBoxes); diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index c8dff34bdc..4bf410fc59 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -57,14 +57,6 @@ namespace cv { namespace gpu { namespace device { namespace icf { -// enum { -// HOG_BINS = 6, -// HOG_LUV_BINS = 10, -// WIDTH = 640, -// HEIGHT = 480, -// GREY_OFFSET = HEIGHT * HOG_LUV_BINS -// }; - // ToDo: use textures or ancached load instruction. __global__ void magToHist(const uchar* __restrict__ mag, const float* __restrict__ angle, const int angPitch, @@ -94,13 +86,6 @@ namespace icf { } texture thogluv; - // ToDo: do it in load time - // __device__ __forceinline__ float rescale(const Level& level, uchar4& scaledRect, const Node& node) - // { - // scaledRect = node.rect; - // return (float)(node.threshold & 0x0FFFFFFFU); - // } - __device__ __forceinline__ float rescale(const Level& level, uchar4& scaledRect, const Node& node) { float relScale = level.relScale; @@ -119,17 +104,12 @@ namespace icf { float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y); - float approx = 1.f; - // if (fabs(farea - 0.f) > FLT_EPSILON && fabs(farea - 0.f) > FLT_EPSILON) - { - const float expected_new_area = farea * relScale * relScale; - approx = sarea / expected_new_area; - } + const float expected_new_area = farea * relScale * relScale; + float approx = sarea / expected_new_area; dprintf("new rect: %d box %d %d %d %d rel areas %f %f\n", (node.threshold >> 28), scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea); - float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx; rootThreshold *= level.scaling[(node.threshold >> 28) > 6]; @@ -139,7 +119,7 @@ namespace icf { return rootThreshold; } - __device__ __forceinline__ int get(const int x, int y, int channel, uchar4 area) + __device__ __forceinline__ int get(const int x, int y, uchar4 area) { dprintf("feature box %d %d %d %d ", area.x, area.y, area.z, area.w); @@ -149,9 +129,6 @@ namespace icf { x + area.x, y + area.w); dprintf("at point %d %d with offset %d\n", x, y, 0); - int offset = channel * 121; - y += offset; - int a = tex2D(thogluv, x + area.x, y + area.y); int b = tex2D(thogluv, x + area.z, y + area.y); int c = tex2D(thogluv, x + area.z, y + area.w); @@ -163,7 +140,7 @@ namespace icf { } __global__ void test_kernel(const Level* levels, const Octave* octaves, const float* stages, - const Node* nodes, const float* leaves, PtrStepSz objects) + const Node* nodes, const float* leaves, PtrStepSz objects, uint* ctr) { const int y = blockIdx.y * blockDim.y + threadIdx.y; const int x = blockIdx.x * blockDim.x + threadIdx.x; @@ -179,7 +156,7 @@ namespace icf { float confidence = 0.f; -// #pragma unroll 8 +// #pragma unroll 2 for(; st < stEnd; ++st) { dprintf("\n\nstage: %d\n", st); @@ -190,7 +167,7 @@ namespace icf { node.threshold >> 28, node.threshold & 0x0FFFFFFFU); float threshold = rescale(level, node.rect, node); - int sum = get(x, y, (node.threshold >> 28), node.rect); + int sum = get(x, y + (node.threshold >> 28) * 121, node.rect); dprintf("Node: [%d %d %d %d] %f\n", node.rect.x, node.rect.y, node.rect.z, node.rect.w, threshold); @@ -200,29 +177,30 @@ namespace icf { node = nodes[nId + next]; threshold = rescale(level, node.rect, node); - sum = get(x, y, (node.threshold >> 28), node.rect); + sum = get(x, y + (node.threshold >> 28) * 121, node.rect); const int lShift = (next - 1) * 2 + (int)(sum >= threshold); float impact = leaves[st * 4 + lShift]; confidence += impact; - if (confidence <= stages[st]) st = stEnd + 1; + if (confidence <= stages[st]) st = stEnd + 10; dprintf("decided: %d (%d >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact); dprintf("extracted stage: %f\n", stages[st]); dprintf("computed score: %f\n\n", confidence); } - // if (st == stEnd) - // printf("%d %d %d\n", x, y, st); - - uchar4 val; - val.x = (int)confidence; - if (x == y) objects(0, threadIdx.x) = val; - + if(st == stEnd) + { + int idx = atomicInc(ctr, objects.cols); + uchar4 val; + val.x = x * 4; + objects(0, idx) = val; + } } void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages, - const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz objects) + const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, + PtrStepSz objects, PtrStepSzi counter) { int fw = 160; int fh = 120; @@ -235,11 +213,12 @@ namespace icf { const float* st = (const float*)stages.ptr(); const Node* nd = (const Node*)nodes.ptr(); const float* lf = (const float*)leaves.ptr(); + uint* ctr = (uint*)counter.ptr(); cudaChannelFormatDesc desc = cudaCreateChannelDesc(); cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step)); - test_kernel<<>>(l, oct, st, nd, lf, objects); + test_kernel<<>>(l, oct, st, nd, lf, objects, ctr); cudaSafeCall( cudaGetLastError()); cudaSafeCall( cudaDeviceSynchronize()); diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index ffbf380c69..320fbb3437 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -60,7 +60,8 @@ namespace icf { void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle, const int fw, const int fh, const int bins); void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages, - const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz objects); + const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz objects, + PtrStepSzi counter); } }}} @@ -75,6 +76,7 @@ struct cv::gpu::SoftCascade::Filds shrunk.create(FRAME_HEIGHT / 4 * HOG_LUV_BINS, FRAME_WIDTH / 4, CV_8UC1); integralBuffer.create(shrunk.rows + 1 * HOG_LUV_BINS, shrunk.cols + 1, CV_32SC1); hogluv.create((FRAME_HEIGHT / 4 + 1) * HOG_LUV_BINS, FRAME_WIDTH / 4 + 1, CV_32SC1); + detCounter.create(1,1, CV_32SC1); } // scales range @@ -90,6 +92,8 @@ struct cv::gpu::SoftCascade::Filds GpuMat leaves; GpuMat levels; + GpuMat detCounter; + // preallocated buffer 640x480x10 for hogluv + 640x480 got gray GpuMat plane; @@ -127,7 +131,8 @@ struct cv::gpu::SoftCascade::Filds bool fill(const FileNode &root, const float mins, const float maxs); void detect(cv::gpu::GpuMat objects, cudaStream_t stream) const { - device::icf::detect(levels, octaves, stages, nodes, leaves, hogluv, objects); + cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int)); + device::icf::detect(levels, octaves, stages, nodes, leaves, hogluv, objects , detCounter); } private: @@ -506,14 +511,13 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& GpuMat sum(flds.hogluv, cv::Rect(0, (fh + 1) * i, fw + 1, fh + 1)); cv::gpu::integralBuffered(channel, sum, flds.integralBuffer); } - #endif cudaStream_t stream = StreamAccessor::getStream(s); - // detection flds.detect(objects, stream); -// // flds.storage.frame(colored, stream); + // cv::Mat out(flds.detCounter); + // std::cout << out << std::endl; } #endif \ No newline at end of file