diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index 5cde710708..37c6e30235 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -42,11 +42,48 @@ #include #include +#include +#include -namespace cv { namespace gpu { +namespace cv { namespace gpu { namespace device { +namespace icf { - namespace device { + enum { + HOG_BINS = 6, + HOG_LUV_BINS = 10, + WIDTH = 640, + HEIGHT = 480, + GREY_OFFSET = HEIGHT * HOG_LUV_BINS + }; + + __global__ void magToHist(const uchar* __restrict__ mag, + const float* __restrict__ angle, const int angPitch, + uchar* __restrict__ hog, const int hogPitch) + { + const int y = blockIdx.y * blockDim.y + threadIdx.y; + const int x = blockIdx.x * blockDim.x + threadIdx.x; + + const int bin = (int)(angle[y * angPitch + x]); + const uchar val = mag[y * angPitch + x]; + + hog[((HEIGHT * bin) + y) * hogPitch + x] = val; + } + + void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle) + { + const uchar* mag = (const uchar*)hogluv.ptr(HEIGHT * HOG_BINS); + uchar* hog = (uchar*)hogluv.ptr(); + const float* angle = (const float*)nangle.ptr(); + + dim3 block(32, 8); + dim3 grid(WIDTH / 32, HEIGHT / 8); + + magToHist<<>>(mag, angle, nangle.step / sizeof(float), hog, hogluv.step); + cudaSafeCall( cudaGetLastError() ); + cudaSafeCall( cudaDeviceSynchronize() ); + } +} enum { HOG_BINS = 6, @@ -185,65 +222,175 @@ __global__ void intCol(ushort* __restrict__ sum, const int pitch) } -__global__ void detect(const cv::gpu::icf::Cascade cascade, const uchar* __restrict__ hogluv, const int pitch) +__global__ void detect(const cv::gpu::icf::Cascade cascade, const uchar* __restrict__ hogluv, const int pitch, + PtrStepSz objects) { - cascade.detectAt(); + cascade.detectAt(hogluv, pitch, objects); } } -void __device icf::Cascade::detectAt() const +float __device icf::Cascade::rescale(const icf::Level& level, uchar4& scaledRect, + const int channel, const float threshold) const { + float relScale = level.relScale; + float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y); + + // rescale + scaledRect.x = __float2int_rn(relScale * scaledRect.x); + scaledRect.y = __float2int_rn(relScale * scaledRect.y); + scaledRect.z = __float2int_rn(relScale * scaledRect.z); + scaledRect.w = __float2int_rn(relScale * scaledRect.w); + + float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y); + float approx = 1.f; + if (fabs(farea - 0.f) > FLT_EPSILON && fabs(farea - 0.f) > FLT_EPSILON) + { + const float expected_new_area = farea * relScale * relScale; + approx = expected_new_area / sarea; + } + + // compensation areas rounding + float rootThreshold = threshold / approx; + rootThreshold *= level.scaling[(int)(channel > 6)]; + + return rootThreshold; } -void icf::Cascade::detect(const cv::gpu::PtrStepSzb& hogluv, cudaStream_t stream) const +typedef unsigned char uchar; +float __device get(const uchar* __restrict__ hogluv, const int pitch, + const int x, const int y, int channel, uchar4 area) { - // detection kernel - dim3 block(32, 8, 1); - dim3 grid(32 * ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 64); - device::detect<<>>(*this, hogluv, hogluv.step / sizeof(ushort)); - if (!stream) - cudaSafeCall( cudaDeviceSynchronize() ); + const uchar* curr = hogluv + ((channel * 121) + y) * pitch; + + int a = curr[area.y * pitch + x + area.x]; + int b = curr[area.y * pitch + x + area.z]; + int c = curr[area.w * pitch + x + area.z]; + int d = curr[area.w * pitch + x + area.x]; + return (a - b + c - d); } -void icf::ChannelStorage::frame(const cv::gpu::PtrStepSz& rgb, cudaStream_t stream) + +void __device icf::Cascade::detectAt(const uchar* __restrict__ hogluv, const int pitch, + PtrStepSz& objects) const { - // color convertin kernel - dim3 block(32, 8); - dim3 grid(FRAME_WIDTH / 32, FRAME_HEIGHT / 8); + const icf::Level* lls = (const icf::Level*)levels.ptr(); + Level level = lls[0]; - uchar * channels = (uchar*)dmem.ptr(FRAME_HEIGHT * HOG_BINS); - device::rgb2grayluv<<>>((uchar3*)rgb.ptr(), channels, - rgb.step / sizeof(uchar3), dmem.step); - cudaSafeCall( cudaGetLastError()); + const int y = blockIdx.y * blockDim.y + threadIdx.y; + const int x = blockIdx.x * blockDim.x + threadIdx.x; - // hog calculation kernel - channels = (uchar*)dmem.ptr(FRAME_HEIGHT * HOG_LUV_BINS); - device::gray2hog<<>>(channels, (uchar*)dmem.ptr(), dmem.step, magnitudeScaling); - cudaSafeCall( cudaGetLastError() ); + if (x >= level.workRect.x || y >= level.workRect.y) return; - const int shrWidth = FRAME_WIDTH / shrinkage; - const int shrHeight = FRAME_HEIGHT / shrinkage; + const Octave octave = ((const Octave*)octaves.ptr())[level.octave]; + const int stBegin = octave.index * octave.stages, stEnd = stBegin + octave.stages; - // decimate kernel - grid = dim3(shrWidth / 32, shrHeight / 8); - device::decimate<4><<>>((uchar*)dmem.ptr(), (uchar*)shrunk.ptr(), dmem.step, shrunk.step); - cudaSafeCall( cudaGetLastError() ); + float detectionScore = 0.f; - // integrate rows - block = dim3(shrWidth, 1); - grid = dim3(shrHeight * HOG_LUV_BINS, 1); - device::intRow<<>>((uchar*)shrunk.ptr(), (ushort*)hogluv.ptr(), - shrunk.step, hogluv.step / sizeof(ushort)); - cudaSafeCall( cudaGetLastError() ); + int st = stBegin; + for(; st < stEnd; ++st) + { + const float stage = stages(0, st); + { + const int nId = st * 3; + + // work with root node + const Node node = ((const Node*)nodes.ptr())[nId]; + const Feature feature = ((const Feature*)features.ptr())[node.feature]; - // integrate cols - block = dim3(128, 1); - grid = dim3(shrWidth * HOG_LUV_BINS, 1); - device::intCol<<>>((ushort*)hogluv.ptr(), hogluv.step / hogluv.step / sizeof(ushort)); + uchar4 scaledRect = feature.rect; + float threshold = rescale(level, scaledRect, feature.channel, node.threshold); + + float sum = get(hogluv,pitch, x, y, feature.channel, scaledRect); + + int next = 1 + (int)(sum >= threshold); + + // leaves + const Node leaf = ((const Node*)nodes.ptr())[nId + next]; + const Feature fLeaf = ((const Feature*)features.ptr())[leaf.feature]; + + scaledRect = fLeaf.rect; + threshold = rescale(level, scaledRect, feature.channel, node.threshold); + sum = get(hogluv, pitch, x, y, fLeaf.channel, scaledRect); + + const int lShift = (next - 1) * 2 + (int)(sum >= threshold); + float impact = leaves(0, (st * 4) + lShift); + + detectionScore += impact; + } + + if (detectionScore <= stage) break; + } + + // if (!threadIdx.x && !threadIdx.y)// printf("%f %d\n", detectionScore, st); + // printf("x %d y %d: %d\n", x, y, st); + + if (st == stEnd) + { + // printf(" got %d\n", st); + uchar4 a; + a.x = level.workRect.x; + a.y = level.workRect.y; + objects(0, threadIdx.x) = a; + } +} + +void icf::Cascade::detect(const cv::gpu::PtrStepSzb& hogluv, PtrStepSz objects, + cudaStream_t stream) const +{ + // detection kernel + dim3 block(32, 8, 1); + // dim3 grid(32 * ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 1); + dim3 grid(ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 1); + device::detect<<>>(*this, hogluv, hogluv.step / sizeof(ushort), objects); cudaSafeCall( cudaGetLastError() ); + if (!stream) + cudaSafeCall( cudaDeviceSynchronize() ); + +} + +//////////////////////////////////////////////////// + + + +void icf::ChannelStorage::frame(const cv::gpu::PtrStepSz& rgb, cudaStream_t stream) +{ +// // // color convertin kernel +// // dim3 block(32, 8); +// // dim3 grid(FRAME_WIDTH / 32, FRAME_HEIGHT / 8); + +// // uchar * channels = (uchar*)dmem.ptr(FRAME_HEIGHT * HOG_BINS); +// // device::rgb2grayluv<<>>((uchar3*)rgb.ptr(), channels, +// // rgb.step / sizeof(uchar3), dmem.step); +// // cudaSafeCall( cudaGetLastError()); + +// // // hog calculation kernel +// // channels = (uchar*)dmem.ptr(FRAME_HEIGHT * HOG_LUV_BINS); +// // device::gray2hog<<>>(channels, (uchar*)dmem.ptr(), dmem.step, magnitudeScaling); +// // cudaSafeCall( cudaGetLastError() ); + +// // const int shrWidth = FRAME_WIDTH / shrinkage; +// // const int shrHeight = FRAME_HEIGHT / shrinkage; + +// // // decimate kernel +// // grid = dim3(shrWidth / 32, shrHeight / 8); +// // device::decimate<4><<>>((uchar*)dmem.ptr(), (uchar*)shrunk.ptr(), dmem.step, shrunk.step); +// // cudaSafeCall( cudaGetLastError() ); + +// // // integrate rows +// // block = dim3(shrWidth, 1); +// // grid = dim3(shrHeight * HOG_LUV_BINS, 1); +// // device::intRow<<>>((uchar*)shrunk.ptr(), (ushort*)hogluv.ptr(), +// // shrunk.step, hogluv.step / sizeof(ushort)); +// // cudaSafeCall( cudaGetLastError() ); + +// // // integrate cols +// // block = dim3(128, 1); +// // grid = dim3(shrWidth * HOG_LUV_BINS, 1); +// // device::intCol<<>>((ushort*)hogluv.ptr(), hogluv.step / hogluv.step / sizeof(ushort)); +// // cudaSafeCall( cudaGetLastError() ); } }} \ No newline at end of file diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp index 69d21fdd9b..454dad8812 100644 --- a/modules/gpu/src/icf.hpp +++ b/modules/gpu/src/icf.hpp @@ -59,6 +59,42 @@ using cv::gpu::PtrStepSzf; typedef unsigned char uchar; +struct __align__(16) Octave +{ + ushort index; + ushort stages; + ushort shrinkage; + ushort2 size; + float scale; + + Octave(const ushort i, const ushort s, const ushort sh, const ushort2 sz, const float sc) + : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {} +}; + +struct __align__(8) Level //is actually 24 bytes +{ + int octave; + + // float origScale; //not actually used + float relScale; + float shrScale; // used for marking detection + float scaling[2]; // calculated according to Dollal paper + + // for 640x480 we can not get overflow + uchar2 workRect; + uchar2 objSize; + + Level(int idx, const Octave& oct, const float scale, const int w, const int h) + : octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage) + { + workRect.x = round(w / (float)oct.shrinkage); + workRect.y = round(h / (float)oct.shrinkage); + + objSize.x = round(oct.size.x * relScale); + objSize.y = round(oct.size.y * relScale); + } +}; + struct Cascade { Cascade() {} @@ -66,8 +102,10 @@ struct Cascade const cv::gpu::PtrStepSzf& lvs, const cv::gpu::PtrStepSzb& fts, const cv::gpu::PtrStepSzb& lls) : octaves(octs), stages(sts), nodes(nds), leaves(lvs), features(fts), levels(lls) {} - void detect(const cv::gpu::PtrStepSzb& hogluv, cudaStream_t stream) const; - void __device detectAt() const; + void detect(const cv::gpu::PtrStepSzb& hogluv, cv::gpu::PtrStepSz objects, cudaStream_t stream) const; + void __device detectAt(const uchar* __restrict__ hogluv, const int pitch, PtrStepSz& objects) const; + float __device rescale(const icf::Level& level, uchar4& scaledRect, + const int channel, const float threshold) const; PtrStepSzb octaves; PtrStepSzf stages; @@ -108,18 +146,6 @@ struct ChannelStorage static const float magnitudeScaling = 1.f ;// / sqrt(2); }; -struct __align__(16) Octave -{ - ushort index; - ushort stages; - ushort shrinkage; - ushort2 size; - float scale; - - Octave(const ushort i, const ushort s, const ushort sh, const ushort2 sz, const float sc) - : index(i), stages(s), shrinkage(sh), size(sz), scale(sc) {} -}; - struct __align__(8) Node { int feature; @@ -135,30 +161,6 @@ struct __align__(8) Feature Feature(const int c, const uchar4 r) : channel(c), rect(r) {} }; - -struct __align__(8) Level //is actually 24 bytes -{ - int octave; - - // float origScale; //not actually used - float relScale; - float shrScale; // used for marking detection - float scaling[2]; // calculated according to Dollal paper - - // for 640x480 we can not get overflow - uchar2 workRect; - uchar2 objSize; - - Level(int idx, const Octave& oct, const float scale, const int w, const int h) - : octave(idx), relScale(scale / oct.scale), shrScale (relScale / (float)oct.shrinkage) - { - workRect.x = round(w / (float)oct.shrinkage); - workRect.y = round(h / (float)oct.shrinkage); - - objSize.x = round(oct.size.x * relScale); - objSize.y = round(oct.size.y * relScale); - } -}; }}} #endif \ No newline at end of file diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index b2419c12cf..abcae73dcb 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -41,6 +41,7 @@ //M*/ #include +#include "opencv2/highgui/highgui.hpp" #if !defined (HAVE_CUDA) @@ -58,6 +59,12 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat&, const GpuMat&, GpuMat #include +namespace cv { namespace gpu { namespace device { +namespace icf { + void fillBins(cv::gpu::PtrStepSzb hogluv,const cv::gpu::PtrStepSzf& nangle); +} +}}} + struct cv::gpu::SoftCascade::Filds { // scales range @@ -81,6 +88,16 @@ struct cv::gpu::SoftCascade::Filds // 161x121x10 GpuMat hogluv; + // will be removed in final version + // temporial mat for cvtColor + GpuMat luv; + + // temporial mat for integrall + GpuMat integralBuffer; + + // temp matrix for sobel and cartToPolar + GpuMat dfdx, dfdy, angle, mag, nmag, nangle; + std::vector scales; icf::Cascade cascade; @@ -100,9 +117,9 @@ struct cv::gpu::SoftCascade::Filds }; bool fill(const FileNode &root, const float mins, const float maxs); - void detect(cudaStream_t stream) const + void detect(cv::gpu::GpuMat objects, cudaStream_t stream) const { - cascade.detect(hogluv, stream); + cascade.detect(hogluv, objects, stream); } private: @@ -284,7 +301,18 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float // allocate buffers dmem.create(FRAME_HEIGHT * (HOG_LUV_BINS + 1), FRAME_WIDTH, CV_8UC1); shrunk.create(FRAME_HEIGHT / shrinkage * HOG_LUV_BINS, FRAME_WIDTH / shrinkage, CV_8UC1); - hogluv.create( (FRAME_HEIGHT / shrinkage * HOG_LUV_BINS) + 1, (FRAME_WIDTH / shrinkage) + 1, CV_16UC1); + // hogluv.create( (FRAME_HEIGHT / shrinkage + 1) * HOG_LUV_BINS, (FRAME_WIDTH / shrinkage + 1), CV_16UC1); + hogluv.create( (FRAME_HEIGHT / shrinkage + 1) * HOG_LUV_BINS, (FRAME_WIDTH / shrinkage + 1), CV_32SC1); + luv.create(FRAME_HEIGHT, FRAME_WIDTH, CV_8UC3); + integralBuffer.create(shrunk.rows + 1 * HOG_LUV_BINS, shrunk.cols + 1, CV_32SC1); + + dfdx.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1); + dfdy.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1); + angle.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1); + mag.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1); + + nmag.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1); + nangle.create(FRAME_HEIGHT, FRAME_WIDTH, CV_32FC1); storage = icf::ChannelStorage(dmem, shrunk, hogluv, shrinkage); return true; @@ -393,21 +421,71 @@ bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, c return true; } -void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& image, const GpuMat& /*rois*/, - GpuMat& /*objects*/, const int /*rejectfactor*/, Stream s) +void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& /*rois*/, + GpuMat& objects, const int /*rejectfactor*/, Stream s) { // only color images are supperted - CV_Assert(image.type() == CV_8UC3); + CV_Assert(colored.type() == CV_8UC3); - // only this window size allowed - CV_Assert(image.cols == 640 && image.rows == 480); + // // only this window size allowed + CV_Assert(colored.cols == 640 && colored.rows == 480); Filds& flds = *filds; + GpuMat& dmem = flds.dmem; + cudaMemset(dmem.data, 0, dmem.step * dmem.rows); + GpuMat& shrunk = flds.shrunk; + int w = shrunk.cols; + int h = colored.rows / flds.storage.shrinkage; cudaStream_t stream = StreamAccessor::getStream(s); - flds.storage.frame(image, stream); - flds.detect(stream); + std::vector splited; + for(int i = 0; i < 3; ++i) + { + splited.push_back(GpuMat(dmem, cv::Rect(0, colored.rows * (7 + i), colored.cols, colored.rows))); + } + + GpuMat gray(dmem, cv::Rect(0, colored.rows * 10, colored.cols, colored.rows) ); + + cv::gpu::cvtColor(colored, gray, CV_RGB2GRAY); + + //create hog + cv::gpu::Sobel(gray, flds.dfdx, CV_32F, 1, 0, 3, 0.25); + cv::gpu::Sobel(gray, flds.dfdy, CV_32F, 0, 1, 3, 0.25); + + cv::gpu::cartToPolar(flds.dfdx, flds.dfdy, flds.mag, flds.angle, true); + + cv::gpu::multiply(flds.mag, cv::Scalar::all(1.0 / ::log(2)), flds.nmag); + cv::gpu::multiply(flds.angle, cv::Scalar::all(1.0 / 60.0), flds.nangle); + + GpuMat magCannel(dmem, cv::Rect(0, colored.rows * 6, colored.cols, colored.rows)); + flds.nmag.convertTo(magCannel, CV_8UC1); + device::icf::fillBins(dmem, flds.nangle); + + // create luv + cv::gpu::cvtColor(colored, flds.luv, CV_BGR2Luv); + cv::gpu::split(flds.luv, splited); + + GpuMat plane(dmem, cv::Rect(0, 0, colored.cols, colored.rows * Filds::HOG_LUV_BINS)); + cv::gpu::resize(plane, flds.shrunk, cv::Size(), 0.25, 0.25, CV_INTER_AREA); + // cv::Mat cpu(plane); + // cv::imshow("channels", cpu); + // cv::waitKey(0); + + // fer debug purpose + // cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows); + + for(int i = 0; i < Filds::HOG_LUV_BINS; ++i) + { + GpuMat channel(shrunk, cv::Rect(0, h * i, w, h )); + GpuMat sum(flds.hogluv, cv::Rect(0, (h + 1) * i, w + 1, h + 1)); + cv::gpu::integralBuffered(channel, sum, flds.integralBuffer); + } + + // detection + flds.detect(objects, stream); + + // flds.storage.frame(colored, stream); } #endif \ No newline at end of file