From 8108bd30febf17e81f8329ccb65f695dd335a471 Mon Sep 17 00:00:00 2001 From: "marina.kolpakova" Date: Thu, 27 Sep 2012 12:44:06 +0400 Subject: [PATCH] optimize memory usage --- modules/gpu/src/cuda/isf-sc.cu | 306 +++++++++++++------------------- modules/gpu/src/icf.hpp | 29 +-- modules/gpu/src/softcascade.cpp | 88 +++++---- 3 files changed, 181 insertions(+), 242 deletions(-) diff --git a/modules/gpu/src/cuda/isf-sc.cu b/modules/gpu/src/cuda/isf-sc.cu index 714bdfa44a..c8dff34bdc 100644 --- a/modules/gpu/src/cuda/isf-sc.cu +++ b/modules/gpu/src/cuda/isf-sc.cu @@ -42,18 +42,17 @@ #include #include -// #include #include -// #include +#include -// //#define LOG_CUDA_CASCADE +// #define LOG_CUDA_CASCADE -// #if defined LOG_CUDA_CASCADE -// # define dprintf(format, ...) \ -// do { printf(format, __VA_ARGS__); } while (0) -// #else -// # define dprintf(format, ...) -// #endif +#if defined LOG_CUDA_CASCADE +# define dprintf(format, ...) \ + do { printf(format, __VA_ARGS__); } while (0) +#else +# define dprintf(format, ...) +#endif namespace cv { namespace gpu { namespace device { namespace icf { @@ -94,32 +93,128 @@ namespace icf { cudaSafeCall( cudaDeviceSynchronize() ); } - texture tnode; + texture thogluv; + // ToDo: do it in load time + // __device__ __forceinline__ float rescale(const Level& level, uchar4& scaledRect, const Node& node) + // { + // scaledRect = node.rect; + // return (float)(node.threshold & 0x0FFFFFFFU); + // } + + __device__ __forceinline__ float rescale(const Level& level, uchar4& scaledRect, const Node& node) + { + float relScale = level.relScale; + float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y); + + dprintf("feature %d box %d %d %d %d\n", (node.threshold >> 28), scaledRect.x, scaledRect.y, + scaledRect.z, scaledRect.w); + dprintf("rescale: %f [%f %f] selected %f\n",level.relScale, level.scaling[0], level.scaling[1], + level.scaling[(node.threshold >> 28) > 6]); + + // rescale + scaledRect.x = __float2int_rn(relScale * scaledRect.x); + scaledRect.y = __float2int_rn(relScale * scaledRect.y); + scaledRect.z = __float2int_rn(relScale * scaledRect.z); + scaledRect.w = __float2int_rn(relScale * scaledRect.w); + + float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y); + + float approx = 1.f; + // if (fabs(farea - 0.f) > FLT_EPSILON && fabs(farea - 0.f) > FLT_EPSILON) + { + const float expected_new_area = farea * relScale * relScale; + approx = sarea / expected_new_area; + } + + dprintf("new rect: %d box %d %d %d %d rel areas %f %f\n", (node.threshold >> 28), + scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea); + + + float rootThreshold = (node.threshold & 0x0FFFFFFFU) * approx; + rootThreshold *= level.scaling[(node.threshold >> 28) > 6]; + + dprintf("approximation %f %d -> %f %f\n", approx, (node.threshold & 0x0FFFFFFFU), rootThreshold, + level.scaling[(node.threshold >> 28) > 6]); + + return rootThreshold; + } + + __device__ __forceinline__ int get(const int x, int y, int channel, uchar4 area) + { + + dprintf("feature box %d %d %d %d ", area.x, area.y, area.z, area.w); + dprintf("get for channel %d\n", channel); + dprintf("extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n", + x + area.x, y + area.y, x + area.z, y + area.y, x + area.z,y + area.w, + x + area.x, y + area.w); + dprintf("at point %d %d with offset %d\n", x, y, 0); + + int offset = channel * 121; + y += offset; + + int a = tex2D(thogluv, x + area.x, y + area.y); + int b = tex2D(thogluv, x + area.z, y + area.y); + int c = tex2D(thogluv, x + area.z, y + area.w); + int d = tex2D(thogluv, x + area.x, y + area.w); + + dprintf(" retruved integral values: %d %d %d %d\n", a, b, c, d); + + return (a - b + c - d); + } + __global__ void test_kernel(const Level* levels, const Octave* octaves, const float* stages, - const Node* nodes, - PtrStepSz objects) + const Node* nodes, const float* leaves, PtrStepSz objects) { const int y = blockIdx.y * blockDim.y + threadIdx.y; const int x = blockIdx.x * blockDim.x + threadIdx.x; Level level = levels[blockIdx.z]; + + // if (x > 0 || y > 0 || blockIdx.z > 0) return; if(x >= level.workRect.x || y >= level.workRect.y) return; Octave octave = octaves[level.octave]; + int st = octave.index * octave.stages; const int stEnd = st + 1000;//octave.stages; float confidence = 0.f; -#pragma unroll 8 +// #pragma unroll 8 for(; st < stEnd; ++st) { + dprintf("\n\nstage: %d\n", st); const int nId = st * 3; - const Node node = nodes[nId]; + Node node = nodes[nId]; + + dprintf("Node: [%d %d %d %d] %d %d\n", node.rect.x, node.rect.y, node.rect.z, node.rect.w, + node.threshold >> 28, node.threshold & 0x0FFFFFFFU); + + float threshold = rescale(level, node.rect, node); + int sum = get(x, y, (node.threshold >> 28), node.rect); + + dprintf("Node: [%d %d %d %d] %f\n", node.rect.x, node.rect.y, node.rect.z, + node.rect.w, threshold); + + int next = 1 + (int)(sum >= threshold); + dprintf("go: %d (%d >= %f)\n\n" ,next, sum, threshold); + + node = nodes[nId + next]; + threshold = rescale(level, node.rect, node); + sum = get(x, y, (node.threshold >> 28), node.rect); - const float stage = stages[st]; - confidence += node.rect.x * stage; + const int lShift = (next - 1) * 2 + (int)(sum >= threshold); + float impact = leaves[st * 4 + lShift]; + confidence += impact; + + if (confidence <= stages[st]) st = stEnd + 1; + dprintf("decided: %d (%d >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact); + dprintf("extracted stage: %f\n", stages[st]); + dprintf("computed score: %f\n\n", confidence); } + // if (st == stEnd) + // printf("%d %d %d\n", x, y, st); + uchar4 val; val.x = (int)confidence; if (x == y) objects(0, threadIdx.x) = val; @@ -127,188 +222,27 @@ namespace icf { } void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages, - const PtrStepSzb& nodes, const PtrStepSzb& features, - PtrStepSz objects) + const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz objects) { int fw = 160; int fh = 120; + dim3 block(32, 8); dim3 grid(fw / 32, fh / 8, 47); + const Level* l = (const Level*)levels.ptr(); const Octave* oct = ((const Octave*)octaves.ptr()); const float* st = (const float*)stages.ptr(); const Node* nd = (const Node*)nodes.ptr(); - // cudaSafeCall( cudaBindTexture(0, tnode, nodes.data, rgb.cols / size) ); + const float* lf = (const float*)leaves.ptr(); + + cudaChannelFormatDesc desc = cudaCreateChannelDesc(); + cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step)); - test_kernel<<>>(l, oct, st, nd, objects); + test_kernel<<>>(l, oct, st, nd, lf, objects); cudaSafeCall( cudaGetLastError()); cudaSafeCall( cudaDeviceSynchronize()); } } -}}} - -// __global__ void detect(const cv::gpu::icf::Cascade cascade, const int* __restrict__ hogluv, const int pitch, -// PtrStepSz objects) -// { -// cascade.detectAt(hogluv, pitch, objects); -// } - -// } - -// float __device icf::Cascade::rescale(const icf::Level& level, uchar4& scaledRect, -// const int channel, const float threshold) const -// { -// dprintf("feature %d box %d %d %d %d\n", channel, scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w); -// dprintf("rescale: %f [%f %f]\n",level.relScale, level.scaling[0], level.scaling[1]); - -// float relScale = level.relScale; -// float farea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y); - -// // rescale -// scaledRect.x = __float2int_rn(relScale * scaledRect.x); -// scaledRect.y = __float2int_rn(relScale * scaledRect.y); -// scaledRect.z = __float2int_rn(relScale * scaledRect.z); -// scaledRect.w = __float2int_rn(relScale * scaledRect.w); - -// float sarea = (scaledRect.z - scaledRect.x) * (scaledRect.w - scaledRect.y); - - -// float approx = 1.f; -// if (fabs(farea - 0.f) > FLT_EPSILON && fabs(farea - 0.f) > FLT_EPSILON) -// { -// const float expected_new_area = farea * relScale * relScale; -// approx = expected_new_area / sarea; -// } - -// dprintf("new rect: %d box %d %d %d %d rel areas %f %f\n", channel, -// scaledRect.x, scaledRect.y, scaledRect.z, scaledRect.w, farea * relScale * relScale, sarea); - -// // compensation areas rounding -// float rootThreshold = threshold / approx; -// // printf(" approx %f\n", rootThreshold); -// rootThreshold *= level.scaling[(int)(channel > 6)]; - -// dprintf("approximation %f %f -> %f %f\n", approx, threshold, rootThreshold, level.scaling[(int)(channel > 6)]); - -// return rootThreshold; -// } - -// typedef unsigned char uchar; -// float __device get(const int* __restrict__ hogluv, const int pitch, -// const int x, const int y, int channel, uchar4 area) -// { -// dprintf("feature box %d %d %d %d ", area.x, area.y, area.z, area.w); -// dprintf("get for channel %d\n", channel); -// dprintf("extract feature for: [%d %d] [%d %d] [%d %d] [%d %d]\n", -// x + area.x, y + area.y, x + area.z, y + area.y, x + area.z,y + area.w, -// x + area.x, y + area.w); -// dprintf("at point %d %d with offset %d\n", x, y, 0); - -// const int* curr = hogluv + ((channel * 121) + y) * pitch; - -// int a = curr[area.y * pitch + x + area.x]; -// int b = curr[area.y * pitch + x + area.z]; -// int c = curr[area.w * pitch + x + area.z]; -// int d = curr[area.w * pitch + x + area.x]; - -// dprintf(" retruved integral values: %d %d %d %d\n", a, b, c, d); - -// return (a - b + c - d); -// } - - -// void __device icf::Cascade::detectAt(const int* __restrict__ hogluv, const int pitch, -// PtrStepSz& objects) const -// { -// const icf::Level* lls = (const icf::Level*)levels.ptr(); - -// const int y = blockIdx.y * blockDim.y + threadIdx.y; -// const int x = blockIdx.x * blockDim.x + threadIdx.x; -// // if (x > 0 || y > 0) return; - -// Level level = lls[blockIdx.z]; -// if (x >= level.workRect.x || y >= level.workRect.y) return; - -// dprintf("level: %d (%f %f) [%f %f] (%d %d) (%d %d)\n", level.octave, level.relScale, level.shrScale, -// level.scaling[0], level.scaling[1], level.workRect.x, level.workRect.y, level.objSize.x, level.objSize.y); - -// const Octave octave = ((const Octave*)octaves.ptr())[level.octave]; -// // printf("Octave: %d %d %d (%d %d) %f\n", octave.index, octave.stages, -// // octave.shrinkage, octave.size.x, octave.size.y, octave.scale); - -// const int stBegin = octave.index * octave.stages, stEnd = stBegin + octave.stages; - -// float detectionScore = 0.f; - -// int st = stBegin; -// for(; st < stEnd; ++st) -// { -// const float stage = stages(0, st); -// dprintf("Stage: %f\n", stage); -// { -// const int nId = st * 3; - -// // work with root node -// const Node node = ((const Node*)nodes.ptr())[nId]; - -// dprintf("Node: %d %f\n", node.feature, node.threshold); - -// const Feature feature = ((const Feature*)features.ptr())[node.feature]; - -// uchar4 scaledRect = feature.rect; -// float threshold = rescale(level, scaledRect, feature.channel, node.threshold); - -// float sum = get(hogluv,pitch, x, y, feature.channel, scaledRect); - -// dprintf("root feature %d %f\n",feature.channel, sum); - -// int next = 1 + (int)(sum >= threshold); - -// dprintf("go: %d (%f >= %f)\n\n" ,next, sum, threshold); - -// // leaves -// const Node leaf = ((const Node*)nodes.ptr())[nId + next]; -// const Feature fLeaf = ((const Feature*)features.ptr())[leaf.feature]; - -// scaledRect = fLeaf.rect; -// threshold = rescale(level, scaledRect, fLeaf.channel, leaf.threshold); -// sum = get(hogluv, pitch, x, y, fLeaf.channel, scaledRect); - -// const int lShift = (next - 1) * 2 + (int)(sum >= threshold); -// float impact = leaves(0, (st * 4) + lShift); - -// detectionScore += impact; - -// dprintf("decided: %d (%f >= %f) %d %f\n\n" ,next, sum, threshold, lShift, impact); -// dprintf("extracted stage:\n"); -// dprintf("ct %f\n", stage); -// dprintf("computed score %f\n\n", detectionScore); -// dprintf("\n\n"); -// } - -// if (detectionScore <= stage || st - stBegin == 100) break; -// } - -// dprintf("x %d y %d: %d\n", x, y, st - stBegin); - -// if (st == stEnd) -// { -// uchar4 a; -// a.x = level.workRect.x; -// a.y = level.workRect.y; -// objects(0, threadIdx.x) = a; -// } -// } - -// void icf::Cascade::detect(const cv::gpu::PtrStepSzi& hogluv, PtrStepSz objects, cudaStream_t stream) const -// { -// dim3 block(32, 8, 1); -// dim3 grid(ChannelStorage::FRAME_WIDTH / 32, ChannelStorage::FRAME_HEIGHT / 8, 47); -// device::detect<<>>(*this, hogluv, hogluv.step / sizeof(int), objects); -// cudaSafeCall( cudaGetLastError() ); -// if (!stream) -// cudaSafeCall( cudaDeviceSynchronize() ); -// } - -// }} \ No newline at end of file +}}} \ No newline at end of file diff --git a/modules/gpu/src/icf.hpp b/modules/gpu/src/icf.hpp index 51ea2c068a..ecd1886d38 100644 --- a/modules/gpu/src/icf.hpp +++ b/modules/gpu/src/icf.hpp @@ -40,11 +40,13 @@ // //M -#include #ifndef __OPENCV_ICF_HPP__ #define __OPENCV_ICF_HPP__ +#include +#include + // #if defined __CUDACC__ // # define __device __device__ __forceinline__ // #else @@ -92,20 +94,27 @@ struct __align__(8) Level //is actually 24 bytes struct __align__(8) Node { - // int feature; uchar4 rect; - float threshold; + // ushort channel; + uint threshold; - Node(const uchar4 c, const int t) : rect(c), threshold(t) {} + enum { THRESHOLD_MASK = 0x0FFFFFFF }; + + Node(const uchar4 r, const uint ch, const uint t) : rect(r), threshold(t + (ch << 28)) + { + // printf("%d\n", t); + // printf("[%d %d %d %d] %d, %d\n",rect.x, rect.y, rect.z, rect.w, (int)(threshold >> 28), + // (int)(0x0FFFFFFF & threshold)); + } }; -struct __align__(8) Feature -{ - int channel; - uchar4 rect; +// struct __align__(8) Feature +// { +// int channel; +// uchar4 rect; - Feature(const int c, const uchar4 r) : channel(c), rect(r) {} -}; +// Feature(const int c, const uchar4 r) : channel(c), rect(r) {} +// }; } }}} // struct Cascade diff --git a/modules/gpu/src/softcascade.cpp b/modules/gpu/src/softcascade.cpp index 8d75176ab3..ffbf380c69 100644 --- a/modules/gpu/src/softcascade.cpp +++ b/modules/gpu/src/softcascade.cpp @@ -60,19 +60,10 @@ namespace icf { void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle, const int fw, const int fh, const int bins); void detect(const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages, - const PtrStepSzb& nodes, const PtrStepSzb& features, - PtrStepSz objects); + const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv, PtrStepSz objects); } }}} -// namespace { -// char *itoa(long i, char* s, int /*dummy_radix*/) -// { -// sprintf(s, "%ld", i); -// return s; -// } -// } - struct cv::gpu::SoftCascade::Filds { @@ -97,7 +88,6 @@ struct cv::gpu::SoftCascade::Filds GpuMat stages; GpuMat nodes; GpuMat leaves; - GpuMat features; GpuMat levels; // preallocated buffer 640x480x10 for hogluv + 640x480 got gray @@ -137,7 +127,7 @@ struct cv::gpu::SoftCascade::Filds bool fill(const FileNode &root, const float mins, const float maxs); void detect(cv::gpu::GpuMat objects, cudaStream_t stream) const { - device::icf::detect(levels, octaves, stages, nodes, features, objects); + device::icf::detect(levels, octaves, stages, nodes, leaves, hogluv, objects); } private: @@ -216,10 +206,9 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float if (fn.empty()) return false; std::vector voctaves; - std::vector vstages; + std::vector vstages; std::vector vnodes; - std::vector vleaves; - std::vector vfeatures; + std::vector vleaves; scales.clear(); FileNodeIterator it = fn.begin(), it_end = fn.end(); @@ -245,6 +234,8 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float FileNode ffs = fns[SC_FEATURES]; if (ffs.empty()) return false; + FileNodeIterator ftrs = ffs.begin(); + fns = fns[SC_STAGES]; if (fn.empty()) return false; @@ -263,10 +254,21 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float FileNodeIterator inIt = fns.begin(), inIt_end = fns.end(); for (; inIt != inIt_end;) { - int feature = (int)(*(inIt +=2)++) + feature_offset; - float th = (float)(*(inIt++)); + // int feature = (int)(*(inIt +=2)) + feature_offset; + inIt +=3; + // extract feature, Todo:check it + uint th = saturate_cast((float)(*(inIt++))); + cv::FileNode ftn = (*ftrs)[SC_F_RECT]; + cv::FileNodeIterator r_it = ftn.begin(); uchar4 rect; - vnodes.push_back(Node(rect, th)); + rect.x = saturate_cast((int)*(r_it++)); + rect.y = saturate_cast((int)*(r_it++)); + rect.z = saturate_cast((int)*(r_it++)); + rect.w = saturate_cast((int)*(r_it++)); + + uint channel = saturate_cast((int)(*ftrs)[SC_F_CHANNEL]); + vnodes.push_back(Node(rect, channel, th)); + ++ftrs; } fns = (*ftr)[SC_LEAF]; @@ -276,19 +278,6 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float } } - st = ffs.begin(), st_end = ffs.end(); - for (; st != st_end; ++st ) - { - cv::FileNode rn = (*st)[SC_F_RECT]; - cv::FileNodeIterator r_it = rn.begin(); - uchar4 rect; - rect.x = saturate_cast((int)*(r_it++)); - rect.y = saturate_cast((int)*(r_it++)); - rect.z = saturate_cast((int)*(r_it++)); - rect.w = saturate_cast((int)*(r_it++)); - vfeatures.push_back(Feature((int)(*st)[SC_F_CHANNEL], rect)); - } - feature_offset += octave.stages * 3; ++octIndex; } @@ -306,9 +295,6 @@ inline bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float leaves.upload(cv::Mat(vleaves).reshape(1,1)); CV_Assert(!leaves.empty()); - features.upload(cv::Mat(1, vfeatures.size() * sizeof(Feature), CV_8UC1, (uchar*)&(vfeatures[0]) )); - CV_Assert(!features.empty()); - // compute levels calcLevels(voctaves, FRAME_WIDTH, FRAME_HEIGHT, TOTAL_SCALES); CV_Assert(!levels.empty()); @@ -425,7 +411,14 @@ bool cv::gpu::SoftCascade::load( const string& filename, const float minScale, c return true; } -// #define USE_REFERENCE_VALUES +#define USE_REFERENCE_VALUES +namespace { + char *itoa(long i, char* s, int /*dummy_radix*/) + { + sprintf(s, "%ld", i); + return s; + } +} void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& /*rois*/, GpuMat& objects, const int /*rejectfactor*/, Stream s) { @@ -438,17 +431,20 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat& Filds& flds = *filds; #if defined USE_REFERENCE_VALUES -// cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows); -// cv::FileStorage imgs("/home/kellan/testInts.xml", cv::FileStorage::READ); -// char buff[33]; - -// for(int i = 0; i < Filds::HOG_LUV_BINS; ++i) -// { -// cv::Mat channel; -// imgs[std::string("channel") + itoa(i, buff, 10)] >> channel; -// GpuMat gchannel(flds.hogluv, cv::Rect(0, 121 * i, 161, 121)); -// gchannel.upload(channel); -// } + cudaMemset(flds.hogluv.data, 0, flds.hogluv.step * flds.hogluv.rows); + + cv::FileStorage imgs("/home/kellan/testInts.xml", cv::FileStorage::READ); + char buff[33]; + + for(int i = 0; i < Filds::HOG_LUV_BINS; ++i) + { + cv::Mat channel; + imgs[std::string("channel") + itoa(i, buff, 10)] >> channel; + + // std::cout << "channel " << i << std::endl << channel << std::endl; + GpuMat gchannel(flds.hogluv, cv::Rect(0, 121 * i, 161, 121)); + gchannel.upload(channel); + } #else GpuMat& plane = flds.plane; GpuMat& shrunk = flds.shrunk;