kernel policy

pull/158/head
marina.kolpakova 12 years ago
parent 916ba4c0ea
commit 0898c3c651
  1. 62
      modules/gpu/src/cuda/isf-sc.cu
  2. 27
      modules/gpu/src/icf.hpp
  3. 44
      modules/gpu/src/softcascade.cpp

@ -303,21 +303,16 @@ namespace icf {
}
#endif
void detect(const PtrStepSzb& roi, const PtrStepSzb& levels, const PtrStepSzb& octaves, const PtrStepSzf& stages,
const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv,
PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales)
template<>
void CascadeInvoker<CascadePolicy>::operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv,
PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales, const int scale) const
{
int fw = 160;
int fh = 120;
dim3 block(32, 8);
dim3 grid(fw, fh / 8, downscales);
dim3 grid(fw, fh / 8, (scale == -1) ? downscales : 1);
const Level* l = (const Level*)levels.ptr();
const Octave* oct = ((const Octave*)octaves.ptr());
const float* st = (const float*)stages.ptr();
const Node* nd = (const Node*)nodes.ptr();
const float* lf = (const float*)leaves.ptr();
uint* ctr = (uint*)counter.ptr();
Detection* det = (Detection*)objects.ptr();
uint max_det = objects.cols / sizeof(Detection);
@ -328,44 +323,21 @@ namespace icf {
cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<float2>();
cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step));
test_kernel_warp<false><<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, 0);
cudaSafeCall( cudaGetLastError());
grid = dim3(fw, fh / 8, 47 - downscales);
test_kernel_warp<true><<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, downscales);
cudaSafeCall( cudaGetLastError());
cudaSafeCall( cudaDeviceSynchronize());
}
void detectAtScale(const int scale, const PtrStepSzb& roi, const PtrStepSzb& levels, const PtrStepSzb& octaves,
const PtrStepSzf& stages, const PtrStepSzb& nodes, const PtrStepSzf& leaves, const PtrStepSzi& hogluv,
PtrStepSz<uchar4> objects, PtrStepSzi counter, const int downscales)
{
int fw = 160;
int fh = 120;
dim3 block(32, 8);
dim3 grid(fw, fh / 8, 1);
const Level* l = (const Level*)levels.ptr();
const Octave* oct = ((const Octave*)octaves.ptr());
const float* st = (const float*)stages.ptr();
const Node* nd = (const Node*)nodes.ptr();
const float* lf = (const float*)leaves.ptr();
uint* ctr = (uint*)counter.ptr();
Detection* det = (Detection*)objects.ptr();
uint max_det = objects.cols / sizeof(Detection);
cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
cudaSafeCall( cudaBindTexture2D(0, thogluv, hogluv.data, desc, hogluv.cols, hogluv.rows, hogluv.step));
cudaChannelFormatDesc desc_roi = cudaCreateChannelDesc<float2>();
cudaSafeCall( cudaBindTexture2D(0, troi, roi.data, desc_roi, roi.cols / 8, roi.rows, roi.step));
if (scale == -1)
{
test_kernel_warp<false><<<grid, block>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, 0);
cudaSafeCall( cudaGetLastError());
if (scale >= downscales)
test_kernel_warp<true><<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, scale);
grid = dim3(fw, fh / 8, 47 - downscales);
test_kernel_warp<true><<<grid, block>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, downscales);
}
else
test_kernel_warp<false><<<grid, block>>>(l, oct, st, nd, lf, det, max_det, ctr, scale);
{
if (scale >= downscales)
test_kernel_warp<true><<<grid, block>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale);
else
test_kernel_warp<false><<<grid, block>>>(levels, octaves, stages, nodes, leaves, det, max_det, ctr, scale);
}
cudaSafeCall( cudaGetLastError());
cudaSafeCall( cudaDeviceSynchronize());

@ -124,6 +124,33 @@ struct __align__(16) Detection
: x(_x), y(_y), w(_w), h(_h), confidence(c), kind(0) {};
};
struct CascadePolicy
{
enum {STA_X = 32, STA_Y = 8};
};
template<typename Policy>
struct CascadeInvoker
{
CascadeInvoker(): levels(0), octaves(0), stages(0), nodes(0), leaves(0) {}
CascadeInvoker(const PtrStepSzb& _levels, const PtrStepSzb& _octaves, const PtrStepSzf& _stages,
const PtrStepSzb& _nodes, const PtrStepSzf& _leaves)
: levels((const Level*)_levels.ptr()), octaves((const Octave*)_octaves.ptr()), stages((const float*)_stages.ptr()),
nodes((const Node*)_nodes.ptr()), leaves((const float*)_leaves.ptr())
{}
const Level* levels;
const Octave* octaves;
const float* stages;
const Node* nodes;
const float* leaves;
void operator()(const PtrStepSzb& roi, const PtrStepSzi& hogluv, PtrStepSz<uchar4> objects,
PtrStepSzi counter, const int downscales, const int csale = -1) const;
};
}
}}}

@ -69,29 +69,6 @@ namespace cv { namespace gpu { namespace device {
namespace icf {
void fillBins(cv::gpu::PtrStepSzb hogluv, const cv::gpu::PtrStepSzf& nangle,
const int fw, const int fh, const int bins);
void detect(const PtrStepSzb& rois,
const PtrStepSzb& levels,
const PtrStepSzb& octaves,
const PtrStepSzf& stages,
const PtrStepSzb& nodes,
const PtrStepSzf& leaves,
const PtrStepSzi& hogluv,
PtrStepSz<uchar4> objects,
PtrStepSzi counter,
const int downscales);
void detectAtScale(const int scale,
const PtrStepSzb& rois,
const PtrStepSzb& levels,
const PtrStepSzb& octaves,
const PtrStepSzf& stages,
const PtrStepSzb& nodes,
const PtrStepSzf& leaves,
const PtrStepSzi& hogluv,
PtrStepSz<uchar4> objects,
PtrStepSzi counter,
const int downscales);
}
namespace imgproc
{
@ -150,6 +127,8 @@ struct cv::gpu::SoftCascade::Filds
std::vector<float> scales;
device::icf::CascadeInvoker<device::icf::CascadePolicy> invoker;
static const int shrinkage = 4;
enum { BOOST = 0 };
@ -166,17 +145,11 @@ struct cv::gpu::SoftCascade::Filds
};
bool fill(const FileNode &root, const float mins, const float maxs);
void detect(const cv::gpu::GpuMat& roi, cv::gpu::GpuMat& objects, cudaStream_t stream) const
{
cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int));
device::icf::detect(roi, levels, octaves, stages, nodes, leaves, hogluv, objects , detCounter, downscales);
}
void detectAtScale(int scale, const cv::gpu::GpuMat& roi, cv::gpu::GpuMat& objects, cudaStream_t stream) const
void detect(int scale, const cv::gpu::GpuMat& roi, cv::gpu::GpuMat& objects, cudaStream_t stream) const
{
cudaMemset(detCounter.data, 0, detCounter.step * detCounter.rows * sizeof(int));
device::icf::detectAtScale(scale, roi, levels, octaves, stages, nodes, leaves, hogluv, objects,
detCounter, downscales);
// device::icf::CascadeInvoker<device::icf::CascadePolicy> invoker(levels, octaves, stages, nodes, leaves);
invoker(roi, hogluv, objects, detCounter, downscales, scale);
}
void preprocess(const cv::gpu::GpuMat& colored)
@ -439,6 +412,8 @@ bool cv::gpu::SoftCascade::Filds::fill(const FileNode &root, const float mins, c
calcLevels(voctaves, FRAME_WIDTH, FRAME_HEIGHT, TOTAL_SCALES);
CV_Assert(!levels.empty());
invoker = device::icf::CascadeInvoker<device::icf::CascadePolicy>(levels, octaves, stages, nodes, leaves);
return true;
}
@ -569,10 +544,7 @@ void cv::gpu::SoftCascade::detectMultiScale(const GpuMat& colored, const GpuMat&
flds.preprocess(colored);
if (specificScale == -1)
flds.detect(rois,objects, 0);
else
flds.detectAtScale(specificScale, rois, objects, 0);
flds.detect(specificScale, rois, objects, 0);
cv::Mat out(flds.detCounter);
int ndetections = *(out.data);

Loading…
Cancel
Save