Open Source Computer Vision Library https://opencv.org/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2604 lines
105 KiB

/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
////////////////////////////////////////////////////////////////////////////////
//
// NVIDIA CUDA implementation of Viola-Jones Object Detection Framework
//
// The algorithm and code are explained in the upcoming GPU Computing Gems
// chapter in detail:
//
// Anton Obukhov, "Haar Classifiers for Object Detection with CUDA"
// PDF URL placeholder
// email: aobukhov@nvidia.com, devsupport@nvidia.com
//
// Credits for help with the code to:
// Alexey Mendelenko, Cyril Crassin, and Mikhail Smirnov.
//
////////////////////////////////////////////////////////////////////////////////
#include <algorithm>
#include <cstdio>
#include "NCV.hpp"
#include "NPP_staging/NPP_staging.hpp"
#include "NCVRuntimeTemplates.hpp"
#include "NCVHaarObjectDetection.hpp"
void groupRectangles(std::vector<NcvRect32u> &hypotheses, int groupThreshold, double eps, std::vector<Ncv32u> *weights);
//==============================================================================
//
// BlockScan file
//
//==============================================================================
//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
//assuming size <= WARP_SIZE and size is power of 2
template <class T>
inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
{
Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
s_Data[pos] = 0;
pos += K_WARP_SIZE;
s_Data[pos] = idata;
for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1)
{
s_Data[pos] += s_Data[pos - offset];
}
return s_Data[pos];
}
template <class T>
inline __device__ T warpScanExclusive(T idata, volatile T *s_Data)
{
return warpScanInclusive(idata, s_Data) - idata;
}
template <class T, Ncv32u tiNumScanThreads>
inline __device__ T blockScanInclusive(T idata, volatile T *s_Data)
{
if (tiNumScanThreads > K_WARP_SIZE)
{
//Bottom-level inclusive warp scan
T warpResult = warpScanInclusive(idata, s_Data);
//Save top elements of each warp for exclusive warp scan
//sync to wait for warp scans to complete (because s_Data is being overwritten)
__syncthreads();
if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )
{
s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;
}
//wait for warp scans to complete
__syncthreads();
if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )
{
//grab top warp elements
T val = s_Data[threadIdx.x];
//calculate exclusive scan and write back to shared memory
s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);
}
//return updated warp scans with exclusive scan results
__syncthreads();
return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE];
}
else
{
return warpScanInclusive(idata, s_Data);
}
}
//==============================================================================
//
// HaarClassifierCascade file
//
//==============================================================================
const Ncv32u MAX_GRID_DIM = 65535;
const Ncv32u NUM_THREADS_ANCHORSPARALLEL = 64;
#define NUM_THREADS_CLASSIFIERPARALLEL_LOG2 6
#define NUM_THREADS_CLASSIFIERPARALLEL (1 << NUM_THREADS_CLASSIFIERPARALLEL_LOG2)
/** \internal
* Haar features solid array.
*/
texture<uint2, 1, cudaReadModeElementType> texHaarFeatures;
/** \internal
* Haar classifiers flattened trees container.
* Two parts: first contains root nodes, second - nodes that are referred by root nodes.
* Drawback: breaks tree locality (might cause more cache misses
* Advantage: No need to introduce additional 32-bit field to index root nodes offsets
*/
texture<uint4, 1, cudaReadModeElementType> texHaarClassifierNodes;
texture<Ncv32u, 1, cudaReadModeElementType> texIImage;
__device__ HaarStage64 getStage(Ncv32u iStage, HaarStage64 *d_Stages)
{
return d_Stages[iStage];
}
template <NcvBool tbCacheTextureCascade>
__device__ HaarClassifierNode128 getClassifierNode(Ncv32u iNode, HaarClassifierNode128 *d_ClassifierNodes)
{
HaarClassifierNode128 tmpNode;
if (tbCacheTextureCascade)
{
tmpNode._ui4 = tex1Dfetch(texHaarClassifierNodes, iNode);
}
else
{
tmpNode = d_ClassifierNodes[iNode];
}
return tmpNode;
}
template <NcvBool tbCacheTextureCascade>
__device__ void getFeature(Ncv32u iFeature, HaarFeature64 *d_Features,
Ncv32f *weight,
Ncv32u *rectX, Ncv32u *rectY, Ncv32u *rectWidth, Ncv32u *rectHeight)
{
HaarFeature64 feature;
if (tbCacheTextureCascade)
{
feature._ui2 = tex1Dfetch(texHaarFeatures, iFeature);
}
else
{
feature = d_Features[iFeature];
}
feature.getRect(rectX, rectY, rectWidth, rectHeight);
*weight = feature.getWeight();
}
template <NcvBool tbCacheTextureIImg>
__device__ Ncv32u getElemIImg(Ncv32u x, Ncv32u *d_IImg)
{
if (tbCacheTextureIImg)
{
return tex1Dfetch(texIImage, x);
}
else
{
return d_IImg[x];
}
}
__device__ Ncv32f reduceSpecialization(Ncv32f partialSum)
{
__shared__ volatile Ncv32f reductor[NUM_THREADS_CLASSIFIERPARALLEL];
reductor[threadIdx.x] = partialSum;
__syncthreads();
#if defined CPU_FP_COMPLIANCE
if (!threadIdx.x)
{
Ncv32f sum = 0.0f;
for (int i=0; i<NUM_THREADS_CLASSIFIERPARALLEL; i++)
{
sum += reductor[i];
}
reductor[0] = sum;
}
#else
#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 8
if (threadIdx.x < 128)
{
reductor[threadIdx.x] += reductor[threadIdx.x + 128];
}
__syncthreads();
#endif
#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 7
if (threadIdx.x < 64)
{
reductor[threadIdx.x] += reductor[threadIdx.x + 64];
}
__syncthreads();
#endif
if (threadIdx.x < 32)
{
#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 6
reductor[threadIdx.x] += reductor[threadIdx.x + 32];
#endif
#if NUM_THREADS_CLASSIFIERPARALLEL_LOG2 >= 5
reductor[threadIdx.x] += reductor[threadIdx.x + 16];
#endif
reductor[threadIdx.x] += reductor[threadIdx.x + 8];
reductor[threadIdx.x] += reductor[threadIdx.x + 4];
reductor[threadIdx.x] += reductor[threadIdx.x + 2];
reductor[threadIdx.x] += reductor[threadIdx.x + 1];
}
#endif
__syncthreads();
return reductor[0];
}
__device__ Ncv32u d_outMaskPosition;
__inline __device__ void compactBlockWriteOutAnchorParallel(NcvBool threadPassFlag,
Ncv32u threadElem,
Ncv32u *vectorOut)
{
#if __CUDA_ARCH__ >= 110
Ncv32u passMaskElem = threadPassFlag ? 1 : 0;
__shared__ Ncv32u shmem[NUM_THREADS_ANCHORSPARALLEL * 2];
Ncv32u incScan = blockScanInclusive<Ncv32u, NUM_THREADS_ANCHORSPARALLEL>(passMaskElem, shmem);
__syncthreads();
Ncv32u excScan = incScan - passMaskElem;
__shared__ Ncv32u numPassed;
__shared__ Ncv32u outMaskOffset;
if (threadIdx.x == NUM_THREADS_ANCHORSPARALLEL-1)
{
numPassed = incScan;
outMaskOffset = atomicAdd(&d_outMaskPosition, incScan);
}
__syncthreads();
if (threadPassFlag)
{
shmem[excScan] = threadElem;
}
__syncthreads();
if (threadIdx.x < numPassed)
{
vectorOut[outMaskOffset + threadIdx.x] = shmem[threadIdx.x];
}
#endif
}
template <NcvBool tbInitMaskPositively,
NcvBool tbCacheTextureIImg,
NcvBool tbCacheTextureCascade,
NcvBool tbReadPixelIndexFromVector,
NcvBool tbDoAtomicCompaction>
__global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStride,
Ncv32f *d_weights, Ncv32u weightsStride,
HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
Ncv32u *d_inMask, Ncv32u *d_outMask,
Ncv32u mask1Dlen, Ncv32u mask2Dstride,
NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
{
Ncv32u y_offs;
Ncv32u x_offs;
Ncv32u maskOffset;
Ncv32u outMaskVal;
NcvBool bInactiveThread = false;
if (tbReadPixelIndexFromVector)
{
maskOffset = (MAX_GRID_DIM * blockIdx.y + blockIdx.x) * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x;
if (maskOffset >= mask1Dlen)
{
if (tbDoAtomicCompaction) bInactiveThread = true; else return;
}
if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread)
{
outMaskVal = d_inMask[maskOffset];
y_offs = outMaskVal >> 16;
x_offs = outMaskVal & 0xFFFF;
}
}
else
{
y_offs = blockIdx.y;
x_offs = blockIdx.x * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x;
if (x_offs >= mask2Dstride)
{
if (tbDoAtomicCompaction) bInactiveThread = true; else return;
}
if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread)
{
maskOffset = y_offs * mask2Dstride + x_offs;
if ((x_offs >= anchorsRoi.width) ||
(!tbInitMaskPositively &&
d_inMask != d_outMask &&
d_inMask[maskOffset] == OBJDET_MASK_ELEMENT_INVALID_32U))
{
if (tbDoAtomicCompaction)
{
bInactiveThread = true;
}
else
{
d_outMask[maskOffset] = OBJDET_MASK_ELEMENT_INVALID_32U;
return;
}
}
outMaskVal = (y_offs << 16) | x_offs;
}
}
NcvBool bPass = true;
if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread)
{
Ncv32f pixelStdDev = d_weights[y_offs * weightsStride + x_offs];
for (Ncv32u iStage = startStageInc; iStage<endStageExc; iStage++)
{
Ncv32f curStageSum = 0.0f;
HaarStage64 curStage = getStage(iStage, d_Stages);
Ncv32u numRootNodesInStage = curStage.getNumClassifierRootNodes();
Ncv32u curRootNodeOffset = curStage.getStartClassifierRootNodeOffset();
Ncv32f stageThreshold = curStage.getStageThreshold();
while (numRootNodesInStage--)
{
NcvBool bMoreNodesToTraverse = true;
Ncv32u iNode = curRootNodeOffset;
while (bMoreNodesToTraverse)
{
HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);
HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();
Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();
Ncv32u iFeature = featuresDesc.getFeaturesOffset();
Ncv32f curNodeVal = 0.0f;
for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)
{
Ncv32f rectWeight;
Ncv32u rectX, rectY, rectWidth, rectHeight;
getFeature<tbCacheTextureCascade>
(iFeature + iRect, d_Features,
&rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);
Ncv32u iioffsTR = iioffsTL + rectWidth;
Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;
Ncv32u iioffsBR = iioffsBL + rectWidth;
Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -
getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +
getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -
getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);
#if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY
curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);
#else
curNodeVal += (Ncv32f)rectSum * rectWeight;
#endif
}
HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();
HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();
Ncv32f nodeThreshold = curNode.getThreshold();
HaarClassifierNodeDescriptor32 nextNodeDescriptor;
nextNodeDescriptor = (curNodeVal < scaleArea * pixelStdDev * nodeThreshold) ? nodeLeft : nodeRight;
if (nextNodeDescriptor.isLeaf())
{
Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValue();
curStageSum += tmpLeafValue;
bMoreNodesToTraverse = false;
}
else
{
iNode = nextNodeDescriptor.getNextNodeOffset();
}
}
__syncthreads();
curRootNodeOffset++;
}
if (curStageSum < stageThreshold)
{
bPass = false;
outMaskVal = OBJDET_MASK_ELEMENT_INVALID_32U;
break;
}
}
}
__syncthreads();
if (!tbDoAtomicCompaction)
{
if (!tbReadPixelIndexFromVector ||
(tbReadPixelIndexFromVector && (!bPass || d_inMask != d_outMask)))
{
d_outMask[maskOffset] = outMaskVal;
}
}
else
{
compactBlockWriteOutAnchorParallel(bPass && !bInactiveThread,
outMaskVal,
d_outMask);
}
}
template <NcvBool tbCacheTextureIImg,
NcvBool tbCacheTextureCascade,
NcvBool tbDoAtomicCompaction>
__global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IImgStride,
Ncv32f *d_weights, Ncv32u weightsStride,
HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
Ncv32u *d_inMask, Ncv32u *d_outMask,
Ncv32u mask1Dlen, Ncv32u mask2Dstride,
NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
{
Ncv32u maskOffset = MAX_GRID_DIM * blockIdx.y + blockIdx.x;
if (maskOffset >= mask1Dlen)
{
return;
}
Ncv32u outMaskVal = d_inMask[maskOffset];
Ncv32u y_offs = outMaskVal >> 16;
Ncv32u x_offs = outMaskVal & 0xFFFF;
Ncv32f pixelStdDev = d_weights[y_offs * weightsStride + x_offs];
NcvBool bPass = true;
for (Ncv32u iStage = startStageInc; iStage<endStageExc; iStage++)
{
//this variable is subject to reduction
Ncv32f curStageSum = 0.0f;
HaarStage64 curStage = getStage(iStage, d_Stages);
Ncv32s numRootNodesInStage = curStage.getNumClassifierRootNodes();
Ncv32u curRootNodeOffset = curStage.getStartClassifierRootNodeOffset() + threadIdx.x;
Ncv32f stageThreshold = curStage.getStageThreshold();
Ncv32u numRootChunks = (numRootNodesInStage + NUM_THREADS_CLASSIFIERPARALLEL - 1) >> NUM_THREADS_CLASSIFIERPARALLEL_LOG2;
for (Ncv32u chunkId=0; chunkId<numRootChunks; chunkId++)
{
NcvBool bMoreNodesToTraverse = true;
if (chunkId * NUM_THREADS_CLASSIFIERPARALLEL + threadIdx.x < numRootNodesInStage)
{
Ncv32u iNode = curRootNodeOffset;
while (bMoreNodesToTraverse)
{
HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);
HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();
Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();
Ncv32u iFeature = featuresDesc.getFeaturesOffset();
Ncv32f curNodeVal = 0.0f;
//TODO: fetch into shmem if size suffices. Shmem can be shared with reduce
for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)
{
Ncv32f rectWeight;
Ncv32u rectX, rectY, rectWidth, rectHeight;
getFeature<tbCacheTextureCascade>
(iFeature + iRect, d_Features,
&rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);
Ncv32u iioffsTR = iioffsTL + rectWidth;
Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;
Ncv32u iioffsBR = iioffsBL + rectWidth;
Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -
getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +
getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -
getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);
#if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY
curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);
#else
curNodeVal += (Ncv32f)rectSum * rectWeight;
#endif
}
HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();
HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();
Ncv32f nodeThreshold = curNode.getThreshold();
HaarClassifierNodeDescriptor32 nextNodeDescriptor;
nextNodeDescriptor = (curNodeVal < scaleArea * pixelStdDev * nodeThreshold) ? nodeLeft : nodeRight;
if (nextNodeDescriptor.isLeaf())
{
Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValue();
curStageSum += tmpLeafValue;
bMoreNodesToTraverse = false;
}
else
{
iNode = nextNodeDescriptor.getNextNodeOffset();
}
}
}
__syncthreads();
curRootNodeOffset += NUM_THREADS_CLASSIFIERPARALLEL;
}
Ncv32f finalStageSum = reduceSpecialization(curStageSum);
if (finalStageSum < stageThreshold)
{
bPass = false;
outMaskVal = OBJDET_MASK_ELEMENT_INVALID_32U;
break;
}
}
if (!tbDoAtomicCompaction)
{
if (!bPass || d_inMask != d_outMask)
{
if (!threadIdx.x)
{
d_outMask[maskOffset] = outMaskVal;
}
}
}
else
{
#if __CUDA_ARCH__ >= 110
if (bPass && !threadIdx.x)
{
Ncv32u outMaskOffset = atomicAdd(&d_outMaskPosition, 1);
d_outMask[outMaskOffset] = outMaskVal;
}
#endif
}
}
template <NcvBool tbMaskByInmask,
NcvBool tbDoAtomicCompaction>
__global__ void initializeMaskVector(Ncv32u *d_inMask, Ncv32u *d_outMask,
Ncv32u mask1Dlen, Ncv32u mask2Dstride,
NcvSize32u anchorsRoi, Ncv32u step)
{
Ncv32u y_offs = blockIdx.y;
Ncv32u x_offs = blockIdx.x * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x;
Ncv32u outMaskOffset = y_offs * gridDim.x * blockDim.x + x_offs;
Ncv32u y_offs_upsc = step * y_offs;
Ncv32u x_offs_upsc = step * x_offs;
Ncv32u inMaskOffset = y_offs_upsc * mask2Dstride + x_offs_upsc;
Ncv32u outElem = OBJDET_MASK_ELEMENT_INVALID_32U;
if (x_offs_upsc < anchorsRoi.width &&
(!tbMaskByInmask || d_inMask[inMaskOffset] != OBJDET_MASK_ELEMENT_INVALID_32U))
{
outElem = (y_offs_upsc << 16) | x_offs_upsc;
}
if (!tbDoAtomicCompaction)
{
d_outMask[outMaskOffset] = outElem;
}
else
{
compactBlockWriteOutAnchorParallel(outElem != OBJDET_MASK_ELEMENT_INVALID_32U,
outElem,
d_outMask);
}
}
struct applyHaarClassifierAnchorParallelFunctor
{
dim3 gridConf, blockConf;
cudaStream_t cuStream;
//Kernel arguments are stored as members;
Ncv32u *d_IImg;
Ncv32u IImgStride;
Ncv32f *d_weights;
Ncv32u weightsStride;
HaarFeature64 *d_Features;
HaarClassifierNode128 *d_ClassifierNodes;
HaarStage64 *d_Stages;
Ncv32u *d_inMask;
Ncv32u *d_outMask;
Ncv32u mask1Dlen;
Ncv32u mask2Dstride;
NcvSize32u anchorsRoi;
Ncv32u startStageInc;
Ncv32u endStageExc;
Ncv32f scaleArea;
//Arguments are passed through the constructor
applyHaarClassifierAnchorParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
Ncv32u *_d_IImg, Ncv32u _IImgStride,
Ncv32f *_d_weights, Ncv32u _weightsStride,
HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages,
Ncv32u *_d_inMask, Ncv32u *_d_outMask,
Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
NcvSize32u _anchorsRoi, Ncv32u _startStageInc,
Ncv32u _endStageExc, Ncv32f _scaleArea) :
gridConf(_gridConf),
blockConf(_blockConf),
cuStream(_cuStream),
d_IImg(_d_IImg),
IImgStride(_IImgStride),
d_weights(_d_weights),
weightsStride(_weightsStride),
d_Features(_d_Features),
d_ClassifierNodes(_d_ClassifierNodes),
d_Stages(_d_Stages),
d_inMask(_d_inMask),
d_outMask(_d_outMask),
mask1Dlen(_mask1Dlen),
mask2Dstride(_mask2Dstride),
anchorsRoi(_anchorsRoi),
startStageInc(_startStageInc),
endStageExc(_endStageExc),
scaleArea(_scaleArea)
{}
template<class TList>
void call(TList tl)
{
applyHaarClassifierAnchorParallel <
Loki::TL::TypeAt<TList, 0>::Result::value,
Loki::TL::TypeAt<TList, 1>::Result::value,
Loki::TL::TypeAt<TList, 2>::Result::value,
Loki::TL::TypeAt<TList, 3>::Result::value,
Loki::TL::TypeAt<TList, 4>::Result::value >
<<<gridConf, blockConf, 0, cuStream>>>
(d_IImg, IImgStride,
d_weights, weightsStride,
d_Features, d_ClassifierNodes, d_Stages,
d_inMask, d_outMask,
mask1Dlen, mask2Dstride,
anchorsRoi, startStageInc,
endStageExc, scaleArea);
}
};
void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively,
NcvBool tbCacheTextureIImg,
NcvBool tbCacheTextureCascade,
NcvBool tbReadPixelIndexFromVector,
NcvBool tbDoAtomicCompaction,
dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
Ncv32u *d_IImg, Ncv32u IImgStride,
Ncv32f *d_weights, Ncv32u weightsStride,
HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
Ncv32u *d_inMask, Ncv32u *d_outMask,
Ncv32u mask1Dlen, Ncv32u mask2Dstride,
NcvSize32u anchorsRoi, Ncv32u startStageInc,
Ncv32u endStageExc, Ncv32f scaleArea)
{
//Second parameter is the number of "dynamic" template parameters
NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 5, applyHaarClassifierAnchorParallelFunctor>
::call( applyHaarClassifierAnchorParallelFunctor(gridConf, blockConf, cuStream,
d_IImg, IImgStride,
d_weights, weightsStride,
d_Features, d_ClassifierNodes, d_Stages,
d_inMask, d_outMask,
mask1Dlen, mask2Dstride,
anchorsRoi, startStageInc,
endStageExc, scaleArea),
0xC001C0DE, //this is dummy int for the va_args C compatibility
tbInitMaskPositively,
tbCacheTextureIImg,
tbCacheTextureCascade,
tbReadPixelIndexFromVector,
tbDoAtomicCompaction);
}
struct applyHaarClassifierClassifierParallelFunctor
{
dim3 gridConf, blockConf;
cudaStream_t cuStream;
//Kernel arguments are stored as members;
Ncv32u *d_IImg;
Ncv32u IImgStride;
Ncv32f *d_weights;
Ncv32u weightsStride;
HaarFeature64 *d_Features;
HaarClassifierNode128 *d_ClassifierNodes;
HaarStage64 *d_Stages;
Ncv32u *d_inMask;
Ncv32u *d_outMask;
Ncv32u mask1Dlen;
Ncv32u mask2Dstride;
NcvSize32u anchorsRoi;
Ncv32u startStageInc;
Ncv32u endStageExc;
Ncv32f scaleArea;
//Arguments are passed through the constructor
applyHaarClassifierClassifierParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
Ncv32u *_d_IImg, Ncv32u _IImgStride,
Ncv32f *_d_weights, Ncv32u _weightsStride,
HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages,
Ncv32u *_d_inMask, Ncv32u *_d_outMask,
Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
NcvSize32u _anchorsRoi, Ncv32u _startStageInc,
Ncv32u _endStageExc, Ncv32f _scaleArea) :
gridConf(_gridConf),
blockConf(_blockConf),
cuStream(_cuStream),
d_IImg(_d_IImg),
IImgStride(_IImgStride),
d_weights(_d_weights),
weightsStride(_weightsStride),
d_Features(_d_Features),
d_ClassifierNodes(_d_ClassifierNodes),
d_Stages(_d_Stages),
d_inMask(_d_inMask),
d_outMask(_d_outMask),
mask1Dlen(_mask1Dlen),
mask2Dstride(_mask2Dstride),
anchorsRoi(_anchorsRoi),
startStageInc(_startStageInc),
endStageExc(_endStageExc),
scaleArea(_scaleArea)
{}
template<class TList>
void call(TList tl)
{
applyHaarClassifierClassifierParallel <
Loki::TL::TypeAt<TList, 0>::Result::value,
Loki::TL::TypeAt<TList, 1>::Result::value,
Loki::TL::TypeAt<TList, 2>::Result::value >
<<<gridConf, blockConf, 0, cuStream>>>
(d_IImg, IImgStride,
d_weights, weightsStride,
d_Features, d_ClassifierNodes, d_Stages,
d_inMask, d_outMask,
mask1Dlen, mask2Dstride,
anchorsRoi, startStageInc,
endStageExc, scaleArea);
}
};
void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg,
NcvBool tbCacheTextureCascade,
NcvBool tbDoAtomicCompaction,
dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
Ncv32u *d_IImg, Ncv32u IImgStride,
Ncv32f *d_weights, Ncv32u weightsStride,
HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
Ncv32u *d_inMask, Ncv32u *d_outMask,
Ncv32u mask1Dlen, Ncv32u mask2Dstride,
NcvSize32u anchorsRoi, Ncv32u startStageInc,
Ncv32u endStageExc, Ncv32f scaleArea)
{
//Second parameter is the number of "dynamic" template parameters
NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 3, applyHaarClassifierClassifierParallelFunctor>
::call( applyHaarClassifierClassifierParallelFunctor(gridConf, blockConf, cuStream,
d_IImg, IImgStride,
d_weights, weightsStride,
d_Features, d_ClassifierNodes, d_Stages,
d_inMask, d_outMask,
mask1Dlen, mask2Dstride,
anchorsRoi, startStageInc,
endStageExc, scaleArea),
0xC001C0DE, //this is dummy int for the va_args C compatibility
tbCacheTextureIImg,
tbCacheTextureCascade,
tbDoAtomicCompaction);
}
struct initializeMaskVectorFunctor
{
dim3 gridConf, blockConf;
cudaStream_t cuStream;
//Kernel arguments are stored as members;
Ncv32u *d_inMask;
Ncv32u *d_outMask;
Ncv32u mask1Dlen;
Ncv32u mask2Dstride;
NcvSize32u anchorsRoi;
Ncv32u step;
//Arguments are passed through the constructor
initializeMaskVectorFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
Ncv32u *_d_inMask, Ncv32u *_d_outMask,
Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
NcvSize32u _anchorsRoi, Ncv32u _step) :
gridConf(_gridConf),
blockConf(_blockConf),
cuStream(_cuStream),
d_inMask(_d_inMask),
d_outMask(_d_outMask),
mask1Dlen(_mask1Dlen),
mask2Dstride(_mask2Dstride),
anchorsRoi(_anchorsRoi),
step(_step)
{}
template<class TList>
void call(TList tl)
{
initializeMaskVector <
Loki::TL::TypeAt<TList, 0>::Result::value,
Loki::TL::TypeAt<TList, 1>::Result::value >
<<<gridConf, blockConf, 0, cuStream>>>
(d_inMask, d_outMask,
mask1Dlen, mask2Dstride,
anchorsRoi, step);
}
};
void initializeMaskVectorDynTemplate(NcvBool tbMaskByInmask,
NcvBool tbDoAtomicCompaction,
dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
Ncv32u *d_inMask, Ncv32u *d_outMask,
Ncv32u mask1Dlen, Ncv32u mask2Dstride,
NcvSize32u anchorsRoi, Ncv32u step)
{
//Second parameter is the number of "dynamic" template parameters
NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 2, initializeMaskVectorFunctor>
::call( initializeMaskVectorFunctor(gridConf, blockConf, cuStream,
d_inMask, d_outMask,
mask1Dlen, mask2Dstride,
anchorsRoi, step),
0xC001C0DE, //this is dummy int for the va_args C compatibility
tbMaskByInmask,
tbDoAtomicCompaction);
}
Ncv32u getStageNumWithNotLessThanNclassifiers(Ncv32u N, HaarClassifierCascadeDescriptor &haar,
NCVVector<HaarStage64> &h_HaarStages)
{
Ncv32u i = 0;
for (; i<haar.NumStages; i++)
{
if (h_HaarStages.ptr()[i].getNumClassifierRootNodes() >= N)
{
break;
}
}
return i;
}
template <class T>
void swap(T &p1, T &p2)
{
T tmp = p1;
p1 = p2;
p2 = tmp;
}
NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImage,
NCVMatrix<Ncv32f> &d_weights,
NCVMatrixAlloc<Ncv32u> &d_pixelMask,
Ncv32u &numDetections,
HaarClassifierCascadeDescriptor &haar,
NCVVector<HaarStage64> &h_HaarStages,
NCVVector<HaarStage64> &d_HaarStages,
NCVVector<HaarClassifierNode128> &d_HaarNodes,
NCVVector<HaarFeature64> &d_HaarFeatures,
NcvBool bMaskElements,
NcvSize32u anchorsRoi,
Ncv32u pixelStep,
Ncv32f scaleArea,
INCVMemAllocator &gpuAllocator,
INCVMemAllocator &cpuAllocator,
cudaDeviceProp &devProp,
cudaStream_t cuStream)
{
ncvAssertReturn(d_integralImage.memType() == d_weights.memType() &&
d_integralImage.memType() == d_pixelMask.memType() &&
d_integralImage.memType() == gpuAllocator.memType() &&
(d_integralImage.memType() == NCVMemoryTypeDevice ||
d_integralImage.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
ncvAssertReturn(d_HaarStages.memType() == d_HaarNodes.memType() &&
d_HaarStages.memType() == d_HaarFeatures.memType() &&
(d_HaarStages.memType() == NCVMemoryTypeDevice ||
d_HaarStages.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
ncvAssertReturn(h_HaarStages.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
ncvAssertReturn(gpuAllocator.isInitialized() && cpuAllocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);
ncvAssertReturn((d_integralImage.ptr() != NULL && d_weights.ptr() != NULL && d_pixelMask.ptr() != NULL &&
h_HaarStages.ptr() != NULL && d_HaarStages.ptr() != NULL && d_HaarNodes.ptr() != NULL &&
d_HaarFeatures.ptr() != NULL) || gpuAllocator.isCounting(), NCV_NULL_PTR);
ncvAssertReturn(anchorsRoi.width > 0 && anchorsRoi.height > 0 &&
d_pixelMask.width() >= anchorsRoi.width && d_pixelMask.height() >= anchorsRoi.height &&
d_weights.width() >= anchorsRoi.width && d_weights.height() >= anchorsRoi.height &&
d_integralImage.width() >= anchorsRoi.width + haar.ClassifierSize.width &&
d_integralImage.height() >= anchorsRoi.height + haar.ClassifierSize.height, NCV_DIMENSIONS_INVALID);
ncvAssertReturn(scaleArea > 0, NCV_INVALID_SCALE);
ncvAssertReturn(d_HaarStages.length() >= haar.NumStages &&
d_HaarNodes.length() >= haar.NumClassifierTotalNodes &&
d_HaarFeatures.length() >= haar.NumFeatures &&
d_HaarStages.length() == h_HaarStages.length() &&
haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);
ncvAssertReturn(haar.bNeedsTiltedII == false || gpuAllocator.isCounting(), NCV_NOIMPL_HAAR_TILTED_FEATURES);
ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);
NCV_SET_SKIP_COND(gpuAllocator.isCounting());
#if defined _SELF_TEST_
NCVStatus ncvStat;
NCVMatrixAlloc<Ncv32u> h_integralImage(cpuAllocator, d_integralImage.width, d_integralImage.height, d_integralImage.pitch);
ncvAssertReturn(h_integralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
NCVMatrixAlloc<Ncv32f> h_weights(cpuAllocator, d_weights.width, d_weights.height, d_weights.pitch);
ncvAssertReturn(h_weights.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
NCVMatrixAlloc<Ncv32u> h_pixelMask(cpuAllocator, d_pixelMask.width, d_pixelMask.height, d_pixelMask.pitch);
ncvAssertReturn(h_pixelMask.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
NCVVectorAlloc<HaarClassifierNode128> h_HaarNodes(cpuAllocator, d_HaarNodes.length);
ncvAssertReturn(h_HaarNodes.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
NCVVectorAlloc<HaarFeature64> h_HaarFeatures(cpuAllocator, d_HaarFeatures.length);
ncvAssertReturn(h_HaarFeatures.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
NCVMatrixAlloc<Ncv32u> h_pixelMask_d(cpuAllocator, d_pixelMask.width, d_pixelMask.height, d_pixelMask.pitch);
ncvAssertReturn(h_pixelMask_d.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
NCV_SKIP_COND_BEGIN
ncvStat = d_pixelMask.copySolid(h_pixelMask, 0);
ncvAssertReturnNcvStat(ncvStat);
ncvStat = d_integralImage.copySolid(h_integralImage, 0);
ncvAssertReturnNcvStat(ncvStat);
ncvStat = d_weights.copySolid(h_weights, 0);
ncvAssertReturnNcvStat(ncvStat);
ncvStat = d_HaarNodes.copySolid(h_HaarNodes, 0);
ncvAssertReturnNcvStat(ncvStat);
ncvStat = d_HaarFeatures.copySolid(h_HaarFeatures, 0);
ncvAssertReturnNcvStat(ncvStat);
ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
for (Ncv32u i=0; i<(Ncv32u)anchorsRoi.height; i++)
{
for (Ncv32u j=0; j<d_pixelMask.stride(); j++)
{
if ((i%pixelStep==0) && (j%pixelStep==0) && (j<(Ncv32u)anchorsRoi.width))
{
if (!bMaskElements || h_pixelMask.ptr[i*d_pixelMask.stride()+j] != OBJDET_MASK_ELEMENT_INVALID_32U)
{
h_pixelMask.ptr[i*d_pixelMask.stride()+j] = (i << 16) | j;
}
}
else
{
h_pixelMask.ptr[i*d_pixelMask.stride()+j] = OBJDET_MASK_ELEMENT_INVALID_32U;
}
}
}
NCV_SKIP_COND_END
#endif
NCVVectorReuse<Ncv32u> d_vecPixelMask(d_pixelMask.getSegment(), anchorsRoi.height * d_pixelMask.stride());
ncvAssertReturn(d_vecPixelMask.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);
NCVVectorAlloc<Ncv32u> d_vecPixelMaskTmp(gpuAllocator, d_vecPixelMask.length());
ncvAssertReturn(d_vecPixelMaskTmp.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
NCVVectorAlloc<Ncv32u> hp_pool32u(cpuAllocator, 2);
ncvAssertReturn(hp_pool32u.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
Ncv32u *hp_zero = &hp_pool32u.ptr()[0];
Ncv32u *hp_numDet = &hp_pool32u.ptr()[1];
NCV_SKIP_COND_BEGIN
*hp_zero = 0;
*hp_numDet = 0;
NCV_SKIP_COND_END
Ncv32f scaleAreaPixels = scaleArea * ((haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER) *
(haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER));
NcvBool bTexCacheCascade = devProp.major < 2;
NcvBool bTexCacheIImg = true; //this works better even on Fermi so far
NcvBool bDoAtomicCompaction = devProp.major >= 2 || (devProp.major == 1 && devProp.minor >= 3);
NCVVector<Ncv32u> *d_ptrNowData = &d_vecPixelMask;
NCVVector<Ncv32u> *d_ptrNowTmp = &d_vecPixelMaskTmp;
Ncv32u szNppCompactTmpBuf;
nppsStCompactGetSize_32u(d_vecPixelMask.length(), &szNppCompactTmpBuf, devProp);
if (bDoAtomicCompaction)
{
szNppCompactTmpBuf = 0;
}
NCVVectorAlloc<Ncv8u> d_tmpBufCompact(gpuAllocator, szNppCompactTmpBuf);
NCV_SKIP_COND_BEGIN
if (bTexCacheIImg)
{
cudaChannelFormatDesc cfdTexIImage;
cfdTexIImage = cudaCreateChannelDesc<Ncv32u>();
size_t alignmentOffset;
ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texIImage, d_integralImage.ptr(), cfdTexIImage,
(anchorsRoi.height + haar.ClassifierSize.height) * d_integralImage.pitch()), NCV_CUDA_ERROR);
ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
}
if (bTexCacheCascade)
{
cudaChannelFormatDesc cfdTexHaarFeatures;
cudaChannelFormatDesc cfdTexHaarClassifierNodes;
cfdTexHaarFeatures = cudaCreateChannelDesc<uint2>();
cfdTexHaarClassifierNodes = cudaCreateChannelDesc<uint4>();
size_t alignmentOffset;
ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarFeatures,
d_HaarFeatures.ptr(), cfdTexHaarFeatures,sizeof(HaarFeature64) * haar.NumFeatures), NCV_CUDA_ERROR);
ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarClassifierNodes,
d_HaarNodes.ptr(), cfdTexHaarClassifierNodes, sizeof(HaarClassifierNode128) * haar.NumClassifierTotalNodes), NCV_CUDA_ERROR);
ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
}
Ncv32u stageStartAnchorParallel = 0;
Ncv32u stageMiddleSwitch = getStageNumWithNotLessThanNclassifiers(NUM_THREADS_CLASSIFIERPARALLEL,
haar, h_HaarStages);
Ncv32u stageEndClassifierParallel = haar.NumStages;
if (stageMiddleSwitch == 0)
{
stageMiddleSwitch = 1;
}
//create stages subdivision for pixel-parallel processing
const Ncv32u compactEveryNstage = bDoAtomicCompaction ? 7 : 1;
Ncv32u curStop = stageStartAnchorParallel;
std::vector<Ncv32u> pixParallelStageStops;
while (curStop < stageMiddleSwitch)
{
pixParallelStageStops.push_back(curStop);
curStop += compactEveryNstage;
}
if (curStop > compactEveryNstage && curStop - stageMiddleSwitch > compactEveryNstage / 2)
{
pixParallelStageStops[pixParallelStageStops.size()-1] =
(stageMiddleSwitch - (curStop - 2 * compactEveryNstage)) / 2;
}
pixParallelStageStops.push_back(stageMiddleSwitch);
Ncv32u pixParallelStageStopsIndex = 0;
if (pixelStep != 1 || bMaskElements)
{
if (bDoAtomicCompaction)
{
ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
}
dim3 gridInit((((anchorsRoi.width + pixelStep - 1) / pixelStep + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL),
(anchorsRoi.height + pixelStep - 1) / pixelStep);
dim3 blockInit(NUM_THREADS_ANCHORSPARALLEL);
if (gridInit.x == 0 || gridInit.y == 0)
{
numDetections = 0;
return NCV_SUCCESS;
}
initializeMaskVectorDynTemplate(bMaskElements,
bDoAtomicCompaction,
gridInit, blockInit, cuStream,
d_ptrNowData->ptr(),
d_ptrNowTmp->ptr(),
d_vecPixelMask.length(), d_pixelMask.stride(),
anchorsRoi, pixelStep);
ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
if (bDoAtomicCompaction)
{
ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
swap(d_ptrNowData, d_ptrNowTmp);
}
else
{
NCVStatus nppSt;
nppSt = nppsStCompact_32u(d_ptrNowTmp->ptr(), d_vecPixelMask.length(),
d_ptrNowData->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
ncvAssertReturn(nppSt == NPPST_SUCCESS, NCV_NPP_ERROR);
}
numDetections = *hp_numDet;
}
else
{
//
// 1. Run the first pixel-input pixel-parallel classifier for few stages
//
if (bDoAtomicCompaction)
{
ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
}
dim3 grid1(((d_pixelMask.stride() + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL),
anchorsRoi.height);
dim3 block1(NUM_THREADS_ANCHORSPARALLEL);
applyHaarClassifierAnchorParallelDynTemplate(
true, //tbInitMaskPositively
bTexCacheIImg, //tbCacheTextureIImg
bTexCacheCascade, //tbCacheTextureCascade
pixParallelStageStops[pixParallelStageStopsIndex] != 0,//tbReadPixelIndexFromVector
bDoAtomicCompaction, //tbDoAtomicCompaction
grid1,
block1,
cuStream,
d_integralImage.ptr(), d_integralImage.stride(),
d_weights.ptr(), d_weights.stride(),
d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
d_ptrNowData->ptr(),
bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
0,
d_pixelMask.stride(),
anchorsRoi,
pixParallelStageStops[pixParallelStageStopsIndex],
pixParallelStageStops[pixParallelStageStopsIndex+1],
scaleAreaPixels);
ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
if (bDoAtomicCompaction)
{
ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
}
else
{
NCVStatus nppSt;
nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), d_vecPixelMask.length(),
d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
ncvAssertReturnNcvStat(nppSt);
}
swap(d_ptrNowData, d_ptrNowTmp);
numDetections = *hp_numDet;
pixParallelStageStopsIndex++;
}
//
// 2. Run pixel-parallel stages
//
for (; pixParallelStageStopsIndex < pixParallelStageStops.size()-1; pixParallelStageStopsIndex++)
{
if (numDetections == 0)
{
break;
}
if (bDoAtomicCompaction)
{
ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
}
dim3 grid2((numDetections + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL);
if (numDetections > MAX_GRID_DIM)
{
grid2.x = MAX_GRID_DIM;
grid2.y = (numDetections + MAX_GRID_DIM - 1) / MAX_GRID_DIM;
}
dim3 block2(NUM_THREADS_ANCHORSPARALLEL);
applyHaarClassifierAnchorParallelDynTemplate(
false, //tbInitMaskPositively
bTexCacheIImg, //tbCacheTextureIImg
bTexCacheCascade, //tbCacheTextureCascade
pixParallelStageStops[pixParallelStageStopsIndex] != 0 || pixelStep != 1 || bMaskElements,//tbReadPixelIndexFromVector
bDoAtomicCompaction, //tbDoAtomicCompaction
grid2,
block2,
cuStream,
d_integralImage.ptr(), d_integralImage.stride(),
d_weights.ptr(), d_weights.stride(),
d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
d_ptrNowData->ptr(),
bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
numDetections,
d_pixelMask.stride(),
anchorsRoi,
pixParallelStageStops[pixParallelStageStopsIndex],
pixParallelStageStops[pixParallelStageStopsIndex+1],
scaleAreaPixels);
ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
if (bDoAtomicCompaction)
{
ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
}
else
{
NCVStatus nppSt;
nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), numDetections,
d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
ncvAssertReturnNcvStat(nppSt);
}
swap(d_ptrNowData, d_ptrNowTmp);
numDetections = *hp_numDet;
}
//
// 3. Run all left stages in one stage-parallel kernel
//
if (numDetections > 0 && stageMiddleSwitch < stageEndClassifierParallel)
{
if (bDoAtomicCompaction)
{
ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
}
dim3 grid3(numDetections);
if (numDetections > MAX_GRID_DIM)
{
grid3.x = MAX_GRID_DIM;
grid3.y = (numDetections + MAX_GRID_DIM - 1) / MAX_GRID_DIM;
}
dim3 block3(NUM_THREADS_CLASSIFIERPARALLEL);
applyHaarClassifierClassifierParallelDynTemplate(
bTexCacheIImg, //tbCacheTextureIImg
bTexCacheCascade, //tbCacheTextureCascade
bDoAtomicCompaction, //tbDoAtomicCompaction
grid3,
block3,
cuStream,
d_integralImage.ptr(), d_integralImage.stride(),
d_weights.ptr(), d_weights.stride(),
d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
d_ptrNowData->ptr(),
bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
numDetections,
d_pixelMask.stride(),
anchorsRoi,
stageMiddleSwitch,
stageEndClassifierParallel,
scaleAreaPixels);
ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
if (bDoAtomicCompaction)
{
ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
}
else
{
NCVStatus nppSt;
nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), numDetections,
d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
ncvAssertReturnNcvStat(nppSt);
}
swap(d_ptrNowData, d_ptrNowTmp);
numDetections = *hp_numDet;
}
if (d_ptrNowData != &d_vecPixelMask)
{
d_vecPixelMaskTmp.copySolid(d_vecPixelMask, cuStream);
ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
}
#if defined _SELF_TEST_
ncvStat = d_pixelMask.copySolid(h_pixelMask_d, 0);
ncvAssertReturnNcvStat(ncvStat);
ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
if (bDoAtomicCompaction)
{
std::sort(h_pixelMask_d.ptr, h_pixelMask_d.ptr + numDetections);
}
Ncv32u fpu_oldcw, fpu_cw;
_controlfp_s(&fpu_cw, 0, 0);
fpu_oldcw = fpu_cw;
_controlfp_s(&fpu_cw, _PC_24, _MCW_PC);
Ncv32u numDetGold;
ncvStat = ncvApplyHaarClassifierCascade_host(h_integralImage, h_weights, h_pixelMask, numDetGold, haar,
h_HaarStages, h_HaarNodes, h_HaarFeatures,
bMaskElements, anchorsRoi, pixelStep, scaleArea);
ncvAssertReturnNcvStat(ncvStat);
_controlfp_s(&fpu_cw, fpu_oldcw, _MCW_PC);
bool bPass = true;
if (numDetGold != numDetections)
{
printf("NCVHaarClassifierCascade::applyHaarClassifierCascade numdetections don't match: cpu=%d, gpu=%d\n", numDetGold, numDetections);
bPass = false;
}
else
{
for (Ncv32u i=0; i<std::max(numDetGold, numDetections) && bPass; i++)
{
if (h_pixelMask.ptr[i] != h_pixelMask_d.ptr[i])
{
printf("NCVHaarClassifierCascade::applyHaarClassifierCascade self test failed: i=%d, cpu=%d, gpu=%d\n", i, h_pixelMask.ptr[i], h_pixelMask_d.ptr[i]);
bPass = false;
}
}
}
printf("NCVHaarClassifierCascade::applyHaarClassifierCascade %s\n", bPass?"PASSED":"FAILED");
#endif
NCV_SKIP_COND_END
return NCV_SUCCESS;
}
//==============================================================================
//
// HypothesesOperations file
//
//==============================================================================
const Ncv32u NUM_GROW_THREADS = 128;
__device__ __host__ NcvRect32u pixelToRect(Ncv32u pixel, Ncv32u width, Ncv32u height, Ncv32f scale)
{
NcvRect32u res;
res.x = (Ncv32u)(scale * (pixel & 0xFFFF));
res.y = (Ncv32u)(scale * (pixel >> 16));
res.width = (Ncv32u)(scale * width);
res.height = (Ncv32u)(scale * height);
return res;
}
__global__ void growDetectionsKernel(Ncv32u *pixelMask, Ncv32u numElements,
NcvRect32u *hypotheses,
Ncv32u rectWidth, Ncv32u rectHeight, Ncv32f curScale)
{
Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
Ncv32u elemAddr = blockId * NUM_GROW_THREADS + threadIdx.x;
if (elemAddr >= numElements)
{
return;
}
hypotheses[elemAddr] = pixelToRect(pixelMask[elemAddr], rectWidth, rectHeight, curScale);
}
NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,
Ncv32u numPixelMaskDetections,
NCVVector<NcvRect32u> &hypotheses,
Ncv32u &totalDetections,
Ncv32u totalMaxDetections,
Ncv32u rectWidth,
Ncv32u rectHeight,
Ncv32f curScale,
cudaStream_t cuStream)
{
ncvAssertReturn(pixelMask.ptr() != NULL && hypotheses.ptr() != NULL, NCV_NULL_PTR);
ncvAssertReturn(pixelMask.memType() == hypotheses.memType() &&
pixelMask.memType() == NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
ncvAssertReturn(rectWidth > 0 && rectHeight > 0 && curScale > 0, NCV_INVALID_ROI);
ncvAssertReturn(curScale > 0, NCV_INVALID_SCALE);
ncvAssertReturn(totalMaxDetections <= hypotheses.length() &&
numPixelMaskDetections <= pixelMask.length() &&
totalMaxDetections <= totalMaxDetections, NCV_INCONSISTENT_INPUT);
NCVStatus ncvStat = NCV_SUCCESS;
Ncv32u numDetsToCopy = numPixelMaskDetections;
if (numDetsToCopy == 0)
{
return ncvStat;
}
if (totalDetections + numPixelMaskDetections > totalMaxDetections)
{
ncvStat = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;
numDetsToCopy = totalMaxDetections - totalDetections;
}
dim3 block(NUM_GROW_THREADS);
dim3 grid((numDetsToCopy + NUM_GROW_THREADS - 1) / NUM_GROW_THREADS);
if (grid.x > 65535)
{
grid.y = (grid.x + 65534) / 65535;
grid.x = 65535;
}
growDetectionsKernel<<<grid, block, 0, cuStream>>>(pixelMask.ptr(), numDetsToCopy,
hypotheses.ptr() + totalDetections,
rectWidth, rectHeight, curScale);
ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
totalDetections += numDetsToCopy;
return ncvStat;
}
//==============================================================================
//
// Visualize file
//
//==============================================================================
const Ncv32u NUMTHREADS_DRAWRECTS = 32;
const Ncv32u NUMTHREADS_DRAWRECTS_LOG2 = 5;
template <class T>
__global__ void drawRects(T *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
T color)
{
Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
if (blockId > numRects * 4)
{
return;
}
NcvRect32u curRect = d_rects[blockId >> 2];
NcvBool bVertical = blockId & 0x1;
NcvBool bTopLeft = blockId & 0x2;
Ncv32u pt0x, pt0y;
if (bVertical)
{
Ncv32u numChunks = (curRect.height + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;
pt0x = bTopLeft ? curRect.x : curRect.x + curRect.width - 1;
pt0y = curRect.y;
if (pt0x < dstWidth)
{
for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)
{
Ncv32u ptY = pt0y + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;
if (ptY < pt0y + curRect.height && ptY < dstHeight)
{
d_dst[ptY * dstStride + pt0x] = color;
}
}
}
}
else
{
Ncv32u numChunks = (curRect.width + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;
pt0x = curRect.x;
pt0y = bTopLeft ? curRect.y : curRect.y + curRect.height - 1;
if (pt0y < dstHeight)
{
for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)
{
Ncv32u ptX = pt0x + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;
if (ptX < pt0x + curRect.width && ptX < dstWidth)
{
d_dst[pt0y * dstStride + ptX] = color;
}
}
}
}
}
template <class T>
static NCVStatus drawRectsWrapperDevice(T *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
T color,
cudaStream_t cuStream)
{
ncvAssertReturn(d_dst != NULL && d_rects != NULL, NCV_NULL_PTR);
ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);
ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);
ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);
if (numRects == 0)
{
return NCV_SUCCESS;
}
#if defined _SELF_TEST_
T *h_dst;
ncvAssertCUDAReturn(cudaMallocHost(&h_dst, dstStride * dstHeight * sizeof(T)), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaMemcpy(h_dst, d_dst, dstStride * dstHeight * sizeof(T), cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
NcvRect32s *h_rects;
ncvAssertCUDAReturn(cudaMallocHost(&h_rects, numRects * sizeof(NcvRect32s)), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaMemcpy(h_rects, d_rects, numRects * sizeof(NcvRect32s), cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
ncvAssertReturnNcvStat(drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color));
#endif
dim3 grid(numRects * 4);
dim3 block(NUMTHREADS_DRAWRECTS);
if (grid.x > 65535)
{
grid.y = (grid.x + 65534) / 65535;
grid.x = 65535;
}
drawRects<T><<<grid, block>>>(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color);
ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
#if defined _SELF_TEST_
T *h_dst_after;
ncvAssertCUDAReturn(cudaMallocHost(&h_dst_after, dstStride * dstHeight * sizeof(T)), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaMemcpy(h_dst_after, d_dst, dstStride * dstHeight * sizeof(T), cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
bool bPass = true;
for (Ncv32u i=0; i<dstHeight && bPass; i++)
{
for (Ncv32u j=0; j<dstWidth && bPass; j++)
{
if (h_dst[i*dstStride+j] != h_dst_after[i*dstStride+j])
{
printf("::drawRectsWrapperDevice self test failed: i=%d, j=%d, cpu=%d, gpu=%d\n", i, j, h_dst[i*dstStride+j], h_dst_after[i*dstStride+j]);
bPass = false;
}
}
}
ncvAssertCUDAReturn(cudaFreeHost(h_dst_after), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaFreeHost(h_dst), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaFreeHost(h_rects), NCV_CUDA_ERROR);
printf("::drawRectsWrapperDevice %s\n", bPass?"PASSED":"FAILED");
#endif
return NCV_SUCCESS;
}
NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
Ncv8u color,
cudaStream_t cuStream)
{
return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);
}
NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
Ncv32u color,
cudaStream_t cuStream)
{
return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);
}
//==============================================================================
//
// Pipeline file
//
//==============================================================================
NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
NcvSize32u srcRoi,
NCVVector<NcvRect32u> &d_dstRects,
Ncv32u &dstNumRects,
HaarClassifierCascadeDescriptor &haar,
NCVVector<HaarStage64> &h_HaarStages,
NCVVector<HaarStage64> &d_HaarStages,
NCVVector<HaarClassifierNode128> &d_HaarNodes,
NCVVector<HaarFeature64> &d_HaarFeatures,
NcvSize32u minObjSize,
Ncv32u minNeighbors, //default 4
Ncv32f scaleStep, //default 1.2f
Ncv32u pixelStep, //default 1
Ncv32u flags, //default NCVPipeObjDet_Default
INCVMemAllocator &gpuAllocator,
INCVMemAllocator &cpuAllocator,
cudaDeviceProp &devProp,
cudaStream_t cuStream)
{
ncvAssertReturn(d_srcImg.memType() == d_dstRects.memType() &&
d_srcImg.memType() == gpuAllocator.memType() &&
(d_srcImg.memType() == NCVMemoryTypeDevice ||
d_srcImg.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
ncvAssertReturn(d_HaarStages.memType() == d_HaarNodes.memType() &&
d_HaarStages.memType() == d_HaarFeatures.memType() &&
(d_HaarStages.memType() == NCVMemoryTypeDevice ||
d_HaarStages.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
ncvAssertReturn(h_HaarStages.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
ncvAssertReturn(gpuAllocator.isInitialized() && cpuAllocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);
ncvAssertReturn((d_srcImg.ptr() != NULL && d_dstRects.ptr() != NULL &&
h_HaarStages.ptr() != NULL && d_HaarStages.ptr() != NULL && d_HaarNodes.ptr() != NULL &&
d_HaarFeatures.ptr() != NULL) || gpuAllocator.isCounting(), NCV_NULL_PTR);
ncvAssertReturn(srcRoi.width > 0 && srcRoi.height > 0 &&
d_srcImg.width() >= srcRoi.width && d_srcImg.height() >= srcRoi.height &&
srcRoi.width >= minObjSize.width && srcRoi.height >= minObjSize.height &&
d_dstRects.length() >= 1, NCV_DIMENSIONS_INVALID);
ncvAssertReturn(scaleStep > 1.0f, NCV_INVALID_SCALE);
ncvAssertReturn(d_HaarStages.length() >= haar.NumStages &&
d_HaarNodes.length() >= haar.NumClassifierTotalNodes &&
d_HaarFeatures.length() >= haar.NumFeatures &&
d_HaarStages.length() == h_HaarStages.length() &&
haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);
ncvAssertReturn(haar.bNeedsTiltedII == false, NCV_NOIMPL_HAAR_TILTED_FEATURES);
ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);
//TODO: set NPP active stream to cuStream
NCVStatus ncvStat;
NCV_SET_SKIP_COND(gpuAllocator.isCounting());
Ncv32u integralWidth = d_srcImg.width() + 1;
Ncv32u integralHeight = d_srcImg.height() + 1;
NCVMatrixAlloc<Ncv32u> d_integralImage(gpuAllocator, integralWidth, integralHeight);
ncvAssertReturn(d_integralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
NCVMatrixAlloc<Ncv64u> d_sqIntegralImage(gpuAllocator, integralWidth, integralHeight);
ncvAssertReturn(d_sqIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
NCVMatrixAlloc<Ncv32f> d_rectStdDev(gpuAllocator, d_srcImg.width(), d_srcImg.height());
ncvAssertReturn(d_rectStdDev.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
NCVMatrixAlloc<Ncv32u> d_pixelMask(gpuAllocator, d_srcImg.width(), d_srcImg.height());
ncvAssertReturn(d_pixelMask.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
NCVMatrixAlloc<Ncv32u> d_scaledIntegralImage(gpuAllocator, integralWidth, integralHeight);
ncvAssertReturn(d_scaledIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
NCVMatrixAlloc<Ncv64u> d_scaledSqIntegralImage(gpuAllocator, integralWidth, integralHeight);
ncvAssertReturn(d_scaledSqIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
NCVVectorAlloc<NcvRect32u> d_hypothesesIntermediate(gpuAllocator, d_srcImg.width() * d_srcImg.height());
ncvAssertReturn(d_hypothesesIntermediate.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
NCVVectorAlloc<NcvRect32u> h_hypothesesIntermediate(cpuAllocator, d_srcImg.width() * d_srcImg.height());
ncvAssertReturn(h_hypothesesIntermediate.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
NCVStatus nppStat;
Ncv32u szTmpBufIntegral, szTmpBufSqIntegral;
nppStat = nppiStIntegralGetSize_8u32u(NcvSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufIntegral, devProp);
ncvAssertReturnNcvStat(nppStat);
nppStat = nppiStSqrIntegralGetSize_8u64u(NcvSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufSqIntegral, devProp);
ncvAssertReturnNcvStat(nppStat);
NCVVectorAlloc<Ncv8u> d_tmpIIbuf(gpuAllocator, std::max(szTmpBufIntegral, szTmpBufSqIntegral));
ncvAssertReturn(d_tmpIIbuf.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
NCV_SKIP_COND_BEGIN
nppStat = nppiStIntegral_8u32u_C1R(d_srcImg.ptr(), d_srcImg.pitch(),
d_integralImage.ptr(), d_integralImage.pitch(),
NcvSize32u(d_srcImg.width(), d_srcImg.height()),
d_tmpIIbuf.ptr(), szTmpBufIntegral, devProp);
ncvAssertReturnNcvStat(nppStat);
nppStat = nppiStSqrIntegral_8u64u_C1R(d_srcImg.ptr(), d_srcImg.pitch(),
d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),
NcvSize32u(d_srcImg.width(), d_srcImg.height()),
d_tmpIIbuf.ptr(), szTmpBufSqIntegral, devProp);
ncvAssertReturnNcvStat(nppStat);
NCV_SKIP_COND_END
dstNumRects = 0;
Ncv32u lastCheckedScale = 0;
NcvBool bReverseTraverseScale = ((flags & NCVPipeObjDet_FindLargestObject) != 0);
std::vector<Ncv32u> scalesVector;
NcvBool bFoundLargestFace = false;
for (Ncv32f scaleIter = 1.0f; ; scaleIter *= scaleStep)
{
Ncv32u scale = (Ncv32u)scaleIter;
if (lastCheckedScale == scale)
{
continue;
}
lastCheckedScale = scale;
if (haar.ClassifierSize.width * (Ncv32s)scale < minObjSize.width ||
haar.ClassifierSize.height * (Ncv32s)scale < minObjSize.height)
{
continue;
}
NcvSize32s srcRoi, srcIIRoi, scaledIIRoi, searchRoi;
srcRoi.width = d_srcImg.width();
srcRoi.height = d_srcImg.height();
srcIIRoi.width = srcRoi.width + 1;
srcIIRoi.height = srcRoi.height + 1;
scaledIIRoi.width = srcIIRoi.width / scale;
scaledIIRoi.height = srcIIRoi.height / scale;
searchRoi.width = scaledIIRoi.width - haar.ClassifierSize.width;
searchRoi.height = scaledIIRoi.height - haar.ClassifierSize.height;
if (searchRoi.width <= 0 || searchRoi.height <= 0)
{
break;
}
scalesVector.push_back(scale);
if (gpuAllocator.isCounting())
{
break;
}
}
if (bReverseTraverseScale)
{
std::reverse(scalesVector.begin(), scalesVector.end());
}
//TODO: handle _fair_scale_ flag
for (Ncv32u i=0; i<scalesVector.size(); i++)
{
Ncv32u scale = scalesVector[i];
NcvSize32u srcRoi, scaledIIRoi, searchRoi;
NcvSize32u srcIIRoi;
srcRoi.width = d_srcImg.width();
srcRoi.height = d_srcImg.height();
srcIIRoi.width = srcRoi.width + 1;
srcIIRoi.height = srcRoi.height + 1;
scaledIIRoi.width = srcIIRoi.width / scale;
scaledIIRoi.height = srcIIRoi.height / scale;
searchRoi.width = scaledIIRoi.width - haar.ClassifierSize.width;
searchRoi.height = scaledIIRoi.height - haar.ClassifierSize.height;
NCV_SKIP_COND_BEGIN
nppStat = nppiStDownsampleNearest_32u_C1R(
d_integralImage.ptr(), d_integralImage.pitch(),
d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(),
srcIIRoi, scale, true);
ncvAssertReturnNcvStat(nppStat);
nppStat = nppiStDownsampleNearest_64u_C1R(
d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),
d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(),
srcIIRoi, scale, true);
ncvAssertReturnNcvStat(nppStat);
const NcvRect32u rect(
HAAR_STDDEV_BORDER,
HAAR_STDDEV_BORDER,
haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER,
haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER);
nppStat = nppiStRectStdDev_32f_C1R(
d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(),
d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(),
d_rectStdDev.ptr(), d_rectStdDev.pitch(),
NcvSize32u(searchRoi.width, searchRoi.height), rect,
(Ncv32f)scale*scale, true);
ncvAssertReturnNcvStat(nppStat);
NCV_SKIP_COND_END
Ncv32u detectionsOnThisScale;
ncvStat = ncvApplyHaarClassifierCascade_device(
d_scaledIntegralImage, d_rectStdDev, d_pixelMask,
detectionsOnThisScale,
haar, h_HaarStages, d_HaarStages, d_HaarNodes, d_HaarFeatures, false,
searchRoi, pixelStep, (Ncv32f)scale*scale,
gpuAllocator, cpuAllocator, devProp, cuStream);
ncvAssertReturnNcvStat(nppStat);
NCV_SKIP_COND_BEGIN
NCVVectorReuse<Ncv32u> d_vecPixelMask(d_pixelMask.getSegment());
ncvStat = ncvGrowDetectionsVector_device(
d_vecPixelMask,
detectionsOnThisScale,
d_hypothesesIntermediate,
dstNumRects,
d_hypothesesIntermediate.length(),
haar.ClassifierSize.width,
haar.ClassifierSize.height,
(Ncv32f)scale,
cuStream);
ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
if (flags & NCVPipeObjDet_FindLargestObject)
{
if (dstNumRects == 0)
{
continue;
}
if (dstNumRects != 0)
{
ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
ncvStat = d_hypothesesIntermediate.copySolid(h_hypothesesIntermediate, cuStream,
dstNumRects * sizeof(NcvRect32u));
ncvAssertReturnNcvStat(ncvStat);
ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
}
Ncv32u numStrongHypothesesNow = dstNumRects;
ncvStat = ncvFilterHypotheses_host(
h_hypothesesIntermediate,
numStrongHypothesesNow,
minNeighbors,
RECT_SIMILARITY_PROPORTION,
NULL);
ncvAssertReturnNcvStat(ncvStat);
if (numStrongHypothesesNow > 0)
{
NcvRect32u maxRect = h_hypothesesIntermediate.ptr()[0];
for (Ncv32u j=1; j<numStrongHypothesesNow; j++)
{
if (maxRect.width < h_hypothesesIntermediate.ptr()[j].width)
{
maxRect = h_hypothesesIntermediate.ptr()[j];
}
}
h_hypothesesIntermediate.ptr()[0] = maxRect;
dstNumRects = 1;
ncvStat = h_hypothesesIntermediate.copySolid(d_dstRects, cuStream, sizeof(NcvRect32u));
ncvAssertReturnNcvStat(ncvStat);
bFoundLargestFace = true;
break;
}
}
NCV_SKIP_COND_END
if (gpuAllocator.isCounting())
{
break;
}
}
NCVStatus ncvRetCode = NCV_SUCCESS;
NCV_SKIP_COND_BEGIN
if (flags & NCVPipeObjDet_FindLargestObject)
{
if (!bFoundLargestFace)
{
dstNumRects = 0;
}
}
else
{
//TODO: move hypotheses filtration to GPU pipeline (the only CPU-resident element of the pipeline left)
if (dstNumRects != 0)
{
ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
ncvStat = d_hypothesesIntermediate.copySolid(h_hypothesesIntermediate, cuStream,
dstNumRects * sizeof(NcvRect32u));
ncvAssertReturnNcvStat(ncvStat);
ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
}
ncvStat = ncvFilterHypotheses_host(
h_hypothesesIntermediate,
dstNumRects,
minNeighbors,
RECT_SIMILARITY_PROPORTION,
NULL);
ncvAssertReturnNcvStat(ncvStat);
if (dstNumRects > d_dstRects.length())
{
ncvRetCode = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;
dstNumRects = d_dstRects.length();
}
if (dstNumRects != 0)
{
ncvStat = h_hypothesesIntermediate.copySolid(d_dstRects, cuStream,
dstNumRects * sizeof(NcvRect32u));
ncvAssertReturnNcvStat(ncvStat);
}
}
if (flags & NCVPipeObjDet_VisualizeInPlace)
{
ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
ncvDrawRects_8u_device(d_srcImg.ptr(), d_srcImg.stride(),
d_srcImg.width(), d_srcImg.height(),
d_dstRects.ptr(), dstNumRects, 255, cuStream);
}
NCV_SKIP_COND_END
return ncvRetCode;
}
//==============================================================================
//
// Purely Host code: classifier IO, mock-ups
//
//==============================================================================
#ifdef _SELF_TEST_
#include <float.h>
#endif
NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> &h_integralImage,
NCVMatrix<Ncv32f> &h_weights,
NCVMatrixAlloc<Ncv32u> &h_pixelMask,
Ncv32u &numDetections,
HaarClassifierCascadeDescriptor &haar,
NCVVector<HaarStage64> &h_HaarStages,
NCVVector<HaarClassifierNode128> &h_HaarNodes,
NCVVector<HaarFeature64> &h_HaarFeatures,
NcvBool bMaskElements,
NcvSize32u anchorsRoi,
Ncv32u pixelStep,
Ncv32f scaleArea)
{
ncvAssertReturn(h_integralImage.memType() == h_weights.memType() &&
h_integralImage.memType() == h_pixelMask.memType() &&
(h_integralImage.memType() == NCVMemoryTypeHostPageable ||
h_integralImage.memType() == NCVMemoryTypeHostPinned), NCV_MEM_RESIDENCE_ERROR);
ncvAssertReturn(h_HaarStages.memType() == h_HaarNodes.memType() &&
h_HaarStages.memType() == h_HaarFeatures.memType() &&
(h_HaarStages.memType() == NCVMemoryTypeHostPageable ||
h_HaarStages.memType() == NCVMemoryTypeHostPinned), NCV_MEM_RESIDENCE_ERROR);
ncvAssertReturn(h_integralImage.ptr() != NULL && h_weights.ptr() != NULL && h_pixelMask.ptr() != NULL &&
h_HaarStages.ptr() != NULL && h_HaarNodes.ptr() != NULL && h_HaarFeatures.ptr() != NULL, NCV_NULL_PTR);
ncvAssertReturn(anchorsRoi.width > 0 && anchorsRoi.height > 0 &&
h_pixelMask.width() >= anchorsRoi.width && h_pixelMask.height() >= anchorsRoi.height &&
h_weights.width() >= anchorsRoi.width && h_weights.height() >= anchorsRoi.height &&
h_integralImage.width() >= anchorsRoi.width + haar.ClassifierSize.width &&
h_integralImage.height() >= anchorsRoi.height + haar.ClassifierSize.height, NCV_DIMENSIONS_INVALID);
ncvAssertReturn(scaleArea > 0, NCV_INVALID_SCALE);
ncvAssertReturn(h_HaarStages.length() >= haar.NumStages &&
h_HaarNodes.length() >= haar.NumClassifierTotalNodes &&
h_HaarFeatures.length() >= haar.NumFeatures &&
h_HaarStages.length() == h_HaarStages.length() &&
haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);
ncvAssertReturn(haar.bNeedsTiltedII == false, NCV_NOIMPL_HAAR_TILTED_FEATURES);
ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);
Ncv32f scaleAreaPixels = scaleArea * ((haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER) *
(haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER));
for (Ncv32u i=0; i<anchorsRoi.height; i++)
{
for (Ncv32u j=0; j<h_pixelMask.stride(); j++)
{
if (i % pixelStep != 0 || j % pixelStep != 0 || j >= anchorsRoi.width)
{
h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = OBJDET_MASK_ELEMENT_INVALID_32U;
}
else
{
for (Ncv32u iStage = 0; iStage < haar.NumStages; iStage++)
{
Ncv32f curStageSum = 0.0f;
Ncv32u numRootNodesInStage = h_HaarStages.ptr()[iStage].getNumClassifierRootNodes();
Ncv32u curRootNodeOffset = h_HaarStages.ptr()[iStage].getStartClassifierRootNodeOffset();
if (iStage == 0)
{
if (bMaskElements && h_pixelMask.ptr()[i * h_pixelMask.stride() + j] == OBJDET_MASK_ELEMENT_INVALID_32U)
{
break;
}
else
{
h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = ((i << 16) | j);
}
}
else if (h_pixelMask.ptr()[i * h_pixelMask.stride() + j] == OBJDET_MASK_ELEMENT_INVALID_32U)
{
break;
}
while (numRootNodesInStage--)
{
NcvBool bMoreNodesToTraverse = true;
Ncv32u curNodeOffset = curRootNodeOffset;
while (bMoreNodesToTraverse)
{
HaarClassifierNode128 curNode = h_HaarNodes.ptr()[curNodeOffset];
Ncv32u curNodeFeaturesNum = curNode.getFeatureDesc().getNumFeatures();
Ncv32u curNodeFeaturesOffs = curNode.getFeatureDesc().getFeaturesOffset();
Ncv32f curNodeVal = 0.f;
for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)
{
HaarFeature64 feature = h_HaarFeatures.ptr()[curNodeFeaturesOffs + iRect];
Ncv32u rectX, rectY, rectWidth, rectHeight;
feature.getRect(&rectX, &rectY, &rectWidth, &rectHeight);
Ncv32f rectWeight = feature.getWeight();
Ncv32u iioffsTL = (i + rectY) * h_integralImage.stride() + (j + rectX);
Ncv32u iioffsTR = iioffsTL + rectWidth;
Ncv32u iioffsBL = iioffsTL + rectHeight * h_integralImage.stride();
Ncv32u iioffsBR = iioffsBL + rectWidth;
Ncv32u iivalTL = h_integralImage.ptr()[iioffsTL];
Ncv32u iivalTR = h_integralImage.ptr()[iioffsTR];
Ncv32u iivalBL = h_integralImage.ptr()[iioffsBL];
Ncv32u iivalBR = h_integralImage.ptr()[iioffsBR];
Ncv32u rectSum = iivalBR - iivalBL + iivalTL - iivalTR;
curNodeVal += (Ncv32f)rectSum * rectWeight;
}
HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();
HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();
Ncv32f nodeThreshold = curNode.getThreshold();
HaarClassifierNodeDescriptor32 nextNodeDescriptor;
if (curNodeVal < scaleAreaPixels * h_weights.ptr()[i * h_weights.stride() + j] * nodeThreshold)
{
nextNodeDescriptor = nodeLeft;
}
else
{
nextNodeDescriptor = nodeRight;
}
NcvBool tmpIsLeaf = nextNodeDescriptor.isLeaf();
if (tmpIsLeaf)
{
Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValueHost();
curStageSum += tmpLeafValue;
bMoreNodesToTraverse = false;
}
else
{
curNodeOffset = nextNodeDescriptor.getNextNodeOffset();
}
}
curRootNodeOffset++;
}
Ncv32f tmpStageThreshold = h_HaarStages.ptr()[iStage].getStageThreshold();
if (curStageSum < tmpStageThreshold)
{
//drop
h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = OBJDET_MASK_ELEMENT_INVALID_32U;
break;
}
}
}
}
}
std::sort(h_pixelMask.ptr(), h_pixelMask.ptr() + anchorsRoi.height * h_pixelMask.stride());
Ncv32u i = 0;
for (; i<anchorsRoi.height * h_pixelMask.stride(); i++)
{
if (h_pixelMask.ptr()[i] == OBJDET_MASK_ELEMENT_INVALID_32U)
{
break;
}
}
numDetections = i;
return NCV_SUCCESS;
}
NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
Ncv32u numPixelMaskDetections,
NCVVector<NcvRect32u> &hypotheses,
Ncv32u &totalDetections,
Ncv32u totalMaxDetections,
Ncv32u rectWidth,
Ncv32u rectHeight,
Ncv32f curScale)
{
ncvAssertReturn(pixelMask.ptr() != NULL && hypotheses.ptr() != NULL, NCV_NULL_PTR);
ncvAssertReturn(pixelMask.memType() == hypotheses.memType() &&
pixelMask.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
ncvAssertReturn(rectWidth > 0 && rectHeight > 0 && curScale > 0, NCV_INVALID_ROI);
ncvAssertReturn(curScale > 0, NCV_INVALID_SCALE);
ncvAssertReturn(totalMaxDetections <= hypotheses.length() &&
numPixelMaskDetections <= pixelMask.length() &&
totalMaxDetections <= totalMaxDetections, NCV_INCONSISTENT_INPUT);
NCVStatus ncvStat = NCV_SUCCESS;
Ncv32u numDetsToCopy = numPixelMaskDetections;
if (numDetsToCopy == 0)
{
return ncvStat;
}
if (totalDetections + numPixelMaskDetections > totalMaxDetections)
{
ncvStat = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;
numDetsToCopy = totalMaxDetections - totalDetections;
}
for (Ncv32u i=0; i<numDetsToCopy; i++)
{
hypotheses.ptr()[totalDetections + i] = pixelToRect(pixelMask.ptr()[i], rectWidth, rectHeight, curScale);
}
totalDetections += numDetsToCopy;
return ncvStat;
}
NCVStatus ncvFilterHypotheses_host(NCVVector<NcvRect32u> &hypotheses,
Ncv32u &numHypotheses,
Ncv32u minNeighbors,
Ncv32f intersectEps,
NCVVector<Ncv32u> *hypothesesWeights)
{
ncvAssertReturn(hypotheses.memType() == NCVMemoryTypeHostPageable ||
hypotheses.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
if (hypothesesWeights != NULL)
{
ncvAssertReturn(hypothesesWeights->memType() == NCVMemoryTypeHostPageable ||
hypothesesWeights->memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
}
if (numHypotheses == 0)
{
return NCV_SUCCESS;
}
std::vector<NcvRect32u> rects(numHypotheses);
memcpy(&rects[0], hypotheses.ptr(), numHypotheses * sizeof(NcvRect32u));
std::vector<Ncv32u> weights;
if (hypothesesWeights != NULL)
{
groupRectangles(rects, minNeighbors, intersectEps, &weights);
}
else
{
groupRectangles(rects, minNeighbors, intersectEps, NULL);
}
numHypotheses = (Ncv32u)rects.size();
if (numHypotheses > 0)
{
memcpy(hypotheses.ptr(), &rects[0], numHypotheses * sizeof(NcvRect32u));
}
if (hypothesesWeights != NULL)
{
memcpy(hypothesesWeights->ptr(), &weights[0], numHypotheses * sizeof(Ncv32u));
}
return NCV_SUCCESS;
}
template <class T>
static NCVStatus drawRectsWrapperHost(T *h_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *h_rects,
Ncv32u numRects,
T color)
{
ncvAssertReturn(h_dst != NULL && h_rects != NULL, NCV_NULL_PTR);
ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);
ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);
ncvAssertReturn(numRects != 0, NCV_SUCCESS);
ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);
for (Ncv32u i=0; i<numRects; i++)
{
NcvRect32u rect = h_rects[i];
if (rect.x < dstWidth)
{
for (Ncv32u i=rect.y; i<rect.y+rect.height && i<dstHeight; i++)
{
h_dst[i*dstStride+rect.x] = color;
}
}
if (rect.x+rect.width-1 < dstWidth)
{
for (Ncv32u i=rect.y; i<rect.y+rect.height && i<dstHeight; i++)
{
h_dst[i*dstStride+rect.x+rect.width-1] = color;
}
}
if (rect.y < dstHeight)
{
for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)
{
h_dst[rect.y*dstStride+j] = color;
}
}
if (rect.y + rect.height - 1 < dstHeight)
{
for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)
{
h_dst[(rect.y+rect.height-1)*dstStride+j] = color;
}
}
}
return NCV_SUCCESS;
}
NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *h_rects,
Ncv32u numRects,
Ncv8u color)
{
return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
}
NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *h_rects,
Ncv32u numRects,
Ncv32u color)
{
return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
}
NCVStatus loadFromXML(const std::string &filename,
HaarClassifierCascadeDescriptor &haar,
std::vector<HaarStage64> &haarStages,
std::vector<HaarClassifierNode128> &haarClassifierNodes,
std::vector<HaarFeature64> &haarFeatures);
#define NVBIN_HAAR_SIZERESERVED 16
#define NVBIN_HAAR_VERSION 0x1
static NCVStatus loadFromNVBIN(const std::string &filename,
HaarClassifierCascadeDescriptor &haar,
std::vector<HaarStage64> &haarStages,
std::vector<HaarClassifierNode128> &haarClassifierNodes,
std::vector<HaarFeature64> &haarFeatures)
{
FILE *fp = fopen(filename.c_str(), "rb");
ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);
Ncv32u fileVersion;
ncvAssertReturn(1 == fread(&fileVersion, sizeof(Ncv32u), 1, fp), NCV_FILE_ERROR);
ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR);
Ncv32u fsize;
ncvAssertReturn(1 == fread(&fsize, sizeof(Ncv32u), 1, fp), NCV_FILE_ERROR);
fseek(fp, 0, SEEK_END);
Ncv32u fsizeActual = ftell(fp);
ncvAssertReturn(fsize == fsizeActual, NCV_FILE_ERROR);
std::vector<unsigned char> fdata;
fdata.resize(fsize);
Ncv32u dataOffset = 0;
fseek(fp, 0, SEEK_SET);
ncvAssertReturn(1 == fread(&fdata[0], fsize, 1, fp), NCV_FILE_ERROR);
fclose(fp);
//data
dataOffset = NVBIN_HAAR_SIZERESERVED;
haar.NumStages = *(Ncv32u *)(&fdata[0]+dataOffset);
dataOffset += sizeof(Ncv32u);
haar.NumClassifierRootNodes = *(Ncv32u *)(&fdata[0]+dataOffset);
dataOffset += sizeof(Ncv32u);
haar.NumClassifierTotalNodes = *(Ncv32u *)(&fdata[0]+dataOffset);
dataOffset += sizeof(Ncv32u);
haar.NumFeatures = *(Ncv32u *)(&fdata[0]+dataOffset);
dataOffset += sizeof(Ncv32u);
haar.ClassifierSize = *(NcvSize32u *)(&fdata[0]+dataOffset);
dataOffset += sizeof(NcvSize32u);
haar.bNeedsTiltedII = *(NcvBool *)(&fdata[0]+dataOffset);
dataOffset += sizeof(NcvBool);
haar.bHasStumpsOnly = *(NcvBool *)(&fdata[0]+dataOffset);
dataOffset += sizeof(NcvBool);
haarStages.resize(haar.NumStages);
haarClassifierNodes.resize(haar.NumClassifierTotalNodes);
haarFeatures.resize(haar.NumFeatures);
Ncv32u szStages = haar.NumStages * sizeof(HaarStage64);
Ncv32u szClassifiers = haar.NumClassifierTotalNodes * sizeof(HaarClassifierNode128);
Ncv32u szFeatures = haar.NumFeatures * sizeof(HaarFeature64);
memcpy(&haarStages[0], &fdata[0]+dataOffset, szStages);
dataOffset += szStages;
memcpy(&haarClassifierNodes[0], &fdata[0]+dataOffset, szClassifiers);
dataOffset += szClassifiers;
memcpy(&haarFeatures[0], &fdata[0]+dataOffset, szFeatures);
dataOffset += szFeatures;
return NCV_SUCCESS;
}
NCVStatus ncvHaarGetClassifierSize(const std::string &filename, Ncv32u &numStages,
Ncv32u &numNodes, Ncv32u &numFeatures)
{
NCVStatus ncvStat;
std::string fext = filename.substr(filename.find_last_of(".") + 1);
std::transform(fext.begin(), fext.end(), fext.begin(), ::tolower);
if (fext == "nvbin")
{
FILE *fp = fopen(filename.c_str(), "rb");
ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);
Ncv32u fileVersion;
ncvAssertReturn(1 == fread(&fileVersion, sizeof(Ncv32u), 1, fp), NCV_FILE_ERROR);
ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR);
fseek(fp, NVBIN_HAAR_SIZERESERVED, SEEK_SET);
Ncv32u tmp;
ncvAssertReturn(1 == fread(&numStages, sizeof(Ncv32u), 1, fp), NCV_FILE_ERROR);
ncvAssertReturn(1 == fread(&tmp, sizeof(Ncv32u), 1, fp), NCV_FILE_ERROR);
ncvAssertReturn(1 == fread(&numNodes, sizeof(Ncv32u), 1, fp), NCV_FILE_ERROR);
ncvAssertReturn(1 == fread(&numFeatures, sizeof(Ncv32u), 1, fp), NCV_FILE_ERROR);
fclose(fp);
}
else if (fext == "xml")
{
HaarClassifierCascadeDescriptor haar;
std::vector<HaarStage64> haarStages;
std::vector<HaarClassifierNode128> haarNodes;
std::vector<HaarFeature64> haarFeatures;
ncvStat = loadFromXML(filename, haar, haarStages, haarNodes, haarFeatures);
ncvAssertReturnNcvStat(ncvStat);
numStages = haar.NumStages;
numNodes = haar.NumClassifierTotalNodes;
numFeatures = haar.NumFeatures;
}
else
{
return NCV_HAAR_XML_LOADING_EXCEPTION;
}
return NCV_SUCCESS;
}
NCVStatus ncvHaarLoadFromFile_host(const std::string &filename,
HaarClassifierCascadeDescriptor &haar,
NCVVector<HaarStage64> &h_HaarStages,
NCVVector<HaarClassifierNode128> &h_HaarNodes,
NCVVector<HaarFeature64> &h_HaarFeatures)
{
ncvAssertReturn(h_HaarStages.memType() == NCVMemoryTypeHostPinned &&
h_HaarNodes.memType() == NCVMemoryTypeHostPinned &&
h_HaarFeatures.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
NCVStatus ncvStat;
std::string fext = filename.substr(filename.find_last_of(".") + 1);
std::transform(fext.begin(), fext.end(), fext.begin(), ::tolower);
std::vector<HaarStage64> haarStages;
std::vector<HaarClassifierNode128> haarNodes;
std::vector<HaarFeature64> haarFeatures;
if (fext == "nvbin")
{
ncvStat = loadFromNVBIN(filename, haar, haarStages, haarNodes, haarFeatures);
ncvAssertReturnNcvStat(ncvStat);
}
else if (fext == "xml")
{
ncvStat = loadFromXML(filename, haar, haarStages, haarNodes, haarFeatures);
ncvAssertReturnNcvStat(ncvStat);
}
else
{
return NCV_HAAR_XML_LOADING_EXCEPTION;
}
ncvAssertReturn(h_HaarStages.length() >= haarStages.size(), NCV_MEM_INSUFFICIENT_CAPACITY);
ncvAssertReturn(h_HaarNodes.length() >= haarNodes.size(), NCV_MEM_INSUFFICIENT_CAPACITY);
ncvAssertReturn(h_HaarFeatures.length() >= haarFeatures.size(), NCV_MEM_INSUFFICIENT_CAPACITY);
memcpy(h_HaarStages.ptr(), &haarStages[0], haarStages.size()*sizeof(HaarStage64));
memcpy(h_HaarNodes.ptr(), &haarNodes[0], haarNodes.size()*sizeof(HaarClassifierNode128));
memcpy(h_HaarFeatures.ptr(), &haarFeatures[0], haarFeatures.size()*sizeof(HaarFeature64));
return NCV_SUCCESS;
}
NCVStatus ncvHaarStoreNVBIN_host(const std::string &filename,
HaarClassifierCascadeDescriptor haar,
NCVVector<HaarStage64> &h_HaarStages,
NCVVector<HaarClassifierNode128> &h_HaarNodes,
NCVVector<HaarFeature64> &h_HaarFeatures)
{
ncvAssertReturn(h_HaarStages.length() >= haar.NumStages, NCV_INCONSISTENT_INPUT);
ncvAssertReturn(h_HaarNodes.length() >= haar.NumClassifierTotalNodes, NCV_INCONSISTENT_INPUT);
ncvAssertReturn(h_HaarFeatures.length() >= haar.NumFeatures, NCV_INCONSISTENT_INPUT);
ncvAssertReturn(h_HaarStages.memType() == NCVMemoryTypeHostPinned &&
h_HaarNodes.memType() == NCVMemoryTypeHostPinned &&
h_HaarFeatures.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
Ncv32u szStages = haar.NumStages * sizeof(HaarStage64);
Ncv32u szClassifiers = haar.NumClassifierTotalNodes * sizeof(HaarClassifierNode128);
Ncv32u szFeatures = haar.NumFeatures * sizeof(HaarFeature64);
Ncv32u dataOffset = 0;
std::vector<unsigned char> fdata;
fdata.resize(szStages+szClassifiers+szFeatures+1024, 0);
//header
*(Ncv32u *)(&fdata[0]+dataOffset) = NVBIN_HAAR_VERSION;
//data
dataOffset = NVBIN_HAAR_SIZERESERVED;
*(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumStages;
dataOffset += sizeof(Ncv32u);
*(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumClassifierRootNodes;
dataOffset += sizeof(Ncv32u);
*(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumClassifierTotalNodes;
dataOffset += sizeof(Ncv32u);
*(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumFeatures;
dataOffset += sizeof(Ncv32u);
*(NcvSize32u *)(&fdata[0]+dataOffset) = haar.ClassifierSize;
dataOffset += sizeof(NcvSize32u);
*(NcvBool *)(&fdata[0]+dataOffset) = haar.bNeedsTiltedII;
dataOffset += sizeof(NcvBool);
*(NcvBool *)(&fdata[0]+dataOffset) = haar.bHasStumpsOnly;
dataOffset += sizeof(NcvBool);
memcpy(&fdata[0]+dataOffset, h_HaarStages.ptr(), szStages);
dataOffset += szStages;
memcpy(&fdata[0]+dataOffset, h_HaarNodes.ptr(), szClassifiers);
dataOffset += szClassifiers;
memcpy(&fdata[0]+dataOffset, h_HaarFeatures.ptr(), szFeatures);
dataOffset += szFeatures;
Ncv32u fsize = dataOffset;
//TODO: CRC32 here
//update header
dataOffset = sizeof(Ncv32u);
*(Ncv32u *)(&fdata[0]+dataOffset) = fsize;
FILE *fp = fopen(filename.c_str(), "wb");
ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);
fwrite(&fdata[0], fsize, 1, fp);
fclose(fp);
return NCV_SUCCESS;
}