[~] Minor refactoring, clean-up

[+] Added 128-bit transpose
Anton Obukhov 14 years ago
parent e2caf4a3ed
commit 0c325cace3
  1. 306
  2. 230
  3. 239
  4. 122
  5. 345
  6. 105
  7. 4
  8. 2
  9. 28

@ -63,8 +63,6 @@
#include "NCVRuntimeTemplates.hpp"
#include "NCVHaarObjectDetection.hpp"
void groupRectangles(std::vector<NcvRect32u> &hypotheses, int groupThreshold, double eps, std::vector<Ncv32u> *weights);
@ -785,7 +783,6 @@ void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively,
//Second parameter is the number of "dynamic" template parameters
NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 5, applyHaarClassifierAnchorParallelFunctor>
::call( &functor,
0xC001C0DE, //this is dummy int for the va_args C compatibility
@ -890,7 +887,6 @@ void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg
//Second parameter is the number of "dynamic" template parameters
NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 3, applyHaarClassifierClassifierParallelFunctor>
::call( &functor,
0xC001C0DE, //this is dummy int for the va_args C compatibility
@ -957,7 +953,6 @@ void initializeMaskVectorDynTemplate(NcvBool tbMaskByInmask,
//Second parameter is the number of "dynamic" template parameters
NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 2, initializeMaskVectorFunctor>
::call( &functor,
0xC001C0DE, //this is dummy int for the va_args C compatibility
@ -1554,172 +1549,6 @@ NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,
// Visualize file
template <class T>
__global__ void drawRects(T *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
T color)
Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
if (blockId > numRects * 4)
NcvRect32u curRect = d_rects[blockId >> 2];
NcvBool bVertical = blockId & 0x1;
NcvBool bTopLeft = blockId & 0x2;
Ncv32u pt0x, pt0y;
if (bVertical)
Ncv32u numChunks = (curRect.height + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;
pt0x = bTopLeft ? curRect.x : curRect.x + curRect.width - 1;
pt0y = curRect.y;
if (pt0x < dstWidth)
for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)
Ncv32u ptY = pt0y + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;
if (ptY < pt0y + curRect.height && ptY < dstHeight)
d_dst[ptY * dstStride + pt0x] = color;
Ncv32u numChunks = (curRect.width + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;
pt0x = curRect.x;
pt0y = bTopLeft ? curRect.y : curRect.y + curRect.height - 1;
if (pt0y < dstHeight)
for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)
Ncv32u ptX = pt0x + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;
if (ptX < pt0x + curRect.width && ptX < dstWidth)
d_dst[pt0y * dstStride + ptX] = color;
template <class T>
static NCVStatus drawRectsWrapperDevice(T *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
T color,
cudaStream_t cuStream)
ncvAssertReturn(d_dst != NULL && d_rects != NULL, NCV_NULL_PTR);
ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);
ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);
ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);
if (numRects == 0)
#if defined _SELF_TEST_
T *h_dst;
ncvAssertCUDAReturn(cudaMallocHost(&h_dst, dstStride * dstHeight * sizeof(T)), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaMemcpy(h_dst, d_dst, dstStride * dstHeight * sizeof(T), cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
NcvRect32s *h_rects;
ncvAssertCUDAReturn(cudaMallocHost(&h_rects, numRects * sizeof(NcvRect32s)), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaMemcpy(h_rects, d_rects, numRects * sizeof(NcvRect32s), cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
ncvAssertReturnNcvStat(drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color));
dim3 grid(numRects * 4);
if (grid.x > 65535)
grid.y = (grid.x + 65534) / 65535;
grid.x = 65535;
drawRects<T><<<grid, block>>>(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color);
ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
#if defined _SELF_TEST_
T *h_dst_after;
ncvAssertCUDAReturn(cudaMallocHost(&h_dst_after, dstStride * dstHeight * sizeof(T)), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaMemcpy(h_dst_after, d_dst, dstStride * dstHeight * sizeof(T), cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
bool bPass = true;
for (Ncv32u i=0; i<dstHeight && bPass; i++)
for (Ncv32u j=0; j<dstWidth && bPass; j++)
if (h_dst[i*dstStride+j] != h_dst_after[i*dstStride+j])
printf("::drawRectsWrapperDevice self test failed: i=%d, j=%d, cpu=%d, gpu=%d\n", i, j, h_dst[i*dstStride+j], h_dst_after[i*dstStride+j]);
bPass = false;
ncvAssertCUDAReturn(cudaFreeHost(h_dst_after), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaFreeHost(h_dst), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaFreeHost(h_rects), NCV_CUDA_ERROR);
printf("::drawRectsWrapperDevice %s\n", bPass?"PASSED":"FAILED");
NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
Ncv8u color,
cudaStream_t cuStream)
return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);
NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
Ncv32u color,
cudaStream_t cuStream)
return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);
// Pipeline file
@ -1901,13 +1730,13 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
nppStat = nppiStDownsampleNearest_32u_C1R(
nppStat = nppiStDecimate_32u_C1R(
d_integralImage.ptr(), d_integralImage.pitch(),
d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(),
srcIIRoi, scale, true);
nppStat = nppiStDownsampleNearest_64u_C1R(
nppStat = nppiStDecimate_64u_C1R(
d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),
d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(),
srcIIRoi, scale, true);
@ -1969,7 +1798,7 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
Ncv32u numStrongHypothesesNow = dstNumRects;
ncvStat = ncvFilterHypotheses_host(
ncvStat = ncvGroupRectangles_host(
@ -2031,7 +1860,7 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
ncvStat = ncvFilterHypotheses_host(
ncvStat = ncvGroupRectangles_host(
@ -2285,133 +2114,6 @@ NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
NCVStatus ncvFilterHypotheses_host(NCVVector<NcvRect32u> &hypotheses,
Ncv32u &numHypotheses,
Ncv32u minNeighbors,
Ncv32f intersectEps,
NCVVector<Ncv32u> *hypothesesWeights)
ncvAssertReturn(hypotheses.memType() == NCVMemoryTypeHostPageable ||
hypotheses.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
if (hypothesesWeights != NULL)
ncvAssertReturn(hypothesesWeights->memType() == NCVMemoryTypeHostPageable ||
hypothesesWeights->memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
if (numHypotheses == 0)
std::vector<NcvRect32u> rects(numHypotheses);
memcpy(&rects[0], hypotheses.ptr(), numHypotheses * sizeof(NcvRect32u));
std::vector<Ncv32u> weights;
if (hypothesesWeights != NULL)
groupRectangles(rects, minNeighbors, intersectEps, &weights);
groupRectangles(rects, minNeighbors, intersectEps, NULL);
numHypotheses = (Ncv32u)rects.size();
if (numHypotheses > 0)
memcpy(hypotheses.ptr(), &rects[0], numHypotheses * sizeof(NcvRect32u));
if (hypothesesWeights != NULL)
memcpy(hypothesesWeights->ptr(), &weights[0], numHypotheses * sizeof(Ncv32u));
template <class T>
static NCVStatus drawRectsWrapperHost(T *h_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *h_rects,
Ncv32u numRects,
T color)
ncvAssertReturn(h_dst != NULL && h_rects != NULL, NCV_NULL_PTR);
ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);
ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);
ncvAssertReturn(numRects != 0, NCV_SUCCESS);
ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);
for (Ncv32u i=0; i<numRects; i++)
NcvRect32u rect = h_rects[i];
if (rect.x < dstWidth)
for (Ncv32u i=rect.y; i<rect.y+rect.height && i<dstHeight; i++)
h_dst[i*dstStride+rect.x] = color;
if (rect.x+rect.width-1 < dstWidth)
for (Ncv32u i=rect.y; i<rect.y+rect.height && i<dstHeight; i++)
h_dst[i*dstStride+rect.x+rect.width-1] = color;
if (rect.y < dstHeight)
for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)
h_dst[rect.y*dstStride+j] = color;
if (rect.y + rect.height - 1 < dstHeight)
for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)
h_dst[(rect.y+rect.height-1)*dstStride+j] = color;
NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *h_rects,
Ncv32u numRects,
Ncv8u color)
return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *h_rects,
Ncv32u numRects,
Ncv32u color)
return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
NCVStatus loadFromXML(const std::string &filename,
HaarClassifierCascadeDescriptor &haar,
std::vector<HaarStage64> &haarStages,

@ -346,153 +346,107 @@ enum
NCVPipeObjDet_VisualizeInPlace = 0x004,
NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
NcvSize32u srcRoi,
NCVVector<NcvRect32u> &d_dstRects,
Ncv32u &dstNumRects,
HaarClassifierCascadeDescriptor &haar,
NCVVector<HaarStage64> &h_HaarStages,
NCVVector<HaarStage64> &d_HaarStages,
NCVVector<HaarClassifierNode128> &d_HaarNodes,
NCVVector<HaarFeature64> &d_HaarFeatures,
NcvSize32u minObjSize,
Ncv32u minNeighbors, //default 4
Ncv32f scaleStep, //default 1.2f
Ncv32u pixelStep, //default 1
Ncv32u flags, //default NCVPipeObjDet_Default
INCVMemAllocator &gpuAllocator,
INCVMemAllocator &cpuAllocator,
cudaDeviceProp &devProp,
cudaStream_t cuStream);
NCV_EXPORTS NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
NcvSize32u srcRoi,
NCVVector<NcvRect32u> &d_dstRects,
Ncv32u &dstNumRects,
HaarClassifierCascadeDescriptor &haar,
NCVVector<HaarStage64> &h_HaarStages,
NCVVector<HaarStage64> &d_HaarStages,
NCVVector<HaarClassifierNode128> &d_HaarNodes,
NCVVector<HaarFeature64> &d_HaarFeatures,
NcvSize32u minObjSize,
Ncv32u minNeighbors, //default 4
Ncv32f scaleStep, //default 1.2f
Ncv32u pixelStep, //default 1
Ncv32u flags, //default NCVPipeObjDet_Default
INCVMemAllocator &gpuAllocator,
INCVMemAllocator &cpuAllocator,
cudaDeviceProp &devProp,
cudaStream_t cuStream);
NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImage,
NCVMatrix<Ncv32f> &d_weights,
NCVMatrixAlloc<Ncv32u> &d_pixelMask,
Ncv32u &numDetections,
HaarClassifierCascadeDescriptor &haar,
NCVVector<HaarStage64> &h_HaarStages,
NCVVector<HaarStage64> &d_HaarStages,
NCVVector<HaarClassifierNode128> &d_HaarNodes,
NCVVector<HaarFeature64> &d_HaarFeatures,
NcvBool bMaskElements,
NcvSize32u anchorsRoi,
Ncv32u pixelStep,
Ncv32f scaleArea,
INCVMemAllocator &gpuAllocator,
INCVMemAllocator &cpuAllocator,
cudaDeviceProp &devProp,
cudaStream_t cuStream);
NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> &h_integralImage,
NCVMatrix<Ncv32f> &h_weights,
NCVMatrixAlloc<Ncv32u> &h_pixelMask,
Ncv32u &numDetections,
HaarClassifierCascadeDescriptor &haar,
NCVVector<HaarStage64> &h_HaarStages,
NCVVector<HaarClassifierNode128> &h_HaarNodes,
NCVVector<HaarFeature64> &h_HaarFeatures,
NcvBool bMaskElements,
NcvSize32u anchorsRoi,
Ncv32u pixelStep,
Ncv32f scaleArea);
NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
Ncv8u color,
cudaStream_t cuStream);
NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
Ncv32u color,
cudaStream_t cuStream);
NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *h_rects,
Ncv32u numRects,
Ncv8u color);
NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *h_rects,
Ncv32u numRects,
Ncv32u color);
NCV_EXPORTS NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &d_integralImage,
NCVMatrix<Ncv32f> &d_weights,
NCVMatrixAlloc<Ncv32u> &d_pixelMask,
Ncv32u &numDetections,
HaarClassifierCascadeDescriptor &haar,
NCVVector<HaarStage64> &h_HaarStages,
NCVVector<HaarStage64> &d_HaarStages,
NCVVector<HaarClassifierNode128> &d_HaarNodes,
NCVVector<HaarFeature64> &d_HaarFeatures,
NcvBool bMaskElements,
NcvSize32u anchorsRoi,
Ncv32u pixelStep,
Ncv32f scaleArea,
INCVMemAllocator &gpuAllocator,
INCVMemAllocator &cpuAllocator,
cudaDeviceProp &devProp,
cudaStream_t cuStream);
NCV_EXPORTS NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> &h_integralImage,
NCVMatrix<Ncv32f> &h_weights,
NCVMatrixAlloc<Ncv32u> &h_pixelMask,
Ncv32u &numDetections,
HaarClassifierCascadeDescriptor &haar,
NCVVector<HaarStage64> &h_HaarStages,
NCVVector<HaarClassifierNode128> &h_HaarNodes,
NCVVector<HaarFeature64> &h_HaarFeatures,
NcvBool bMaskElements,
NcvSize32u anchorsRoi,
Ncv32u pixelStep,
Ncv32f scaleArea);
NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,
Ncv32u numPixelMaskDetections,
NCVVector<NcvRect32u> &hypotheses,
Ncv32u &totalDetections,
Ncv32u totalMaxDetections,
Ncv32u rectWidth,
Ncv32u rectHeight,
Ncv32f curScale,
cudaStream_t cuStream);
NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
Ncv32u numPixelMaskDetections,
NCVVector<NcvRect32u> &hypotheses,
Ncv32u &totalDetections,
Ncv32u totalMaxDetections,
Ncv32u rectWidth,
Ncv32u rectHeight,
Ncv32f curScale);
NCVStatus ncvFilterHypotheses_host(NCVVector<NcvRect32u> &hypotheses,
Ncv32u &numHypotheses,
Ncv32u minNeighbors,
Ncv32f intersectEps,
NCVVector<Ncv32u> *hypothesesWeights);
NCVStatus ncvHaarGetClassifierSize(const std::string &filename, Ncv32u &numStages,
Ncv32u &numNodes, Ncv32u &numFeatures);
NCVStatus ncvHaarLoadFromFile_host(const std::string &filename,
HaarClassifierCascadeDescriptor &haar,
NCVVector<HaarStage64> &h_HaarStages,
NCVVector<HaarClassifierNode128> &h_HaarNodes,
NCVVector<HaarFeature64> &h_HaarFeatures);
NCVStatus ncvHaarStoreNVBIN_host(const std::string &filename,
HaarClassifierCascadeDescriptor haar,
NCVVector<HaarStage64> &h_HaarStages,
NCVVector<HaarClassifierNode128> &h_HaarNodes,
NCVVector<HaarFeature64> &h_HaarFeatures);
NCV_EXPORTS NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,
Ncv32u numPixelMaskDetections,
NCVVector<NcvRect32u> &hypotheses,
Ncv32u &totalDetections,
Ncv32u totalMaxDetections,
Ncv32u rectWidth,
Ncv32u rectHeight,
Ncv32f curScale,
cudaStream_t cuStream);
NCV_EXPORTS NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
Ncv32u numPixelMaskDetections,
NCVVector<NcvRect32u> &hypotheses,
Ncv32u &totalDetections,
Ncv32u totalMaxDetections,
Ncv32u rectWidth,
Ncv32u rectHeight,
Ncv32f curScale);
NCV_EXPORTS NCVStatus ncvHaarGetClassifierSize(const std::string &filename, Ncv32u &numStages,
Ncv32u &numNodes, Ncv32u &numFeatures);
NCV_EXPORTS NCVStatus ncvHaarLoadFromFile_host(const std::string &filename,
HaarClassifierCascadeDescriptor &haar,
NCVVector<HaarStage64> &h_HaarStages,
NCVVector<HaarClassifierNode128> &h_HaarNodes,
NCVVector<HaarFeature64> &h_HaarFeatures);
NCV_EXPORTS NCVStatus ncvHaarStoreNVBIN_host(const std::string &filename,
HaarClassifierCascadeDescriptor haar,
NCVVector<HaarStage64> &h_HaarStages,
NCVVector<HaarClassifierNode128> &h_HaarNodes,
NCVVector<HaarFeature64> &h_HaarFeatures);

@ -44,10 +44,6 @@
#include <cuda_runtime.h>
#include "NPP_staging.hpp"
#if defined _SELF_TEST_
#include <stdio.h>
texture<Ncv8u, 1, cudaReadModeElementType> tex8u;
texture<Ncv32u, 1, cudaReadModeElementType> tex32u;
@ -161,12 +157,6 @@ const Ncv32u NUM_SCAN_THREADS = 256;
const Ncv32u LOG2_NUM_SCAN_THREADS = 8;
struct T_true {};
struct T_false {};
template <typename T, typename U> struct is_same : T_false {};
template <typename T> struct is_same<T, T> : T_true {};
template<class T_in, class T_out>
struct _scanElemOp
@ -175,13 +165,16 @@ struct _scanElemOp
return scanElemOp( elem, Int2Type<(int)tbDoSqr>() );
template <int v> struct Int2Type { enum { value = v }; };
static inline __host__ __device__ T_out scanElemOp(T_in elem, Int2Type<0>)
return (T_out)elem;
static inline __host__ __device__ T_out scanElemOp(T_in elem, Int2Type<1>)
return (T_out)(elem*elem);
@ -190,25 +183,25 @@ private:
template<class T>
inline __device__ T readElem(T *d_src, Ncv32u srcStride, Ncv32u curElemOffs);
inline __device__ T readElem(T *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs);
inline __device__ Ncv8u readElem<Ncv8u>(Ncv8u *d_src, Ncv32u srcStride, Ncv32u curElemOffs)
inline __device__ Ncv8u readElem<Ncv8u>(Ncv8u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
return tex1Dfetch(tex8u, srcStride * blockIdx.x + curElemOffs);
return tex1Dfetch(tex8u, texOffs + srcStride * blockIdx.x + curElemOffs);
inline __device__ Ncv32u readElem<Ncv32u>(Ncv32u *d_src, Ncv32u srcStride, Ncv32u curElemOffs)
inline __device__ Ncv32u readElem<Ncv32u>(Ncv32u *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
return d_src[curElemOffs];
inline __device__ Ncv32f readElem<Ncv32f>(Ncv32f *d_src, Ncv32u srcStride, Ncv32u curElemOffs)
inline __device__ Ncv32f readElem<Ncv32f>(Ncv32f *d_src, Ncv32u texOffs, Ncv32u srcStride, Ncv32u curElemOffs)
return d_src[curElemOffs];
@ -233,7 +226,7 @@ inline __device__ Ncv32f readElem<Ncv32f>(Ncv32f *d_src, Ncv32u srcStride, Ncv32
* \return None
template <class T_in, class T_out, bool tbDoSqr>
__global__ void scanRows(T_in *d_src, Ncv32u srcWidth, Ncv32u srcStride,
__global__ void scanRows(T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u srcStride,
T_out *d_II, Ncv32u IIstride)
//advance pointers to the current line
@ -263,7 +256,7 @@ __global__ void scanRows(T_in *d_src, Ncv32u srcWidth, Ncv32u srcStride,
if (curElemOffs < srcWidth)
//load elements
curElem = readElem<T_in>(d_src, srcStride, curElemOffs);
curElem = readElem<T_in>(d_src, texOffs, srcStride, curElemOffs);
curElemMod = _scanElemOp<T_in, T_out>::scanElemOp<tbDoSqr>(curElem);
@ -298,55 +291,28 @@ NCVStatus scanRowsWrapperDevice(T_in *d_src, Ncv32u srcStride,
T_out *d_dst, Ncv32u dstStride, NcvSize32u roi)
cudaChannelFormatDesc cfdTex;
size_t alignmentOffset = 0;
if (sizeof(T_in) == 1)
cfdTex = cudaCreateChannelDesc<Ncv8u>();
size_t alignmentOffset;
ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR);
ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
if (alignmentOffset > 0)
ncvAssertCUDAReturn(cudaUnbindTexture(tex8u), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, alignmentOffset + roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR);
<T_in, T_out, tbDoSqr>
<<<roi.height, NUM_SCAN_THREADS, 0, nppStGetActiveCUDAstream()>>>
(d_src, roi.width, srcStride, d_dst, dstStride);
(d_src, (Ncv32u)alignmentOffset, roi.width, srcStride, d_dst, dstStride);
#if defined _SELF_TEST_
T_in *h_src;
T_out *h_dst;
ncvAssertCUDAReturn(cudaMallocHost(&h_src, srcStride * roi.height * sizeof(T_in)), NPPST_MEM_ALLOC_ERR);
ncvAssertCUDAReturn(cudaMallocHost(&h_dst, dstStride * roi.height * sizeof(T_out)), NPPST_MEM_ALLOC_ERR);
memset(h_src, 0, srcStride * roi.height * sizeof(T_in));
memset(h_dst, 0, dstStride * roi.height * sizeof(T_out));
ncvAssertCUDAReturn(cudaMemcpy(h_src, d_src, srcStride * roi.height * sizeof(T_in), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
ncvAssertCUDAReturn(cudaMemcpy(h_dst, d_dst, dstStride * roi.height * sizeof(T_out), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
NcvBool bPass = true;
for (Ncv32u i=0; i<roi.height && bPass; i++)
T_out curElem = 0;
for (Ncv32u j=0; j<roi.width+1 && bPass; j++)
if (curElem != h_dst[i * dstStride + j])
printf("CIntegralImage::scanRowsWrapperDevice self test failed: i=%d, j=%d, cpu=%d, gpu=%d\n", i, j, curElem, h_dst[i * dstStride + j]);
bPass = false;
if (j < roi.width)
curElem += scanElemOp<T_op>(h_src[i*srcStride+j]);
ncvAssertCUDAReturn(cudaFreeHost(h_src), NPPST_MEMFREE_ERR);
ncvAssertCUDAReturn(cudaFreeHost(h_dst), NPPST_MEMFREE_ERR);
printf("CIntegralImage::scanRowsWrapperDevice %s\n", bPass?"PASSED":"FAILED");
Ncv32u getPaddedDimension(Ncv32u dim, Ncv32u elemTypeSize, Ncv32u allocatorAlignment)
static Ncv32u getPaddedDimension(Ncv32u dim, Ncv32u elemTypeSize, Ncv32u allocatorAlignment)
Ncv32u alignMask = allocatorAlignment-1;
Ncv32u inverseAlignMask = ~alignMask;
@ -676,7 +642,7 @@ NCVStatus nppiStSqrIntegral_8u64u_C1R_host(Ncv8u *h_src, Ncv32u srcStep,
// DownsampleNearest.cu
// Decimate.cu
@ -686,25 +652,25 @@ const Ncv32u NUM_DOWNSAMPLE_NEAREST_THREADS_Y = 8;
template<class T, NcvBool tbCacheTexture>
__device__ T getElem_DownsampleNearest(Ncv32u x, T *d_src);
__device__ T getElem_Decimate(Ncv32u x, T *d_src);
__device__ Ncv32u getElem_DownsampleNearest<Ncv32u, true>(Ncv32u x, Ncv32u *d_src)
__device__ Ncv32u getElem_Decimate<Ncv32u, true>(Ncv32u x, Ncv32u *d_src)
return tex1Dfetch(tex32u, x);
__device__ Ncv32u getElem_DownsampleNearest<Ncv32u, false>(Ncv32u x, Ncv32u *d_src)
__device__ Ncv32u getElem_Decimate<Ncv32u, false>(Ncv32u x, Ncv32u *d_src)
return d_src[x];
__device__ Ncv64u getElem_DownsampleNearest<Ncv64u, true>(Ncv32u x, Ncv64u *d_src)
__device__ Ncv64u getElem_Decimate<Ncv64u, true>(Ncv32u x, Ncv64u *d_src)
uint2 tmp = tex1Dfetch(tex64u, x);
Ncv64u res = (Ncv64u)tmp.y;
@ -715,14 +681,14 @@ __device__ Ncv64u getElem_DownsampleNearest<Ncv64u, true>(Ncv32u x, Ncv64u *d_sr
__device__ Ncv64u getElem_DownsampleNearest<Ncv64u, false>(Ncv32u x, Ncv64u *d_src)
__device__ Ncv64u getElem_Decimate<Ncv64u, false>(Ncv32u x, Ncv64u *d_src)
return d_src[x];
template <class T, NcvBool tbCacheTexture>
__global__ void downsampleNearest_C1R(T *d_src, Ncv32u srcStep, T *d_dst, Ncv32u dstStep,
__global__ void decimate_C1R(T *d_src, Ncv32u srcStep, T *d_dst, Ncv32u dstStep,
NcvSize32u dstRoi, Ncv32u scale)
int curX = blockIdx.x * blockDim.x + threadIdx.x;
@ -733,12 +699,12 @@ __global__ void downsampleNearest_C1R(T *d_src, Ncv32u srcStep, T *d_dst, Ncv32u
d_dst[curY * dstStep + curX] = getElem_DownsampleNearest<T, tbCacheTexture>((curY * srcStep + curX) * scale, d_src);
d_dst[curY * dstStep + curX] = getElem_Decimate<T, tbCacheTexture>((curY * srcStep + curX) * scale, d_src);
template <class T>
static NCVStatus downsampleNearestWrapperDevice(T *d_src, Ncv32u srcStep,
static NCVStatus decimateWrapperDevice(T *d_src, Ncv32u srcStep,
T *d_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale,
NcvBool readThruTexture)
@ -761,7 +727,7 @@ static NCVStatus downsampleNearestWrapperDevice(T *d_src, Ncv32u srcStep,
if (!readThruTexture)
<T, false>
<<<grid, block, 0, nppStGetActiveCUDAstream()>>>
(d_src, srcStep, d_dst, dstStep, dstRoi, scale);
@ -787,7 +753,7 @@ static NCVStatus downsampleNearestWrapperDevice(T *d_src, Ncv32u srcStep,
ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR);
<T, true>
<<<grid, block, 0, nppStGetActiveCUDAstream()>>>
(d_src, srcStep, d_dst, dstStep, dstRoi, scale);
@ -795,39 +761,12 @@ static NCVStatus downsampleNearestWrapperDevice(T *d_src, Ncv32u srcStep,
#if defined _SELF_TEST_
T *h_src;
T *h_dst;
ncvAssertCUDAReturn(cudaMallocHost(&h_src, srcStep * srcRoi.height * sizeof(T)), NPPST_MEM_ALLOC_ERR);
ncvAssertCUDAReturn(cudaMallocHost(&h_dst, dstStep * dstRoi.height * sizeof(T)), NPPST_MEM_ALLOC_ERR);
ncvAssertCUDAReturn(cudaMemcpy(h_src, d_src, srcStep * srcRoi.height * sizeof(T), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
ncvAssertCUDAReturn(cudaMemcpy(h_dst, d_dst, dstStep * dstRoi.height * sizeof(T), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
bool bPass = true;
for (Ncv32u i=0; i<dstRoi.height && bPass; i++)
for (Ncv32u j=0; j<dstRoi.width && bPass; j++)
if (h_dst[i*dstStep+j] != h_src[i*scale*srcStep + j*scale])
printf("::downsampleNearestWrapperDevice self test failed: i=%d, j=%d, cpu=%ld, gpu=%ld\n", i, j, (long long)h_src[i*scale*srcStep + j*scale], (long long)h_dst[i*dstStep+j]);
bPass = false;
ncvAssertCUDAReturn(cudaFreeHost(h_src), NPPST_MEMFREE_ERR);
ncvAssertCUDAReturn(cudaFreeHost(h_dst), NPPST_MEMFREE_ERR);
printf("::downsampleNearestWrapperDevice %s\n", bPass?"PASSED":"FAILED");
template <class T>
static NCVStatus downsampleNearestWrapperHost(T *h_src, Ncv32u srcStep,
static NCVStatus decimateWrapperHost(T *h_src, Ncv32u srcStep,
T *h_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale)
@ -856,40 +795,40 @@ static NCVStatus downsampleNearestWrapperHost(T *h_src, Ncv32u srcStep,
#define implementNppDownsampleNearest(bit, typ) \
NCVStatus nppiStDownsampleNearest_##bit##typ##_C1R(Ncv##bit##typ *d_src, Ncv32u srcStep, \
#define implementNppDecimate(bit, typ) \
NCVStatus nppiStDecimate_##bit##typ##_C1R(Ncv##bit##typ *d_src, Ncv32u srcStep, \
Ncv##bit##typ *d_dst, Ncv32u dstStep, \
NcvSize32u srcRoi, Ncv32u scale, NcvBool readThruTexture) \
{ \
return downsampleNearestWrapperDevice<Ncv##bit##u>((Ncv##bit##u *)d_src, srcStep, \
return decimateWrapperDevice<Ncv##bit##u>((Ncv##bit##u *)d_src, srcStep, \
(Ncv##bit##u *)d_dst, dstStep, \
srcRoi, scale, readThruTexture); \
#define implementNppDownsampleNearestHost(bit, typ) \
NCVStatus nppiStDownsampleNearest_##bit##typ##_C1R_host(Ncv##bit##typ *h_src, Ncv32u srcStep, \
#define implementNppDecimateHost(bit, typ) \
NCVStatus nppiStDecimate_##bit##typ##_C1R_host(Ncv##bit##typ *h_src, Ncv32u srcStep, \
Ncv##bit##typ *h_dst, Ncv32u dstStep, \
NcvSize32u srcRoi, Ncv32u scale) \
{ \
return downsampleNearestWrapperHost<Ncv##bit##u>((Ncv##bit##u *)h_src, srcStep, \
return decimateWrapperHost<Ncv##bit##u>((Ncv##bit##u *)h_src, srcStep, \
(Ncv##bit##u *)h_dst, dstStep, \
srcRoi, scale); \
implementNppDownsampleNearest(32, u)
implementNppDownsampleNearest(32, s)
implementNppDownsampleNearest(32, f)
implementNppDownsampleNearest(64, u)
implementNppDownsampleNearest(64, s)
implementNppDownsampleNearest(64, f)
implementNppDownsampleNearestHost(32, u)
implementNppDownsampleNearestHost(32, s)
implementNppDownsampleNearestHost(32, f)
implementNppDownsampleNearestHost(64, u)
implementNppDownsampleNearestHost(64, s)
implementNppDownsampleNearestHost(64, f)
implementNppDecimate(32, u)
implementNppDecimate(32, s)
implementNppDecimate(32, f)
implementNppDecimate(64, u)
implementNppDecimate(64, s)
implementNppDecimate(64, f)
implementNppDecimateHost(32, u)
implementNppDecimateHost(32, s)
implementNppDecimateHost(32, f)
implementNppDecimateHost(64, u)
implementNppDecimateHost(64, s)
implementNppDecimateHost(64, f)
@ -1051,46 +990,6 @@ NCVStatus nppiStRectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep,
#if defined _SELF_TEST_
Ncv32u *h_sum;
Ncv64u *h_sqsum;
Ncv32f *h_norm_d;
Ncv32u ExtHeight = roi.height + rect.y + rect.height;
ncvAssertCUDAReturn(cudaMallocHost(&h_sum, sumStep * ExtHeight * sizeof(Ncv32u)), NPPST_MEM_ALLOC_ERR);
ncvAssertCUDAReturn(cudaMallocHost(&h_sqsum, sqsumStep * ExtHeight * sizeof(Ncv64u)), NPPST_MEM_ALLOC_ERR);
ncvAssertCUDAReturn(cudaMallocHost(&h_norm_d, normStep * roi.height * sizeof(Ncv32u)), NPPST_MEM_ALLOC_ERR);
ncvAssertCUDAReturn(cudaMemcpy(h_sum, d_sum, sumStep * ExtHeight * sizeof(Ncv32u), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
ncvAssertCUDAReturn(cudaMemcpy(h_sqsum, d_sqsum, sqsumStep * ExtHeight * sizeof(Ncv64u), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
ncvAssertCUDAReturn(cudaMemcpy(h_norm_d, d_norm, normStep * roi.height * sizeof(Ncv32f), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
Ncv32f *h_norm_h;
ncvAssertCUDAReturn(cudaMallocHost(&h_norm_h, normStep * roi.height * sizeof(Ncv32u)), NPPST_MEM_ALLOC_ERR);
ncvAssertReturnNcvStat(nppRectStdDev_32f_C1R_host(h_sum, sqsumStep, h_sqsum, sqsumStep, h_norm_h, normStep, roi, rect, scaleArea));
const Ncv64f relEPS = 0.005;
bool bPass = true;
for (Ncv32u i=0; i<roi.height && bPass; i++)
for (Ncv32u j=0; j<roi.width && bPass; j++)
Ncv64f absErr = fabs(h_norm_h[i * normStep + j] - h_norm_d[i * normStep + j]);
Ncv64f relErr = absErr / h_norm_h[i * normStep + j];
if (relErr > relEPS)
printf("::ncvRectStdDev_32f_C1R self test failed: i=%d, j=%d, cpu=%f, gpu=%f\n", i, j, h_norm_h[i * normStep + j], h_norm_d[i * normStep + j]);
bPass = false;
ncvAssertCUDAReturn(cudaFreeHost(h_sum), NPPST_MEMFREE_ERR);
ncvAssertCUDAReturn(cudaFreeHost(h_sqsum), NPPST_MEMFREE_ERR);
ncvAssertCUDAReturn(cudaFreeHost(h_norm_d), NPPST_MEMFREE_ERR);
ncvAssertCUDAReturn(cudaFreeHost(h_norm_h), NPPST_MEMFREE_ERR);
printf("::ncvRectStdDev_32f_C1R %s\n", bPass?"PASSED":"FAILED");
@ -1251,34 +1150,6 @@ NCVStatus transposeWrapperDevice(T *d_src, Ncv32u srcStride,
(d_src, srcStride, d_dst, dstStride, srcRoi);
#if defined _SELF_TEST_
Ncv32u widthExt = grid.x * TRANSPOSE_TILE_DIM;
Ncv32u heightExt = grid.y * TRANSPOSE_TILE_DIM;
T *h_src;
T *h_dst;
ncvAssertCUDAReturn(cudaMallocHost(&h_src, srcStride * heightExt * sizeof(T)), NPPST_MEM_ALLOC_ERR);
ncvAssertCUDAReturn(cudaMallocHost(&h_dst, dstStride * widthExt * sizeof(T)), NPPST_MEM_ALLOC_ERR);
memset(h_src, 0, srcStride * heightExt * sizeof(T));
memset(h_dst, 0, dstStride * widthExt * sizeof(T));
ncvAssertCUDAReturn(cudaMemcpy(h_src, d_src, srcStride * heightExt * sizeof(T), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
ncvAssertCUDAReturn(cudaMemcpy(h_dst, d_dst, dstStride * widthExt * sizeof(T), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR);
NcvBool bPass = true;
for (Ncv32u i=0; i<srcRoi.height && bPass; i++)
for (Ncv32u j=0; j<srcRoi.width && bPass; j++)
if (h_src[i * srcStride + j] != h_dst[j * dstStride + i])
printf("CIntegralImage::transposeWrapperDevice self test failed: i=%d, j=%d, cpu=%d, gpu=%d\n", i, j, h_src[j * srcStride + i], h_dst[i * dstStride + j]);
bPass = false;
ncvAssertCUDAReturn(cudaFreeHost(h_src), NPPST_MEMFREE_ERR);
ncvAssertCUDAReturn(cudaFreeHost(h_dst), NPPST_MEMFREE_ERR);
printf("CIntegralImage::transposeWrapperDevice %s\n", bPass?"PASSED":"FAILED");
@ -1341,6 +1212,20 @@ implementNppTransposeHost(64,s)
NCVStatus nppiStTranspose_128_C1R(void *d_src, Ncv32u srcStep,
void *d_dst, Ncv32u dstStep, NcvSize32u srcRoi)
return transposeWrapperDevice<uint4>((uint4 *)d_src, srcStep, (uint4 *)d_dst, dstStep, srcRoi);
NCVStatus nppiStTranspose_128_C1R_host(void *d_src, Ncv32u srcStep,
void *d_dst, Ncv32u dstStep, NcvSize32u srcRoi)
return transposeWrapperHost<uint4>((uint4 *)d_src, srcStep, (uint4 *)d_dst, dstStep, srcRoi);
// Compact.cu

@ -96,65 +96,65 @@ cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream);
* \return NCV status code
NCVStatus nppiStDownsampleNearest_32u_C1R(Ncv32u *d_src, Ncv32u srcStep,
Ncv32u *d_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale,
NcvBool readThruTexture);
NCVStatus nppiStDecimate_32u_C1R(Ncv32u *d_src, Ncv32u srcStep,
Ncv32u *d_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale,
NcvBool readThruTexture);
* Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit signed pixels, single channel.
* \see nppiStDownsampleNearest_32u_C1R
* \see nppiStDecimate_32u_C1R
NCVStatus nppiStDownsampleNearest_32s_C1R(Ncv32s *d_src, Ncv32u srcStep,
Ncv32s *d_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale,
NcvBool readThruTexture);
NCVStatus nppiStDecimate_32s_C1R(Ncv32s *d_src, Ncv32u srcStep,
Ncv32s *d_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale,
NcvBool readThruTexture);
* Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit float pixels, single channel.
* \see nppiStDownsampleNearest_32u_C1R
* \see nppiStDecimate_32u_C1R
NCVStatus nppiStDownsampleNearest_32f_C1R(Ncv32f *d_src, Ncv32u srcStep,
Ncv32f *d_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale,
NcvBool readThruTexture);
NCVStatus nppiStDecimate_32f_C1R(Ncv32f *d_src, Ncv32u srcStep,
Ncv32f *d_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale,
NcvBool readThruTexture);
* Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit unsigned pixels, single channel.
* \see nppiStDownsampleNearest_32u_C1R
* \see nppiStDecimate_32u_C1R
NCVStatus nppiStDownsampleNearest_64u_C1R(Ncv64u *d_src, Ncv32u srcStep,
Ncv64u *d_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale,
NcvBool readThruTexture);
NCVStatus nppiStDecimate_64u_C1R(Ncv64u *d_src, Ncv32u srcStep,
Ncv64u *d_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale,
NcvBool readThruTexture);
* Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit signed pixels, single channel.
* \see nppiStDownsampleNearest_32u_C1R
* \see nppiStDecimate_32u_C1R
NCVStatus nppiStDownsampleNearest_64s_C1R(Ncv64s *d_src, Ncv32u srcStep,
Ncv64s *d_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale,
NcvBool readThruTexture);
NCVStatus nppiStDecimate_64s_C1R(Ncv64s *d_src, Ncv32u srcStep,
Ncv64s *d_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale,
NcvBool readThruTexture);
* Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit float pixels, single channel.
* \see nppiStDownsampleNearest_32u_C1R
* \see nppiStDecimate_32u_C1R
NCVStatus nppiStDownsampleNearest_64f_C1R(Ncv64f *d_src, Ncv32u srcStep,
Ncv64f *d_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale,
NcvBool readThruTexture);
NCVStatus nppiStDecimate_64f_C1R(Ncv64f *d_src, Ncv32u srcStep,
Ncv64f *d_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale,
NcvBool readThruTexture);
@ -170,59 +170,59 @@ NCVStatus nppiStDownsampleNearest_64f_C1R(Ncv64f *d_src, Ncv32u srcStep,
* \return NCV status code
NCVStatus nppiStDownsampleNearest_32u_C1R_host(Ncv32u *h_src, Ncv32u srcStep,
Ncv32u *h_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale);
NCVStatus nppiStDecimate_32u_C1R_host(Ncv32u *h_src, Ncv32u srcStep,
Ncv32u *h_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale);
* Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit signed pixels, single channel. Host implementation.
* \see nppiStDownsampleNearest_32u_C1R_host
* \see nppiStDecimate_32u_C1R_host
NCVStatus nppiStDownsampleNearest_32s_C1R_host(Ncv32s *h_src, Ncv32u srcStep,
Ncv32s *h_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale);
NCVStatus nppiStDecimate_32s_C1R_host(Ncv32s *h_src, Ncv32u srcStep,
Ncv32s *h_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale);
* Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit float pixels, single channel. Host implementation.
* \see nppiStDownsampleNearest_32u_C1R_host
* \see nppiStDecimate_32u_C1R_host
NCVStatus nppiStDownsampleNearest_32f_C1R_host(Ncv32f *h_src, Ncv32u srcStep,
Ncv32f *h_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale);
NCVStatus nppiStDecimate_32f_C1R_host(Ncv32f *h_src, Ncv32u srcStep,
Ncv32f *h_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale);
* Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit unsigned pixels, single channel. Host implementation.
* \see nppiStDownsampleNearest_32u_C1R_host
* \see nppiStDecimate_32u_C1R_host
NCVStatus nppiStDownsampleNearest_64u_C1R_host(Ncv64u *h_src, Ncv32u srcStep,
Ncv64u *h_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale);
NCVStatus nppiStDecimate_64u_C1R_host(Ncv64u *h_src, Ncv32u srcStep,
Ncv64u *h_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale);
* Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit signed pixels, single channel. Host implementation.
* \see nppiStDownsampleNearest_32u_C1R_host
* \see nppiStDecimate_32u_C1R_host
NCVStatus nppiStDownsampleNearest_64s_C1R_host(Ncv64s *h_src, Ncv32u srcStep,
Ncv64s *h_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale);
NCVStatus nppiStDecimate_64s_C1R_host(Ncv64s *h_src, Ncv32u srcStep,
Ncv64s *h_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale);
* Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit float pixels, single channel. Host implementation.
* \see nppiStDownsampleNearest_32u_C1R_host
* \see nppiStDecimate_32u_C1R_host
NCVStatus nppiStDownsampleNearest_64f_C1R_host(Ncv64f *h_src, Ncv32u srcStep,
Ncv64f *h_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale);
NCVStatus nppiStDecimate_64f_C1R_host(Ncv64f *h_src, Ncv32u srcStep,
Ncv64f *h_dst, Ncv32u dstStep,
NcvSize32u srcRoi, Ncv32u scale);
@ -333,6 +333,15 @@ NCVStatus nppiStTranspose_64f_C1R(Ncv64f *d_src, Ncv32u srcStride,
Ncv64f *d_dst, Ncv32u dstStride, NcvSize32u srcRoi);
* Transposes an image. 128-bit pixels of any type, single channel
* \see nppiStTranspose_32u_C1R
NCVStatus nppiStTranspose_128_C1R(void *d_src, Ncv32u srcStep,
void *d_dst, Ncv32u dstStep, NcvSize32u srcRoi);
* Transposes an image. 32-bit unsigned pixels, single channel. Host implementation
@ -394,6 +403,15 @@ NCVStatus nppiStTranspose_64f_C1R_host(Ncv64f *h_src, Ncv32u srcStride,
Ncv64f *h_dst, Ncv32u dstStride, NcvSize32u srcRoi);
* Transposes an image. 128-bit pixels of any type, single channel. Host implementation
* \see nppiStTranspose_32u_C1R_host
NCVStatus nppiStTranspose_128_C1R_host(void *d_src, Ncv32u srcStep,
void *d_dst, Ncv32u dstStep, NcvSize32u srcRoi);
* Calculates the size of the temporary buffer for integral image creation

@ -40,14 +40,9 @@
#if !defined (HAVE_CUDA)
#else /* !defined (HAVE_CUDA) */
#include <ios>
#include <stdarg.h>
#include <vector>
#include "NCV.hpp"
@ -182,6 +177,78 @@ NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType, const void *src, NC
NCVStatus memSegCopyHelper2D(void *dst, Ncv32u dstPitch, NCVMemoryType dstType,
const void *src, Ncv32u srcPitch, NCVMemoryType srcType,
Ncv32u widthbytes, Ncv32u height, cudaStream_t cuStream)
NCVStatus ncvStat;
switch (dstType)
case NCVMemoryTypeHostPageable:
case NCVMemoryTypeHostPinned:
switch (srcType)
case NCVMemoryTypeHostPageable:
case NCVMemoryTypeHostPinned:
for (Ncv32u i=0; i<height; i++)
memcpy((char*)dst + i * dstPitch, (char*)src + i * srcPitch, widthbytes);
ncvStat = NCV_SUCCESS;
case NCVMemoryTypeDevice:
if (cuStream != 0)
ncvAssertCUDAReturn(cudaMemcpy2DAsync(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaMemcpy2D(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToHost), NCV_CUDA_ERROR);
ncvStat = NCV_SUCCESS;
case NCVMemoryTypeDevice:
switch (srcType)
case NCVMemoryTypeHostPageable:
case NCVMemoryTypeHostPinned:
if (cuStream != 0)
ncvAssertCUDAReturn(cudaMemcpy2DAsync(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaMemcpy2D(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyHostToDevice), NCV_CUDA_ERROR);
ncvStat = NCV_SUCCESS;
case NCVMemoryTypeDevice:
if (cuStream != 0)
ncvAssertCUDAReturn(cudaMemcpy2DAsync(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToDevice, cuStream), NCV_CUDA_ERROR);
ncvAssertCUDAReturn(cudaMemcpy2D(dst, dstPitch, src, srcPitch, widthbytes, height, cudaMemcpyDeviceToDevice), NCV_CUDA_ERROR);
ncvStat = NCV_SUCCESS;
return ncvStat;
// NCVMemStackAllocator class members implementation
@ -195,8 +262,10 @@ NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment)
NcvBool bProperAlignment = (alignment & (alignment-1)) == 0;
ncvAssertPrintCheck(bProperAlignment, "NCVMemStackAllocator ctor:: alignment not power of 2");
@ -573,4 +642,264 @@ double ncvEndQueryTimerMs(NcvTimer t)
return res;
#endif /* !defined (HAVE_CUDA) */
// Operations with rectangles
//from OpenCV
void groupRectangles(std::vector<NcvRect32u> &hypotheses, int groupThreshold, double eps, std::vector<Ncv32u> *weights);
NCVStatus ncvGroupRectangles_host(NCVVector<NcvRect32u> &hypotheses,
Ncv32u &numHypotheses,
Ncv32u minNeighbors,
Ncv32f intersectEps,
NCVVector<Ncv32u> *hypothesesWeights)
ncvAssertReturn(hypotheses.memType() == NCVMemoryTypeHostPageable ||
hypotheses.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
if (hypothesesWeights != NULL)
ncvAssertReturn(hypothesesWeights->memType() == NCVMemoryTypeHostPageable ||
hypothesesWeights->memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
if (numHypotheses == 0)
std::vector<NcvRect32u> rects(numHypotheses);
memcpy(&rects[0], hypotheses.ptr(), numHypotheses * sizeof(NcvRect32u));
std::vector<Ncv32u> weights;
if (hypothesesWeights != NULL)
groupRectangles(rects, minNeighbors, intersectEps, &weights);
groupRectangles(rects, minNeighbors, intersectEps, NULL);
numHypotheses = (Ncv32u)rects.size();
if (numHypotheses > 0)
memcpy(hypotheses.ptr(), &rects[0], numHypotheses * sizeof(NcvRect32u));
if (hypothesesWeights != NULL)
memcpy(hypothesesWeights->ptr(), &weights[0], numHypotheses * sizeof(Ncv32u));
template <class T>
static NCVStatus drawRectsWrapperHost(T *h_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *h_rects,
Ncv32u numRects,
T color)
ncvAssertReturn(h_dst != NULL && h_rects != NULL, NCV_NULL_PTR);
ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);
ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);
ncvAssertReturn(numRects != 0, NCV_SUCCESS);
ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);
for (Ncv32u i=0; i<numRects; i++)
NcvRect32u rect = h_rects[i];
if (rect.x < dstWidth)
for (Ncv32u i=rect.y; i<rect.y+rect.height && i<dstHeight; i++)
h_dst[i*dstStride+rect.x] = color;
if (rect.x+rect.width-1 < dstWidth)
for (Ncv32u i=rect.y; i<rect.y+rect.height && i<dstHeight; i++)
h_dst[i*dstStride+rect.x+rect.width-1] = color;
if (rect.y < dstHeight)
for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)
h_dst[rect.y*dstStride+j] = color;
if (rect.y + rect.height - 1 < dstHeight)
for (Ncv32u j=rect.x; j<rect.x+rect.width && j<dstWidth; j++)
h_dst[(rect.y+rect.height-1)*dstStride+j] = color;
NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *h_rects,
Ncv32u numRects,
Ncv8u color)
return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *h_rects,
Ncv32u numRects,
Ncv32u color)
return drawRectsWrapperHost(h_dst, dstStride, dstWidth, dstHeight, h_rects, numRects, color);
template <class T>
__global__ void drawRects(T *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
T color)
Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
if (blockId > numRects * 4)
NcvRect32u curRect = d_rects[blockId >> 2];
NcvBool bVertical = blockId & 0x1;
NcvBool bTopLeft = blockId & 0x2;
Ncv32u pt0x, pt0y;
if (bVertical)
Ncv32u numChunks = (curRect.height + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;
pt0x = bTopLeft ? curRect.x : curRect.x + curRect.width - 1;
pt0y = curRect.y;
if (pt0x < dstWidth)
for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)
Ncv32u ptY = pt0y + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;
if (ptY < pt0y + curRect.height && ptY < dstHeight)
d_dst[ptY * dstStride + pt0x] = color;
Ncv32u numChunks = (curRect.width + NUMTHREADS_DRAWRECTS - 1) >> NUMTHREADS_DRAWRECTS_LOG2;
pt0x = curRect.x;
pt0y = bTopLeft ? curRect.y : curRect.y + curRect.height - 1;
if (pt0y < dstHeight)
for (Ncv32u chunkId = 0; chunkId < numChunks; chunkId++)
Ncv32u ptX = pt0x + chunkId * NUMTHREADS_DRAWRECTS + threadIdx.x;
if (ptX < pt0x + curRect.width && ptX < dstWidth)
d_dst[pt0y * dstStride + ptX] = color;
template <class T>
static NCVStatus drawRectsWrapperDevice(T *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
T color,
cudaStream_t cuStream)
ncvAssertReturn(d_dst != NULL && d_rects != NULL, NCV_NULL_PTR);
ncvAssertReturn(dstWidth > 0 && dstHeight > 0, NCV_DIMENSIONS_INVALID);
ncvAssertReturn(dstStride >= dstWidth, NCV_INVALID_STEP);
ncvAssertReturn(numRects <= dstWidth * dstHeight, NCV_DIMENSIONS_INVALID);
if (numRects == 0)
dim3 grid(numRects * 4);
if (grid.x > 65535)
grid.y = (grid.x + 65534) / 65535;
grid.x = 65535;
drawRects<T><<<grid, block>>>(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color);
ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
Ncv8u color,
cudaStream_t cuStream)
return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);
NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst,
Ncv32u dstStride,
Ncv32u dstWidth,
Ncv32u dstHeight,
NcvRect32u *d_rects,
Ncv32u numRects,
Ncv32u color,
cudaStream_t cuStream)
return drawRectsWrapperDevice(d_dst, dstStride, dstWidth, dstHeight, d_rects, numRects, color, cuStream);

@ -129,8 +129,8 @@ struct NcvRect8u
Ncv8u y;
Ncv8u width;
Ncv8u height;
NcvRect8u() : x(0), y(0), width(0), height(0) {};
NcvRect8u(Ncv8u x, Ncv8u y, Ncv8u width, Ncv8u height) : x(x), y(y), width(width), height(height) {}
__host__ __device__ NcvRect8u() : x(0), y(0), width(0), height(0) {};
__host__ __device__ NcvRect8u(Ncv8u x, Ncv8u y, Ncv8u width, Ncv8u height) : x(x), y(y), width(width), height(height) {}
@ -140,8 +140,8 @@ struct NcvRect32s
Ncv32s y; ///< y-coordinate of upper left corner.
Ncv32s width; ///< Rectangle width.
Ncv32s height; ///< Rectangle height.
NcvRect32s() : x(0), y(0), width(0), height(0) {};
NcvRect32s(Ncv32s x, Ncv32s y, Ncv32s width, Ncv32s height) : x(x), y(y), width(width), height(height) {}
__host__ __device__ NcvRect32s() : x(0), y(0), width(0), height(0) {};
__host__ __device__ NcvRect32s(Ncv32s x, Ncv32s y, Ncv32s width, Ncv32s height) : x(x), y(y), width(width), height(height) {}
@ -151,8 +151,8 @@ struct NcvRect32u
Ncv32u y; ///< y-coordinate of upper left corner.
Ncv32u width; ///< Rectangle width.
Ncv32u height; ///< Rectangle height.
NcvRect32u() : x(0), y(0), width(0), height(0) {};
NcvRect32u(Ncv32u x, Ncv32u y, Ncv32u width, Ncv32u height) : x(x), y(y), width(width), height(height) {}
__host__ __device__ NcvRect32u() : x(0), y(0), width(0), height(0) {};
__host__ __device__ NcvRect32u(Ncv32u x, Ncv32u y, Ncv32u width, Ncv32u height) : x(x), y(y), width(width), height(height) {}
@ -160,8 +160,8 @@ struct NcvSize32s
Ncv32s width; ///< Rectangle width.
Ncv32s height; ///< Rectangle height.
NcvSize32s() : width(0), height(0) {};
NcvSize32s(Ncv32s width, Ncv32s height) : width(width), height(height) {}
__host__ __device__ NcvSize32s() : width(0), height(0) {};
__host__ __device__ NcvSize32s(Ncv32s width, Ncv32s height) : width(width), height(height) {}
@ -169,8 +169,8 @@ struct NcvSize32u
Ncv32u width; ///< Rectangle width.
Ncv32u height; ///< Rectangle height.
NcvSize32u() : width(0), height(0) {};
NcvSize32u(Ncv32u width, Ncv32u height) : width(width), height(height) {}
__host__ __device__ NcvSize32u() : width(0), height(0) {};
__host__ __device__ NcvSize32u(Ncv32u width, Ncv32u height) : width(width), height(height) {}
@ -275,6 +275,7 @@ enum NCVStatus
//NCV statuses
@ -501,13 +502,18 @@ private:
* Copy dispatcher
* Copy dispatchers
NCV_EXPORTS NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType,
const void *src, NCVMemoryType srcType,
size_t sz, cudaStream_t cuStream);
NCV_EXPORTS NCVStatus memSegCopyHelper2D(void *dst, Ncv32u dstPitch, NCVMemoryType dstType,
const void *src, Ncv32u srcPitch, NCVMemoryType srcType,
Ncv32u widthbytes, Ncv32u height, cudaStream_t cuStream);
* NCVVector (1D)
@ -532,7 +538,7 @@ public:
_memtype = NCVMemoryTypeNone;
NCVStatus copySolid(NCVVector<T> &dst, cudaStream_t cuStream, size_t howMuch=0)
NCVStatus copySolid(NCVVector<T> &dst, cudaStream_t cuStream, size_t howMuch=0) const
if (howMuch == 0)
@ -600,7 +606,6 @@ public:
this->_memtype = this->allocatedMem.begin.memtype;
NCVStatus ncvStat;
@ -611,25 +616,22 @@ public:
NcvBool isMemAllocated() const
return (this->allocatedMem.begin.ptr != NULL) || (this->allocator.isCounting());
Ncv32u getAllocatorsAlignment() const
return allocator.alignment();
NCVMemSegment getSegment() const
return allocatedMem;
INCVMemAllocator &allocator;
NCVMemSegment allocatedMem;
@ -658,7 +660,6 @@ public:
this->bReused = true;
NCVVectorReuse(const NCVMemSegment &memSegment, Ncv32u length)
this->bReused = false;
@ -674,7 +675,6 @@ public:
this->bReused = true;
NcvBool isMemReused() const
return this->bReused;
@ -703,7 +703,6 @@ public:
virtual ~NCVMatrix() {}
void clear()
_ptr = NULL;
@ -713,14 +712,13 @@ public:
_memtype = NCVMemoryTypeNone;
Ncv32u stride() const
return _pitch / sizeof(T);
NCVStatus copySolid(NCVMatrix<T> &dst, cudaStream_t cuStream, size_t howMuch=0)
//a side effect of this function is that it copies everything in a single chunk, so the "padding" will be overwritten
NCVStatus copySolid(NCVMatrix<T> &dst, cudaStream_t cuStream, size_t howMuch=0) const
if (howMuch == 0)
@ -748,6 +746,24 @@ public:
return ncvStat;
NCVStatus copy2D(NCVMatrix<T> &dst, NcvSize32u roi, cudaStream_t cuStream) const
ncvAssertReturn(this->width() >= roi.width && this->height() >= roi.height &&
dst.width() >= roi.width && dst.height() >= roi.height, NCV_MEM_COPY_ERROR);
ncvAssertReturn((this->_ptr != NULL || this->_memtype == NCVMemoryTypeNone) &&
(dst._ptr != NULL || dst._memtype == NCVMemoryTypeNone), NCV_NULL_PTR);
NCVStatus ncvStat = NCV_SUCCESS;
if (this->_memtype != NCVMemoryTypeNone)
ncvStat = memSegCopyHelper2D(dst._ptr, dst._pitch, dst._memtype,
this->_ptr, this->_pitch, this->_memtype,
roi.width * sizeof(T), roi.height, cuStream);
return ncvStat;
T *ptr() const {return this->_ptr;}
Ncv32u width() const {return this->_width;}
Ncv32u height() const {return this->_height;}
@ -817,19 +833,16 @@ public:
NcvBool isMemAllocated() const
return (this->allocatedMem.begin.ptr != NULL) || (this->allocator.isCounting());
Ncv32u getAllocatorsAlignment() const
return allocator.alignment();
NCVMemSegment getSegment() const
return allocatedMem;
@ -888,6 +901,23 @@ public:
this->bReused = true;
NCVMatrixReuse(const NCVMatrix<T> &mat, NcvRect32u roi)
this->bReused = false;
ncvAssertPrintReturn(roi.x < mat.width() && roi.y < mat.height() && \
roi.x + roi.width <= mat.width() && roi.y + roi.height <= mat.height(),
"NCVMatrixReuse ctor:: memory binding failed due to mismatching ROI and source matrix dims", );
this->_width = roi.width;
this->_height = roi.height;
this->_pitch = mat.pitch();
this->_ptr = mat.ptr() + roi.y * mat.stride() + roi.x;
this->_memtype = mat.memType();
this->bReused = true;
NcvBool isMemReused() const
@ -899,4 +929,27 @@ private:
NcvBool bReused;
* Operations with rectangles
NCV_EXPORTS NCVStatus ncvGroupRectangles_host(NCVVector<NcvRect32u> &hypotheses, Ncv32u &numHypotheses,
Ncv32u minNeighbors, Ncv32f intersectEps, NCVVector<Ncv32u> *hypothesesWeights);
NCV_EXPORTS NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst, Ncv32u dstStride, Ncv32u dstWidth, Ncv32u dstHeight,
NcvRect32u *h_rects, Ncv32u numRects, Ncv8u color);
NCV_EXPORTS NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst, Ncv32u dstStride, Ncv32u dstWidth, Ncv32u dstHeight,
NcvRect32u *h_rects, Ncv32u numRects, Ncv32u color);
NCV_EXPORTS NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst, Ncv32u dstStride, Ncv32u dstWidth, Ncv32u dstHeight,
NcvRect32u *d_rects, Ncv32u numRects, Ncv8u color, cudaStream_t cuStream);
NCV_EXPORTS NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst, Ncv32u dstStride, Ncv32u dstWidth, Ncv32u dstHeight,
NcvRect32u *d_rects, Ncv32u numRects, Ncv32u color, cudaStream_t cuStream);
#endif // _ncv_hpp_

@ -150,14 +150,14 @@ namespace NCVRuntimeTemplateBool
//Convenience function used by the user
//Takes a variable argument list, transforms it into a list
static void call(Func *functor, int dummy, ...)
static void call(Func *functor, ...)
//Vector used to collect arguments
std::vector<int> templateParamList;
//Variable argument list manipulation
va_list listPointer;
va_start(listPointer, dummy);
va_start(listPointer, functor);
//Collect parameters into the list
for(int i=0; i<NumArguments; i++)

@ -134,7 +134,7 @@ bool TestHypothesesFilter::process()
Ncv32u numHypothesesSrc = h_vecSrc.length();
ncvStat = ncvFilterHypotheses_host(h_vecSrc, numHypothesesSrc, this->minNeighbors, this->eps, NULL);
ncvStat = ncvGroupRectangles_host(h_vecSrc, numHypothesesSrc, this->minNeighbors, this->eps, NULL);
ncvAssertReturn(ncvStat == NCV_SUCCESS, false);

@ -83,17 +83,17 @@ bool TestResize<T>::process()
if (sizeof(T) == sizeof(Ncv32u))
ncvStat = nppiStDownsampleNearest_32u_C1R((Ncv32u *)d_img.ptr(), d_img.pitch(),
(Ncv32u *)d_small.ptr(), d_small.pitch(),
srcSize, this->scaleFactor,
ncvStat = nppiStDecimate_32u_C1R((Ncv32u *)d_img.ptr(), d_img.pitch(),
(Ncv32u *)d_small.ptr(), d_small.pitch(),
srcSize, this->scaleFactor,
else if (sizeof(T) == sizeof(Ncv64u))
ncvStat = nppiStDownsampleNearest_64u_C1R((Ncv64u *)d_img.ptr(), d_img.pitch(),
(Ncv64u *)d_small.ptr(), d_small.pitch(),
srcSize, this->scaleFactor,
ncvStat = nppiStDecimate_64u_C1R((Ncv64u *)d_img.ptr(), d_img.pitch(),
(Ncv64u *)d_small.ptr(), d_small.pitch(),
srcSize, this->scaleFactor,
@ -107,15 +107,15 @@ bool TestResize<T>::process()
if (sizeof(T) == sizeof(Ncv32u))
ncvStat = nppiStDownsampleNearest_32u_C1R_host((Ncv32u *)h_img.ptr(), h_img.pitch(),
(Ncv32u *)h_small.ptr(), h_small.pitch(),
srcSize, this->scaleFactor);
ncvStat = nppiStDecimate_32u_C1R_host((Ncv32u *)h_img.ptr(), h_img.pitch(),
(Ncv32u *)h_small.ptr(), h_small.pitch(),
srcSize, this->scaleFactor);
else if (sizeof(T) == sizeof(Ncv64u))
ncvStat = nppiStDownsampleNearest_64u_C1R_host((Ncv64u *)h_img.ptr(), h_img.pitch(),
(Ncv64u *)h_small.ptr(), h_small.pitch(),
srcSize, this->scaleFactor);
ncvStat = nppiStDecimate_64u_C1R_host((Ncv64u *)h_img.ptr(), h_img.pitch(),
(Ncv64u *)h_small.ptr(), h_small.pitch(),
srcSize, this->scaleFactor);
