diff --git a/3rdparty/NPP_staging/NPP_staging_static_Windows_32_v1.lib b/3rdparty/NPP_staging/NPP_staging_static_Windows_32_v1.lib deleted file mode 100644 index 98d5c227a1..0000000000 Binary files a/3rdparty/NPP_staging/NPP_staging_static_Windows_32_v1.lib and /dev/null differ diff --git a/3rdparty/NPP_staging/NPP_staging_static_Windows_64_v1.lib b/3rdparty/NPP_staging/NPP_staging_static_Windows_64_v1.lib deleted file mode 100644 index f8372bf108..0000000000 Binary files a/3rdparty/NPP_staging/NPP_staging_static_Windows_64_v1.lib and /dev/null differ diff --git a/3rdparty/NPP_staging/libNPP_staging_static_Darwin_64_v1.a b/3rdparty/NPP_staging/libNPP_staging_static_Darwin_64_v1.a deleted file mode 100644 index ab0150ef0d..0000000000 Binary files a/3rdparty/NPP_staging/libNPP_staging_static_Darwin_64_v1.a and /dev/null differ diff --git a/3rdparty/NPP_staging/libNPP_staging_static_Linux_32_v1.a b/3rdparty/NPP_staging/libNPP_staging_static_Linux_32_v1.a deleted file mode 100644 index 6c16959f3e..0000000000 Binary files a/3rdparty/NPP_staging/libNPP_staging_static_Linux_32_v1.a and /dev/null differ diff --git a/3rdparty/NPP_staging/libNPP_staging_static_Linux_64_v1.a b/3rdparty/NPP_staging/libNPP_staging_static_Linux_64_v1.a deleted file mode 100644 index 56c7bd5624..0000000000 Binary files a/3rdparty/NPP_staging/libNPP_staging_static_Linux_64_v1.a and /dev/null differ diff --git a/3rdparty/NPP_staging/npp_staging.h b/3rdparty/NPP_staging/npp_staging.h deleted file mode 100644 index 6bb1e67d59..0000000000 --- a/3rdparty/NPP_staging/npp_staging.h +++ /dev/null @@ -1,784 +0,0 @@ -/* -* Copyright 1993-2010 NVIDIA Corporation. All rights reserved. -* -* NOTICE TO USER: -* -* This source code is subject to NVIDIA ownership rights under U.S. and -* international Copyright laws. -* -* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE -* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR -* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH -* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF -* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. -* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, -* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS -* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE -* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE -* OR PERFORMANCE OF THIS SOURCE CODE. -* -* U.S. Government End Users. This source code is a "commercial item" as -* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of -* "commercial computer software" and "commercial computer software -* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) -* and is provided to the U.S. Government only as a commercial end item. -* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through -* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the -* source code with only those rights set forth herein. -*/ -#ifndef _npp_staging_h_ -#define _npp_staging_h_ - - -/** -* \file npp_staging.h -* NPP Staging Library (will become part of NPP next release) -*/ - - -#ifdef __cplusplus - - -/** \defgroup ctassert Compile-time assert functionality -* @{ -*/ - - - /** - * Compile-time assert namespace - */ - namespace NppStCTprep - { - template - struct CT_ASSERT_FAILURE; - - template <> - struct CT_ASSERT_FAILURE {}; - - template - struct assertTest{}; - } - - - #define NPPST_CT_PREP_PASTE_AUX(a,b) a##b ///< Concatenation indirection macro - #define NPPST_CT_PREP_PASTE(a,b) NPPST_CT_PREP_PASTE_AUX(a, b) ///< Concatenation macro - - - /** - * Performs compile-time assertion of a condition on the file scope - */ - #define NPPST_CT_ASSERT(X) \ - typedef NppStCTprep::assertTest)> \ - NPPST_CT_PREP_PASTE(__ct_assert_typedef_, __LINE__) - - -/*@}*/ - - -#endif - - -/** \defgroup typedefs NPP Integral and compound types of guaranteed size - * @{ - */ - - -typedef bool NppStBool; ///< Bool of size less than integer -typedef long long NppSt64s; ///< 64-bit signed integer -typedef unsigned long long NppSt64u; ///< 64-bit unsigned integer -typedef int NppSt32s; ///< 32-bit signed integer -typedef unsigned int NppSt32u; ///< 32-bit unsigned integer -typedef short NppSt16s; ///< 16-bit signed short -typedef unsigned short NppSt16u; ///< 16-bit unsigned short -typedef char NppSt8s; ///< 8-bit signed char -typedef unsigned char NppSt8u; ///< 8-bit unsigned char -typedef float NppSt32f; ///< 32-bit IEEE-754 (single precision) float -typedef double NppSt64f; ///< 64-bit IEEE-754 (double precision) float - - -/** - * 2D Rectangle, 8-bit unsigned fields - * This struct contains position and size information of a rectangle in two space - */ -struct NppStRect8u -{ - NppSt8u x; ///< x-coordinate of upper left corner - NppSt8u y; ///< y-coordinate of upper left corner - NppSt8u width; ///< Rectangle width - NppSt8u height; ///< Rectangle height -#ifdef __cplusplus - NppStRect8u() : x(0), y(0), width(0), height(0) {}; - NppStRect8u(NppSt8u x, NppSt8u y, NppSt8u width, NppSt8u height) : x(x), y(y), width(width), height(height) {} -#endif -}; - - -/** - * 2D Rectangle, 32-bit signed fields - * This struct contains position and size information of a rectangle in two space - */ -struct NppStRect32s -{ - NppSt32s x; ///< x-coordinate of upper left corner - NppSt32s y; ///< y-coordinate of upper left corner - NppSt32s width; ///< Rectangle width - NppSt32s height; ///< Rectangle height -#ifdef __cplusplus - NppStRect32s() : x(0), y(0), width(0), height(0) {}; - NppStRect32s(NppSt32s x, NppSt32s y, NppSt32s width, NppSt32s height) : x(x), y(y), width(width), height(height) {} -#endif -}; - - -/** - * 2D Rectangle, 32-bit unsigned fields - * This struct contains position and size information of a rectangle in two space - */ -struct NppStRect32u -{ - NppSt32u x; ///< x-coordinate of upper left corner - NppSt32u y; ///< y-coordinate of upper left corner - NppSt32u width; ///< Rectangle width - NppSt32u height; ///< Rectangle height -#ifdef __cplusplus - NppStRect32u() : x(0), y(0), width(0), height(0) {}; - NppStRect32u(NppSt32u x, NppSt32u y, NppSt32u width, NppSt32u height) : x(x), y(y), width(width), height(height) {} -#endif -}; - - -/** - * 2D Size, 32-bit signed fields - * This struct typically represents the size of a a rectangular region in two space - */ -struct NppStSize32s -{ - NppSt32s width; ///< Rectangle width - NppSt32s height; ///< Rectangle height -#ifdef __cplusplus - NppStSize32s() : width(0), height(0) {}; - NppStSize32s(NppSt32s width, NppSt32s height) : width(width), height(height) {} -#endif -}; - - -/** - * 2D Size, 32-bit unsigned fields - * This struct typically represents the size of a a rectangular region in two space - */ -struct NppStSize32u -{ - NppSt32u width; ///< Rectangle width - NppSt32u height; ///< Rectangle height -#ifdef __cplusplus - NppStSize32u() : width(0), height(0) {}; - NppStSize32u(NppSt32u width, NppSt32u height) : width(width), height(height) {} -#endif -}; - - -/** - * Error Status Codes - * - * Almost all NPP function return error-status information using - * these return codes. - * Negative return codes indicate errors, positive return codes indicate - * warnings, a return code of 0 indicates success. - */ -enum NppStStatus -{ - //already present in NPP - __NPP_SUCCESS = 0, ///< Successful operation (same as NPP_NO_ERROR) - __NPP_ERROR = -1, ///< Unknown error - __NPP_CUDA_KERNEL_EXECUTION_ERROR = -3, ///< CUDA kernel execution error - __NPP_NULL_POINTER_ERROR = -4, ///< NULL pointer argument error - __NPP_TEXTURE_BIND_ERROR = -24, ///< CUDA texture binding error or non-zero offset returned - __NPP_MEMCPY_ERROR = -13, ///< CUDA memory copy error - __NPP_MEM_ALLOC_ERR = -12, ///< CUDA memory allocation error - __NPP_MEMFREE_ERR = -15, ///< CUDA memory deallocation error - - //to be added - NPP_INVALID_ROI, ///< Invalid region of interest argument - NPP_INVALID_STEP, ///< Invalid image lines step argument (check sign, alignment, relation to image width) - NPP_INVALID_SCALE, ///< Invalid scale parameter passed - NPP_MEM_INSUFFICIENT_BUFFER, ///< Insufficient user-allocated buffer - NPP_MEM_RESIDENCE_ERROR, ///< Memory residence error detected (check if pointers should be device or pinned) - NPP_MEM_INTERNAL_ERROR, ///< Internal memory management error -}; - - -/*@}*/ - - -#ifdef __cplusplus - - -/** \defgroup ct_typesize_checks Client-side sizeof types compile-time check -* @{ -*/ - NPPST_CT_ASSERT(sizeof(NppStBool) <= 4); - NPPST_CT_ASSERT(sizeof(NppSt64s) == 8); - NPPST_CT_ASSERT(sizeof(NppSt64u) == 8); - NPPST_CT_ASSERT(sizeof(NppSt32s) == 4); - NPPST_CT_ASSERT(sizeof(NppSt32u) == 4); - NPPST_CT_ASSERT(sizeof(NppSt16s) == 2); - NPPST_CT_ASSERT(sizeof(NppSt16u) == 2); - NPPST_CT_ASSERT(sizeof(NppSt8s) == 1); - NPPST_CT_ASSERT(sizeof(NppSt8u) == 1); - NPPST_CT_ASSERT(sizeof(NppSt32f) == 4); - NPPST_CT_ASSERT(sizeof(NppSt64f) == 8); - NPPST_CT_ASSERT(sizeof(NppStRect8u) == sizeof(NppSt32u)); - NPPST_CT_ASSERT(sizeof(NppStRect32s) == 4 * sizeof(NppSt32s)); - NPPST_CT_ASSERT(sizeof(NppStRect32u) == 4 * sizeof(NppSt32u)); - NPPST_CT_ASSERT(sizeof(NppStSize32u) == 2 * sizeof(NppSt32u)); -/*@}*/ - - -#endif - - -#ifdef __cplusplus -extern "C" { -#endif - - -/** \defgroup core_npp NPP Core - * Basic functions for CUDA streams management. - * WARNING: These functions couldn't be exported into DLL, so they can be used only with static version of NPP_staging - * @{ - */ - - -/** - * Gets an active CUDA stream used by NPP (Not an API yet!) - * \return Current CUDA stream - */ -cudaStream_t nppStGetActiveCUDAstream(); - - -/** - * Sets an active CUDA stream used by NPP (Not an API yet!) - * \param cudaStream [IN] cudaStream CUDA stream to become current - * \return CUDA stream used before - */ -cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream); - - -/*@}*/ - - -/** \defgroup nppi NPP Image Processing -* @{ -*/ - - -/** - * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit unsigned pixels, single channel. - * - * \param d_src [IN] Source image pointer (CUDA device memory) - * \param srcStep [IN] Source image line step - * \param d_dst [OUT] Destination image pointer (CUDA device memory) - * \param dstStep [IN] Destination image line step - * \param srcRoi [IN] Region of interest in the source image - * \param scale [IN] Downsampling scale factor (positive integer) - * \param readThruTexture [IN] Performance hint to cache source in texture (true) or read directly (false) - * - * \return NPP status code - */ -NppStStatus nppiStDownsampleNearest_32u_C1R(NppSt32u *d_src, NppSt32u srcStep, - NppSt32u *d_dst, NppSt32u dstStep, - NppStSize32u srcRoi, NppSt32u scale, - NppStBool readThruTexture); - - -/** - * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit signed pixels, single channel. - * \see nppiStDownsampleNearest_32u_C1R - */ -NppStStatus nppiStDownsampleNearest_32s_C1R(NppSt32s *d_src, NppSt32u srcStep, - NppSt32s *d_dst, NppSt32u dstStep, - NppStSize32u srcRoi, NppSt32u scale, - NppStBool readThruTexture); - - -/** - * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit float pixels, single channel. - * \see nppiStDownsampleNearest_32u_C1R - */ -NppStStatus nppiStDownsampleNearest_32f_C1R(NppSt32f *d_src, NppSt32u srcStep, - NppSt32f *d_dst, NppSt32u dstStep, - NppStSize32u srcRoi, NppSt32u scale, - NppStBool readThruTexture); - - -/** -* Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit unsigned pixels, single channel. -* \see nppiStDownsampleNearest_32u_C1R -*/ -NppStStatus nppiStDownsampleNearest_64u_C1R(NppSt64u *d_src, NppSt32u srcStep, - NppSt64u *d_dst, NppSt32u dstStep, - NppStSize32u srcRoi, NppSt32u scale, - NppStBool readThruTexture); - - -/** - * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit signed pixels, single channel. - * \see nppiStDownsampleNearest_32u_C1R - */ -NppStStatus nppiStDownsampleNearest_64s_C1R(NppSt64s *d_src, NppSt32u srcStep, - NppSt64s *d_dst, NppSt32u dstStep, - NppStSize32u srcRoi, NppSt32u scale, - NppStBool readThruTexture); - - -/** - * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit float pixels, single channel. - * \see nppiStDownsampleNearest_32u_C1R - */ -NppStStatus nppiStDownsampleNearest_64f_C1R(NppSt64f *d_src, NppSt32u srcStep, - NppSt64f *d_dst, NppSt32u dstStep, - NppStSize32u srcRoi, NppSt32u scale, - NppStBool readThruTexture); - - -/** - * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit unsigned pixels, single channel. Host implementation. - * - * \param h_src [IN] Source image pointer (Host or pinned memory) - * \param srcStep [IN] Source image line step - * \param h_dst [OUT] Destination image pointer (Host or pinned memory) - * \param dstStep [IN] Destination image line step - * \param srcRoi [IN] Region of interest in the source image - * \param scale [IN] Downsampling scale factor (positive integer) - * - * \return NPP status code - */ -NppStStatus nppiStDownsampleNearest_32u_C1R_host(NppSt32u *h_src, NppSt32u srcStep, - NppSt32u *h_dst, NppSt32u dstStep, - NppStSize32u srcRoi, NppSt32u scale); - - -/** - * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit signed pixels, single channel. Host implementation. - * \see nppiStDownsampleNearest_32u_C1R_host - */ -NppStStatus nppiStDownsampleNearest_32s_C1R_host(NppSt32s *h_src, NppSt32u srcStep, - NppSt32s *h_dst, NppSt32u dstStep, - NppStSize32u srcRoi, NppSt32u scale); - - -/** - * Downsamples (decimates) an image using the nearest neighbor algorithm. 32-bit float pixels, single channel. Host implementation. - * \see nppiStDownsampleNearest_32u_C1R_host - */ -NppStStatus nppiStDownsampleNearest_32f_C1R_host(NppSt32f *h_src, NppSt32u srcStep, - NppSt32f *h_dst, NppSt32u dstStep, - NppStSize32u srcRoi, NppSt32u scale); - - -/** - * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit unsigned pixels, single channel. Host implementation. - * \see nppiStDownsampleNearest_32u_C1R_host - */ -NppStStatus nppiStDownsampleNearest_64u_C1R_host(NppSt64u *h_src, NppSt32u srcStep, - NppSt64u *h_dst, NppSt32u dstStep, - NppStSize32u srcRoi, NppSt32u scale); - - -/** - * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit signed pixels, single channel. Host implementation. - * \see nppiStDownsampleNearest_32u_C1R_host - */ -NppStStatus nppiStDownsampleNearest_64s_C1R_host(NppSt64s *h_src, NppSt32u srcStep, - NppSt64s *h_dst, NppSt32u dstStep, - NppStSize32u srcRoi, NppSt32u scale); - - -/** - * Downsamples (decimates) an image using the nearest neighbor algorithm. 64-bit float pixels, single channel. Host implementation. - * \see nppiStDownsampleNearest_32u_C1R_host - */ -NppStStatus nppiStDownsampleNearest_64f_C1R_host(NppSt64f *h_src, NppSt32u srcStep, - NppSt64f *h_dst, NppSt32u dstStep, - NppStSize32u srcRoi, NppSt32u scale); - - -/** - * Computes standard deviation for each rectangular region of the input image using integral images. - * - * \param d_sum [IN] Integral image pointer (CUDA device memory) - * \param sumStep [IN] Integral image line step - * \param d_sqsum [IN] Squared integral image pointer (CUDA device memory) - * \param sqsumStep [IN] Squared integral image line step - * \param d_norm [OUT] Stddev image pointer (CUDA device memory). Each pixel contains stddev of a rect with top-left corner at the original location in the image - * \param normStep [IN] Stddev image line step - * \param roi [IN] Region of interest in the source image - * \param rect [IN] Rectangular region to calculate stddev over - * \param scaleArea [IN] Multiplication factor to account decimated scale - * \param readThruTexture [IN] Performance hint to cache source in texture (true) or read directly (false) - * - * \return NPP status code - */ -NppStStatus nppiStRectStdDev_32f_C1R(NppSt32u *d_sum, NppSt32u sumStep, - NppSt64u *d_sqsum, NppSt32u sqsumStep, - NppSt32f *d_norm, NppSt32u normStep, - NppStSize32u roi, NppStRect32u rect, - NppSt32f scaleArea, NppStBool readThruTexture); - - -/** - * Computes standard deviation for each rectangular region of the input image using integral images. Host implementation - * - * \param h_sum [IN] Integral image pointer (Host or pinned memory) - * \param sumStep [IN] Integral image line step - * \param h_sqsum [IN] Squared integral image pointer (Host or pinned memory) - * \param sqsumStep [IN] Squared integral image line step - * \param h_norm [OUT] Stddev image pointer (Host or pinned memory). Each pixel contains stddev of a rect with top-left corner at the original location in the image - * \param normStep [IN] Stddev image line step - * \param roi [IN] Region of interest in the source image - * \param rect [IN] Rectangular region to calculate stddev over - * \param scaleArea [IN] Multiplication factor to account decimated scale - * - * \return NPP status code - */ -NppStStatus nppiStRectStdDev_32f_C1R_host(NppSt32u *h_sum, NppSt32u sumStep, - NppSt64u *h_sqsum, NppSt32u sqsumStep, - NppSt32f *h_norm, NppSt32u normStep, - NppStSize32u roi, NppStRect32u rect, - NppSt32f scaleArea); - - -/** - * Transposes an image. 32-bit unsigned pixels, single channel - * - * \param d_src [IN] Source image pointer (CUDA device memory) - * \param srcStride [IN] Source image line step - * \param d_dst [OUT] Destination image pointer (CUDA device memory) - * \param dstStride [IN] Destination image line step - * \param srcRoi [IN] Region of interest of the source image - * - * \return NPP status code - */ -NppStStatus nppiStTranspose_32u_C1R(NppSt32u *d_src, NppSt32u srcStride, - NppSt32u *d_dst, NppSt32u dstStride, NppStSize32u srcRoi); - - -/** - * Transposes an image. 32-bit signed pixels, single channel - * \see nppiStTranspose_32u_C1R - */ -NppStStatus nppiStTranspose_32s_C1R(NppSt32s *d_src, NppSt32u srcStride, - NppSt32s *d_dst, NppSt32u dstStride, NppStSize32u srcRoi); - - -/** - * Transposes an image. 32-bit float pixels, single channel - * \see nppiStTranspose_32u_C1R - */ -NppStStatus nppiStTranspose_32f_C1R(NppSt32f *d_src, NppSt32u srcStride, - NppSt32f *d_dst, NppSt32u dstStride, NppStSize32u srcRoi); - - -/** - * Transposes an image. 64-bit unsigned pixels, single channel - * \see nppiStTranspose_32u_C1R - */ -NppStStatus nppiStTranspose_64u_C1R(NppSt64u *d_src, NppSt32u srcStride, - NppSt64u *d_dst, NppSt32u dstStride, NppStSize32u srcRoi); - - -/** - * Transposes an image. 64-bit signed pixels, single channel - * \see nppiStTranspose_32u_C1R - */ -NppStStatus nppiStTranspose_64s_C1R(NppSt64s *d_src, NppSt32u srcStride, - NppSt64s *d_dst, NppSt32u dstStride, NppStSize32u srcRoi); - - -/** - * Transposes an image. 64-bit float pixels, single channel - * \see nppiStTranspose_32u_C1R - */ -NppStStatus nppiStTranspose_64f_C1R(NppSt64f *d_src, NppSt32u srcStride, - NppSt64f *d_dst, NppSt32u dstStride, NppStSize32u srcRoi); - - -/** - * Transposes an image. 32-bit unsigned pixels, single channel. Host implementation - * - * \param h_src [IN] Source image pointer (Host or pinned memory) - * \param srcStride [IN] Source image line step - * \param h_dst [OUT] Destination image pointer (Host or pinned memory) - * \param dstStride [IN] Destination image line step - * \param srcRoi [IN] Region of interest of the source image - * - * \return NPP status code - */ -NppStStatus nppiStTranspose_32u_C1R_host(NppSt32u *h_src, NppSt32u srcStride, - NppSt32u *h_dst, NppSt32u dstStride, NppStSize32u srcRoi); - - -/** - * Transposes an image. 32-bit signed pixels, single channel. Host implementation - * \see nppiStTranspose_32u_C1R_host - */ -NppStStatus nppiStTranspose_32s_C1R_host(NppSt32s *h_src, NppSt32u srcStride, - NppSt32s *h_dst, NppSt32u dstStride, NppStSize32u srcRoi); - - -/** - * Transposes an image. 32-bit float pixels, single channel. Host implementation - * \see nppiStTranspose_32u_C1R_host - */ -NppStStatus nppiStTranspose_32f_C1R_host(NppSt32f *h_src, NppSt32u srcStride, - NppSt32f *h_dst, NppSt32u dstStride, NppStSize32u srcRoi); - - -/** - * Transposes an image. 64-bit unsigned pixels, single channel. Host implementation - * \see nppiStTranspose_32u_C1R_host - */ -NppStStatus nppiStTranspose_64u_C1R_host(NppSt64u *h_src, NppSt32u srcStride, - NppSt64u *h_dst, NppSt32u dstStride, NppStSize32u srcRoi); - - -/** - * Transposes an image. 64-bit signed pixels, single channel. Host implementation - * \see nppiStTranspose_32u_C1R_host - */ -NppStStatus nppiStTranspose_64s_C1R_host(NppSt64s *h_src, NppSt32u srcStride, - NppSt64s *h_dst, NppSt32u dstStride, NppStSize32u srcRoi); - - -/** - * Transposes an image. 64-bit float pixels, single channel. Host implementation - * \see nppiStTranspose_32u_C1R_host - */ -NppStStatus nppiStTranspose_64f_C1R_host(NppSt64f *h_src, NppSt32u srcStride, - NppSt64f *h_dst, NppSt32u dstStride, NppStSize32u srcRoi); - - -/** - * Calculates the size of the temporary buffer for integral image creation - * - * \param roiSize [IN] Size of the input image - * \param pBufsize [OUT] Pointer to host variable that returns the size of the temporary buffer (in bytes) - * - * \return NPP status code - */ -NppStStatus nppiStIntegralGetSize_8u32u(NppStSize32u roiSize, NppSt32u *pBufsize); - - -/** - * Calculates the size of the temporary buffer for integral image creation - * \see nppiStIntegralGetSize_8u32u - */ -NppStStatus nppiStIntegralGetSize_32f32f(NppStSize32u roiSize, NppSt32u *pBufsize); - - -/** - * Creates an integral image representation for the input image - * - * \param d_src [IN] Source image pointer (CUDA device memory) - * \param srcStep [IN] Source image line step - * \param d_dst [OUT] Destination integral image pointer (CUDA device memory) - * \param dstStep [IN] Destination image line step - * \param roiSize [IN] Region of interest of the source image - * \param pBuffer [IN] Pointer to the pre-allocated temporary buffer (CUDA device memory) - * \param bufSize [IN] Size of the pBuffer in bytes - * - * \return NPP status code - */ -NppStStatus nppiStIntegral_8u32u_C1R(NppSt8u *d_src, NppSt32u srcStep, - NppSt32u *d_dst, NppSt32u dstStep, NppStSize32u roiSize, - NppSt8u *pBuffer, NppSt32u bufSize); - - -/** - * Creates an integral image representation for the input image - * \see nppiStIntegral_8u32u_C1R - */ -NppStStatus nppiStIntegral_32f32f_C1R(NppSt32f *d_src, NppSt32u srcStep, - NppSt32f *d_dst, NppSt32u dstStep, NppStSize32u roiSize, - NppSt8u *pBuffer, NppSt32u bufSize); - - -/** - * Creates an integral image representation for the input image. Host implementation - * - * \param h_src [IN] Source image pointer (Host or pinned memory) - * \param srcStep [IN] Source image line step - * \param h_dst [OUT] Destination integral image pointer (Host or pinned memory) - * \param dstStep [IN] Destination image line step - * \param roiSize [IN] Region of interest of the source image - * - * \return NPP status code - */ -NppStStatus nppiStIntegral_8u32u_C1R_host(NppSt8u *h_src, NppSt32u srcStep, - NppSt32u *h_dst, NppSt32u dstStep, NppStSize32u roiSize); - - -/** - * Creates an integral image representation for the input image. Host implementation - * \see nppiStIntegral_8u32u_C1R_host - */ -NppStStatus nppiStIntegral_32f32f_C1R_host(NppSt32f *h_src, NppSt32u srcStep, - NppSt32f *h_dst, NppSt32u dstStep, NppStSize32u roiSize); - - -/** - * Calculates the size of the temporary buffer for squared integral image creation - * - * \param roiSize [IN] Size of the input image - * \param pBufsize [OUT] Pointer to host variable that returns the size of the temporary buffer (in bytes) - * - * \return NPP status code - */ -NppStStatus nppiStSqrIntegralGetSize_8u64u(NppStSize32u roiSize, NppSt32u *pBufsize); - - -/** - * Creates a squared integral image representation for the input image - * - * \param d_src [IN] Source image pointer (CUDA device memory) - * \param srcStep [IN] Source image line step - * \param d_dst [OUT] Destination squared integral image pointer (CUDA device memory) - * \param dstStep [IN] Destination image line step - * \param roiSize [IN] Region of interest of the source image - * \param pBuffer [IN] Pointer to the pre-allocated temporary buffer (CUDA device memory) - * \param bufSize [IN] Size of the pBuffer in bytes - * - * \return NPP status code - */ -NppStStatus nppiStSqrIntegral_8u64u_C1R(NppSt8u *d_src, NppSt32u srcStep, - NppSt64u *d_dst, NppSt32u dstStep, NppStSize32u roiSize, - NppSt8u *pBuffer, NppSt32u bufSize); - - -/** - * Creates a squared integral image representation for the input image. Host implementation - * - * \param h_src [IN] Source image pointer (Host or pinned memory) - * \param srcStep [IN] Source image line step - * \param h_dst [OUT] Destination squared integral image pointer (Host or pinned memory) - * \param dstStep [IN] Destination image line step - * \param roiSize [IN] Region of interest of the source image - * - * \return NPP status code - */ -NppStStatus nppiStSqrIntegral_8u64u_C1R_host(NppSt8u *h_src, NppSt32u srcStep, - NppSt64u *h_dst, NppSt32u dstStep, NppStSize32u roiSize); - - -/*@}*/ - - -/** \defgroup npps NPP Signal Processing -* @{ -*/ - - -/** - * Calculates the size of the temporary buffer for vector compaction. 32-bit unsigned values - * - * \param srcLen [IN] Length of the input vector in elements - * \param pBufsize [OUT] Pointer to host variable that returns the size of the temporary buffer (in bytes) - * - * \return NPP status code - */ -NppStStatus nppsStCompactGetSize_32u(NppSt32u srcLen, NppSt32u *pBufsize); - - -/** - * Calculates the size of the temporary buffer for vector compaction. 32-bit signed values - * \see nppsStCompactGetSize_32u - */ -NppStStatus nppsStCompactGetSize_32s(NppSt32u srcLen, NppSt32u *pBufsize); - - -/** - * Calculates the size of the temporary buffer for vector compaction. 32-bit float values - * \see nppsStCompactGetSize_32u - */ -NppStStatus nppsStCompactGetSize_32f(NppSt32u srcLen, NppSt32u *pBufsize); - - -/** - * Compacts the input vector by removing elements of specified value. 32-bit unsigned values - * - * \param d_src [IN] Source vector pointer (CUDA device memory) - * \param srcLen [IN] Source vector length - * \param d_dst [OUT] Destination vector pointer (CUDA device memory) - * \param p_dstLen [OUT] Pointer to the destination vector length (Pinned memory or NULL) - * \param elemRemove [IN] The value to be removed - * \param pBuffer [IN] Pointer to the pre-allocated temporary buffer (CUDA device memory) - * \param bufSize [IN] Size of the pBuffer in bytes - * - * \return NPP status code - */ -NppStStatus nppsStCompact_32u(NppSt32u *d_src, NppSt32u srcLen, - NppSt32u *d_dst, NppSt32u *p_dstLen, - NppSt32u elemRemove, - NppSt8u *pBuffer, NppSt32u bufSize); - - -/** - * Compacts the input vector by removing elements of specified value. 32-bit signed values - * \see nppsStCompact_32u - */ -NppStStatus nppsStCompact_32s(NppSt32s *d_src, NppSt32u srcLen, - NppSt32s *d_dst, NppSt32u *p_dstLen, - NppSt32s elemRemove, - NppSt8u *pBuffer, NppSt32u bufSize); - - -/** - * Compacts the input vector by removing elements of specified value. 32-bit float values - * \see nppsStCompact_32u - */ -NppStStatus nppsStCompact_32f(NppSt32f *d_src, NppSt32u srcLen, - NppSt32f *d_dst, NppSt32u *p_dstLen, - NppSt32f elemRemove, - NppSt8u *pBuffer, NppSt32u bufSize); - - -/** - * Compacts the input vector by removing elements of specified value. 32-bit unsigned values. Host implementation - * - * \param h_src [IN] Source vector pointer (CUDA device memory) - * \param srcLen [IN] Source vector length - * \param h_dst [OUT] Destination vector pointer (CUDA device memory) - * \param dstLen [OUT] Pointer to the destination vector length (can be NULL) - * \param elemRemove [IN] The value to be removed - * - * \return NPP status code - */ -NppStStatus nppsStCompact_32u_host(NppSt32u *h_src, NppSt32u srcLen, - NppSt32u *h_dst, NppSt32u *dstLen, NppSt32u elemRemove); - - -/** - * Compacts the input vector by removing elements of specified value. 32-bit signed values. Host implementation - * \see nppsStCompact_32u_host - */ -NppStStatus nppsStCompact_32s_host(NppSt32s *h_src, NppSt32u srcLen, - NppSt32s *h_dst, NppSt32u *dstLen, NppSt32s elemRemove); - - -/** - * Compacts the input vector by removing elements of specified value. 32-bit float values. Host implementation - * \see nppsStCompact_32u_host - */ -NppStStatus nppsStCompact_32f_host(NppSt32f *h_src, NppSt32u srcLen, - NppSt32f *h_dst, NppSt32u *dstLen, NppSt32f elemRemove); - - -/*@}*/ - - -#ifdef __cplusplus -} -#endif - - -#endif // _npp_staging_h_ diff --git a/modules/gpu/CMakeLists.txt b/modules/gpu/CMakeLists.txt index f25036ac24..e99cbc78fc 100644 --- a/modules/gpu/CMakeLists.txt +++ b/modules/gpu/CMakeLists.txt @@ -36,10 +36,12 @@ file(GLOB lib_device_hdrs "src/opencv2/gpu/device/*.h*") source_group("Device" FILES ${lib_device_hdrs}) if (HAVE_CUDA AND MSVC) - file(GLOB ncv_srcs "src/nvidia/*.cpp") - file(GLOB ncv_hdrs "src/nvidia/*.h*") - file(GLOB ncv_cuda "src/nvidia/*.cu") - source_group("Src\\NVidia" FILES ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda}) + file(GLOB_RECURSE ncv_srcs "src/nvidia/*.cpp") + file(GLOB_RECURSE ncv_cuda "src/nvidia/*.cu") + file(GLOB_RECURSE ncv_hdr1 "src/nvidia/*.hpp") + file(GLOB_RECURSE ncv_hdr2 "src/nvidia/*.h") + source_group("Src\\NVidia" FILES ${ncv_srcs} ${ncv_hdr1} ${ncv_hdr2} ${ncv_cuda}) + include_directories("src/nvidia/core" "src/nvidia/NPP_staging") endif() if (HAVE_CUDA) @@ -74,17 +76,13 @@ if (HAVE_CUDA) string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}") string(REPLACE "/EHsc-" "/EHs" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") - endif() - - - include(FindNPP_staging.cmake) - include_directories(${NPPST_INC}) + endif() CUDA_COMPILE(cuda_objs ${lib_cuda} ${ncv_cuda}) #CUDA_BUILD_CLEAN_TARGET() endif() -add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${ncv_srcs} ${ncv_hdrs} ${ncv_cuda} ${cuda_objs}) +add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${lib_cuda_hdrs} ${lib_device_hdrs} ${ncv_srcs} ${ncv_hdr1} ${ncv_hdr2} ${ncv_cuda} ${cuda_objs}) if(PCHSupport_FOUND) set(pch_header ${CMAKE_CURRENT_SOURCE_DIR}/src/precomp.hpp) @@ -117,8 +115,7 @@ set_target_properties(${the_target} PROPERTIES target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${DEPS} ) if (HAVE_CUDA) - target_link_libraries(${the_target} ${CUDA_LIBRARIES} ${CUDA_NPP_LIBRARIES}) - target_link_libraries(${the_target} ${NPPST_LIB}) + target_link_libraries(${the_target} ${CUDA_LIBRARIES} ${CUDA_NPP_LIBRARIES}) CUDA_ADD_CUFFT_TO_TARGET(${the_target}) endif() diff --git a/modules/gpu/FindNPP_staging.cmake b/modules/gpu/FindNPP_staging.cmake deleted file mode 100644 index e478695c69..0000000000 --- a/modules/gpu/FindNPP_staging.cmake +++ /dev/null @@ -1,24 +0,0 @@ -if(CMAKE_SIZEOF_VOID_P EQUAL 4) - set(BIT_SUFF 32) -else() - set(BIT_SUFF 64) -endif() - -if (APPLE) - set(PLATFORM_SUFF Darwin) -elseif (UNIX) - set(PLATFORM_SUFF Linux) -else() - set(PLATFORM_SUFF Windows) -endif() - -set(LIB_FILE NPP_staging_static_${PLATFORM_SUFF}_${BIT_SUFF}_v1) - -find_library(NPPST_LIB - NAMES "${LIB_FILE}" "lib${LIB_FILE}" - PATHS "${CMAKE_SOURCE_DIR}/3rdparty/NPP_staging" - DOC "NPP staging library" - ) - -SET(NPPST_INC "${CMAKE_SOURCE_DIR}//3rdparty/NPP_staging") - \ No newline at end of file diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp index 5a54072a82..c6a23debaa 100644 --- a/modules/gpu/src/arithm.cpp +++ b/modules/gpu/src/arithm.cpp @@ -83,25 +83,25 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst) sz.width = src.cols; sz.height = src.rows; - nppSafeCall( nppiTranspose_8u_C1R(src.ptr(), src.step, dst.ptr(), dst.step, sz) ); + nppSafeCall( nppiTranspose_8u_C1R(src.ptr(), src.step, dst.ptr(), dst.step, sz) ); } else if (src.elemSize() == 4) { - NppStSize32u sz; + NcvSize32u sz; sz.width = src.cols; sz.height = src.rows; - nppSafeCall( nppiStTranspose_32u_C1R(const_cast(src.ptr()), src.step, - dst.ptr(), dst.step, sz) ); + nppSafeCall( nppiStTranspose_32u_C1R(const_cast(src.ptr()), src.step, + dst.ptr(), dst.step, sz) ); } else // if (src.elemSize() == 8) { - NppStSize32u sz; + NcvSize32u sz; sz.width = src.cols; sz.height = src.rows; - nppSafeCall( nppiStTranspose_64u_C1R(const_cast(src.ptr()), src.step, - dst.ptr(), dst.step, sz) ); + nppSafeCall( nppiStTranspose_64u_C1R(const_cast(src.ptr()), src.step, + dst.ptr(), dst.step, sz) ); } cudaSafeCall( cudaThreadSynchronize() ); diff --git a/modules/gpu/src/cascadeclassifier.cpp b/modules/gpu/src/cascadeclassifier.cpp index 15e5d7ffb8..37acc59bd1 100644 --- a/modules/gpu/src/cascadeclassifier.cpp +++ b/modules/gpu/src/cascadeclassifier.cpp @@ -126,7 +126,7 @@ struct cv::gpu::CascadeClassifier_GPU::CascadeClassifierImpl minNeighbors, scaleStep, 1, flags, - *gpuAllocator, *cpuAllocator, devProp.major, devProp.minor, 0); + *gpuAllocator, *cpuAllocator, devProp, 0); ncvAssertReturnNcvStat(ncvStat); ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR); @@ -146,8 +146,8 @@ private: ncvAssertCUDAReturn(cudaGetDeviceProperties(&devProp, devId), NCV_CUDA_ERROR); // Load the classifier from file (assuming its size is about 1 mb) using a simple allocator - gpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeDevice); - cpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeHostPinned); + gpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeDevice, devProp.textureAlignment); + cpuCascadeAllocator = new NCVMemNativeAllocator(NCVMemoryTypeHostPinned, devProp.textureAlignment); ncvAssertPrintReturn(gpuCascadeAllocator->isInitialized(), "Error creating cascade GPU allocator", NCV_CUDA_ERROR); ncvAssertPrintReturn(cpuCascadeAllocator->isInitialized(), "Error creating cascade CPU allocator", NCV_CUDA_ERROR); @@ -212,7 +212,7 @@ private: roi.height = d_src.height(); Ncv32u numDetections; ncvStat = ncvDetectObjectsMultiScale_device(d_src, roi, d_rects, numDetections, haar, *h_haarStages, - *d_haarStages, *d_haarNodes, *d_haarFeatures, haar.ClassifierSize, 4, 1.2f, 1, 0, gpuCounter, cpuCounter, devProp.major, devProp.minor, 0); + *d_haarStages, *d_haarNodes, *d_haarFeatures, haar.ClassifierSize, 4, 1.2f, 1, 0, gpuCounter, cpuCounter, devProp, 0); ncvAssertReturnNcvStat(ncvStat); ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR); diff --git a/modules/gpu/src/imgproc_gpu.cpp b/modules/gpu/src/imgproc_gpu.cpp index 1711c3c9ee..15a95a4479 100644 --- a/modules/gpu/src/imgproc_gpu.cpp +++ b/modules/gpu/src/imgproc_gpu.cpp @@ -560,16 +560,19 @@ void cv::gpu::integralBuffered(const GpuMat& src, GpuMat& sum, GpuMat& buffer) sum.create(src.rows + 1, src.cols + 1, CV_32S); - NppStSize32u roiSize; + NcvSize32u roiSize; roiSize.width = src.cols; roiSize.height = src.rows; - NppSt32u bufSize; - nppSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize) ); + cudaDeviceProp prop; + cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) ); + + Ncv32u bufSize; + nppSafeCall( nppiStIntegralGetSize_8u32u(roiSize, &bufSize, prop) ); ensureSizeIsEnough(1, bufSize, CV_8UC1, buffer); - nppSafeCall( nppiStIntegral_8u32u_C1R(const_cast(src.ptr()), src.step, - sum.ptr(), sum.step, roiSize, buffer.ptr(), bufSize) ); + nppSafeCall( nppiStIntegral_8u32u_C1R(const_cast(src.ptr()), src.step, + sum.ptr(), sum.step, roiSize, buffer.ptr(), bufSize, prop) ); cudaSafeCall( cudaThreadSynchronize() ); } @@ -600,19 +603,20 @@ void cv::gpu::sqrIntegral(const GpuMat& src, GpuMat& sqsum) { CV_Assert(src.type() == CV_8U); - NppStSize32u roiSize; + NcvSize32u roiSize; roiSize.width = src.cols; roiSize.height = src.rows; - NppSt32u bufSize; - nppSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize)); + cudaDeviceProp prop; + cudaSafeCall( cudaGetDeviceProperties(&prop, cv::gpu::getDevice()) ); + + Ncv32u bufSize; + nppSafeCall(nppiStSqrIntegralGetSize_8u64u(roiSize, &bufSize, prop)); GpuMat buf(1, bufSize, CV_8U); sqsum.create(src.rows + 1, src.cols + 1, CV_64F); - nppSafeCall(nppiStSqrIntegral_8u64u_C1R( - const_cast(src.ptr(0)), src.step, - sqsum.ptr(0), sqsum.step, roiSize, - buf.ptr(0), bufSize)); + nppSafeCall(nppiStSqrIntegral_8u64u_C1R(const_cast(src.ptr(0)), src.step, + sqsum.ptr(0), sqsum.step, roiSize, buf.ptr(0), bufSize, prop)); cudaSafeCall( cudaThreadSynchronize() ); } diff --git a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu index a501d6525e..dc4796989a 100644 --- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu +++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu @@ -57,8 +57,8 @@ #include -#include "npp.h" #include "NCV.hpp" +#include "NPP_staging/NPP_staging.hpp" #include "NCVRuntimeTemplates.hpp" #include "NCVHaarObjectDetection.hpp" @@ -970,8 +970,7 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix &d_integralImag Ncv32f scaleArea, INCVMemAllocator &gpuAllocator, INCVMemAllocator &cpuAllocator, - Ncv32u devPropMajor, - Ncv32u devPropMinor, + cudaDeviceProp &devProp, cudaStream_t cuStream) { ncvAssertReturn(d_integralImage.memType() == d_weights.memType() && @@ -1077,15 +1076,15 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix &d_integralImag Ncv32f scaleAreaPixels = scaleArea * ((haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER) * (haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER)); - NcvBool bTexCacheCascade = devPropMajor < 2; + NcvBool bTexCacheCascade = devProp.major < 2; NcvBool bTexCacheIImg = true; //this works better even on Fermi so far - NcvBool bDoAtomicCompaction = devPropMajor >= 2 || (devPropMajor == 1 && devPropMinor >= 3); + NcvBool bDoAtomicCompaction = devProp.major >= 2 || (devProp.major == 1 && devProp.minor >= 3); NCVVector *d_ptrNowData = &d_vecPixelMask; NCVVector *d_ptrNowTmp = &d_vecPixelMaskTmp; Ncv32u szNppCompactTmpBuf; - nppsStCompactGetSize_32u(d_vecPixelMask.length(), &szNppCompactTmpBuf); + nppsStCompactGetSize_32u(d_vecPixelMask.length(), &szNppCompactTmpBuf, devProp); if (bDoAtomicCompaction) { szNppCompactTmpBuf = 0; @@ -1185,11 +1184,11 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix &d_integralImag } else { - NppStStatus nppSt; + NCVStatus nppSt; nppSt = nppsStCompact_32u(d_ptrNowTmp->ptr(), d_vecPixelMask.length(), d_ptrNowData->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U, - d_tmpBufCompact.ptr(), szNppCompactTmpBuf); - ncvAssertReturn(nppSt == NPP_SUCCESS, NCV_NPP_ERROR); + d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp); + ncvAssertReturn(nppSt == NPPST_SUCCESS, NCV_NPP_ERROR); } numDetections = *hp_numDet; } @@ -1240,11 +1239,11 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix &d_integralImag } else { - NppStStatus nppSt; + NCVStatus nppSt; nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), d_vecPixelMask.length(), d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U, - d_tmpBufCompact.ptr(), szNppCompactTmpBuf); - ncvAssertReturn(nppSt == NPP_SUCCESS, NCV_NPP_ERROR); + d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp); + ncvAssertReturnNcvStat(nppSt); } swap(d_ptrNowData, d_ptrNowTmp); @@ -1310,11 +1309,11 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix &d_integralImag } else { - NppStStatus nppSt; + NCVStatus nppSt; nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), numDetections, d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U, - d_tmpBufCompact.ptr(), szNppCompactTmpBuf); - ncvAssertReturn(nppSt == NPP_SUCCESS, NCV_NPP_ERROR); + d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp); + ncvAssertReturnNcvStat(nppSt); } swap(d_ptrNowData, d_ptrNowTmp); @@ -1371,11 +1370,11 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix &d_integralImag } else { - NppStStatus nppSt; + NCVStatus nppSt; nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), numDetections, d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U, - d_tmpBufCompact.ptr(), szNppCompactTmpBuf); - ncvAssertReturn(nppSt == NPP_SUCCESS, NCV_NPP_ERROR); + d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp); + ncvAssertReturnNcvStat(nppSt); } swap(d_ptrNowData, d_ptrNowTmp); @@ -1715,8 +1714,7 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix &d_srcImg, INCVMemAllocator &gpuAllocator, INCVMemAllocator &cpuAllocator, - Ncv32u devPropMajor, - Ncv32u devPropMinor, + cudaDeviceProp &devProp, cudaStream_t cuStream) { ncvAssertReturn(d_srcImg.memType() == d_dstRects.memType() && @@ -1773,12 +1771,12 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix &d_srcImg, NCVVectorAlloc h_hypothesesIntermediate(cpuAllocator, d_srcImg.width() * d_srcImg.height()); ncvAssertReturn(h_hypothesesIntermediate.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); - NppStStatus nppStat; + NCVStatus nppStat; Ncv32u szTmpBufIntegral, szTmpBufSqIntegral; - nppStat = nppiStIntegralGetSize_8u32u(NppStSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufIntegral); - ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR); - nppStat = nppiStSqrIntegralGetSize_8u64u(NppStSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufSqIntegral); - ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR); + nppStat = nppiStIntegralGetSize_8u32u(NcvSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufIntegral, devProp); + ncvAssertReturnNcvStat(nppStat); + nppStat = nppiStSqrIntegralGetSize_8u64u(NcvSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufSqIntegral, devProp); + ncvAssertReturnNcvStat(nppStat); NCVVectorAlloc d_tmpIIbuf(gpuAllocator, std::max(szTmpBufIntegral, szTmpBufSqIntegral)); ncvAssertReturn(d_tmpIIbuf.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); @@ -1786,15 +1784,15 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix &d_srcImg, nppStat = nppiStIntegral_8u32u_C1R(d_srcImg.ptr(), d_srcImg.pitch(), d_integralImage.ptr(), d_integralImage.pitch(), - NppStSize32u(d_srcImg.width(), d_srcImg.height()), - d_tmpIIbuf.ptr(), szTmpBufIntegral); - ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR); + NcvSize32u(d_srcImg.width(), d_srcImg.height()), + d_tmpIIbuf.ptr(), szTmpBufIntegral, devProp); + ncvAssertReturnNcvStat(nppStat); nppStat = nppiStSqrIntegral_8u64u_C1R(d_srcImg.ptr(), d_srcImg.pitch(), d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(), - NppStSize32u(d_srcImg.width(), d_srcImg.height()), - d_tmpIIbuf.ptr(), szTmpBufSqIntegral); - ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR); + NcvSize32u(d_srcImg.width(), d_srcImg.height()), + d_tmpIIbuf.ptr(), szTmpBufSqIntegral, devProp); + ncvAssertReturnNcvStat(nppStat); NCV_SKIP_COND_END @@ -1859,7 +1857,7 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix &d_srcImg, Ncv32u scale = scalesVector[i]; NcvSize32u srcRoi, scaledIIRoi, searchRoi; - NppStSize32u srcIIRoi; + NcvSize32u srcIIRoi; srcRoi.width = d_srcImg.width(); srcRoi.height = d_srcImg.height(); srcIIRoi.width = srcRoi.width + 1; @@ -1875,15 +1873,15 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix &d_srcImg, d_integralImage.ptr(), d_integralImage.pitch(), d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(), srcIIRoi, scale, true); - ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR); + ncvAssertReturnNcvStat(nppStat); nppStat = nppiStDownsampleNearest_64u_C1R( d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(), d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(), srcIIRoi, scale, true); - ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR); + ncvAssertReturnNcvStat(nppStat); - const NppStRect32u rect( + const NcvRect32u rect( HAAR_STDDEV_BORDER, HAAR_STDDEV_BORDER, haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER, @@ -1892,9 +1890,9 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix &d_srcImg, d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(), d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(), d_rectStdDev.ptr(), d_rectStdDev.pitch(), - NppStSize32u(searchRoi.width, searchRoi.height), rect, + NcvSize32u(searchRoi.width, searchRoi.height), rect, (Ncv32f)scale*scale, true); - ncvAssertReturn(nppStat == NPP_SUCCESS, NCV_NPP_ERROR); + ncvAssertReturnNcvStat(nppStat); NCV_SKIP_COND_END @@ -1904,8 +1902,8 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix &d_srcImg, detectionsOnThisScale, haar, h_HaarStages, d_HaarStages, d_HaarNodes, d_HaarFeatures, false, searchRoi, pixelStep, (Ncv32f)scale*scale, - gpuAllocator, cpuAllocator, devPropMajor, devPropMinor, cuStream); - ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat); + gpuAllocator, cpuAllocator, devProp, cuStream); + ncvAssertReturnNcvStat(nppStat); NCV_SKIP_COND_BEGIN @@ -2250,6 +2248,10 @@ NCVStatus ncvGrowDetectionsVector_host(NCVVector &pixelMask, return ncvStat; } + + + + NCVStatus ncvFilterHypotheses_host(NCVVector &hypotheses, Ncv32u &numHypotheses, Ncv32u minNeighbors, @@ -2539,7 +2541,7 @@ NCVStatus ncvHaarLoadFromFile_host(const std::string &filename, } -NCVStatus ncvHaarStoreNVBIN_host(std::string &filename, +NCVStatus ncvHaarStoreNVBIN_host(const std::string &filename, HaarClassifierCascadeDescriptor haar, NCVVector &h_HaarStages, NCVVector &h_HaarNodes, diff --git a/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp b/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp index d9f500d7a0..bb463d8f4c 100644 --- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp +++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.hpp @@ -75,13 +75,13 @@ struct HaarFeature64 #define HaarFeature64_CreateCheck_MaxRectField 0xFF - __host__ NCVStatus setRect(Ncv32u rectX, Ncv32u rectY, Ncv32u rectWidth, Ncv32u rectHeight, Ncv32u clsWidth, Ncv32u clsHeight) + __host__ NCVStatus setRect(Ncv32u rectX, Ncv32u rectY, Ncv32u rectWidth, Ncv32u rectHeight, Ncv32u /*clsWidth*/, Ncv32u /*clsHeight*/) { ncvAssertReturn(rectWidth <= HaarFeature64_CreateCheck_MaxRectField && rectHeight <= HaarFeature64_CreateCheck_MaxRectField, NCV_HAAR_TOO_LARGE_FEATURES); - ((NcvRect8u*)&(this->_ui2.x))->x = rectX; - ((NcvRect8u*)&(this->_ui2.x))->y = rectY; - ((NcvRect8u*)&(this->_ui2.x))->width = rectWidth; - ((NcvRect8u*)&(this->_ui2.x))->height = rectHeight; + ((NcvRect8u*)&(this->_ui2.x))->x = (Ncv8u)rectX; + ((NcvRect8u*)&(this->_ui2.x))->y = (Ncv8u)rectY; + ((NcvRect8u*)&(this->_ui2.x))->width = (Ncv8u)rectWidth; + ((NcvRect8u*)&(this->_ui2.x))->height = (Ncv8u)rectHeight; return NCV_SUCCESS; } @@ -306,11 +306,11 @@ struct HaarStage64 }; -NPPST_CT_ASSERT(sizeof(HaarFeature64) == 8); -NPPST_CT_ASSERT(sizeof(HaarFeatureDescriptor32) == 4); -NPPST_CT_ASSERT(sizeof(HaarClassifierNodeDescriptor32) == 4); -NPPST_CT_ASSERT(sizeof(HaarClassifierNode128) == 16); -NPPST_CT_ASSERT(sizeof(HaarStage64) == 8); +NCV_CT_ASSERT(sizeof(HaarFeature64) == 8); +NCV_CT_ASSERT(sizeof(HaarFeatureDescriptor32) == 4); +NCV_CT_ASSERT(sizeof(HaarClassifierNodeDescriptor32) == 4); +NCV_CT_ASSERT(sizeof(HaarClassifierNode128) == 16); +NCV_CT_ASSERT(sizeof(HaarStage64) == 8); //============================================================================== @@ -347,7 +347,7 @@ enum NCVPipeObjDet_VisualizeInPlace = 0x004, }; - +NCV_EXPORTS NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix &d_srcImg, NcvSize32u srcRoi, NCVVector &d_dstRects, @@ -367,15 +367,14 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix &d_srcImg, INCVMemAllocator &gpuAllocator, INCVMemAllocator &cpuAllocator, - Ncv32u devPropMajor, - Ncv32u devPropMinor, + cudaDeviceProp &devProp, cudaStream_t cuStream); #define OBJDET_MASK_ELEMENT_INVALID_32U 0xFFFFFFFF #define HAAR_STDDEV_BORDER 1 - +NCV_EXPORTS NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix &d_integralImage, NCVMatrix &d_weights, NCVMatrixAlloc &d_pixelMask, @@ -391,11 +390,10 @@ NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix &d_integralImag Ncv32f scaleArea, INCVMemAllocator &gpuAllocator, INCVMemAllocator &cpuAllocator, - Ncv32u devPropMajor, - Ncv32u devPropMinor, + cudaDeviceProp &devProp, cudaStream_t cuStream); - +NCV_EXPORTS NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix &h_integralImage, NCVMatrix &h_weights, NCVMatrixAlloc &h_pixelMask, @@ -409,7 +407,7 @@ NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix &h_integralImage, Ncv32u pixelStep, Ncv32f scaleArea); - +NCV_EXPORTS NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst, Ncv32u dstStride, Ncv32u dstWidth, @@ -419,7 +417,7 @@ NCVStatus ncvDrawRects_8u_device(Ncv8u *d_dst, Ncv8u color, cudaStream_t cuStream); - +NCV_EXPORTS NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst, Ncv32u dstStride, Ncv32u dstWidth, @@ -429,7 +427,7 @@ NCVStatus ncvDrawRects_32u_device(Ncv32u *d_dst, Ncv32u color, cudaStream_t cuStream); - +NCV_EXPORTS NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst, Ncv32u dstStride, Ncv32u dstWidth, @@ -438,7 +436,7 @@ NCVStatus ncvDrawRects_8u_host(Ncv8u *h_dst, Ncv32u numRects, Ncv8u color); - +NCV_EXPORTS NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst, Ncv32u dstStride, Ncv32u dstWidth, @@ -450,7 +448,7 @@ NCVStatus ncvDrawRects_32u_host(Ncv32u *h_dst, #define RECT_SIMILARITY_PROPORTION 0.2f - +NCV_EXPORTS NCVStatus ncvGrowDetectionsVector_device(NCVVector &pixelMask, Ncv32u numPixelMaskDetections, NCVVector &hypotheses, @@ -461,7 +459,7 @@ NCVStatus ncvGrowDetectionsVector_device(NCVVector &pixelMask, Ncv32f curScale, cudaStream_t cuStream); - +NCV_EXPORTS NCVStatus ncvGrowDetectionsVector_host(NCVVector &pixelMask, Ncv32u numPixelMaskDetections, NCVVector &hypotheses, @@ -471,18 +469,18 @@ NCVStatus ncvGrowDetectionsVector_host(NCVVector &pixelMask, Ncv32u rectHeight, Ncv32f curScale); - +NCV_EXPORTS NCVStatus ncvFilterHypotheses_host(NCVVector &hypotheses, Ncv32u &numHypotheses, Ncv32u minNeighbors, Ncv32f intersectEps, NCVVector *hypothesesWeights); - +NCV_EXPORTS NCVStatus ncvHaarGetClassifierSize(const std::string &filename, Ncv32u &numStages, Ncv32u &numNodes, Ncv32u &numFeatures); - +NCV_EXPORTS NCVStatus ncvHaarLoadFromFile_host(const std::string &filename, HaarClassifierCascadeDescriptor &haar, NCVVector &h_HaarStages, @@ -490,6 +488,7 @@ NCVStatus ncvHaarLoadFromFile_host(const std::string &filename, NCVVector &h_HaarFeatures); +NCV_EXPORTS NCVStatus ncvHaarStoreNVBIN_host(const std::string &filename, HaarClassifierCascadeDescriptor haar, NCVVector &h_HaarStages, diff --git a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu new file mode 100644 index 0000000000..66f9dcdbbd --- /dev/null +++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu @@ -0,0 +1,1704 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + + +#include +#include +#include "NPP_staging.hpp" + +#if defined _SELF_TEST_ +#include +#endif + + +texture tex8u; +texture tex32u; +texture tex64u; + + +//============================================================================== +// +// CUDA streams handling +// +//============================================================================== + + +static cudaStream_t nppStream = 0; + + +cudaStream_t nppStGetActiveCUDAstream(void) +{ + return nppStream; +} + + + +cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream) +{ + cudaStream_t tmp = nppStream; + nppStream = cudaStream; + return tmp; +} + + +//============================================================================== +// +// BlockScan.cuh +// +//============================================================================== + + +//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads() +//assuming size <= WARP_SIZE and size is power of 2 +template +inline __device__ T warpScanInclusive(T idata, volatile T *s_Data) +{ + Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1)); + s_Data[pos] = 0; + pos += K_WARP_SIZE; + s_Data[pos] = idata; + + for(Ncv32u offset = 1; offset < K_WARP_SIZE; offset <<= 1) + { + s_Data[pos] += s_Data[pos - offset]; + } + + return s_Data[pos]; +} + + +template +inline __device__ T warpScanExclusive(T idata, volatile T *s_Data) +{ + return warpScanInclusive(idata, s_Data) - idata; +} + + +template +inline __device__ T blockScanInclusive(T idata, volatile T *s_Data) +{ + if (tiNumScanThreads > K_WARP_SIZE) + { + //Bottom-level inclusive warp scan + T warpResult = warpScanInclusive(idata, s_Data); + + //Save top elements of each warp for exclusive warp scan + //sync to wait for warp scans to complete (because s_Data is being overwritten) + __syncthreads(); + if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) ) + { + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult; + } + + //wait for warp scans to complete + __syncthreads(); + + if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) ) + { + //grab top warp elements + T val = s_Data[threadIdx.x]; + //calculate exclusive scan and write back to shared memory + s_Data[threadIdx.x] = warpScanExclusive(val, s_Data); + } + + //return updated warp scans with exclusive scan results + __syncthreads(); + return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE]; + } + else + { + return warpScanInclusive(idata, s_Data); + } +} + + +//============================================================================== +// +// IntegralImage.cu +// +//============================================================================== + + +const Ncv32u NUM_SCAN_THREADS = 256; +const Ncv32u LOG2_NUM_SCAN_THREADS = 8; + + +template +struct _scanElemOp +{ + template + static inline __host__ __device__ T_out scanElemOp(T_in elem); + + template<> + static inline __host__ __device__ T_out scanElemOp(T_in elem) + { + return (T_out)elem; + } + + template<> + static inline __host__ __device__ T_out scanElemOp(T_in elem) + { + return (T_out)(elem*elem); + } +}; + + +template +inline __device__ T readElem(T *d_src, Ncv32u srcStride, Ncv32u curElemOffs); + + +template<> +inline __device__ Ncv8u readElem(Ncv8u *d_src, Ncv32u srcStride, Ncv32u curElemOffs) +{ + return tex1Dfetch(tex8u, srcStride * blockIdx.x + curElemOffs); +} + + +template<> +inline __device__ Ncv32u readElem(Ncv32u *d_src, Ncv32u srcStride, Ncv32u curElemOffs) +{ + return d_src[curElemOffs]; +} + + +template<> +inline __device__ Ncv32f readElem(Ncv32f *d_src, Ncv32u srcStride, Ncv32u curElemOffs) +{ + return d_src[curElemOffs]; +} + + +/** +* \brief Segmented scan kernel +* +* Calculates per-row prefix scans of the input image. +* Out-of-bounds safe: reads 'size' elements, writes 'size+1' elements +* +* \tparam T_in Type of input image elements +* \tparam T_out Type of output image elements +* \tparam T_op Defines an operation to be performed on the input image pixels +* +* \param d_src [IN] Source image pointer +* \param srcWidth [IN] Source image width +* \param srcStride [IN] Source image stride +* \param d_II [OUT] Output image pointer +* \param IIstride [IN] Output image stride +* +* \return None +*/ +template +__global__ void scanRows(T_in *d_src, Ncv32u srcWidth, Ncv32u srcStride, + T_out *d_II, Ncv32u IIstride) +{ + //advance pointers to the current line + if (sizeof(T_in) != 1) + { + d_src += srcStride * blockIdx.x; + } + //for initial image 8bit source we use texref tex8u + d_II += IIstride * blockIdx.x; + + Ncv32u numBuckets = (srcWidth + NUM_SCAN_THREADS - 1) >> LOG2_NUM_SCAN_THREADS; + Ncv32u offsetX = 0; + + __shared__ T_out shmem[NUM_SCAN_THREADS * 2]; + __shared__ T_out carryElem; + carryElem = 0; + __syncthreads(); + + while (numBuckets--) + { + Ncv32u curElemOffs = offsetX + threadIdx.x; + T_out curScanElem; + + T_in curElem; + T_out curElemMod; + + if (curElemOffs < srcWidth) + { + //load elements + curElem = readElem(d_src, srcStride, curElemOffs); + } + curElemMod = _scanElemOp::scanElemOp(curElem); + + //inclusive scan + curScanElem = blockScanInclusive(curElemMod, shmem); + + if (curElemOffs <= srcWidth) + { + //make scan exclusive and write the bucket to the output buffer + d_II[curElemOffs] = carryElem + curScanElem - curElemMod; + offsetX += NUM_SCAN_THREADS; + } + + //remember last element for subsequent buckets adjustment + __syncthreads(); + if (threadIdx.x == NUM_SCAN_THREADS-1) + { + carryElem += curScanElem; + } + __syncthreads(); + } + + if (offsetX == srcWidth && !threadIdx.x) + { + d_II[offsetX] = carryElem; + } +} + + +template +NCVStatus scanRowsWrapperDevice(T_in *d_src, Ncv32u srcStride, + T_out *d_dst, Ncv32u dstStride, NcvSize32u roi) +{ + cudaChannelFormatDesc cfdTex; + if (sizeof(T_in) == 1) + { + cfdTex = cudaCreateChannelDesc(); + size_t alignmentOffset; + ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex8u, d_src, cfdTex, roi.height * srcStride), NPPST_TEXTURE_BIND_ERROR); + ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR); + } + scanRows + + <<>> + (d_src, roi.width, srcStride, d_dst, dstStride); + ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR); + +#if defined _SELF_TEST_ + T_in *h_src; + T_out *h_dst; + ncvAssertCUDAReturn(cudaMallocHost(&h_src, srcStride * roi.height * sizeof(T_in)), NPPST_MEM_ALLOC_ERR); + ncvAssertCUDAReturn(cudaMallocHost(&h_dst, dstStride * roi.height * sizeof(T_out)), NPPST_MEM_ALLOC_ERR); + memset(h_src, 0, srcStride * roi.height * sizeof(T_in)); + memset(h_dst, 0, dstStride * roi.height * sizeof(T_out)); + ncvAssertCUDAReturn(cudaMemcpy(h_src, d_src, srcStride * roi.height * sizeof(T_in), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR); + ncvAssertCUDAReturn(cudaMemcpy(h_dst, d_dst, dstStride * roi.height * sizeof(T_out), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR); + NcvBool bPass = true; + for (Ncv32u i=0; i(h_src[i*srcStride+j]); + } + } + } + ncvAssertCUDAReturn(cudaFreeHost(h_src), NPPST_MEMFREE_ERR); + ncvAssertCUDAReturn(cudaFreeHost(h_dst), NPPST_MEMFREE_ERR); + printf("CIntegralImage::scanRowsWrapperDevice %s\n", bPass?"PASSED":"FAILED"); +#endif + + return NPPST_SUCCESS; +} + + +Ncv32u getPaddedDimension(Ncv32u dim, Ncv32u elemTypeSize, Ncv32u allocatorAlignment) +{ + Ncv32u alignMask = allocatorAlignment-1; + Ncv32u inverseAlignMask = ~alignMask; + Ncv32u dimBytes = dim * elemTypeSize; + Ncv32u pitch = (dimBytes + alignMask) & inverseAlignMask; + Ncv32u PaddedDim = pitch / elemTypeSize; + return PaddedDim; +} + + +template +NCVStatus ncvIntegralImage_device(T_in *d_src, Ncv32u srcStep, + T_out *d_dst, Ncv32u dstStep, NcvSize32u roi, + INCVMemAllocator &gpuAllocator) +{ + ncvAssertReturn(sizeof(T_out) == sizeof(Ncv32u), NPPST_MEM_INTERNAL_ERROR); + ncvAssertReturn(gpuAllocator.memType() == NCVMemoryTypeDevice || + gpuAllocator.memType() == NCVMemoryTypeNone, NPPST_MEM_RESIDENCE_ERROR); + ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR); + ncvAssertReturn((d_src != NULL && d_dst != NULL) || gpuAllocator.isCounting(), NPPST_NULL_POINTER_ERROR); + ncvAssertReturn(roi.width > 0 && roi.height > 0, NPPST_INVALID_ROI); + ncvAssertReturn(srcStep >= roi.width * sizeof(T_in) && + dstStep >= (roi.width + 1) * sizeof(T_out) && + srcStep % sizeof(T_in) == 0 && + dstStep % sizeof(T_out) == 0, NPPST_INVALID_STEP); + srcStep /= sizeof(T_in); + dstStep /= sizeof(T_out); + + Ncv32u WidthII = roi.width + 1; + Ncv32u HeightII = roi.height + 1; + Ncv32u PaddedWidthII32 = getPaddedDimension(WidthII, sizeof(Ncv32u), gpuAllocator.alignment()); + Ncv32u PaddedHeightII32 = getPaddedDimension(HeightII, sizeof(Ncv32u), gpuAllocator.alignment()); + + NCVMatrixAlloc Tmp32_1(gpuAllocator, PaddedWidthII32, PaddedHeightII32); + ncvAssertReturn(gpuAllocator.isCounting() || Tmp32_1.isMemAllocated(), NPPST_MEM_INTERNAL_ERROR); + NCVMatrixAlloc Tmp32_2(gpuAllocator, PaddedHeightII32, PaddedWidthII32); + ncvAssertReturn(gpuAllocator.isCounting() || Tmp32_2.isMemAllocated(), NPPST_MEM_INTERNAL_ERROR); + ncvAssertReturn(Tmp32_1.pitch() * Tmp32_1.height() == Tmp32_2.pitch() * Tmp32_2.height(), NPPST_MEM_INTERNAL_ERROR); + + NCVStatus ncvStat; + NCV_SET_SKIP_COND(gpuAllocator.isCounting()); + + NCV_SKIP_COND_BEGIN + + ncvStat = scanRowsWrapperDevice + + (d_src, srcStep, Tmp32_1.ptr(), PaddedWidthII32, roi); + ncvAssertReturnNcvStat(ncvStat); + + ncvStat = nppiStTranspose_32u_C1R((Ncv32u *)Tmp32_1.ptr(), PaddedWidthII32*sizeof(Ncv32u), + (Ncv32u *)Tmp32_2.ptr(), PaddedHeightII32*sizeof(Ncv32u), NcvSize32u(WidthII, roi.height)); + ncvAssertReturnNcvStat(ncvStat); + + ncvStat = scanRowsWrapperDevice + + (Tmp32_2.ptr(), PaddedHeightII32, Tmp32_1.ptr(), PaddedHeightII32, NcvSize32u(roi.height, WidthII)); + ncvAssertReturnNcvStat(ncvStat); + + ncvStat = nppiStTranspose_32u_C1R((Ncv32u *)Tmp32_1.ptr(), PaddedHeightII32*sizeof(Ncv32u), + (Ncv32u *)d_dst, dstStep*sizeof(Ncv32u), NcvSize32u(HeightII, WidthII)); + ncvAssertReturnNcvStat(ncvStat); + + NCV_SKIP_COND_END + + return NPPST_SUCCESS; +} + + +NCVStatus ncvSquaredIntegralImage_device(Ncv8u *d_src, Ncv32u srcStep, + Ncv64u *d_dst, Ncv32u dstStep, NcvSize32u roi, + INCVMemAllocator &gpuAllocator) +{ + ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR); + ncvAssertReturn(gpuAllocator.memType() == NCVMemoryTypeDevice || + gpuAllocator.memType() == NCVMemoryTypeNone, NPPST_MEM_RESIDENCE_ERROR); + ncvAssertReturn((d_src != NULL && d_dst != NULL) || gpuAllocator.isCounting(), NPPST_NULL_POINTER_ERROR); + ncvAssertReturn(roi.width > 0 && roi.height > 0, NPPST_INVALID_ROI); + ncvAssertReturn(srcStep >= roi.width && + dstStep >= (roi.width + 1) * sizeof(Ncv64u) && + dstStep % sizeof(Ncv64u) == 0, NPPST_INVALID_STEP); + dstStep /= sizeof(Ncv64u); + + Ncv32u WidthII = roi.width + 1; + Ncv32u HeightII = roi.height + 1; + Ncv32u PaddedWidthII32 = getPaddedDimension(WidthII, sizeof(Ncv32u), gpuAllocator.alignment()); + Ncv32u PaddedHeightII32 = getPaddedDimension(HeightII, sizeof(Ncv32u), gpuAllocator.alignment()); + Ncv32u PaddedWidthII64 = getPaddedDimension(WidthII, sizeof(Ncv64u), gpuAllocator.alignment()); + Ncv32u PaddedHeightII64 = getPaddedDimension(HeightII, sizeof(Ncv64u), gpuAllocator.alignment()); + Ncv32u PaddedWidthMax = PaddedWidthII32 > PaddedWidthII64 ? PaddedWidthII32 : PaddedWidthII64; + Ncv32u PaddedHeightMax = PaddedHeightII32 > PaddedHeightII64 ? PaddedHeightII32 : PaddedHeightII64; + + NCVMatrixAlloc Tmp32_1(gpuAllocator, PaddedWidthII32, PaddedHeightII32); + ncvAssertReturn(Tmp32_1.isMemAllocated(), NPPST_MEM_INTERNAL_ERROR); + NCVMatrixAlloc Tmp64(gpuAllocator, PaddedWidthMax, PaddedHeightMax); + ncvAssertReturn(Tmp64.isMemAllocated(), NPPST_MEM_INTERNAL_ERROR); + + NCVMatrixReuse Tmp32_2(Tmp64.getSegment(), gpuAllocator.alignment(), PaddedWidthII32, PaddedHeightII32); + ncvAssertReturn(Tmp32_2.isMemReused(), NPPST_MEM_INTERNAL_ERROR); + NCVMatrixReuse Tmp64_2(Tmp64.getSegment(), gpuAllocator.alignment(), PaddedWidthII64, PaddedHeightII64); + ncvAssertReturn(Tmp64_2.isMemReused(), NPPST_MEM_INTERNAL_ERROR); + + NCVStatus ncvStat; + NCV_SET_SKIP_COND(gpuAllocator.isCounting()); + + NCV_SKIP_COND_BEGIN + + ncvStat = scanRowsWrapperDevice + + (d_src, srcStep, Tmp32_2.ptr(), PaddedWidthII32, roi); + ncvAssertReturnNcvStat(ncvStat); + + ncvStat = nppiStTranspose_32u_C1R(Tmp32_2.ptr(), PaddedWidthII32*sizeof(Ncv32u), + Tmp32_1.ptr(), PaddedHeightII32*sizeof(Ncv32u), NcvSize32u(WidthII, roi.height)); + ncvAssertReturnNcvStat(ncvStat); + + ncvStat = scanRowsWrapperDevice + + (Tmp32_1.ptr(), PaddedHeightII32, Tmp64_2.ptr(), PaddedHeightII64, NcvSize32u(roi.height, WidthII)); + ncvAssertReturnNcvStat(ncvStat); + + ncvStat = nppiStTranspose_64u_C1R(Tmp64_2.ptr(), PaddedHeightII64*sizeof(Ncv64u), + d_dst, dstStep*sizeof(Ncv64u), NcvSize32u(HeightII, WidthII)); + ncvAssertReturnNcvStat(ncvStat); + + NCV_SKIP_COND_END + + return NPPST_SUCCESS; +} + + +NCVStatus nppiStIntegralGetSize_8u32u(NcvSize32u roiSize, Ncv32u *pBufsize, cudaDeviceProp &devProp) +{ + ncvAssertReturn(pBufsize != NULL, NPPST_NULL_POINTER_ERROR); + ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI); + + NCVMemStackAllocator gpuCounter(devProp.textureAlignment); + ncvAssertReturn(gpuCounter.isInitialized(), NPPST_MEM_INTERNAL_ERROR); + + NCVStatus ncvStat = ncvIntegralImage_device((Ncv8u*)NULL, roiSize.width, + (Ncv32u*)NULL, (roiSize.width+1) * sizeof(Ncv32u), + roiSize, gpuCounter); + ncvAssertReturnNcvStat(ncvStat); + + *pBufsize = (Ncv32u)gpuCounter.maxSize(); + return NPPST_SUCCESS; +} + + +NCVStatus nppiStIntegralGetSize_32f32f(NcvSize32u roiSize, Ncv32u *pBufsize, cudaDeviceProp &devProp) +{ + ncvAssertReturn(pBufsize != NULL, NPPST_NULL_POINTER_ERROR); + ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI); + + NCVMemStackAllocator gpuCounter(devProp.textureAlignment); + ncvAssertReturn(gpuCounter.isInitialized(), NPPST_MEM_INTERNAL_ERROR); + + NCVStatus ncvStat = ncvIntegralImage_device((Ncv32f*)NULL, roiSize.width * sizeof(Ncv32f), + (Ncv32f*)NULL, (roiSize.width+1) * sizeof(Ncv32f), + roiSize, gpuCounter); + ncvAssertReturnNcvStat(ncvStat); + + *pBufsize = (Ncv32u)gpuCounter.maxSize(); + return NPPST_SUCCESS; +} + + +NCVStatus nppiStSqrIntegralGetSize_8u64u(NcvSize32u roiSize, Ncv32u *pBufsize, cudaDeviceProp &devProp) +{ + ncvAssertReturn(pBufsize != NULL, NPPST_NULL_POINTER_ERROR); + ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI); + + NCVMemStackAllocator gpuCounter(devProp.textureAlignment); + ncvAssertReturn(gpuCounter.isInitialized(), NPPST_MEM_INTERNAL_ERROR); + + NCVStatus ncvStat = ncvSquaredIntegralImage_device(NULL, roiSize.width, + NULL, (roiSize.width+1) * sizeof(Ncv64u), + roiSize, gpuCounter); + ncvAssertReturnNcvStat(ncvStat); + + *pBufsize = (Ncv32u)gpuCounter.maxSize(); + return NPPST_SUCCESS; +} + + +NCVStatus nppiStIntegral_8u32u_C1R(Ncv8u *d_src, Ncv32u srcStep, + Ncv32u *d_dst, Ncv32u dstStep, + NcvSize32u roiSize, Ncv8u *pBuffer, + Ncv32u bufSize, cudaDeviceProp &devProp) +{ + NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, devProp.textureAlignment, pBuffer); + ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR); + + NCVStatus ncvStat = ncvIntegralImage_device(d_src, srcStep, d_dst, dstStep, roiSize, gpuAllocator); + ncvAssertReturnNcvStat(ncvStat); + + return NPPST_SUCCESS; +} + + +NCVStatus nppiStIntegral_32f32f_C1R(Ncv32f *d_src, Ncv32u srcStep, + Ncv32f *d_dst, Ncv32u dstStep, + NcvSize32u roiSize, Ncv8u *pBuffer, + Ncv32u bufSize, cudaDeviceProp &devProp) +{ + NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, devProp.textureAlignment, pBuffer); + ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR); + + NCVStatus ncvStat = ncvIntegralImage_device(d_src, srcStep, d_dst, dstStep, roiSize, gpuAllocator); + ncvAssertReturnNcvStat(ncvStat); + + return NPPST_SUCCESS; +} + + +NCVStatus nppiStSqrIntegral_8u64u_C1R(Ncv8u *d_src, Ncv32u srcStep, + Ncv64u *d_dst, Ncv32u dstStep, + NcvSize32u roiSize, Ncv8u *pBuffer, + Ncv32u bufSize, cudaDeviceProp &devProp) +{ + NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, devProp.textureAlignment, pBuffer); + ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR); + + NCVStatus ncvStat = ncvSquaredIntegralImage_device(d_src, srcStep, d_dst, dstStep, roiSize, gpuAllocator); + ncvAssertReturnNcvStat(ncvStat); + + return NPPST_SUCCESS; +} + + +NCVStatus nppiStIntegral_8u32u_C1R_host(Ncv8u *h_src, Ncv32u srcStep, + Ncv32u *h_dst, Ncv32u dstStep, + NcvSize32u roiSize) +{ + ncvAssertReturn(h_src != NULL && h_dst != NULL, NPPST_NULL_POINTER_ERROR); + ncvAssertReturn(roiSize.width > 0 && roiSize.height > 0, NPPST_INVALID_ROI); + ncvAssertReturn(srcStep >= roiSize.width && + dstStep >= (roiSize.width + 1) * sizeof(Ncv32u) && + dstStep % sizeof(Ncv32u) == 0, NPPST_INVALID_STEP); + dstStep /= sizeof(Ncv32u); + + Ncv32u WidthII = roiSize.width + 1; + Ncv32u HeightII = roiSize.height + 1; + + memset(h_dst, 0, WidthII * sizeof(Ncv32u)); + for (Ncv32u i=1; i 0 && roiSize.height > 0, NPPST_INVALID_ROI); + ncvAssertReturn(srcStep >= roiSize.width * sizeof(Ncv32f) && + dstStep >= (roiSize.width + 1) * sizeof(Ncv32f) && + srcStep % sizeof(Ncv32f) == 0 && + dstStep % sizeof(Ncv32f) == 0, NPPST_INVALID_STEP); + srcStep /= sizeof(Ncv32f); + dstStep /= sizeof(Ncv32f); + + Ncv32u WidthII = roiSize.width + 1; + Ncv32u HeightII = roiSize.height + 1; + + memset(h_dst, 0, WidthII * sizeof(Ncv32u)); + for (Ncv32u i=1; i 0 && roiSize.height > 0, NPPST_INVALID_ROI); + ncvAssertReturn(srcStep >= roiSize.width && + dstStep >= (roiSize.width + 1) * sizeof(Ncv64u) && + dstStep % sizeof(Ncv64u) == 0, NPPST_INVALID_STEP); + dstStep /= sizeof(Ncv64u); + + Ncv32u WidthII = roiSize.width + 1; + Ncv32u HeightII = roiSize.height + 1; + + memset(h_dst, 0, WidthII * sizeof(Ncv64u)); + for (Ncv32u i=1; i +__device__ T getElem_DownsampleNearest(Ncv32u x, T *d_src); + + +template<> +__device__ Ncv32u getElem_DownsampleNearest(Ncv32u x, Ncv32u *d_src) +{ + return tex1Dfetch(tex32u, x); +} + + +template<> +__device__ Ncv32u getElem_DownsampleNearest(Ncv32u x, Ncv32u *d_src) +{ + return d_src[x]; +} + + +template<> +__device__ Ncv64u getElem_DownsampleNearest(Ncv32u x, Ncv64u *d_src) +{ + uint2 tmp = tex1Dfetch(tex64u, x); + Ncv64u res = (Ncv64u)tmp.y; + res <<= 32; + res |= tmp.x; + return res; +} + + +template<> +__device__ Ncv64u getElem_DownsampleNearest(Ncv32u x, Ncv64u *d_src) +{ + return d_src[x]; +} + + +template +__global__ void downsampleNearest_C1R(T *d_src, Ncv32u srcStep, T *d_dst, Ncv32u dstStep, + NcvSize32u dstRoi, Ncv32u scale) +{ + int curX = blockIdx.x * blockDim.x + threadIdx.x; + int curY = blockIdx.y * blockDim.y + threadIdx.y; + + if (curX >= dstRoi.width || curY >= dstRoi.height) + { + return; + } + + d_dst[curY * dstStep + curX] = getElem_DownsampleNearest((curY * srcStep + curX) * scale, d_src); +} + + +template +static NCVStatus downsampleNearestWrapperDevice(T *d_src, Ncv32u srcStep, + T *d_dst, Ncv32u dstStep, + NcvSize32u srcRoi, Ncv32u scale, + NcvBool readThruTexture) +{ + ncvAssertReturn(d_src != NULL && d_dst != NULL, NPPST_NULL_POINTER_ERROR); + ncvAssertReturn(srcRoi.width > 0 && srcRoi.height > 0, NPPST_INVALID_ROI); + ncvAssertReturn(scale != 0, NPPST_INVALID_SCALE); + ncvAssertReturn(srcStep >= (Ncv32u)(srcRoi.width) * sizeof(T) && + dstStep >= (Ncv32u)(srcRoi.width * sizeof(T) / scale), NPPST_INVALID_STEP); + srcStep /= sizeof(T); + dstStep /= sizeof(T); + + NcvSize32u dstRoi; + dstRoi.width = srcRoi.width / scale; + dstRoi.height = srcRoi.height / scale; + + dim3 grid((dstRoi.width + NUM_DOWNSAMPLE_NEAREST_THREADS_X - 1) / NUM_DOWNSAMPLE_NEAREST_THREADS_X, + (dstRoi.height + NUM_DOWNSAMPLE_NEAREST_THREADS_Y - 1) / NUM_DOWNSAMPLE_NEAREST_THREADS_Y); + dim3 block(NUM_DOWNSAMPLE_NEAREST_THREADS_X, NUM_DOWNSAMPLE_NEAREST_THREADS_Y); + + if (!readThruTexture) + { + downsampleNearest_C1R + + <<>> + (d_src, srcStep, d_dst, dstStep, dstRoi, scale); + } + else + { + cudaChannelFormatDesc cfdTexSrc; + + if (sizeof(T) == sizeof(Ncv32u)) + { + cfdTexSrc = cudaCreateChannelDesc(); + + size_t alignmentOffset; + ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex32u, d_src, cfdTexSrc, srcRoi.height * srcStep * sizeof(T)), NPPST_TEXTURE_BIND_ERROR); + ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR); + } + else + { + cfdTexSrc = cudaCreateChannelDesc(); + + size_t alignmentOffset; + ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex64u, d_src, cfdTexSrc, srcRoi.height * srcStep * sizeof(T)), NPPST_TEXTURE_BIND_ERROR); + ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR); + } + + downsampleNearest_C1R + + <<>> + (d_src, srcStep, d_dst, dstStep, dstRoi, scale); + } + + ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR); + +#if defined _SELF_TEST_ + T *h_src; + T *h_dst; + ncvAssertCUDAReturn(cudaMallocHost(&h_src, srcStep * srcRoi.height * sizeof(T)), NPPST_MEM_ALLOC_ERR); + ncvAssertCUDAReturn(cudaMallocHost(&h_dst, dstStep * dstRoi.height * sizeof(T)), NPPST_MEM_ALLOC_ERR); + ncvAssertCUDAReturn(cudaMemcpy(h_src, d_src, srcStep * srcRoi.height * sizeof(T), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR); + ncvAssertCUDAReturn(cudaMemcpy(h_dst, d_dst, dstStep * dstRoi.height * sizeof(T), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR); + + bool bPass = true; + + for (Ncv32u i=0; i +static NCVStatus downsampleNearestWrapperHost(T *h_src, Ncv32u srcStep, + T *h_dst, Ncv32u dstStep, + NcvSize32u srcRoi, Ncv32u scale) +{ + ncvAssertReturn(h_src != NULL && h_dst != NULL, NPPST_NULL_POINTER_ERROR); + ncvAssertReturn(srcRoi.width != 0 && srcRoi.height != 0, NPPST_INVALID_ROI); + ncvAssertReturn(scale != 0, NPPST_INVALID_SCALE); + ncvAssertReturn(srcStep >= (Ncv32u)(srcRoi.width) * sizeof(T) && + dstStep >= (Ncv32u)(srcRoi.width * sizeof(T) / scale) && + srcStep % sizeof(T) == 0 && dstStep % sizeof(T) == 0, NPPST_INVALID_STEP); + srcStep /= sizeof(T); + dstStep /= sizeof(T); + + NcvSize32u dstRoi; + dstRoi.width = srcRoi.width / scale; + dstRoi.height = srcRoi.height / scale; + + for (Ncv32u i=0; i((Ncv##bit##u *)d_src, srcStep, \ + (Ncv##bit##u *)d_dst, dstStep, \ + srcRoi, scale, readThruTexture); \ + } + + +#define implementNppDownsampleNearestHost(bit, typ) \ + NCVStatus nppiStDownsampleNearest_##bit##typ##_C1R_host(Ncv##bit##typ *h_src, Ncv32u srcStep, \ + Ncv##bit##typ *h_dst, Ncv32u dstStep, \ + NcvSize32u srcRoi, Ncv32u scale) \ + { \ + return downsampleNearestWrapperHost((Ncv##bit##u *)h_src, srcStep, \ + (Ncv##bit##u *)h_dst, dstStep, \ + srcRoi, scale); \ + } + + +implementNppDownsampleNearest(32, u) +implementNppDownsampleNearest(32, s) +implementNppDownsampleNearest(32, f) +implementNppDownsampleNearest(64, u) +implementNppDownsampleNearest(64, s) +implementNppDownsampleNearest(64, f) +implementNppDownsampleNearestHost(32, u) +implementNppDownsampleNearestHost(32, s) +implementNppDownsampleNearestHost(32, f) +implementNppDownsampleNearestHost(64, u) +implementNppDownsampleNearestHost(64, s) +implementNppDownsampleNearestHost(64, f) + + +//============================================================================== +// +// RectStdDev.cu +// +//============================================================================== + + +const Ncv32u NUM_RECTSTDDEV_THREADS = 128; + + +template +__device__ Ncv32u getElemSum(Ncv32u x, Ncv32u *d_sum) +{ + if (tbCacheTexture) + { + return tex1Dfetch(tex32u, x); + } + else + { + return d_sum[x]; + } +} + + +template +__device__ Ncv64u getElemSqSum(Ncv32u x, Ncv64u *d_sqsum) +{ + if (tbCacheTexture) + { + uint2 tmp = tex1Dfetch(tex64u, x); + Ncv64u res = (Ncv64u)tmp.y; + res <<= 32; + res |= tmp.x; + return res; + } + else + { + return d_sqsum[x]; + } +} + + +template +__global__ void rectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep, + Ncv64u *d_sqsum, Ncv32u sqsumStep, + Ncv32f *d_norm, Ncv32u normStep, + NcvSize32u roi, NcvRect32u rect, Ncv32f invRectArea) +{ + Ncv32u x_offs = blockIdx.x * NUM_RECTSTDDEV_THREADS + threadIdx.x; + if (x_offs >= roi.width) + { + return; + } + + Ncv32u sum_offset = blockIdx.y * sumStep + x_offs; + Ncv32u sqsum_offset = blockIdx.y * sqsumStep + x_offs; + + //OPT: try swapping order (could change cache hit/miss ratio) + Ncv32u sum_tl = getElemSum(sum_offset + rect.y * sumStep + rect.x, d_sum); + Ncv32u sum_bl = getElemSum(sum_offset + (rect.y + rect.height) * sumStep + rect.x, d_sum); + Ncv32u sum_tr = getElemSum(sum_offset + rect.y * sumStep + rect.x + rect.width, d_sum); + Ncv32u sum_br = getElemSum(sum_offset + (rect.y + rect.height) * sumStep + rect.x + rect.width, d_sum); + Ncv32u sum_val = sum_br + sum_tl - sum_tr - sum_bl; + + Ncv64u sqsum_tl, sqsum_bl, sqsum_tr, sqsum_br; + sqsum_tl = getElemSqSum(sqsum_offset + rect.y * sqsumStep + rect.x, d_sqsum); + sqsum_bl = getElemSqSum(sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x, d_sqsum); + sqsum_tr = getElemSqSum(sqsum_offset + rect.y * sqsumStep + rect.x + rect.width, d_sqsum); + sqsum_br = getElemSqSum(sqsum_offset + (rect.y + rect.height) * sqsumStep + rect.x + rect.width, d_sqsum); + Ncv64u sqsum_val = sqsum_br + sqsum_tl - sqsum_tr - sqsum_bl; + + Ncv32f mean = sum_val * invRectArea; + + ////////////////////////////////////////////////////////////////////////// + // sqsum_val_res = sqsum_val / rectArea + ////////////////////////////////////////////////////////////////////////// + + Ncv32f sqsum_val_1 = __ull2float_rz(sqsum_val); + Ncv64u sqsum_val_2 = __float2ull_rz(sqsum_val_1); + Ncv64u sqsum_val_3 = sqsum_val - sqsum_val_2; + Ncv32f sqsum_val_4 = __ull2float_rn(sqsum_val_3); + sqsum_val_1 *= invRectArea; + sqsum_val_4 *= invRectArea; + Ncv32f sqsum_val_res = sqsum_val_1 + sqsum_val_4; + + ////////////////////////////////////////////////////////////////////////// + // variance = sqsum_val_res - mean * mean + ////////////////////////////////////////////////////////////////////////// + +#if defined DISABLE_MAD_SELECTIVELY + Ncv32f variance = sqsum_val_2 - __fmul_rn(mean, mean); +#else + Ncv32f variance = sqsum_val_res - mean * mean; +#endif + + ////////////////////////////////////////////////////////////////////////// + // stddev = sqrtf(variance) + ////////////////////////////////////////////////////////////////////////// + + //Ncv32f stddev = sqrtf(variance); + Ncv32f stddev = __fsqrt_rn(variance); + + d_norm[blockIdx.y * normStep + x_offs] = stddev; +} + + +NCVStatus nppiStRectStdDev_32f_C1R(Ncv32u *d_sum, Ncv32u sumStep, + Ncv64u *d_sqsum, Ncv32u sqsumStep, + Ncv32f *d_norm, Ncv32u normStep, + NcvSize32u roi, NcvRect32u rect, + Ncv32f scaleArea, NcvBool readThruTexture) +{ + ncvAssertReturn(d_sum != NULL && d_sqsum != NULL && d_norm != NULL, NPPST_NULL_POINTER_ERROR); + ncvAssertReturn(roi.width > 0 && roi.height > 0, NPPST_INVALID_ROI); + ncvAssertReturn(sumStep >= (Ncv32u)(roi.width + rect.x + rect.width - 1) * sizeof(Ncv32u) && + sqsumStep >= (Ncv32u)(roi.width + rect.x + rect.width - 1) * sizeof(Ncv64u) && + normStep >= (Ncv32u)roi.width * sizeof(Ncv32f) && + sumStep % sizeof(Ncv32u) == 0 && + sqsumStep % sizeof(Ncv64u) == 0 && + normStep % sizeof(Ncv32f) == 0, NPPST_INVALID_STEP); + ncvAssertReturn(scaleArea >= 1.0f, NPPST_INVALID_SCALE); + sumStep /= sizeof(Ncv32u); + sqsumStep /= sizeof(Ncv64u); + normStep /= sizeof(Ncv32f); + + Ncv32f rectArea = rect.width * rect.height * scaleArea; + Ncv32f invRectArea = 1.0f / rectArea; + + dim3 grid(((roi.width + NUM_RECTSTDDEV_THREADS - 1) / NUM_RECTSTDDEV_THREADS), roi.height); + dim3 block(NUM_RECTSTDDEV_THREADS); + + if (!readThruTexture) + { + rectStdDev_32f_C1R + + <<>> + (d_sum, sumStep, d_sqsum, sqsumStep, d_norm, normStep, roi, rect, invRectArea); + } + else + { + cudaChannelFormatDesc cfdTexSrc; + cudaChannelFormatDesc cfdTexSqr; + cfdTexSrc = cudaCreateChannelDesc(); + cfdTexSqr = cudaCreateChannelDesc(); + + size_t alignmentOffset; + ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex32u, d_sum, cfdTexSrc, (roi.height + rect.y + rect.height) * sumStep * sizeof(Ncv32u)), NPPST_TEXTURE_BIND_ERROR); + ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR); + ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, tex64u, d_sqsum, cfdTexSqr, (roi.height + rect.y + rect.height) * sqsumStep * sizeof(Ncv64u)), NPPST_TEXTURE_BIND_ERROR); + ncvAssertReturn(alignmentOffset==0, NPPST_TEXTURE_BIND_ERROR); + + rectStdDev_32f_C1R + + <<>> + (NULL, sumStep, NULL, sqsumStep, d_norm, normStep, roi, rect, invRectArea); + } + + ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR); + +#if defined _SELF_TEST_ + Ncv32u *h_sum; + Ncv64u *h_sqsum; + Ncv32f *h_norm_d; + Ncv32u ExtHeight = roi.height + rect.y + rect.height; + ncvAssertCUDAReturn(cudaMallocHost(&h_sum, sumStep * ExtHeight * sizeof(Ncv32u)), NPPST_MEM_ALLOC_ERR); + ncvAssertCUDAReturn(cudaMallocHost(&h_sqsum, sqsumStep * ExtHeight * sizeof(Ncv64u)), NPPST_MEM_ALLOC_ERR); + ncvAssertCUDAReturn(cudaMallocHost(&h_norm_d, normStep * roi.height * sizeof(Ncv32u)), NPPST_MEM_ALLOC_ERR); + ncvAssertCUDAReturn(cudaMemcpy(h_sum, d_sum, sumStep * ExtHeight * sizeof(Ncv32u), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR); + ncvAssertCUDAReturn(cudaMemcpy(h_sqsum, d_sqsum, sqsumStep * ExtHeight * sizeof(Ncv64u), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR); + ncvAssertCUDAReturn(cudaMemcpy(h_norm_d, d_norm, normStep * roi.height * sizeof(Ncv32f), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR); + + Ncv32f *h_norm_h; + ncvAssertCUDAReturn(cudaMallocHost(&h_norm_h, normStep * roi.height * sizeof(Ncv32u)), NPPST_MEM_ALLOC_ERR); + + ncvAssertReturnNcvStat(nppRectStdDev_32f_C1R_host(h_sum, sqsumStep, h_sqsum, sqsumStep, h_norm_h, normStep, roi, rect, scaleArea)); + + const Ncv64f relEPS = 0.005; + bool bPass = true; + for (Ncv32u i=0; i relEPS) + { + printf("::ncvRectStdDev_32f_C1R self test failed: i=%d, j=%d, cpu=%f, gpu=%f\n", i, j, h_norm_h[i * normStep + j], h_norm_d[i * normStep + j]); + bPass = false; + } + } + } + ncvAssertCUDAReturn(cudaFreeHost(h_sum), NPPST_MEMFREE_ERR); + ncvAssertCUDAReturn(cudaFreeHost(h_sqsum), NPPST_MEMFREE_ERR); + ncvAssertCUDAReturn(cudaFreeHost(h_norm_d), NPPST_MEMFREE_ERR); + ncvAssertCUDAReturn(cudaFreeHost(h_norm_h), NPPST_MEMFREE_ERR); + printf("::ncvRectStdDev_32f_C1R %s\n", bPass?"PASSED":"FAILED"); +#endif + + return NPPST_SUCCESS; +} + + +NCVStatus nppiStRectStdDev_32f_C1R_host(Ncv32u *h_sum, Ncv32u sumStep, + Ncv64u *h_sqsum, Ncv32u sqsumStep, + Ncv32f *h_norm, Ncv32u normStep, + NcvSize32u roi, NcvRect32u rect, + Ncv32f scaleArea) +{ + ncvAssertReturn(h_sum != NULL && h_sqsum != NULL && h_norm != NULL, NPPST_NULL_POINTER_ERROR); + ncvAssertReturn(roi.width > 0 && roi.height > 0, NPPST_INVALID_ROI); + ncvAssertReturn(sumStep >= (Ncv32u)(roi.width + rect.x + rect.width - 1) * sizeof(Ncv32u) && + sqsumStep >= (Ncv32u)(roi.width + rect.x + rect.width - 1) * sizeof(Ncv64u) && + normStep >= (Ncv32u)roi.width * sizeof(Ncv32f) && + sumStep % sizeof(Ncv32u) == 0 && + sqsumStep % sizeof(Ncv64u) == 0 && + normStep % sizeof(Ncv32f) == 0, NPPST_INVALID_STEP); + ncvAssertReturn(scaleArea >= 1.0f, NPPST_INVALID_SCALE); + sumStep /= sizeof(Ncv32u); + sqsumStep /= sizeof(Ncv64u); + normStep /= sizeof(Ncv32f); + + Ncv32f rectArea = rect.width * rect.height * scaleArea; + Ncv32f invRectArea = 1.0f / rectArea; + + for (Ncv32u i=0; i +__global__ void transpose(T *d_src, Ncv32u srcStride, + T *d_dst, Ncv32u dstStride, NcvSize32u srcRoi) +{ + __shared__ T tile[TRANSPOSE_TILE_DIM][TRANSPOSE_TILE_DIM+1]; + + Ncv32u blockIdx_x, blockIdx_y; + + // do diagonal reordering + if (gridDim.x == gridDim.y) + { + blockIdx_y = blockIdx.x; + blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x; + } + else + { + Ncv32u bid = blockIdx.x + gridDim.x * blockIdx.y; + blockIdx_y = bid % gridDim.y; + blockIdx_x = ((bid / gridDim.y) + blockIdx_y) % gridDim.x; + } + + Ncv32u xIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.x; + Ncv32u yIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.y; + Ncv32u index_in = xIndex + yIndex * srcStride; + + xIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.x; + yIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.y; + Ncv32u index_out = xIndex + yIndex * dstStride; + + for (Ncv32u i=0; i +NCVStatus transposeWrapperDevice(T *d_src, Ncv32u srcStride, + T *d_dst, Ncv32u dstStride, NcvSize32u srcRoi) +{ + ncvAssertReturn(d_src != NULL && d_dst != NULL, NPPST_NULL_POINTER_ERROR); + ncvAssertReturn(srcRoi.width > 0 && srcRoi.height > 0, NPPST_INVALID_ROI); + ncvAssertReturn(srcStride >= srcRoi.width * sizeof(T) && + dstStride >= srcRoi.height * sizeof(T) && + srcStride % sizeof(T) == 0 && dstStride % sizeof(T) == 0, NPPST_INVALID_STEP); + srcStride /= sizeof(T); + dstStride /= sizeof(T); + + dim3 grid((srcRoi.width + TRANSPOSE_TILE_DIM - 1) / TRANSPOSE_TILE_DIM, + (srcRoi.height + TRANSPOSE_TILE_DIM - 1) / TRANSPOSE_TILE_DIM); + dim3 block(TRANSPOSE_TILE_DIM, TRANSPOSE_TILE_DIM); + transpose + + <<>> + (d_src, srcStride, d_dst, dstStride, srcRoi); + ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR); + +#if defined _SELF_TEST_ + Ncv32u widthExt = grid.x * TRANSPOSE_TILE_DIM; + Ncv32u heightExt = grid.y * TRANSPOSE_TILE_DIM; + T *h_src; + T *h_dst; + ncvAssertCUDAReturn(cudaMallocHost(&h_src, srcStride * heightExt * sizeof(T)), NPPST_MEM_ALLOC_ERR); + ncvAssertCUDAReturn(cudaMallocHost(&h_dst, dstStride * widthExt * sizeof(T)), NPPST_MEM_ALLOC_ERR); + memset(h_src, 0, srcStride * heightExt * sizeof(T)); + memset(h_dst, 0, dstStride * widthExt * sizeof(T)); + ncvAssertCUDAReturn(cudaMemcpy(h_src, d_src, srcStride * heightExt * sizeof(T), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR); + ncvAssertCUDAReturn(cudaMemcpy(h_dst, d_dst, dstStride * widthExt * sizeof(T), cudaMemcpyDeviceToHost), NPPST_MEMCPY_ERROR); + NcvBool bPass = true; + for (Ncv32u i=0; i +static NCVStatus transposeWrapperHost(T *h_src, Ncv32u srcStride, + T *h_dst, Ncv32u dstStride, NcvSize32u srcRoi) +{ + ncvAssertReturn(h_src != NULL && h_dst != NULL, NPPST_NULL_POINTER_ERROR); + ncvAssertReturn(srcRoi.width > 0 && srcRoi.height > 0, NPPST_INVALID_ROI); + ncvAssertReturn(srcStride >= srcRoi.width * sizeof(T) && + dstStride >= srcRoi.height * sizeof(T) && + srcStride % sizeof(T) == 0 && dstStride % sizeof(T) == 0, NPPST_INVALID_STEP); + srcStride /= sizeof(T); + dstStride /= sizeof(T); + + for (Ncv32u i=0; i((Ncv##bit##u *)d_src, srcStep, \ + (Ncv##bit##u *)d_dst, dstStep, srcRoi); \ + } + + +#define implementNppTransposeHost(bit, typ) \ + NCVStatus nppiStTranspose_##bit##typ##_C1R_host(Ncv##bit##typ *h_src, Ncv32u srcStep, \ + Ncv##bit##typ *h_dst, Ncv32u dstStep, \ + NcvSize32u srcRoi) \ + { \ + return transposeWrapperHost((Ncv##bit##u *)h_src, srcStep, \ + (Ncv##bit##u *)h_dst, dstStep, srcRoi); \ + } + + +implementNppTranspose(32,u) +implementNppTranspose(32,s) +implementNppTranspose(32,f) +implementNppTranspose(64,u) +implementNppTranspose(64,s) +implementNppTranspose(64,f) + +implementNppTransposeHost(32,u) +implementNppTransposeHost(32,s) +implementNppTransposeHost(32,f) +implementNppTransposeHost(64,u) +implementNppTransposeHost(64,s) +implementNppTransposeHost(64,f) + + +//============================================================================== +// +// Compact.cu +// +//============================================================================== + + +const Ncv32u NUM_REMOVE_THREADS = 256; + + +template +__global__ void removePass1Scan(Ncv32u *d_src, Ncv32u srcLen, + Ncv32u *d_offsets, Ncv32u *d_blockSums, + Ncv32u elemRemove) +{ + Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x; + Ncv32u elemAddrIn = blockId * NUM_REMOVE_THREADS + threadIdx.x; + + if (elemAddrIn > srcLen + blockDim.x) + { + return; + } + + __shared__ Ncv32u shmem[NUM_REMOVE_THREADS * 2]; + + Ncv32u scanElem = 0; + if (elemAddrIn < srcLen) + { + if (bRemove) + { + scanElem = (d_src[elemAddrIn] != elemRemove) ? 1 : 0; + } + else + { + scanElem = d_src[elemAddrIn]; + } + } + + Ncv32u localScanInc = blockScanInclusive(scanElem, shmem); + __syncthreads(); + + if (elemAddrIn < srcLen) + { + if (threadIdx.x == NUM_REMOVE_THREADS-1 && bWritePartial) + { + d_blockSums[blockId] = localScanInc; + } + + if (bRemove) + { + d_offsets[elemAddrIn] = localScanInc - scanElem; + } + else + { + d_src[elemAddrIn] = localScanInc - scanElem; + } + } +} + + +__global__ void removePass2Adjust(Ncv32u *d_offsets, Ncv32u srcLen, Ncv32u *d_blockSums) +{ + Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x; + Ncv32u elemAddrIn = blockId * NUM_REMOVE_THREADS + threadIdx.x; + if (elemAddrIn >= srcLen) + { + return; + } + + __shared__ Ncv32u valOffs; + valOffs = d_blockSums[blockId]; + __syncthreads(); + + d_offsets[elemAddrIn] += valOffs; +} + + +__global__ void removePass3Compact(Ncv32u *d_src, Ncv32u srcLen, + Ncv32u *d_offsets, Ncv32u *d_dst, + Ncv32u elemRemove, Ncv32u *dstLenValue) +{ + Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x; + Ncv32u elemAddrIn = blockId * NUM_REMOVE_THREADS + threadIdx.x; + if (elemAddrIn >= srcLen) + { + return; + } + + Ncv32u elem = d_src[elemAddrIn]; + Ncv32u elemAddrOut = d_offsets[elemAddrIn]; + if (elem != elemRemove) + { + d_dst[elemAddrOut] = elem; + } + + if (elemAddrIn == srcLen-1) + { + if (elem != elemRemove) + { + *dstLenValue = elemAddrOut + 1; + } + else + { + *dstLenValue = elemAddrOut; + } + } +} + + +NCVStatus compactVector_32u_device(Ncv32u *d_src, Ncv32u srcLen, + Ncv32u *d_dst, Ncv32u *dstLenPinned, + Ncv32u elemRemove, + INCVMemAllocator &gpuAllocator) +{ + ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR); + ncvAssertReturn((d_src != NULL && d_dst != NULL) || gpuAllocator.isCounting(), NPPST_NULL_POINTER_ERROR); + + if (srcLen == 0) + { + if (dstLenPinned != NULL) + { + *dstLenPinned = 0; + } + return NPPST_SUCCESS; + } + + std::vector partSumNums; + std::vector partSumOffsets; + Ncv32u partSumLastNum = srcLen; + Ncv32u partSumLastOffs = 0; + do + { + partSumNums.push_back(partSumLastNum); + partSumOffsets.push_back(partSumLastOffs); + + Ncv32u curPartSumAlignedLength = alignUp(partSumLastNum * sizeof(Ncv32u), + gpuAllocator.alignment()) / sizeof(Ncv32u); + partSumLastOffs += curPartSumAlignedLength; + + partSumLastNum = (partSumLastNum + NUM_REMOVE_THREADS - 1) / NUM_REMOVE_THREADS; + } + while (partSumLastNum>1); + partSumNums.push_back(partSumLastNum); + partSumOffsets.push_back(partSumLastOffs); + + NCVVectorAlloc d_hierSums(gpuAllocator, partSumLastOffs+1); + ncvAssertReturn(gpuAllocator.isCounting() || d_hierSums.isMemAllocated(), NPPST_MEM_INTERNAL_ERROR); + NCVVectorAlloc d_numDstElements(gpuAllocator, 1); + ncvAssertReturn(gpuAllocator.isCounting() || d_numDstElements.isMemAllocated(), NPPST_MEM_INTERNAL_ERROR); + + NCV_SET_SKIP_COND(gpuAllocator.isCounting()); + NCV_SKIP_COND_BEGIN + + dim3 block(NUM_REMOVE_THREADS); + + //calculate zero-level partial sums for indices calculation + if (partSumNums.size() > 2) + { + dim3 grid(partSumNums[1]); + + if (grid.x > 65535) + { + grid.y = (grid.x + 65534) / 65535; + grid.x = 65535; + } + removePass1Scan + + <<>> + (d_src, srcLen, + d_hierSums.ptr(), + d_hierSums.ptr() + partSumOffsets[1], + elemRemove); + ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR); + + //calculate hierarchical partial sums + for (Ncv32u i=1; i 65535) + { + grid.y = (grid.x + 65534) / 65535; + grid.x = 65535; + } + if (grid.x != 1) + { + removePass1Scan + + <<>> + (d_hierSums.ptr() + partSumOffsets[i], + partSumNums[i], NULL, + d_hierSums.ptr() + partSumOffsets[i+1], + NULL); + } + else + { + removePass1Scan + + <<>> + (d_hierSums.ptr() + partSumOffsets[i], + partSumNums[i], NULL, + NULL, + NULL); + } + ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR); + } + + //adjust hierarchical partial sums + for (Ncv32s i=(Ncv32s)partSumNums.size()-3; i>=0; i--) + { + dim3 grid(partSumNums[i+1]); + if (grid.x > 65535) + { + grid.y = (grid.x + 65534) / 65535; + grid.x = 65535; + } + removePass2Adjust + <<>> + (d_hierSums.ptr() + partSumOffsets[i], partSumNums[i], + d_hierSums.ptr() + partSumOffsets[i+1]); + ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR); + } + } + else + { + dim3 grid(partSumNums[1]); + removePass1Scan + + <<>> + (d_src, srcLen, + d_hierSums.ptr(), + NULL, elemRemove); + ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR); + } + + //compact source vector using indices + dim3 grid(partSumNums[1]); + if (grid.x > 65535) + { + grid.y = (grid.x + 65534) / 65535; + grid.x = 65535; + } + removePass3Compact + <<>> + (d_src, srcLen, d_hierSums.ptr(), d_dst, + elemRemove, d_numDstElements.ptr()); + ncvAssertCUDAReturn(cudaGetLastError(), NPPST_CUDA_KERNEL_EXECUTION_ERROR); + + //get number of dst elements + if (dstLenPinned != NULL) + { + ncvAssertCUDAReturn(cudaMemcpyAsync(dstLenPinned, d_numDstElements.ptr(), sizeof(Ncv32u), + cudaMemcpyDeviceToHost, nppStGetActiveCUDAstream()), NPPST_MEM_RESIDENCE_ERROR); + ncvAssertCUDAReturn(cudaStreamSynchronize(nppStGetActiveCUDAstream()), NPPST_MEM_RESIDENCE_ERROR); + } + + NCV_SKIP_COND_END + + return NPPST_SUCCESS; +} + + +NCVStatus nppsStCompactGetSize_32u(Ncv32u srcLen, Ncv32u *pBufsize, cudaDeviceProp &devProp) +{ + ncvAssertReturn(pBufsize != NULL, NPPST_NULL_POINTER_ERROR); + + if (srcLen == 0) + { + *pBufsize = 0; + return NPPST_SUCCESS; + } + + NCVMemStackAllocator gpuCounter(devProp.textureAlignment); + ncvAssertReturn(gpuCounter.isInitialized(), NPPST_MEM_INTERNAL_ERROR); + + NCVStatus ncvStat = compactVector_32u_device(NULL, srcLen, NULL, NULL, 0xC001C0DE, + gpuCounter); + ncvAssertReturnNcvStat(ncvStat); + + *pBufsize = (Ncv32u)gpuCounter.maxSize(); + return NPPST_SUCCESS; +} + + +NCVStatus nppsStCompactGetSize_32s(Ncv32u srcLen, Ncv32u *pBufsize, cudaDeviceProp &devProp) +{ + return nppsStCompactGetSize_32u(srcLen, pBufsize, devProp); +} + + +NCVStatus nppsStCompactGetSize_32f(Ncv32u srcLen, Ncv32u *pBufsize, cudaDeviceProp &devProp) +{ + return nppsStCompactGetSize_32u(srcLen, pBufsize, devProp); +} + + +NCVStatus nppsStCompact_32u(Ncv32u *d_src, Ncv32u srcLen, + Ncv32u *d_dst, Ncv32u *p_dstLen, + Ncv32u elemRemove, Ncv8u *pBuffer, + Ncv32u bufSize, cudaDeviceProp &devProp) +{ + NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, devProp.textureAlignment, pBuffer); + ncvAssertReturn(gpuAllocator.isInitialized(), NPPST_MEM_INTERNAL_ERROR); + + NCVStatus ncvStat = compactVector_32u_device(d_src, srcLen, d_dst, p_dstLen, elemRemove, + gpuAllocator); + ncvAssertReturnNcvStat(ncvStat); + + return NPPST_SUCCESS; +} + + +NCVStatus nppsStCompact_32s(Ncv32s *d_src, Ncv32u srcLen, + Ncv32s *d_dst, Ncv32u *p_dstLen, + Ncv32s elemRemove, Ncv8u *pBuffer, + Ncv32u bufSize, cudaDeviceProp &devProp) +{ + return nppsStCompact_32u((Ncv32u *)d_src, srcLen, (Ncv32u *)d_dst, p_dstLen, + *(Ncv32u *)&elemRemove, pBuffer, bufSize, devProp); +} + + +NCVStatus nppsStCompact_32f(Ncv32f *d_src, Ncv32u srcLen, + Ncv32f *d_dst, Ncv32u *p_dstLen, + Ncv32f elemRemove, Ncv8u *pBuffer, + Ncv32u bufSize, cudaDeviceProp &devProp) +{ + return nppsStCompact_32u((Ncv32u *)d_src, srcLen, (Ncv32u *)d_dst, p_dstLen, + *(Ncv32u *)&elemRemove, pBuffer, bufSize, devProp); +} + + +NCVStatus nppsStCompact_32u_host(Ncv32u *h_src, Ncv32u srcLen, + Ncv32u *h_dst, Ncv32u *dstLen, Ncv32u elemRemove) +{ + ncvAssertReturn(h_src != NULL && h_dst != NULL, NPPST_NULL_POINTER_ERROR); + + if (srcLen == 0) + { + if (dstLen != NULL) + { + *dstLen = 0; + } + return NPPST_SUCCESS; + } + + Ncv32u dstIndex = 0; + for (Ncv32u srcIndex=0; srcIndex - - #if !defined (HAVE_CUDA) #else /* !defined (HAVE_CUDA) */ +#include #include #include "NCV.hpp" @@ -94,17 +92,6 @@ void ncvSetDebugOutputHandler(NCVDebugOutputHandler *func) //============================================================================== -NCVStatus GPUAlignmentValue(Ncv32u &alignment) -{ - int curDev; - cudaDeviceProp curProp; - ncvAssertCUDAReturn(cudaGetDevice(&curDev), NCV_CUDA_ERROR); - ncvAssertCUDAReturn(cudaGetDeviceProperties(&curProp, curDev), NCV_CUDA_ERROR); - alignment = curProp.textureAlignment; //GPUAlignmentValue(curProp.major); - return NCV_SUCCESS; -} - - Ncv32u alignUp(Ncv32u what, Ncv32u alignment) { Ncv32u alignMask = alignment-1; @@ -216,7 +203,7 @@ NCVMemStackAllocator::NCVMemStackAllocator(Ncv32u alignment) } -NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment) +NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment, void *reusePtr) : currentSize(0), _maxSize(0), @@ -229,17 +216,26 @@ NCVMemStackAllocator::NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, allocBegin = NULL; - switch (memT) + if (reusePtr == NULL) { - case NCVMemoryTypeDevice: - ncvAssertCUDAReturn(cudaMalloc(&allocBegin, capacity), ); - break; - case NCVMemoryTypeHostPinned: - ncvAssertCUDAReturn(cudaMallocHost(&allocBegin, capacity), ); - break; - case NCVMemoryTypeHostPageable: - allocBegin = (Ncv8u *)malloc(capacity); - break; + bReusesMemory = false; + switch (memT) + { + case NCVMemoryTypeDevice: + ncvAssertCUDAReturn(cudaMalloc(&allocBegin, capacity), ); + break; + case NCVMemoryTypeHostPinned: + ncvAssertCUDAReturn(cudaMallocHost(&allocBegin, capacity), ); + break; + case NCVMemoryTypeHostPageable: + allocBegin = (Ncv8u *)malloc(capacity); + break; + } + } + else + { + bReusesMemory = true; + allocBegin = (Ncv8u *)reusePtr; } if (capacity == 0) @@ -260,18 +256,23 @@ NCVMemStackAllocator::~NCVMemStackAllocator() if (allocBegin != NULL) { ncvAssertPrintCheck(currentSize == 0, "NCVMemStackAllocator dtor:: not all objects were deallocated properly, forcing destruction"); - switch (_memType) + + if (!bReusesMemory) { - case NCVMemoryTypeDevice: - ncvAssertCUDAReturn(cudaFree(allocBegin), ); - break; - case NCVMemoryTypeHostPinned: - ncvAssertCUDAReturn(cudaFreeHost(allocBegin), ); - break; - case NCVMemoryTypeHostPageable: - free(allocBegin); - break; + switch (_memType) + { + case NCVMemoryTypeDevice: + ncvAssertCUDAReturn(cudaFree(allocBegin), ); + break; + case NCVMemoryTypeHostPinned: + ncvAssertCUDAReturn(cudaFreeHost(allocBegin), ); + break; + case NCVMemoryTypeHostPageable: + free(allocBegin); + break; + } } + allocBegin = NULL; } } @@ -356,14 +357,14 @@ size_t NCVMemStackAllocator::maxSize(void) const //=================================================================== -NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT) +NCVMemNativeAllocator::NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment) : currentSize(0), _maxSize(0), - _memType(memT) + _memType(memT), + _alignment(alignment) { ncvAssertPrintReturn(memT != NCVMemoryTypeNone, "NCVMemNativeAllocator ctor:: counting not permitted for this allocator type", ); - ncvAssertPrintReturn(NCV_SUCCESS == GPUAlignmentValue(this->_alignment), "NCVMemNativeAllocator ctor:: couldn't get device _alignment", ); } diff --git a/modules/gpu/src/nvidia/NCV.hpp b/modules/gpu/src/nvidia/core/NCV.hpp similarity index 76% rename from modules/gpu/src/nvidia/NCV.hpp rename to modules/gpu/src/nvidia/core/NCV.hpp index a71f650252..81eb417fbe 100644 --- a/modules/gpu/src/nvidia/NCV.hpp +++ b/modules/gpu/src/nvidia/core/NCV.hpp @@ -42,8 +42,49 @@ #ifndef _ncv_hpp_ #define _ncv_hpp_ +#if (defined WIN32 || defined _WIN32 || defined WINCE) && defined CVAPI_EXPORTS //&& !defined(__CUDACC__) + #define NCV_EXPORTS __declspec(dllexport) +#else + #define NCV_EXPORTS +#endif + #include -#include "npp_staging.h" + + +//============================================================================== +// +// Compile-time assert functionality +// +//============================================================================== + + +/** +* Compile-time assert namespace +*/ +namespace NcvCTprep +{ + template + struct CT_ASSERT_FAILURE; + + template <> + struct CT_ASSERT_FAILURE {}; + + template + struct assertTest{}; +} + + +#define NCV_CT_PREP_PASTE_AUX(a,b) a##b ///< Concatenation indirection macro +#define NCV_CT_PREP_PASTE(a,b) NCV_CT_PREP_PASTE_AUX(a, b) ///< Concatenation macro + + +/** +* Performs compile-time assertion of a condition on the file scope +*/ +#define NCV_CT_ASSERT(X) \ + typedef NcvCTprep::assertTest)> \ + NCV_CT_PREP_PASTE(__ct_assert_typedef_, __LINE__) + //============================================================================== @@ -82,62 +123,72 @@ typedef float Ncv32f; typedef double Ncv64f; -typedef struct +struct NcvRect8u { Ncv8u x; Ncv8u y; Ncv8u width; Ncv8u height; -} NcvRect8u; + NcvRect8u() : x(0), y(0), width(0), height(0) {}; + NcvRect8u(Ncv8u x, Ncv8u y, Ncv8u width, Ncv8u height) : x(x), y(y), width(width), height(height) {} +}; -typedef struct +struct NcvRect32s { Ncv32s x; ///< x-coordinate of upper left corner. Ncv32s y; ///< y-coordinate of upper left corner. Ncv32s width; ///< Rectangle width. Ncv32s height; ///< Rectangle height. -} NcvRect32s; + NcvRect32s() : x(0), y(0), width(0), height(0) {}; + NcvRect32s(Ncv32s x, Ncv32s y, Ncv32s width, Ncv32s height) : x(x), y(y), width(width), height(height) {} +}; -typedef struct +struct NcvRect32u { Ncv32u x; ///< x-coordinate of upper left corner. Ncv32u y; ///< y-coordinate of upper left corner. Ncv32u width; ///< Rectangle width. Ncv32u height; ///< Rectangle height. -} NcvRect32u; + NcvRect32u() : x(0), y(0), width(0), height(0) {}; + NcvRect32u(Ncv32u x, Ncv32u y, Ncv32u width, Ncv32u height) : x(x), y(y), width(width), height(height) {} +}; -typedef struct +struct NcvSize32s { Ncv32s width; ///< Rectangle width. Ncv32s height; ///< Rectangle height. -} NcvSize32s; + NcvSize32s() : width(0), height(0) {}; + NcvSize32s(Ncv32s width, Ncv32s height) : width(width), height(height) {} +}; -typedef struct +struct NcvSize32u { Ncv32u width; ///< Rectangle width. Ncv32u height; ///< Rectangle height. -} NcvSize32u; - - -NPPST_CT_ASSERT(sizeof(NcvBool) <= 4); -NPPST_CT_ASSERT(sizeof(Ncv64s) == 8); -NPPST_CT_ASSERT(sizeof(Ncv64u) == 8); -NPPST_CT_ASSERT(sizeof(Ncv32s) == 4); -NPPST_CT_ASSERT(sizeof(Ncv32u) == 4); -NPPST_CT_ASSERT(sizeof(Ncv16s) == 2); -NPPST_CT_ASSERT(sizeof(Ncv16u) == 2); -NPPST_CT_ASSERT(sizeof(Ncv8s) == 1); -NPPST_CT_ASSERT(sizeof(Ncv8u) == 1); -NPPST_CT_ASSERT(sizeof(Ncv32f) == 4); -NPPST_CT_ASSERT(sizeof(Ncv64f) == 8); -NPPST_CT_ASSERT(sizeof(NcvRect8u) == sizeof(Ncv32u)); -NPPST_CT_ASSERT(sizeof(NcvRect32s) == 4 * sizeof(Ncv32s)); -NPPST_CT_ASSERT(sizeof(NcvRect32u) == 4 * sizeof(Ncv32u)); -NPPST_CT_ASSERT(sizeof(NcvSize32u) == 2 * sizeof(Ncv32u)); + NcvSize32u() : width(0), height(0) {}; + NcvSize32u(Ncv32u width, Ncv32u height) : width(width), height(height) {} +}; + + +NCV_CT_ASSERT(sizeof(NcvBool) <= 4); +NCV_CT_ASSERT(sizeof(Ncv64s) == 8); +NCV_CT_ASSERT(sizeof(Ncv64u) == 8); +NCV_CT_ASSERT(sizeof(Ncv32s) == 4); +NCV_CT_ASSERT(sizeof(Ncv32u) == 4); +NCV_CT_ASSERT(sizeof(Ncv16s) == 2); +NCV_CT_ASSERT(sizeof(Ncv16u) == 2); +NCV_CT_ASSERT(sizeof(Ncv8s) == 1); +NCV_CT_ASSERT(sizeof(Ncv8u) == 1); +NCV_CT_ASSERT(sizeof(Ncv32f) == 4); +NCV_CT_ASSERT(sizeof(Ncv64f) == 8); +NCV_CT_ASSERT(sizeof(NcvRect8u) == sizeof(Ncv32u)); +NCV_CT_ASSERT(sizeof(NcvRect32s) == 4 * sizeof(Ncv32s)); +NCV_CT_ASSERT(sizeof(NcvRect32u) == 4 * sizeof(Ncv32u)); +NCV_CT_ASSERT(sizeof(NcvSize32u) == 2 * sizeof(Ncv32u)); //============================================================================== @@ -162,13 +213,13 @@ const Ncv32u K_LOG2_WARP_SIZE = 5; #define NCV_CT_PREP_STRINGIZE(x) NCV_CT_PREP_STRINGIZE_AUX(x) -void ncvDebugOutput(const char *msg, ...); +NCV_EXPORTS void ncvDebugOutput(const char *msg, ...); typedef void NCVDebugOutputHandler(const char* msg); -void ncvSetDebugOutputHandler(NCVDebugOutputHandler* func); +NCV_EXPORTS void ncvSetDebugOutputHandler(NCVDebugOutputHandler* func); #define ncvAssertPrintCheck(pred, msg) \ @@ -222,6 +273,7 @@ void ncvSetDebugOutputHandler(NCVDebugOutputHandler* func); */ enum NCVStatus { + //NCV statuses NCV_SUCCESS, NCV_CUDA_ERROR, @@ -257,6 +309,24 @@ enum NCVStatus NCV_NOIMPL_HAAR_TILTED_FEATURES, NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW, + + //NPP statuses + NPPST_SUCCESS = NCV_SUCCESS, ///< Successful operation (same as NPP_NO_ERROR) + NPPST_ERROR, ///< Unknown error + NPPST_CUDA_KERNEL_EXECUTION_ERROR, ///< CUDA kernel execution error + NPPST_NULL_POINTER_ERROR, ///< NULL pointer argument error + NPPST_TEXTURE_BIND_ERROR, ///< CUDA texture binding error or non-zero offset returned + NPPST_MEMCPY_ERROR, ///< CUDA memory copy error + NPPST_MEM_ALLOC_ERR, ///< CUDA memory allocation error + NPPST_MEMFREE_ERR, ///< CUDA memory deallocation error + + //NPPST statuses + NPPST_INVALID_ROI, ///< Invalid region of interest argument + NPPST_INVALID_STEP, ///< Invalid image lines step argument (check sign, alignment, relation to image width) + NPPST_INVALID_SCALE, ///< Invalid scale parameter passed + NPPST_MEM_INSUFFICIENT_BUFFER, ///< Insufficient user-allocated buffer + NPPST_MEM_RESIDENCE_ERROR, ///< Memory residence error detected (check if pointers should be device or pinned) + NPPST_MEM_INTERNAL_ERROR, ///< Internal memory management error }; @@ -285,11 +355,11 @@ enum NCVStatus typedef struct _NcvTimer *NcvTimer; -NcvTimer ncvStartTimer(void); +NCV_EXPORTS NcvTimer ncvStartTimer(void); -double ncvEndQueryTimerUs(NcvTimer t); +NCV_EXPORTS double ncvEndQueryTimerUs(NcvTimer t); -double ncvEndQueryTimerMs(NcvTimer t); +NCV_EXPORTS double ncvEndQueryTimerMs(NcvTimer t); //============================================================================== @@ -299,16 +369,10 @@ double ncvEndQueryTimerMs(NcvTimer t); //============================================================================== -/** -* Alignment of GPU memory chunks in bytes -*/ -NCVStatus GPUAlignmentValue(Ncv32u &alignment); - - /** * Calculates the aligned top bound value */ -Ncv32u alignUp(Ncv32u what, Ncv32u alignment); +NCV_EXPORTS Ncv32u alignUp(Ncv32u what, Ncv32u alignment); /** @@ -326,7 +390,7 @@ enum NCVMemoryType /** * NCVMemPtr */ -struct NCVMemPtr +struct NCV_EXPORTS NCVMemPtr { void *ptr; NCVMemoryType memtype; @@ -337,7 +401,7 @@ struct NCVMemPtr /** * NCVMemSegment */ -struct NCVMemSegment +struct NCV_EXPORTS NCVMemSegment { NCVMemPtr begin; size_t size; @@ -348,7 +412,7 @@ struct NCVMemSegment /** * INCVMemAllocator (Interface) */ -class INCVMemAllocator +class NCV_EXPORTS INCVMemAllocator { public: virtual ~INCVMemAllocator() = 0; @@ -370,7 +434,7 @@ inline INCVMemAllocator::~INCVMemAllocator() {} /** * NCVMemStackAllocator */ -class NCVMemStackAllocator : public INCVMemAllocator +class NCV_EXPORTS NCVMemStackAllocator : public INCVMemAllocator { NCVMemStackAllocator(); NCVMemStackAllocator(const NCVMemStackAllocator &); @@ -378,7 +442,7 @@ class NCVMemStackAllocator : public INCVMemAllocator public: explicit NCVMemStackAllocator(Ncv32u alignment); - NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment); + NCVMemStackAllocator(NCVMemoryType memT, size_t capacity, Ncv32u alignment, void *reusePtr=NULL); virtual ~NCVMemStackAllocator(); virtual NCVStatus alloc(NCVMemSegment &seg, size_t size); @@ -400,17 +464,18 @@ private: Ncv8u *end; size_t currentSize; size_t _maxSize; + NcvBool bReusesMemory; }; /** * NCVMemNativeAllocator */ -class NCVMemNativeAllocator : public INCVMemAllocator +class NCV_EXPORTS NCVMemNativeAllocator : public INCVMemAllocator { public: - NCVMemNativeAllocator(NCVMemoryType memT); + NCVMemNativeAllocator(NCVMemoryType memT, Ncv32u alignment); virtual ~NCVMemNativeAllocator(); virtual NCVStatus alloc(NCVMemSegment &seg, size_t size); @@ -438,9 +503,9 @@ private: /** * Copy dispatcher */ -NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType, - const void *src, NCVMemoryType srcType, - size_t sz, cudaStream_t cuStream); +NCV_EXPORTS NCVStatus memSegCopyHelper(void *dst, NCVMemoryType dstType, + const void *src, NCVMemoryType srcType, + size_t sz, cudaStream_t cuStream); /** @@ -514,6 +579,7 @@ class NCVVectorAlloc : public NCVVector { NCVVectorAlloc(); NCVVectorAlloc(const NCVVectorAlloc &); + NCVVectorAlloc& operator=(const NCVVectorAlloc&); public: @@ -563,8 +629,7 @@ public: return allocatedMem; } -private: - +private: INCVMemAllocator &allocator; NCVMemSegment allocatedMem; }; @@ -707,7 +772,7 @@ class NCVMatrixAlloc : public NCVMatrix { NCVMatrixAlloc(); NCVMatrixAlloc(const NCVMatrixAlloc &); - + NCVMatrixAlloc& operator=(const NCVMatrixAlloc &); public: NCVMatrixAlloc(INCVMemAllocator &allocator, Ncv32u width, Ncv32u height, Ncv32u pitch=0) diff --git a/modules/gpu/src/nvidia/NCVRuntimeTemplates.hpp b/modules/gpu/src/nvidia/core/NCVRuntimeTemplates.hpp similarity index 73% rename from modules/gpu/src/nvidia/NCVRuntimeTemplates.hpp rename to modules/gpu/src/nvidia/core/NCVRuntimeTemplates.hpp index 14d16bb3b9..3d3f94f8b1 100644 --- a/modules/gpu/src/nvidia/NCVRuntimeTemplates.hpp +++ b/modules/gpu/src/nvidia/core/NCVRuntimeTemplates.hpp @@ -1,3 +1,51 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef _ncvruntimetemplates_hpp_ +#define _ncvruntimetemplates_hpp_ + +#include +#include + + //////////////////////////////////////////////////////////////////////////////// // The Loki Library // Copyright (c) 2001 by Andrei Alexandrescu @@ -14,13 +62,6 @@ // http://loki-lib.sourceforge.net/index.php?n=Main.License //////////////////////////////////////////////////////////////////////////////// -#ifndef _ncvruntimetemplates_hpp_ -#define _ncvruntimetemplates_hpp_ - -#include -#include - - namespace Loki { //============================================================================== diff --git a/modules/gpu/src/opencv2/gpu/device/transform.hpp b/modules/gpu/src/opencv2/gpu/device/transform.hpp index 959cca2350..79ed09439d 100644 --- a/modules/gpu/src/opencv2/gpu/device/transform.hpp +++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp @@ -68,51 +68,51 @@ namespace cv { namespace gpu { namespace device //! Read Write Traits - template - struct UnReadWriteTraits_ - { - enum {shift=1}; - }; - template - struct UnReadWriteTraits_ - { - enum {shift=4}; - }; - template - struct UnReadWriteTraits_ - { - enum {shift=2}; + template + struct UnReadWriteTraits_ + { + enum {shift=1}; + }; + template + struct UnReadWriteTraits_ + { + enum {shift=4}; + }; + template + struct UnReadWriteTraits_ + { + enum {shift=2}; }; - template struct UnReadWriteTraits - { - enum {shift=UnReadWriteTraits_::shift}; - - typedef typename TypeVec::vec_t read_type; - typedef typename TypeVec::vec_t write_type; + template struct UnReadWriteTraits + { + enum {shift=UnReadWriteTraits_::shift}; + + typedef typename TypeVec::vec_t read_type; + typedef typename TypeVec::vec_t write_type; }; - template - struct BinReadWriteTraits_ - { - enum {shift=1}; + template + struct BinReadWriteTraits_ + { + enum {shift=1}; }; - template - struct BinReadWriteTraits_ - { - enum {shift=4}; + template + struct BinReadWriteTraits_ + { + enum {shift=4}; }; - template - struct BinReadWriteTraits_ - { - enum {shift=2}; + template + struct BinReadWriteTraits_ + { + enum {shift=2}; }; - template struct BinReadWriteTraits - { - enum {shift=BinReadWriteTraits_::shift}; - - typedef typename TypeVec::vec_t read_type1; - typedef typename TypeVec::vec_t read_type2; - typedef typename TypeVec::vec_t write_type; + template struct BinReadWriteTraits + { + enum {shift=BinReadWriteTraits_::shift}; + + typedef typename TypeVec::vec_t read_type1; + typedef typename TypeVec::vec_t read_type2; + typedef typename TypeVec::vec_t write_type; }; //! Transform kernels @@ -122,14 +122,14 @@ namespace cv { namespace gpu { namespace device { template static __device__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y) - { + { if (mask(y, x_shifted)) dst.x = op(src.x); } template static __device__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y) - { + { if (mask(y, x_shifted)) dst.x = op(src1.x, src2.x); } @@ -138,18 +138,18 @@ namespace cv { namespace gpu { namespace device { template static __device__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y) - { + { if (mask(y, x_shifted)) - dst.x = op(src.x); + dst.x = op(src.x); if (mask(y, x_shifted + 1)) dst.y = op(src.y); } template static __device__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y) - { + { if (mask(y, x_shifted)) - dst.x = op(src1.x, src2.x); + dst.x = op(src1.x, src2.x); if (mask(y, x_shifted + 1)) dst.y = op(src1.y, src2.y); } @@ -158,22 +158,22 @@ namespace cv { namespace gpu { namespace device { template static __device__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y) - { + { if (mask(y, x_shifted)) - dst.x = op(src.x); + dst.x = op(src.x); if (mask(y, x_shifted + 1)) - dst.y = op(src.y); + dst.y = op(src.y); if (mask(y, x_shifted + 2)) dst.z = op(src.z); } template static __device__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y) - { + { if (mask(y, x_shifted)) - dst.x = op(src1.x, src2.x); + dst.x = op(src1.x, src2.x); if (mask(y, x_shifted + 1)) - dst.y = op(src1.y, src2.y); + dst.y = op(src1.y, src2.y); if (mask(y, x_shifted + 2)) dst.z = op(src1.z, src2.z); } @@ -182,65 +182,65 @@ namespace cv { namespace gpu { namespace device { template static __device__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y) - { + { if (mask(y, x_shifted)) - dst.x = op(src.x); + dst.x = op(src.x); if (mask(y, x_shifted + 1)) - dst.y = op(src.y); + dst.y = op(src.y); if (mask(y, x_shifted + 2)) - dst.z = op(src.z); + dst.z = op(src.z); if (mask(y, x_shifted + 3)) dst.w = op(src.w); } template static __device__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y) - { + { if (mask(y, x_shifted)) - dst.x = op(src1.x, src2.x); + dst.x = op(src1.x, src2.x); if (mask(y, x_shifted + 1)) - dst.y = op(src1.y, src2.y); + dst.y = op(src1.y, src2.y); if (mask(y, x_shifted + 2)) - dst.z = op(src1.z, src2.z); + dst.z = op(src1.z, src2.z); if (mask(y, x_shifted + 3)) dst.w = op(src1.w, src2.w); } }; - template - __global__ static void transformSmart(const DevMem2D_ src_, PtrStep_ dst_, const Mask mask, UnOp op) - { - typedef typename UnReadWriteTraits::read_type read_type; - typedef typename UnReadWriteTraits::write_type write_type; - const int shift = UnReadWriteTraits::shift; - - const int x = threadIdx.x + blockIdx.x * blockDim.x; - const int y = threadIdx.y + blockIdx.y * blockDim.y; - const int x_shifted = x * shift; - - if (y < src_.rows) - { - const T* src = src_.ptr(y); - D* dst = dst_.ptr(y); - - if (x_shifted + shift - 1 < src_.cols) - { - read_type src_n_el = ((const read_type*)src)[x]; - write_type dst_n_el; - - OpUnroller::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y); - - ((write_type*)dst)[x] = dst_n_el; - } - else - { - for (int real_x = x_shifted; real_x < src_.cols; ++real_x) - { - if (mask(y, real_x)) - dst[real_x] = op(src[real_x]); - } - } - } + template + __global__ static void transformSmart(const DevMem2D_ src_, PtrStep_ dst_, const Mask mask, UnOp op) + { + typedef typename UnReadWriteTraits::read_type read_type; + typedef typename UnReadWriteTraits::write_type write_type; + const int shift = UnReadWriteTraits::shift; + + const int x = threadIdx.x + blockIdx.x * blockDim.x; + const int y = threadIdx.y + blockIdx.y * blockDim.y; + const int x_shifted = x * shift; + + if (y < src_.rows) + { + const T* src = src_.ptr(y); + D* dst = dst_.ptr(y); + + if (x_shifted + shift - 1 < src_.cols) + { + read_type src_n_el = ((const read_type*)src)[x]; + write_type dst_n_el; + + OpUnroller::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y); + + ((write_type*)dst)[x] = dst_n_el; + } + else + { + for (int real_x = x_shifted; real_x < src_.cols; ++real_x) + { + if (mask(y, real_x)) + dst[real_x] = op(src[real_x]); + } + } + } } template @@ -255,44 +255,44 @@ namespace cv { namespace gpu { namespace device } } - template - __global__ static void transformSmart(const DevMem2D_ src1_, const PtrStep_ src2_, PtrStep_ dst_, - const Mask mask, BinOp op) - { - typedef typename BinReadWriteTraits::read_type1 read_type1; - typedef typename BinReadWriteTraits::read_type2 read_type2; - typedef typename BinReadWriteTraits::write_type write_type; - const int shift = BinReadWriteTraits::shift; - - const int x = threadIdx.x + blockIdx.x * blockDim.x; - const int y = threadIdx.y + blockIdx.y * blockDim.y; - const int x_shifted = x * shift; - - if (y < src1_.rows) - { - const T1* src1 = src1_.ptr(y); - const T2* src2 = src2_.ptr(y); - D* dst = dst_.ptr(y); - - if (x_shifted + shift - 1 < src1_.cols) - { - read_type1 src1_n_el = ((const read_type1*)src1)[x]; - read_type2 src2_n_el = ((const read_type2*)src2)[x]; - write_type dst_n_el; - - OpUnroller::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y); - - ((write_type*)dst)[x] = dst_n_el; - } - else - { - for (int real_x = x_shifted; real_x < src1_.cols; ++real_x) - { - if (mask(y, real_x)) - dst[real_x] = op(src1[real_x], src2[real_x]); - } - } - } + template + __global__ static void transformSmart(const DevMem2D_ src1_, const PtrStep_ src2_, PtrStep_ dst_, + const Mask mask, BinOp op) + { + typedef typename BinReadWriteTraits::read_type1 read_type1; + typedef typename BinReadWriteTraits::read_type2 read_type2; + typedef typename BinReadWriteTraits::write_type write_type; + const int shift = BinReadWriteTraits::shift; + + const int x = threadIdx.x + blockIdx.x * blockDim.x; + const int y = threadIdx.y + blockIdx.y * blockDim.y; + const int x_shifted = x * shift; + + if (y < src1_.rows) + { + const T1* src1 = src1_.ptr(y); + const T2* src2 = src2_.ptr(y); + D* dst = dst_.ptr(y); + + if (x_shifted + shift - 1 < src1_.cols) + { + read_type1 src1_n_el = ((const read_type1*)src1)[x]; + read_type2 src2_n_el = ((const read_type2*)src2)[x]; + write_type dst_n_el; + + OpUnroller::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y); + + ((write_type*)dst)[x] = dst_n_el; + } + else + { + for (int real_x = x_shifted; real_x < src1_.cols; ++real_x) + { + if (mask(y, real_x)) + dst[real_x] = op(src1[real_x], src2[real_x]); + } + } + } } template @@ -355,11 +355,11 @@ namespace cv template static void call(const DevMem2D_& src, const DevMem2D_& dst, UnOp op, const Mask& mask, cudaStream_t stream = 0) - { + { const int shift = device::UnReadWriteTraits::shift; dim3 threads(16, 16, 1); - dim3 grid(1, 1, 1); + dim3 grid(1, 1, 1); grid.x = divUp(src.cols, threads.x * shift); grid.y = divUp(src.rows, threads.y); @@ -373,7 +373,7 @@ namespace cv template static void call(const DevMem2D_& src1, const DevMem2D_& src2, const DevMem2D_& dst, BinOp op, const Mask& mask, cudaStream_t stream = 0) - { + { const int shift = device::BinReadWriteTraits::shift; dim3 threads(16, 16, 1); @@ -392,7 +392,7 @@ namespace cv template static void transform_caller(const DevMem2D_& src, const DevMem2D_& dst, UnOp op, const Mask& mask, cudaStream_t stream = 0) - { + { TransformChooser::cn == 1 && device::VecTraits::cn == 1 && device::UnReadWriteTraits::shift != 1>::call(src, dst, op, mask, stream); } diff --git a/modules/gpu/src/precomp.hpp b/modules/gpu/src/precomp.hpp index d5ad3cc727..ca073119ee 100644 --- a/modules/gpu/src/precomp.hpp +++ b/modules/gpu/src/precomp.hpp @@ -69,9 +69,9 @@ #include "cufft.h" #include "opencv2/gpu/stream_accessor.hpp" #include "npp.h" - #include "npp_staging.h" - - #include "nvidia/NCV.hpp" + + #include "nvidia/core/NCV.hpp" + #include "nvidia/NPP_staging/npp_staging.hpp" #include "nvidia/NCVHaarObjectDetection.hpp" #define CUDART_MINIMUM_REQUIRED_VERSION 3020 diff --git a/modules/gtest/src/gtestcv.cpp b/modules/gtest/src/gtestcv.cpp index 25082ca1af..3b3b695235 100644 --- a/modules/gtest/src/gtestcv.cpp +++ b/modules/gtest/src/gtestcv.cpp @@ -1378,7 +1378,7 @@ cmpEpsFlt_(const _Tp* src1, const _Tp* src2, size_t total, int imaxdiff, size_t { _Tp a = src1[i], b = src2[i]; if( a < 0 ) a ^= C; if( b < 0 ) b ^= C; - _Tp d = std::abs(a - b); + _Tp d = std::abs(double(a - b)); if( d > imaxdiff ) { idx = i + startidx; diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 1a601589a9..079d3af271 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -3,8 +3,8 @@ # # ---------------------------------------------------------------------------- -add_subdirectory(c) -add_subdirectory(cpp) +#add_subdirectory(c) +#add_subdirectory(cpp) add_subdirectory(gpu) if(0) diff --git a/samples/gpu/CMakeLists.txt b/samples/gpu/CMakeLists.txt index de9fe6ef80..7b39e6ee6b 100644 --- a/samples/gpu/CMakeLists.txt +++ b/samples/gpu/CMakeLists.txt @@ -14,10 +14,12 @@ if (BUILD_EXAMPLES) "${CMAKE_SOURCE_DIR}/modules/legacy/include" "${CMAKE_SOURCE_DIR}/modules/contrib/include" "${CMAKE_SOURCE_DIR}/modules/gpu/include" + "${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia" + "${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/core" ) if(HAVE_CUDA) - include_directories(${CUDA_INCLUDE_DIRS}) + include_directories(${CUDA_INCLUDE_DIRS} ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/core) endif() if(CMAKE_COMPILER_IS_GNUCXX) diff --git a/modules/gpu/src/nvidia/FaceDetectionFeed.cpp_NvidiaAPI_sample b/samples/gpu/cascadeclassifier_nvidia_api.cpp similarity index 74% rename from modules/gpu/src/nvidia/FaceDetectionFeed.cpp_NvidiaAPI_sample rename to samples/gpu/cascadeclassifier_nvidia_api.cpp index c1926a38c2..ff99f2c361 100644 --- a/modules/gpu/src/nvidia/FaceDetectionFeed.cpp_NvidiaAPI_sample +++ b/samples/gpu/cascadeclassifier_nvidia_api.cpp @@ -1,60 +1,25 @@ -/*M/////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2009-2010, NVIDIA Corporation, all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ - +#pragma warning( disable : 4201 4408 4127 4100) #include -#include -#define CV_NO_BACKWARD_COMPATIBILITY +#include "cvconfig.h" +#if !defined(HAVE_CUDA) + int main( int argc, const char** argv ) { return printf("Please compile the librarary with CUDA support."), -1; } +#else -#include "opencv2/opencv.hpp" +#include +#include "opencv2/opencv.hpp" #include "NCVHaarObjectDetection.hpp" + + using namespace cv; -using namespace std; -const Size preferredVideoFrameSize(640, 480); -string preferredClassifier = "haarcascade_frontalface_alt.xml"; -string wndTitle = "NVIDIA Computer Vision SDK :: Face Detection in Video Feed"; +const Size2i preferredVideoFrameSize(640, 480); + +std::string preferredClassifier = "haarcascade_frontalface_alt.xml"; +std::string wndTitle = "NVIDIA Computer Vision SDK :: Face Detection in Video Feed"; void printSyntax(void) @@ -62,7 +27,6 @@ void printSyntax(void) printf("Syntax: FaceDetectionFeed.exe [-c cameranum | -v filename] classifier.xml\n"); } - void imagePrintf(Mat& img, int lineOffsY, Scalar color, const char *format, ...) { int fontFace = CV_FONT_HERSHEY_PLAIN; @@ -83,7 +47,6 @@ void imagePrintf(Mat& img, int lineOffsY, Scalar color, const char *format, ...) va_end(arg_ptr); } - NCVStatus process(Mat *srcdst, Ncv32u width, Ncv32u height, NcvBool bShowAllHypotheses, NcvBool bLargestFace, @@ -104,15 +67,16 @@ NCVStatus process(Mat *srcdst, ncvAssertReturn(d_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); NCVMatrixAlloc h_src(cpuAllocator, width, height); ncvAssertReturn(h_src.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); - NCVVectorAlloc d_rects(gpuAllocator, 100); + NCVVectorAlloc d_rects(gpuAllocator, 100); ncvAssertReturn(d_rects.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); - Mat h_src_hdr(Size(width, height), CV_8U, h_src.ptr(), h_src.stride()); + NCV_SKIP_COND_BEGIN + + for (Ncv32u i=0; i<(Ncv32u)srcdst->rows; i++) + { + memcpy(h_src.ptr() + i * h_src.stride(), srcdst->ptr(i), srcdst->cols); + } - NCV_SKIP_COND_BEGIN - - (*srcdst).copyTo(h_src_hdr); - ncvStat = h_src.copySolid(d_src, 0); ncvAssertReturnNcvStat(ncvStat); ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR); @@ -130,8 +94,9 @@ NCVStatus process(Mat *srcdst, haar.ClassifierSize, bShowAllHypotheses ? 0 : 4, 1.2f, 1, - (bLargestFace ? NCVPipeObjDet_FindLargestObject : 0) | NCVPipeObjDet_VisualizeInPlace, - gpuAllocator, cpuAllocator, devProp.major, devProp.minor, 0); + (bLargestFace ? NCVPipeObjDet_FindLargestObject : 0) + | NCVPipeObjDet_VisualizeInPlace, + gpuAllocator, cpuAllocator, devProp, 0); ncvAssertReturnNcvStat(ncvStat); ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR); @@ -141,14 +106,16 @@ NCVStatus process(Mat *srcdst, ncvAssertReturnNcvStat(ncvStat); ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR); - h_src_hdr.copyTo(*srcdst); - + for (Ncv32u i=0; i<(Ncv32u)srcdst->rows; i++) + { + memcpy(srcdst->ptr(i), h_src.ptr() + i * h_src.stride(), srcdst->cols); + } + NCV_SKIP_COND_END return NCV_SUCCESS; } - int main( int argc, const char** argv ) { NCVStatus ncvStat; @@ -160,14 +127,19 @@ int main( int argc, const char** argv ) printf(" Space - Switch between NCV and OpenCV\n"); printf(" L - Switch between FullSearch and LargestFace modes\n"); printf(" U - Toggle unfiltered hypotheses visualization in FullSearch\n"); - - if (argc != 4 && argc != 1) - return printSyntax(), -1; - + VideoCapture capture; - Size frameSize; + bool bQuit = false; + + Size2i frameSize; - if (argc == 1 || strcmp(argv[1], "-c") == 0) + if (argc != 4 && argc != 1) + { + printSyntax(); + return -1; + } + + if (argc == 1 || strcmp(argv[1], "-c") == 0) { // Camera input is specified int camIdx = (argc == 3) ? atoi(argv[2]) : 0; @@ -192,14 +164,26 @@ int main( int argc, const char** argv ) return printSyntax(), -1; NcvBool bUseOpenCV = true; - NcvBool bLargestFace = true; - NcvBool bShowAllHypotheses = false; + NcvBool bLargestFace = false; //LargestFace=true is used usually during training + NcvBool bShowAllHypotheses = false; - string classifierFile = (argc == 1) ? preferredClassifier : argv[3]; - CascadeClassifier classifierOpenCV; + std::string classifierFile; + if (argc == 1) + { + classifierFile = preferredClassifier; + } + else + { + classifierFile.assign(argv[3]); + } + if (!classifierOpenCV.load(classifierFile)) - return printf("Error (in OpenCV) opening classifier\n"), printSyntax(), -1; + { + printf("Error (in OpenCV) opening classifier\n"); + printSyntax(); + return -1; + } int devId; ncvAssertCUDAReturn(cudaGetDevice(&devId), -1); @@ -214,9 +198,9 @@ int main( int argc, const char** argv ) // //============================================================================== - NCVMemNativeAllocator gpuCascadeAllocator(NCVMemoryTypeDevice); + NCVMemNativeAllocator gpuCascadeAllocator(NCVMemoryTypeDevice, devProp.textureAlignment); ncvAssertPrintReturn(gpuCascadeAllocator.isInitialized(), "Error creating cascade GPU allocator", -1); - NCVMemNativeAllocator cpuCascadeAllocator(NCVMemoryTypeHostPinned); + NCVMemNativeAllocator cpuCascadeAllocator(NCVMemoryTypeHostPinned, devProp.textureAlignment); ncvAssertPrintReturn(cpuCascadeAllocator.isInitialized(), "Error creating cascade CPU allocator", -1); Ncv32u haarNumStages, haarNumNodes, haarNumFeatures; @@ -278,32 +262,36 @@ int main( int argc, const char** argv ) // Main processing loop // //============================================================================== - - namedWindow(wndTitle, 1); + namedWindow(wndTitle, 1); Mat frame, gray, frameDisp; - for(;;) + do { - // For camera and video file, capture the next image + // For camera and video file, capture the next image capture >> frame; if (frame.empty()) break; - + + Mat gray; cvtColor(frame, gray, CV_BGR2GRAY); + // // process + // + NcvSize32u minSize = haar.ClassifierSize; if (bLargestFace) { Ncv32u ratioX = preferredVideoFrameSize.width / minSize.width; Ncv32u ratioY = preferredVideoFrameSize.height / minSize.height; Ncv32u ratioSmallest = std::min(ratioX, ratioY); - ratioSmallest = (Ncv32u)std::max(ratioSmallest / 2.5f, 1.f); + ratioSmallest = std::max((Ncv32u)(ratioSmallest / 2.5f), (Ncv32u)1); minSize.width *= ratioSmallest; minSize.height *= ratioSmallest; } - + + Ncv32f avgTime; NcvTimer timer = ncvStartTimer(); if (!bUseOpenCV) @@ -324,15 +312,16 @@ int main( int argc, const char** argv ) rectsOpenCV, 1.2f, bShowAllHypotheses && !bLargestFace ? 0 : 4, - (bLargestFace ? CV_HAAR_FIND_BIGGEST_OBJECT : 0) | CV_HAAR_SCALE_IMAGE, + (bLargestFace ? CV_HAAR_FIND_BIGGEST_OBJECT : 0) + | CV_HAAR_SCALE_IMAGE, Size(minSize.width, minSize.height)); for (size_t rt = 0; rt < rectsOpenCV.size(); ++rt) rectangle(gray, rectsOpenCV[rt], Scalar(255)); } - Ncv32f avgTime = (Ncv32f)ncvEndQueryTimerMs(timer); - + avgTime = (Ncv32f)ncvEndQueryTimerMs(timer); + cvtColor(gray, frameDisp, CV_GRAY2BGR); imagePrintf(frameDisp, 0, CV_RGB(255, 0,0), "Space - Switch NCV%s / OpenCV%s", bUseOpenCV?"":" (ON)", bUseOpenCV?" (ON)":""); @@ -347,16 +336,25 @@ int main( int argc, const char** argv ) case ' ': bUseOpenCV = !bUseOpenCV; break; - case 'L':case 'l': + case 'L': + case 'l': bLargestFace = !bLargestFace; break; - case 'U':case 'u': + case 'U': + case 'u': bShowAllHypotheses = !bShowAllHypotheses; break; case 27: - return 0; + bQuit = true; + break; } - } - + + } while (!bQuit); + + cvDestroyWindow(wndTitle.c_str()); + return 0; } + + +#endif \ No newline at end of file diff --git a/tests/gpu/CMakeLists.txt b/tests/gpu/CMakeLists.txt index 4a25f2f76d..ca0e601ef7 100644 --- a/tests/gpu/CMakeLists.txt +++ b/tests/gpu/CMakeLists.txt @@ -3,12 +3,15 @@ # ---------------------------------------------------------------------------- project(opencv_test_gpu) +set(the_target "opencv_test_gpu") + file(GLOB test_srcs "src/*.cpp") -source_group("Src" FILES ${test_srcs}) file(GLOB test_hdrs "src/*.h*") +source_group("Src" FILES ${test_srcs}) source_group("Include" FILES ${test_hdrs}) -set(the_target "opencv_test_gpu") + + include_directories ( "${CMAKE_SOURCE_DIR}/include/opencv" @@ -26,11 +29,21 @@ include_directories ( "${CMAKE_SOURCE_DIR}/modules/ml/include" "${CMAKE_CURRENT_SOURCE_DIR}/src" "${CMAKE_CURRENT_BINARY_DIR}" + "${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia" + "${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/core" ) include_directories(../cxts) -add_executable(${the_target} ${test_srcs} ${test_hdrs}) +if(HAVE_CUDA) + include_directories(${CUDA_INCLUDE_DIRS} ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/core ${CMAKE_SOURCE_DIR}/modules/gpu/src/nvidia/NPP_staging) + + file(GLOB nvidia "src/nvidia/*.*") + SET(ncv_cpp ../../modules/gpu/src/nvidia/core/NCV.cpp) + source_group("Src\\nvidia" FILES ${nvidia}) +endif() + +add_executable(${the_target} ${test_srcs} ${test_hdrs} ${nvidia} ${ncv_cpp}) # Additional target properties set_target_properties(${the_target} PROPERTIES diff --git a/tests/gpu/src/nvidia/NCVAutoTestLister.hpp b/tests/gpu/src/nvidia/NCVAutoTestLister.hpp new file mode 100644 index 0000000000..d9f4438b80 --- /dev/null +++ b/tests/gpu/src/nvidia/NCVAutoTestLister.hpp @@ -0,0 +1,127 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual + * property and proprietary rights in and to this software and + * related documentation and any modifications thereto. + * Any use, reproduction, disclosure, or distribution of this + * software and related documentation without an express license + * agreement from NVIDIA Corporation is strictly prohibited. + */ +#ifndef _ncvautotestlister_hpp_ +#define _ncvautotestlister_hpp_ + +#include + +#include "NCVTest.hpp" + +class NCVAutoTestLister +{ +public: + + NCVAutoTestLister(std::string testSuiteName, NcvBool bStopOnFirstFail=false, NcvBool bCompactOutput=true) + : + testSuiteName(testSuiteName), + bStopOnFirstFail(bStopOnFirstFail), + bCompactOutput(bCompactOutput) + { + } + + void add(INCVTest *test) + { + this->tests.push_back(test); + } + + void invoke() + { + Ncv32u nPassed = 0; + Ncv32u nFailed = 0; + Ncv32u nFailedMem = 0; + + if (bCompactOutput) + { + printf("Test suite '%s' with %d tests\n", + testSuiteName.c_str(), + (int)(this->tests.size())); + } + + for (Ncv32u i=0; itests.size(); i++) + { + INCVTest &curTest = *tests[i]; + + NCVTestReport curReport; + bool res = curTest.executeTest(curReport); + + if (!bCompactOutput) + { + printf("Test %3i %16s; Consumed mem GPU = %8d, CPU = %8d; %s\n", + i, + curTest.getName().c_str(), + curReport.statsNums["MemGPU"], + curReport.statsNums["MemCPU"], + curReport.statsText["rcode"].c_str()); + } + + if (res) + { + nPassed++; + if (bCompactOutput) + { + printf("."); + } + } + else + { + if (!curReport.statsText["rcode"].compare("FAILED")) + { + nFailed++; + if (bCompactOutput) + { + printf("x"); + } + if (bStopOnFirstFail) + { + break; + } + } + else + { + nFailedMem++; + if (bCompactOutput) + { + printf("m"); + } + } + } + fflush(stdout); + } + if (bCompactOutput) + { + printf("\n"); + } + + printf("Test suite '%s' complete: %d total, %d passed, %d memory errors, %d failed\n\n", + testSuiteName.c_str(), + (int)(this->tests.size()), + nPassed, + nFailedMem, + nFailed); + } + + ~NCVAutoTestLister() + { + for (Ncv32u i=0; itests.size(); i++) + { + delete tests[i]; + } + } + +private: + + NcvBool bStopOnFirstFail; + NcvBool bCompactOutput; + std::string testSuiteName; + std::vector tests; +}; + +#endif // _ncvautotestlister_hpp_ diff --git a/tests/gpu/src/nvidia/NCVTest.hpp b/tests/gpu/src/nvidia/NCVTest.hpp new file mode 100644 index 0000000000..b8c2d9729e --- /dev/null +++ b/tests/gpu/src/nvidia/NCVTest.hpp @@ -0,0 +1,211 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual + * property and proprietary rights in and to this software and + * related documentation and any modifications thereto. + * Any use, reproduction, disclosure, or distribution of this + * software and related documentation without an express license + * agreement from NVIDIA Corporation is strictly prohibited. + */ +#ifndef _ncvtest_hpp_ +#define _ncvtest_hpp_ + +#pragma warning( disable : 4201 4408 4127 4100) + +#include +#include +#include +#include +#include +#include + +#include +#include "NPP_staging.hpp" + + +struct NCVTestReport +{ + std::map statsNums; + std::map statsText; +}; + + +class INCVTest +{ +public: + virtual bool executeTest(NCVTestReport &report) = 0; + virtual std::string getName() const = 0; +}; + + +class NCVTestProvider : public INCVTest +{ +public: + + NCVTestProvider(std::string testName) + : + testName(testName) + { + int devId; + ncvAssertPrintReturn(cudaSuccess == cudaGetDevice(&devId), "Error returned from cudaGetDevice", ); + ncvAssertPrintReturn(cudaSuccess == cudaGetDeviceProperties(&this->devProp, devId), "Error returned from cudaGetDeviceProperties", ); + } + + virtual bool init() = 0; + virtual bool process() = 0; + virtual bool deinit() = 0; + virtual bool toString(std::ofstream &strOut) = 0; + + virtual std::string getName() const + { + return this->testName; + } + + virtual ~NCVTestProvider() + { + deinitMemory(); + } + + virtual bool executeTest(NCVTestReport &report) + { + bool res; + report.statsText["rcode"] = "FAILED"; + + res = initMemory(report); + if (!res) + { + dumpToFile(report); + deinitMemory(); + return false; + } + + res = init(); + if (!res) + { + dumpToFile(report); + deinit(); + deinitMemory(); + return false; + } + + res = process(); + if (!res) + { + dumpToFile(report); + deinit(); + deinitMemory(); + return false; + } + + res = deinit(); + if (!res) + { + dumpToFile(report); + deinitMemory(); + return false; + } + + deinitMemory(); + + report.statsText["rcode"] = "Passed"; + return true; + } + +protected: + + cudaDeviceProp devProp; + std::auto_ptr allocatorGPU; + std::auto_ptr allocatorCPU; + +private: + + std::string testName; + + bool initMemory(NCVTestReport &report) + { + this->allocatorGPU.reset(new NCVMemStackAllocator(devProp.textureAlignment)); + this->allocatorCPU.reset(new NCVMemStackAllocator(devProp.textureAlignment)); + + if (!this->allocatorGPU.get()->isInitialized() || + !this->allocatorCPU.get()->isInitialized()) + { + report.statsText["rcode"] = "Memory FAILED"; + return false; + } + + if (!this->process()) + { + report.statsText["rcode"] = "Memory FAILED"; + return false; + } + + Ncv32u maxGPUsize = (Ncv32u)this->allocatorGPU.get()->maxSize(); + Ncv32u maxCPUsize = (Ncv32u)this->allocatorCPU.get()->maxSize(); + + report.statsNums["MemGPU"] = maxGPUsize; + report.statsNums["MemCPU"] = maxCPUsize; + + this->allocatorGPU.reset(new NCVMemStackAllocator(NCVMemoryTypeDevice, maxGPUsize, devProp.textureAlignment)); + + this->allocatorCPU.reset(new NCVMemStackAllocator(NCVMemoryTypeHostPinned, maxCPUsize, devProp.textureAlignment)); + + if (!this->allocatorGPU.get()->isInitialized() || + !this->allocatorCPU.get()->isInitialized()) + { + report.statsText["rcode"] = "Memory FAILED"; + return false; + } + + return true; + } + + void deinitMemory() + { + this->allocatorGPU.reset(); + this->allocatorCPU.reset(); + } + + void dumpToFile(NCVTestReport &report) + { + bool bReasonMem = (0 == report.statsText["rcode"].compare("Memory FAILED")); + std::string fname = "TestDump_"; + fname += (bReasonMem ? "m_" : "") + this->testName + ".log"; + std::ofstream stream(fname.c_str(), std::ios::trunc | std::ios::out); + if (!stream.is_open()) return; + + stream << "NCV Test Failure Log: " << this->testName << std::endl; + stream << "====================================================" << std::endl << std::endl; + stream << "Test initialization report: " << std::endl; + for (std::map::iterator it=report.statsText.begin(); + it != report.statsText.end(); it++) + { + stream << it->first << "=" << it->second << std::endl; + } + for (std::map::iterator it=report.statsNums.begin(); + it != report.statsNums.end(); it++) + { + stream << it->first << "=" << it->second << std::endl; + } + stream << std::endl; + + stream << "Test initialization parameters: " << std::endl; + bool bSerializeRes = false; + try + { + bSerializeRes = this->toString(stream); + } + catch (...) + { + } + + if (!bSerializeRes) + { + stream << "Couldn't retrieve object dump" << std::endl; + } + + stream.flush(); + } +}; + +#endif // _ncvtest_hpp_ diff --git a/tests/gpu/src/nvidia/NCVTestSourceProvider.hpp b/tests/gpu/src/nvidia/NCVTestSourceProvider.hpp new file mode 100644 index 0000000000..f4f9a392ec --- /dev/null +++ b/tests/gpu/src/nvidia/NCVTestSourceProvider.hpp @@ -0,0 +1,161 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual + * property and proprietary rights in and to this software and + * related documentation and any modifications thereto. + * Any use, reproduction, disclosure, or distribution of this + * software and related documentation without an express license + * agreement from NVIDIA Corporation is strictly prohibited. + */ +#ifndef _ncvtestsourceprovider_hpp_ +#define _ncvtestsourceprovider_hpp_ + +#include + +#include "NCV.hpp" +#include + + +template +class NCVTestSourceProvider +{ +public: + + NCVTestSourceProvider(Ncv32u seed, T rangeLow, T rangeHigh, Ncv32u maxWidth, Ncv32u maxHeight) + : + bInit(false) + { + ncvAssertPrintReturn(rangeLow < rangeHigh, "NCVTestSourceProvider ctor:: Invalid range", ); + + int devId; + cudaDeviceProp devProp; + ncvAssertPrintReturn(cudaSuccess == cudaGetDevice(&devId), "Error returned from cudaGetDevice", ); + ncvAssertPrintReturn(cudaSuccess == cudaGetDeviceProperties(&devProp, devId), "Error returned from cudaGetDeviceProperties", ); + + //Ncv32u maxWpitch = alignUp(maxWidth * sizeof(T), devProp.textureAlignment); + + allocatorCPU.reset(new NCVMemNativeAllocator(NCVMemoryTypeHostPinned, devProp.textureAlignment)); + data.reset(new NCVMatrixAlloc(*this->allocatorCPU.get(), maxWidth, maxHeight)); + ncvAssertPrintReturn(data.get()->isMemAllocated(), "NCVTestSourceProvider ctor:: Matrix not allocated", ); + + this->dataWidth = maxWidth; + this->dataHeight = maxHeight; + + srand(seed); + + for (Ncv32u i=0; istride(); j++) + { + data.get()->ptr()[i * data.get()->stride() + j] = + (T)(((1.0 * rand()) / RAND_MAX) * (rangeHigh - rangeLow) + rangeLow); + } + } + + this->bInit = true; + } + + NCVTestSourceProvider(std::string pgmFilename) + : + bInit(false) + { + ncvAssertPrintReturn(sizeof(T) == 1, "NCVTestSourceProvider ctor:: PGM constructor complies only with 8bit types", ); + + cv::Mat image = cv::imread(pgmFilename); + ncvAssertPrintReturn(!image.empty(), "NCVTestSourceProvider ctor:: PGM file error", ); + + int devId; + cudaDeviceProp devProp; + ncvAssertPrintReturn(cudaSuccess == cudaGetDevice(&devId), "Error returned from cudaGetDevice", ); + ncvAssertPrintReturn(cudaSuccess == cudaGetDeviceProperties(&devProp, devId), "Error returned from cudaGetDeviceProperties", ); + + allocatorCPU.reset(new NCVMemNativeAllocator(NCVMemoryTypeHostPinned, devProp.textureAlignment)); + data.reset(new NCVMatrixAlloc(*this->allocatorCPU.get(), image.cols, image.rows)); + ncvAssertPrintReturn(data.get()->isMemAllocated(), "NCVTestSourceProvider ctor:: Matrix not allocated", ); + + this->dataWidth = image.cols; + this->dataHeight = image.rows; + + cv::Mat hdr(image.size(), CV_8UC1, data.get()->ptr(), data.get()->pitch()); + image.copyTo(hdr); + + this->bInit = true; + } + + NcvBool fill(NCVMatrix &dst) + { + ncvAssertReturn(this->isInit() && + dst.memType() == allocatorCPU.get()->memType(), false); + + if (dst.width() == 0 || dst.height() == 0) + { + return true; + } + + for (Ncv32u i=0; idataHeight; + + Ncv32u srcFullChunks = dst.width() / this->dataWidth; + for (Ncv32u j=0; jdataWidth, + this->data.get()->ptr() + this->data.get()->stride() * srcLine, + this->dataWidth * sizeof(T)); + } + + Ncv32u srcLastChunk = dst.width() % this->dataWidth; + memcpy(dst.ptr() + i * dst.stride() + srcFullChunks * this->dataWidth, + this->data.get()->ptr() + this->data.get()->stride() * srcLine, + srcLastChunk * sizeof(T)); + } + + return true; + } + + NcvBool fill(NCVVector &dst) + { + ncvAssertReturn(this->isInit() && + dst.memType() == allocatorCPU.get()->memType(), false); + + if (dst.length() == 0) + { + return true; + } + + Ncv32u srcLen = this->dataWidth * this->dataHeight; + + Ncv32u srcFullChunks = (Ncv32u)dst.length() / srcLen; + for (Ncv32u j=0; jdata.get()->ptr(), srcLen * sizeof(T)); + } + + Ncv32u srcLastChunk = dst.length() % srcLen; + memcpy(dst.ptr() + srcFullChunks * srcLen, this->data.get()->ptr(), srcLastChunk * sizeof(T)); + + return true; + } + + ~NCVTestSourceProvider() + { + data.reset(); + allocatorCPU.reset(); + } + +private: + + NcvBool isInit(void) + { + return this->bInit; + } + + NcvBool bInit; + std::auto_ptr< INCVMemAllocator > allocatorCPU; + std::auto_ptr< NCVMatrixAlloc > data; + Ncv32u dataWidth; + Ncv32u dataHeight; +}; + +#endif // _ncvtestsourceprovider_hpp_ diff --git a/tests/gpu/src/nvidia/TestCompact.cpp b/tests/gpu/src/nvidia/TestCompact.cpp new file mode 100644 index 0000000000..2882f7cad2 --- /dev/null +++ b/tests/gpu/src/nvidia/TestCompact.cpp @@ -0,0 +1,129 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual + * property and proprietary rights in and to this software and + * related documentation and any modifications thereto. + * Any use, reproduction, disclosure, or distribution of this + * software and related documentation without an express license + * agreement from NVIDIA Corporation is strictly prohibited. + */ + +#include "TestCompact.h" + + +TestCompact::TestCompact(std::string testName, NCVTestSourceProvider &src, + Ncv32u length, Ncv32u badElem, Ncv32u badElemPercentage) + : + NCVTestProvider(testName), + src(src), + length(length), + badElem(badElem), + badElemPercentage(badElemPercentage > 100 ? 100 : badElemPercentage) +{ +} + + +bool TestCompact::toString(std::ofstream &strOut) +{ + strOut << "length=" << length << std::endl; + strOut << "badElem=" << badElem << std::endl; + strOut << "badElemPercentage=" << badElemPercentage << std::endl; + return true; +} + + +bool TestCompact::init() +{ + return true; +} + + +bool TestCompact::process() +{ + NCVStatus ncvStat; + bool rcode = false; + + NCVVectorAlloc h_vecSrc(*this->allocatorCPU.get(), this->length); + ncvAssertReturn(h_vecSrc.isMemAllocated(), false); + NCVVectorAlloc d_vecSrc(*this->allocatorGPU.get(), this->length); + ncvAssertReturn(d_vecSrc.isMemAllocated(), false); + + NCVVectorAlloc h_vecDst(*this->allocatorCPU.get(), this->length); + ncvAssertReturn(h_vecDst.isMemAllocated(), false); + NCVVectorAlloc d_vecDst(*this->allocatorGPU.get(), this->length); + ncvAssertReturn(d_vecDst.isMemAllocated(), false); + NCVVectorAlloc h_vecDst_d(*this->allocatorCPU.get(), this->length); + ncvAssertReturn(h_vecDst_d.isMemAllocated(), false); + + NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting()); + NCV_SKIP_COND_BEGIN + ncvAssertReturn(this->src.fill(h_vecSrc), false); + for (Ncv32u i=0; ilength; i++) + { + Ncv32u tmp = (h_vecSrc.ptr()[i]) & 0xFF; + tmp = tmp * 99 / 255; + if (tmp < this->badElemPercentage) + { + h_vecSrc.ptr()[i] = this->badElem; + } + } + NCV_SKIP_COND_END + + NCVVectorAlloc h_dstLen(*this->allocatorCPU.get(), 1); + ncvAssertReturn(h_dstLen.isMemAllocated(), false); + Ncv32u bufSize; + ncvStat = nppsStCompactGetSize_32u(this->length, &bufSize, this->devProp); + ncvAssertReturn(NPPST_SUCCESS == ncvStat, false); + NCVVectorAlloc d_tmpBuf(*this->allocatorGPU.get(), bufSize); + ncvAssertReturn(d_tmpBuf.isMemAllocated(), false); + + Ncv32u h_outElemNum_h = 0; + + NCV_SKIP_COND_BEGIN + ncvStat = h_vecSrc.copySolid(d_vecSrc, 0); + ncvAssertReturn(ncvStat == NPPST_SUCCESS, false); + ncvStat = nppsStCompact_32u(d_vecSrc.ptr(), this->length, + d_vecDst.ptr(), h_dstLen.ptr(), this->badElem, + d_tmpBuf.ptr(), bufSize, this->devProp); + ncvAssertReturn(ncvStat == NPPST_SUCCESS, false); + ncvStat = d_vecDst.copySolid(h_vecDst_d, 0); + ncvAssertReturn(ncvStat == NPPST_SUCCESS, false); + + ncvStat = nppsStCompact_32u_host(h_vecSrc.ptr(), this->length, h_vecDst.ptr(), &h_outElemNum_h, this->badElem); + ncvAssertReturn(ncvStat == NPPST_SUCCESS, false); + NCV_SKIP_COND_END + + //bit-to-bit check + bool bLoopVirgin = true; + + NCV_SKIP_COND_BEGIN + if (h_dstLen.ptr()[0] != h_outElemNum_h) + { + bLoopVirgin = false; + } + else + { + for (Ncv32u i=0; bLoopVirgin && i < h_outElemNum_h; i++) + { + if (h_vecDst.ptr()[i] != h_vecDst_d.ptr()[i]) + { + bLoopVirgin = false; + } + } + } + NCV_SKIP_COND_END + + if (bLoopVirgin) + { + rcode = true; + } + + return rcode; +} + + +bool TestCompact::deinit() +{ + return true; +} diff --git a/tests/gpu/src/nvidia/TestCompact.h b/tests/gpu/src/nvidia/TestCompact.h new file mode 100644 index 0000000000..ba4f93aa10 --- /dev/null +++ b/tests/gpu/src/nvidia/TestCompact.h @@ -0,0 +1,41 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual + * property and proprietary rights in and to this software and + * related documentation and any modifications thereto. + * Any use, reproduction, disclosure, or distribution of this + * software and related documentation without an express license + * agreement from NVIDIA Corporation is strictly prohibited. + */ +#ifndef _testhypothesescompact_h_ +#define _testhypothesescompact_h_ + +#include "NCVTest.hpp" +#include "NCVTestSourceProvider.hpp" + + +class TestCompact : public NCVTestProvider +{ +public: + + TestCompact(std::string testName, NCVTestSourceProvider &src, + Ncv32u length, Ncv32u badElem, Ncv32u badElemPercentage); + + virtual bool init(); + virtual bool process(); + virtual bool deinit(); + virtual bool toString(std::ofstream &strOut); + +private: + TestCompact(const TestCompact&); + TestCompact& operator=(const TestCompact&); + + + NCVTestSourceProvider &src; + Ncv32u length; + Ncv32u badElem; + Ncv32u badElemPercentage; +}; + +#endif // _testhypothesescompact_h_ diff --git a/tests/gpu/src/nvidia/TestDrawRects.cpp b/tests/gpu/src/nvidia/TestDrawRects.cpp new file mode 100644 index 0000000000..b86ac58b15 --- /dev/null +++ b/tests/gpu/src/nvidia/TestDrawRects.cpp @@ -0,0 +1,163 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual + * property and proprietary rights in and to this software and + * related documentation and any modifications thereto. + * Any use, reproduction, disclosure, or distribution of this + * software and related documentation without an express license + * agreement from NVIDIA Corporation is strictly prohibited. + */ + +#include "TestDrawRects.h" +#include "NCVHaarObjectDetection.hpp" + + +template +TestDrawRects::TestDrawRects(std::string testName, NCVTestSourceProvider &src, NCVTestSourceProvider &src32u, + Ncv32u width, Ncv32u height, Ncv32u numRects, T color) + : + NCVTestProvider(testName), + src(src), + src32u(src32u), + width(width), + height(height), + numRects(numRects), + color(color) +{ +} + + +template +bool TestDrawRects::toString(std::ofstream &strOut) +{ + strOut << "sizeof(T)=" << sizeof(T) << std::endl; + strOut << "width=" << width << std::endl; + strOut << "height=" << height << std::endl; + strOut << "numRects=" << numRects << std::endl; + strOut << "color=" << color << std::endl; + return true; +} + + +template +bool TestDrawRects::init() +{ + return true; +} + + +template +bool TestDrawRects::process() +{ + NCVStatus ncvStat; + bool rcode = false; + + NCVMatrixAlloc d_img(*this->allocatorGPU.get(), this->width, this->height); + ncvAssertReturn(d_img.isMemAllocated(), false); + NCVMatrixAlloc h_img(*this->allocatorCPU.get(), this->width, this->height); + ncvAssertReturn(h_img.isMemAllocated(), false); + NCVMatrixAlloc h_img_d(*this->allocatorCPU.get(), this->width, this->height); + ncvAssertReturn(h_img_d.isMemAllocated(), false); + + NCVVectorAlloc d_rects(*this->allocatorGPU.get(), this->numRects); + ncvAssertReturn(d_rects.isMemAllocated(), false); + NCVVectorAlloc h_rects(*this->allocatorCPU.get(), this->numRects); + ncvAssertReturn(h_rects.isMemAllocated(), false); + + NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting()); + NCV_SKIP_COND_BEGIN + ncvAssertReturn(this->src.fill(h_img), false); + ncvStat = h_img.copySolid(d_img, 0); + ncvAssertReturn(ncvStat == NCV_SUCCESS, false); + ncvAssertCUDAReturn(cudaStreamSynchronize(0), false); + + //fill vector of rectangles with random rects covering the input + NCVVectorReuse h_rects_as32u(h_rects.getSegment()); + ncvAssertReturn(h_rects_as32u.isMemReused(), false); + ncvAssertReturn(this->src32u.fill(h_rects_as32u), false); + for (Ncv32u i=0; inumRects; i++) + { + h_rects.ptr()[i].x = (Ncv32u)(((1.0 * h_rects.ptr()[i].x) / RAND_MAX) * (this->width-2)); + h_rects.ptr()[i].y = (Ncv32u)(((1.0 * h_rects.ptr()[i].y) / RAND_MAX) * (this->height-2)); + h_rects.ptr()[i].width = (Ncv32u)(((1.0 * h_rects.ptr()[i].width) / RAND_MAX) * (this->width+10 - h_rects.ptr()[i].x)); + h_rects.ptr()[i].height = (Ncv32u)(((1.0 * h_rects.ptr()[i].height) / RAND_MAX) * (this->height+10 - h_rects.ptr()[i].y)); + } + ncvStat = h_rects.copySolid(d_rects, 0); + ncvAssertReturn(ncvStat == NCV_SUCCESS, false); + ncvAssertCUDAReturn(cudaStreamSynchronize(0), false); + + if (sizeof(T) == sizeof(Ncv32u)) + { + ncvStat = ncvDrawRects_32u_device((Ncv32u *)d_img.ptr(), d_img.stride(), this->width, this->height, + (NcvRect32u *)d_rects.ptr(), this->numRects, this->color, 0); + } + else if (sizeof(T) == sizeof(Ncv8u)) + { + ncvStat = ncvDrawRects_8u_device((Ncv8u *)d_img.ptr(), d_img.stride(), this->width, this->height, + (NcvRect32u *)d_rects.ptr(), this->numRects, (Ncv8u)this->color, 0); + } + else + { + ncvAssertPrintReturn(false, "Incorrect drawrects test instance", false); + } + ncvAssertReturn(ncvStat == NCV_SUCCESS, false); + NCV_SKIP_COND_END + + ncvStat = d_img.copySolid(h_img_d, 0); + ncvAssertReturn(ncvStat == NCV_SUCCESS, false); + ncvAssertCUDAReturn(cudaStreamSynchronize(0), false); + + NCV_SKIP_COND_BEGIN + if (sizeof(T) == sizeof(Ncv32u)) + { + ncvStat = ncvDrawRects_32u_host((Ncv32u *)h_img.ptr(), h_img.stride(), this->width, this->height, + (NcvRect32u *)h_rects.ptr(), this->numRects, this->color); + } + else if (sizeof(T) == sizeof(Ncv8u)) + { + ncvStat = ncvDrawRects_8u_host((Ncv8u *)h_img.ptr(), h_img.stride(), this->width, this->height, + (NcvRect32u *)h_rects.ptr(), this->numRects, (Ncv8u)this->color); + } + else + { + ncvAssertPrintReturn(false, "Incorrect drawrects test instance", false); + } + ncvAssertReturn(ncvStat == NCV_SUCCESS, false); + NCV_SKIP_COND_END + + //bit-to-bit check + bool bLoopVirgin = true; + + NCV_SKIP_COND_BEGIN + //const Ncv64f relEPS = 0.005; + for (Ncv32u i=0; bLoopVirgin && i < h_img.height(); i++) + { + for (Ncv32u j=0; bLoopVirgin && j < h_img.width(); j++) + { + if (h_img.ptr()[h_img.stride()*i+j] != h_img_d.ptr()[h_img_d.stride()*i+j]) + { + bLoopVirgin = false; + } + } + } + NCV_SKIP_COND_END + + if (bLoopVirgin) + { + rcode = true; + } + + return rcode; +} + + +template +bool TestDrawRects::deinit() +{ + return true; +} + + +template class TestDrawRects; +template class TestDrawRects; diff --git a/tests/gpu/src/nvidia/TestDrawRects.h b/tests/gpu/src/nvidia/TestDrawRects.h new file mode 100644 index 0000000000..bde80fe6c6 --- /dev/null +++ b/tests/gpu/src/nvidia/TestDrawRects.h @@ -0,0 +1,44 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual + * property and proprietary rights in and to this software and + * related documentation and any modifications thereto. + * Any use, reproduction, disclosure, or distribution of this + * software and related documentation without an express license + * agreement from NVIDIA Corporation is strictly prohibited. + */ +#ifndef _testdrawrects_h_ +#define _testdrawrects_h_ + +#include "NCVTest.hpp" +#include "NCVTestSourceProvider.hpp" + + +template +class TestDrawRects : public NCVTestProvider +{ +public: + + TestDrawRects(std::string testName, NCVTestSourceProvider &src, NCVTestSourceProvider &src32u, + Ncv32u width, Ncv32u height, Ncv32u numRects, T color); + + virtual bool init(); + virtual bool process(); + virtual bool deinit(); + virtual bool toString(std::ofstream &strOut); + +private: + + TestDrawRects(const TestDrawRects&); + TestDrawRects& operator=(const TestDrawRects&); + + NCVTestSourceProvider &src; + NCVTestSourceProvider &src32u; + Ncv32u width; + Ncv32u height; + Ncv32u numRects; + T color; +}; + +#endif // _testdrawrects_h_ diff --git a/tests/gpu/src/nvidia/TestHaarCascadeApplication.cpp b/tests/gpu/src/nvidia/TestHaarCascadeApplication.cpp new file mode 100644 index 0000000000..e3a13fc320 --- /dev/null +++ b/tests/gpu/src/nvidia/TestHaarCascadeApplication.cpp @@ -0,0 +1,267 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual + * property and proprietary rights in and to this software and + * related documentation and any modifications thereto. + * Any use, reproduction, disclosure, or distribution of this + * software and related documentation without an express license + * agreement from NVIDIA Corporation is strictly prohibited. + */ + +#include +#include "TestHaarCascadeApplication.h" +#include "NCVHaarObjectDetection.hpp" + + +TestHaarCascadeApplication::TestHaarCascadeApplication(std::string testName, NCVTestSourceProvider &src, + std::string cascadeName, Ncv32u width, Ncv32u height) + : + NCVTestProvider(testName), + src(src), + cascadeName(cascadeName), + width(width), + height(height) +{ +} + + +bool TestHaarCascadeApplication::toString(std::ofstream &strOut) +{ + strOut << "cascadeName=" << cascadeName << std::endl; + strOut << "width=" << width << std::endl; + strOut << "height=" << height << std::endl; + return true; +} + + +bool TestHaarCascadeApplication::init() +{ + return true; +} + + +bool TestHaarCascadeApplication::process() +{ + NCVStatus ncvStat; + bool rcode = false; + + Ncv32u numStages, numNodes, numFeatures; + + ncvStat = ncvHaarGetClassifierSize(this->cascadeName, numStages, numNodes, numFeatures); + ncvAssertReturn(ncvStat == NCV_SUCCESS, false); + + NCVVectorAlloc h_HaarStages(*this->allocatorCPU.get(), numStages); + ncvAssertReturn(h_HaarStages.isMemAllocated(), false); + NCVVectorAlloc h_HaarNodes(*this->allocatorCPU.get(), numNodes); + ncvAssertReturn(h_HaarNodes.isMemAllocated(), false); + NCVVectorAlloc h_HaarFeatures(*this->allocatorCPU.get(), numFeatures); + ncvAssertReturn(h_HaarFeatures.isMemAllocated(), false); + + NCVVectorAlloc d_HaarStages(*this->allocatorGPU.get(), numStages); + ncvAssertReturn(d_HaarStages.isMemAllocated(), false); + NCVVectorAlloc d_HaarNodes(*this->allocatorGPU.get(), numNodes); + ncvAssertReturn(d_HaarNodes.isMemAllocated(), false); + NCVVectorAlloc d_HaarFeatures(*this->allocatorGPU.get(), numFeatures); + ncvAssertReturn(d_HaarFeatures.isMemAllocated(), false); + + HaarClassifierCascadeDescriptor haar; + haar.ClassifierSize.width = haar.ClassifierSize.height = 1; + haar.bNeedsTiltedII = false; + haar.NumClassifierRootNodes = numNodes; + haar.NumClassifierTotalNodes = numNodes; + haar.NumFeatures = numFeatures; + haar.NumStages = numStages; + + NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting()); + NCV_SKIP_COND_BEGIN + + ncvStat = ncvHaarLoadFromFile_host(this->cascadeName, haar, h_HaarStages, h_HaarNodes, h_HaarFeatures); + ncvAssertReturn(ncvStat == NCV_SUCCESS, false); + + ncvAssertReturn(NCV_SUCCESS == h_HaarStages.copySolid(d_HaarStages, 0), false); + ncvAssertReturn(NCV_SUCCESS == h_HaarNodes.copySolid(d_HaarNodes, 0), false); + ncvAssertReturn(NCV_SUCCESS == h_HaarFeatures.copySolid(d_HaarFeatures, 0), false); + ncvAssertCUDAReturn(cudaStreamSynchronize(0), false); + + NCV_SKIP_COND_END + + NcvSize32s srcRoi, srcIIRoi, searchRoi; + srcRoi.width = this->width; + srcRoi.height = this->height; + srcIIRoi.width = srcRoi.width + 1; + srcIIRoi.height = srcRoi.height + 1; + searchRoi.width = srcIIRoi.width - haar.ClassifierSize.width; + searchRoi.height = srcIIRoi.height - haar.ClassifierSize.height; + if (searchRoi.width <= 0 || searchRoi.height <= 0) + { + return false; + } + NcvSize32u searchRoiU(searchRoi.width, searchRoi.height); + + NCVMatrixAlloc d_img(*this->allocatorGPU.get(), this->width, this->height); + ncvAssertReturn(d_img.isMemAllocated(), false); + NCVMatrixAlloc h_img(*this->allocatorCPU.get(), this->width, this->height); + ncvAssertReturn(h_img.isMemAllocated(), false); + + Ncv32u integralWidth = this->width + 1; + Ncv32u integralHeight = this->height + 1; + + NCVMatrixAlloc d_integralImage(*this->allocatorGPU.get(), integralWidth, integralHeight); + ncvAssertReturn(d_integralImage.isMemAllocated(), false); + NCVMatrixAlloc d_sqIntegralImage(*this->allocatorGPU.get(), integralWidth, integralHeight); + ncvAssertReturn(d_sqIntegralImage.isMemAllocated(), false); + NCVMatrixAlloc h_integralImage(*this->allocatorCPU.get(), integralWidth, integralHeight); + ncvAssertReturn(h_integralImage.isMemAllocated(), false); + NCVMatrixAlloc h_sqIntegralImage(*this->allocatorCPU.get(), integralWidth, integralHeight); + ncvAssertReturn(h_sqIntegralImage.isMemAllocated(), false); + + NCVMatrixAlloc d_rectStdDev(*this->allocatorGPU.get(), this->width, this->height); + ncvAssertReturn(d_rectStdDev.isMemAllocated(), false); + NCVMatrixAlloc d_pixelMask(*this->allocatorGPU.get(), this->width, this->height); + ncvAssertReturn(d_pixelMask.isMemAllocated(), false); + NCVMatrixAlloc h_rectStdDev(*this->allocatorCPU.get(), this->width, this->height); + ncvAssertReturn(h_rectStdDev.isMemAllocated(), false); + NCVMatrixAlloc h_pixelMask(*this->allocatorCPU.get(), this->width, this->height); + ncvAssertReturn(h_pixelMask.isMemAllocated(), false); + + NCVVectorAlloc d_hypotheses(*this->allocatorGPU.get(), this->width * this->height); + ncvAssertReturn(d_hypotheses.isMemAllocated(), false); + NCVVectorAlloc h_hypotheses(*this->allocatorCPU.get(), this->width * this->height); + ncvAssertReturn(h_hypotheses.isMemAllocated(), false); + + NCVStatus nppStat; + Ncv32u szTmpBufIntegral, szTmpBufSqIntegral; + nppStat = nppiStIntegralGetSize_8u32u(NcvSize32u(this->width, this->height), &szTmpBufIntegral, this->devProp); + ncvAssertReturn(nppStat == NPPST_SUCCESS, false); + nppStat = nppiStSqrIntegralGetSize_8u64u(NcvSize32u(this->width, this->height), &szTmpBufSqIntegral, this->devProp); + ncvAssertReturn(nppStat == NPPST_SUCCESS, false); + NCVVectorAlloc d_tmpIIbuf(*this->allocatorGPU.get(), std::max(szTmpBufIntegral, szTmpBufSqIntegral)); + ncvAssertReturn(d_tmpIIbuf.isMemAllocated(), false); + + Ncv32u detectionsOnThisScale_d = 0; + Ncv32u detectionsOnThisScale_h = 0; + + NCV_SKIP_COND_BEGIN + + ncvAssertReturn(this->src.fill(h_img), false); + ncvStat = h_img.copySolid(d_img, 0); + ncvAssertReturn(ncvStat == NCV_SUCCESS, false); + ncvAssertCUDAReturn(cudaStreamSynchronize(0), false); + + nppStat = nppiStIntegral_8u32u_C1R(d_img.ptr(), d_img.pitch(), + d_integralImage.ptr(), d_integralImage.pitch(), + NcvSize32u(d_img.width(), d_img.height()), + d_tmpIIbuf.ptr(), szTmpBufIntegral, this->devProp); + ncvAssertReturn(nppStat == NPPST_SUCCESS, false); + + nppStat = nppiStSqrIntegral_8u64u_C1R(d_img.ptr(), d_img.pitch(), + d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(), + NcvSize32u(d_img.width(), d_img.height()), + d_tmpIIbuf.ptr(), szTmpBufSqIntegral, this->devProp); + ncvAssertReturn(nppStat == NPPST_SUCCESS, false); + + const NcvRect32u rect( + HAAR_STDDEV_BORDER, + HAAR_STDDEV_BORDER, + haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER, + haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER); + nppStat = nppiStRectStdDev_32f_C1R( + d_integralImage.ptr(), d_integralImage.pitch(), + d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(), + d_rectStdDev.ptr(), d_rectStdDev.pitch(), + NcvSize32u(searchRoi.width, searchRoi.height), rect, + 1.0f, true); + ncvAssertReturn(nppStat == NPPST_SUCCESS, false); + + ncvStat = d_integralImage.copySolid(h_integralImage, 0); + ncvAssertReturn(ncvStat == NCV_SUCCESS, false); + ncvStat = d_rectStdDev.copySolid(h_rectStdDev, 0); + ncvAssertReturn(ncvStat == NCV_SUCCESS, false); + + for (Ncv32u i=0; iallocatorGPU.get(), *this->allocatorCPU.get(), + devProp, 0); + ncvAssertReturn(ncvStat == NCV_SUCCESS, false); + + NCVMatrixAlloc h_pixelMask_d(*this->allocatorCPU.get(), this->width, this->height); + ncvAssertReturn(h_pixelMask_d.isMemAllocated(), false); + + //bit-to-bit check + bool bLoopVirgin = true; + + NCV_SKIP_COND_BEGIN + + ncvStat = d_pixelMask.copySolid(h_pixelMask_d, 0); + ncvAssertReturn(ncvStat == NCV_SUCCESS, false); + + if (detectionsOnThisScale_d != detectionsOnThisScale_h) + { + bLoopVirgin = false; + } + else + { + std::sort(h_pixelMask_d.ptr(), h_pixelMask_d.ptr() + detectionsOnThisScale_d); + for (Ncv32u i=0; i &src, + std::string cascadeName, Ncv32u width, Ncv32u height); + + virtual bool init(); + virtual bool process(); + virtual bool deinit(); + virtual bool toString(std::ofstream &strOut); + +private: + TestHaarCascadeApplication(const TestHaarCascadeApplication&); + TestHaarCascadeApplication& operator=(const TestHaarCascadeApplication&); + + + NCVTestSourceProvider &src; + std::string cascadeName; + Ncv32u width; + Ncv32u height; +}; + +#endif // _testhaarcascadeapplication_h_ diff --git a/tests/gpu/src/nvidia/TestHaarCascadeLoader.cpp b/tests/gpu/src/nvidia/TestHaarCascadeLoader.cpp new file mode 100644 index 0000000000..8991e69cd8 --- /dev/null +++ b/tests/gpu/src/nvidia/TestHaarCascadeLoader.cpp @@ -0,0 +1,123 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual + * property and proprietary rights in and to this software and + * related documentation and any modifications thereto. + * Any use, reproduction, disclosure, or distribution of this + * software and related documentation without an express license + * agreement from NVIDIA Corporation is strictly prohibited. + */ + +#include "TestHaarCascadeLoader.h" +#include "NCVHaarObjectDetection.hpp" + + +TestHaarCascadeLoader::TestHaarCascadeLoader(std::string testName, std::string cascadeName) + : + NCVTestProvider(testName), + cascadeName(cascadeName) +{ +} + + +bool TestHaarCascadeLoader::toString(std::ofstream &strOut) +{ + strOut << "cascadeName=" << cascadeName << std::endl; + return true; +} + + +bool TestHaarCascadeLoader::init() +{ + return true; +} + + +bool TestHaarCascadeLoader::process() +{ + NCVStatus ncvStat; + bool rcode = false; + + Ncv32u numStages, numNodes, numFeatures; + Ncv32u numStages_2 = 0, numNodes_2 = 0, numFeatures_2 = 0; + + ncvStat = ncvHaarGetClassifierSize(this->cascadeName, numStages, numNodes, numFeatures); + ncvAssertReturn(ncvStat == NCV_SUCCESS, false); + + NCVVectorAlloc h_HaarStages(*this->allocatorCPU.get(), numStages); + ncvAssertReturn(h_HaarStages.isMemAllocated(), false); + NCVVectorAlloc h_HaarNodes(*this->allocatorCPU.get(), numNodes); + ncvAssertReturn(h_HaarNodes.isMemAllocated(), false); + NCVVectorAlloc h_HaarFeatures(*this->allocatorCPU.get(), numFeatures); + ncvAssertReturn(h_HaarFeatures.isMemAllocated(), false); + + NCVVectorAlloc h_HaarStages_2(*this->allocatorCPU.get(), numStages); + ncvAssertReturn(h_HaarStages_2.isMemAllocated(), false); + NCVVectorAlloc h_HaarNodes_2(*this->allocatorCPU.get(), numNodes); + ncvAssertReturn(h_HaarNodes_2.isMemAllocated(), false); + NCVVectorAlloc h_HaarFeatures_2(*this->allocatorCPU.get(), numFeatures); + ncvAssertReturn(h_HaarFeatures_2.isMemAllocated(), false); + + HaarClassifierCascadeDescriptor haar; + HaarClassifierCascadeDescriptor haar_2; + + NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting()); + NCV_SKIP_COND_BEGIN + + const std::string testNvbinName = "test.nvbin"; + ncvStat = ncvHaarLoadFromFile_host(this->cascadeName, haar, h_HaarStages, h_HaarNodes, h_HaarFeatures); + ncvAssertReturn(ncvStat == NCV_SUCCESS, false); + + ncvStat = ncvHaarStoreNVBIN_host(testNvbinName, haar, h_HaarStages, h_HaarNodes, h_HaarFeatures); + ncvAssertReturn(ncvStat == NCV_SUCCESS, false); + + ncvStat = ncvHaarGetClassifierSize(testNvbinName, numStages_2, numNodes_2, numFeatures_2); + ncvAssertReturn(ncvStat == NCV_SUCCESS, false); + + ncvStat = ncvHaarLoadFromFile_host(testNvbinName, haar_2, h_HaarStages_2, h_HaarNodes_2, h_HaarFeatures_2); + ncvAssertReturn(ncvStat == NCV_SUCCESS, false); + + NCV_SKIP_COND_END + + //bit-to-bit check + bool bLoopVirgin = true; + + NCV_SKIP_COND_BEGIN + + if ( + numStages_2 != numStages || + numNodes_2 != numNodes || + numFeatures_2 != numFeatures || + haar.NumStages != haar_2.NumStages || + haar.NumClassifierRootNodes != haar_2.NumClassifierRootNodes || + haar.NumClassifierTotalNodes != haar_2.NumClassifierTotalNodes || + haar.NumFeatures != haar_2.NumFeatures || + haar.ClassifierSize.width != haar_2.ClassifierSize.width || + haar.ClassifierSize.height != haar_2.ClassifierSize.height || + haar.bNeedsTiltedII != haar_2.bNeedsTiltedII || + haar.bHasStumpsOnly != haar_2.bHasStumpsOnly ) + { + bLoopVirgin = false; + } + if (memcmp(h_HaarStages.ptr(), h_HaarStages_2.ptr(), haar.NumStages * sizeof(HaarStage64)) || + memcmp(h_HaarNodes.ptr(), h_HaarNodes_2.ptr(), haar.NumClassifierTotalNodes * sizeof(HaarClassifierNode128)) || + memcmp(h_HaarFeatures.ptr(), h_HaarFeatures_2.ptr(), haar.NumFeatures * sizeof(HaarFeature64)) ) + { + bLoopVirgin = false; + } + NCV_SKIP_COND_END + + if (bLoopVirgin) + { + rcode = true; + } + + return rcode; +} + + +bool TestHaarCascadeLoader::deinit() +{ + return true; +} diff --git a/tests/gpu/src/nvidia/TestHaarCascadeLoader.h b/tests/gpu/src/nvidia/TestHaarCascadeLoader.h new file mode 100644 index 0000000000..717a38e2e0 --- /dev/null +++ b/tests/gpu/src/nvidia/TestHaarCascadeLoader.h @@ -0,0 +1,34 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual + * property and proprietary rights in and to this software and + * related documentation and any modifications thereto. + * Any use, reproduction, disclosure, or distribution of this + * software and related documentation without an express license + * agreement from NVIDIA Corporation is strictly prohibited. + */ +#ifndef _testhaarcascadeloader_h_ +#define _testhaarcascadeloader_h_ + +#include "NCVTest.hpp" +#include "NCVTestSourceProvider.hpp" + + +class TestHaarCascadeLoader : public NCVTestProvider +{ +public: + + TestHaarCascadeLoader(std::string testName, std::string cascadeName); + + virtual bool init(); + virtual bool process(); + virtual bool deinit(); + virtual bool toString(std::ofstream &strOut); + +private: + + std::string cascadeName; +}; + +#endif // _testhaarcascadeloader_h_ diff --git a/tests/gpu/src/nvidia/TestHypothesesFilter.cpp b/tests/gpu/src/nvidia/TestHypothesesFilter.cpp new file mode 100644 index 0000000000..c41ca0025f --- /dev/null +++ b/tests/gpu/src/nvidia/TestHypothesesFilter.cpp @@ -0,0 +1,176 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual + * property and proprietary rights in and to this software and + * related documentation and any modifications thereto. + * Any use, reproduction, disclosure, or distribution of this + * software and related documentation without an express license + * agreement from NVIDIA Corporation is strictly prohibited. + */ + +#include "TestHypothesesFilter.h" +#include "NCVHaarObjectDetection.hpp" + + +TestHypothesesFilter::TestHypothesesFilter(std::string testName, NCVTestSourceProvider &src, + Ncv32u numDstRects, Ncv32u minNeighbors, Ncv32f eps) + : + NCVTestProvider(testName), + src(src), + numDstRects(numDstRects), + minNeighbors(minNeighbors), + eps(eps) +{ +} + + +bool TestHypothesesFilter::toString(std::ofstream &strOut) +{ + strOut << "numDstRects=" << numDstRects << std::endl; + strOut << "minNeighbors=" << minNeighbors << std::endl; + strOut << "eps=" << eps << std::endl; + return true; +} + + +bool TestHypothesesFilter::init() +{ + this->canvasWidth = 4096; + this->canvasHeight = 4096; + return true; +} + + +bool compareRects(const NcvRect32u &r1, const NcvRect32u &r2, Ncv32f eps) +{ + double delta = eps*(std::min(r1.width, r2.width) + std::min(r1.height, r2.height))*0.5; + return std::abs((Ncv32s)r1.x - (Ncv32s)r2.x) <= delta && + std::abs((Ncv32s)r1.y - (Ncv32s)r2.y) <= delta && + std::abs((Ncv32s)r1.x + (Ncv32s)r1.width - (Ncv32s)r2.x - (Ncv32s)r2.width) <= delta && + std::abs((Ncv32s)r1.y + (Ncv32s)r1.height - (Ncv32s)r2.y - (Ncv32s)r2.height) <= delta; +} + + +inline bool operator < (const NcvRect32u &a, const NcvRect32u &b) +{ + return a.x < b.x; +} + + +bool TestHypothesesFilter::process() +{ + NCVStatus ncvStat; + bool rcode = false; + + NCVVectorAlloc h_random32u(*this->allocatorCPU.get(), this->numDstRects * sizeof(NcvRect32u) / sizeof(Ncv32u)); + ncvAssertReturn(h_random32u.isMemAllocated(), false); + + Ncv32u srcSlotSize = 2 * this->minNeighbors + 1; + + NCVVectorAlloc h_vecSrc(*this->allocatorCPU.get(), this->numDstRects*srcSlotSize); + ncvAssertReturn(h_vecSrc.isMemAllocated(), false); + NCVVectorAlloc h_vecDst_groundTruth(*this->allocatorCPU.get(), this->numDstRects); + ncvAssertReturn(h_vecDst_groundTruth.isMemAllocated(), false); + + NCV_SET_SKIP_COND(this->allocatorCPU.get()->isCounting()); + + NCV_SKIP_COND_BEGIN + ncvAssertReturn(this->src.fill(h_random32u), false); + Ncv32u randCnt = 0; + Ncv64f randVal; + + for (Ncv32u i=0; inumDstRects; i++) + { + h_vecDst_groundTruth.ptr()[i].x = i * this->canvasWidth / this->numDstRects + this->canvasWidth / (this->numDstRects * 4); + h_vecDst_groundTruth.ptr()[i].y = i * this->canvasHeight / this->numDstRects + this->canvasHeight / (this->numDstRects * 4); + h_vecDst_groundTruth.ptr()[i].width = this->canvasWidth / (this->numDstRects * 2); + h_vecDst_groundTruth.ptr()[i].height = this->canvasHeight / (this->numDstRects * 2); + + Ncv32u numNeighbors = this->minNeighbors + 1 + (Ncv32u)(((1.0 * h_random32u.ptr()[i]) * (this->minNeighbors + 1)) / 0xFFFFFFFF); + numNeighbors = (numNeighbors > srcSlotSize) ? srcSlotSize : numNeighbors; + + //fill in strong hypotheses (2 * ((1.0 * randVal) / 0xFFFFFFFF) - 1) + for (Ncv32u j=0; jeps * (randVal - 0.5)); + randVal = (1.0 * h_random32u.ptr()[randCnt++]) / 0xFFFFFFFF; randCnt = randCnt % h_random32u.length(); + h_vecSrc.ptr()[srcSlotSize * i + j].y = + h_vecDst_groundTruth.ptr()[i].y + + (Ncv32s)(h_vecDst_groundTruth.ptr()[i].height * this->eps * (randVal - 0.5)); + h_vecSrc.ptr()[srcSlotSize * i + j].width = h_vecDst_groundTruth.ptr()[i].width; + h_vecSrc.ptr()[srcSlotSize * i + j].height = h_vecDst_groundTruth.ptr()[i].height; + } + + //generate weak hypotheses (to be removed in processing) + for (Ncv32u j=numNeighbors; jcanvasWidth + h_vecDst_groundTruth.ptr()[i].x + + (Ncv32s)(h_vecDst_groundTruth.ptr()[i].width * this->eps * (randVal - 0.5)); + randVal = (1.0 * h_random32u.ptr()[randCnt++]) / 0xFFFFFFFF; randCnt = randCnt % h_random32u.length(); + h_vecSrc.ptr()[srcSlotSize * i + j].y = + this->canvasHeight + h_vecDst_groundTruth.ptr()[i].y + + (Ncv32s)(h_vecDst_groundTruth.ptr()[i].height * this->eps * (randVal - 0.5)); + h_vecSrc.ptr()[srcSlotSize * i + j].width = h_vecDst_groundTruth.ptr()[i].width; + h_vecSrc.ptr()[srcSlotSize * i + j].height = h_vecDst_groundTruth.ptr()[i].height; + } + } + + //shuffle + for (Ncv32u i=0; inumDstRects*srcSlotSize-1; i++) + { + Ncv32u randVal = h_random32u.ptr()[randCnt++]; randCnt = randCnt % h_random32u.length(); + Ncv32u secondSwap = randVal % (this->numDstRects*srcSlotSize-1 - i); + NcvRect32u tmp = h_vecSrc.ptr()[i + secondSwap]; + h_vecSrc.ptr()[i + secondSwap] = h_vecSrc.ptr()[i]; + h_vecSrc.ptr()[i] = tmp; + } + NCV_SKIP_COND_END + + Ncv32u numHypothesesSrc = h_vecSrc.length(); + NCV_SKIP_COND_BEGIN + ncvStat = ncvFilterHypotheses_host(h_vecSrc, numHypothesesSrc, this->minNeighbors, this->eps, NULL); + ncvAssertReturn(ncvStat == NCV_SUCCESS, false); + NCV_SKIP_COND_END + + //verification + bool bLoopVirgin = true; + + NCV_SKIP_COND_BEGIN + if (numHypothesesSrc != this->numDstRects) + { + bLoopVirgin = false; + } + else + { + std::vector tmpRects(numHypothesesSrc); + memcpy(&tmpRects[0], h_vecSrc.ptr(), numHypothesesSrc * sizeof(NcvRect32u)); + std::sort(tmpRects.begin(), tmpRects.end()); + for (Ncv32u i=0; ieps)) + { + bLoopVirgin = false; + } + } + } + NCV_SKIP_COND_END + + if (bLoopVirgin) + { + rcode = true; + } + + return rcode; +} + + +bool TestHypothesesFilter::deinit() +{ + return true; +} diff --git a/tests/gpu/src/nvidia/TestHypothesesFilter.h b/tests/gpu/src/nvidia/TestHypothesesFilter.h new file mode 100644 index 0000000000..63894f82d5 --- /dev/null +++ b/tests/gpu/src/nvidia/TestHypothesesFilter.h @@ -0,0 +1,44 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual + * property and proprietary rights in and to this software and + * related documentation and any modifications thereto. + * Any use, reproduction, disclosure, or distribution of this + * software and related documentation without an express license + * agreement from NVIDIA Corporation is strictly prohibited. + */ +#ifndef _testhypothesesfilter_h_ +#define _testhypothesesfilter_h_ + +#include "NCVTest.hpp" +#include "NCVTestSourceProvider.hpp" + + +class TestHypothesesFilter : public NCVTestProvider +{ +public: + + TestHypothesesFilter(std::string testName, NCVTestSourceProvider &src, + Ncv32u numDstRects, Ncv32u minNeighbors, Ncv32f eps); + + virtual bool init(); + virtual bool process(); + virtual bool deinit(); + virtual bool toString(std::ofstream &strOut); + +private: + + TestHypothesesFilter(const TestHypothesesFilter&); + TestHypothesesFilter& operator=(const TestHypothesesFilter&); + + NCVTestSourceProvider &src; + Ncv32u numDstRects; + Ncv32u minNeighbors; + Ncv32f eps; + + Ncv32u canvasWidth; + Ncv32u canvasHeight; +}; + +#endif // _testhypothesesfilter_h_ diff --git a/tests/gpu/src/nvidia/TestHypothesesGrow.cpp b/tests/gpu/src/nvidia/TestHypothesesGrow.cpp new file mode 100644 index 0000000000..3ca076c973 --- /dev/null +++ b/tests/gpu/src/nvidia/TestHypothesesGrow.cpp @@ -0,0 +1,134 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual + * property and proprietary rights in and to this software and + * related documentation and any modifications thereto. + * Any use, reproduction, disclosure, or distribution of this + * software and related documentation without an express license + * agreement from NVIDIA Corporation is strictly prohibited. + */ + +#include "TestHypothesesGrow.h" +#include "NCVHaarObjectDetection.hpp" + + +TestHypothesesGrow::TestHypothesesGrow(std::string testName, NCVTestSourceProvider &src, + Ncv32u rectWidth, Ncv32u rectHeight, Ncv32f rectScale, + Ncv32u maxLenSrc, Ncv32u lenSrc, Ncv32u maxLenDst, Ncv32u lenDst) + : + NCVTestProvider(testName), + src(src), + rectWidth(rectWidth), + rectHeight(rectHeight), + rectScale(rectScale), + maxLenSrc(maxLenSrc), + lenSrc(lenSrc), + maxLenDst(maxLenDst), + lenDst(lenDst) +{ +} + + +bool TestHypothesesGrow::toString(std::ofstream &strOut) +{ + strOut << "rectWidth=" << rectWidth << std::endl; + strOut << "rectHeight=" << rectHeight << std::endl; + strOut << "rectScale=" << rectScale << std::endl; + strOut << "maxLenSrc=" << maxLenSrc << std::endl; + strOut << "lenSrc=" << lenSrc << std::endl; + strOut << "maxLenDst=" << maxLenDst << std::endl; + strOut << "lenDst=" << lenDst << std::endl; + return true; +} + + +bool TestHypothesesGrow::init() +{ + return true; +} + + +bool TestHypothesesGrow::process() +{ + NCVStatus ncvStat; + bool rcode = false; + + NCVVectorAlloc h_vecSrc(*this->allocatorCPU.get(), this->maxLenSrc); + ncvAssertReturn(h_vecSrc.isMemAllocated(), false); + NCVVectorAlloc d_vecSrc(*this->allocatorGPU.get(), this->maxLenSrc); + ncvAssertReturn(d_vecSrc.isMemAllocated(), false); + + NCVVectorAlloc h_vecDst(*this->allocatorCPU.get(), this->maxLenDst); + ncvAssertReturn(h_vecDst.isMemAllocated(), false); + NCVVectorAlloc d_vecDst(*this->allocatorGPU.get(), this->maxLenDst); + ncvAssertReturn(d_vecDst.isMemAllocated(), false); + NCVVectorAlloc h_vecDst_d(*this->allocatorCPU.get(), this->maxLenDst); + ncvAssertReturn(h_vecDst_d.isMemAllocated(), false); + + NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting()); + + NCV_SKIP_COND_BEGIN + ncvAssertReturn(this->src.fill(h_vecSrc), false); + memset(h_vecDst.ptr(), 0, h_vecDst.length() * sizeof(NcvRect32u)); + NCVVectorReuse h_vecDst_as32u(h_vecDst.getSegment(), lenDst * sizeof(NcvRect32u) / sizeof(Ncv32u)); + ncvAssertReturn(h_vecDst_as32u.isMemReused(), false); + ncvAssertReturn(this->src.fill(h_vecDst_as32u), false); + memcpy(h_vecDst_d.ptr(), h_vecDst.ptr(), h_vecDst.length() * sizeof(NcvRect32u)); + NCV_SKIP_COND_END + + ncvStat = h_vecSrc.copySolid(d_vecSrc, 0); + ncvAssertReturn(ncvStat == NCV_SUCCESS, false); + ncvStat = h_vecDst.copySolid(d_vecDst, 0); + ncvAssertReturn(ncvStat == NCV_SUCCESS, false); + ncvAssertCUDAReturn(cudaStreamSynchronize(0), false); + + Ncv32u h_outElemNum_d = 0; + Ncv32u h_outElemNum_h = 0; + NCV_SKIP_COND_BEGIN + h_outElemNum_d = this->lenDst; + ncvStat = ncvGrowDetectionsVector_device(d_vecSrc, this->lenSrc, + d_vecDst, h_outElemNum_d, this->maxLenDst, + this->rectWidth, this->rectHeight, this->rectScale, 0); + ncvAssertReturn(ncvStat == NCV_SUCCESS, false); + ncvStat = d_vecDst.copySolid(h_vecDst_d, 0); + ncvAssertReturn(ncvStat == NCV_SUCCESS, false); + ncvAssertCUDAReturn(cudaStreamSynchronize(0), false); + + h_outElemNum_h = this->lenDst; + ncvStat = ncvGrowDetectionsVector_host(h_vecSrc, this->lenSrc, + h_vecDst, h_outElemNum_h, this->maxLenDst, + this->rectWidth, this->rectHeight, this->rectScale); + ncvAssertReturn(ncvStat == NCV_SUCCESS, false); + NCV_SKIP_COND_END + + //bit-to-bit check + bool bLoopVirgin = true; + + NCV_SKIP_COND_BEGIN + if (h_outElemNum_d != h_outElemNum_h) + { + bLoopVirgin = false; + } + else + { + if (memcmp(h_vecDst.ptr(), h_vecDst_d.ptr(), this->maxLenDst * sizeof(NcvRect32u))) + { + bLoopVirgin = false; + } + } + NCV_SKIP_COND_END + + if (bLoopVirgin) + { + rcode = true; + } + + return rcode; +} + + +bool TestHypothesesGrow::deinit() +{ + return true; +} diff --git a/tests/gpu/src/nvidia/TestHypothesesGrow.h b/tests/gpu/src/nvidia/TestHypothesesGrow.h new file mode 100644 index 0000000000..c8358ec782 --- /dev/null +++ b/tests/gpu/src/nvidia/TestHypothesesGrow.h @@ -0,0 +1,46 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual + * property and proprietary rights in and to this software and + * related documentation and any modifications thereto. + * Any use, reproduction, disclosure, or distribution of this + * software and related documentation without an express license + * agreement from NVIDIA Corporation is strictly prohibited. + */ +#ifndef _testhypothesesgrow_h_ +#define _testhypothesesgrow_h_ + +#include "NCVTest.hpp" +#include "NCVTestSourceProvider.hpp" + + +class TestHypothesesGrow : public NCVTestProvider +{ +public: + + TestHypothesesGrow(std::string testName, NCVTestSourceProvider &src, + Ncv32u rectWidth, Ncv32u rectHeight, Ncv32f rectScale, + Ncv32u maxLenSrc, Ncv32u lenSrc, Ncv32u maxLenDst, Ncv32u lenDst); + + virtual bool init(); + virtual bool process(); + virtual bool deinit(); + virtual bool toString(std::ofstream &strOut); + +private: + TestHypothesesGrow(const TestHypothesesGrow&); + TestHypothesesGrow& operator=(const TestHypothesesGrow&); + + + NCVTestSourceProvider &src; + Ncv32u rectWidth; + Ncv32u rectHeight; + Ncv32f rectScale; + Ncv32u maxLenSrc; + Ncv32u lenSrc; + Ncv32u maxLenDst; + Ncv32u lenDst; +}; + +#endif // _testhypothesesgrow_h_ diff --git a/tests/gpu/src/nvidia/TestIntegralImage.cpp b/tests/gpu/src/nvidia/TestIntegralImage.cpp new file mode 100644 index 0000000000..47de70cb13 --- /dev/null +++ b/tests/gpu/src/nvidia/TestIntegralImage.cpp @@ -0,0 +1,185 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual + * property and proprietary rights in and to this software and + * related documentation and any modifications thereto. + * Any use, reproduction, disclosure, or distribution of this + * software and related documentation without an express license + * agreement from NVIDIA Corporation is strictly prohibited. + */ + +#include +#include "TestIntegralImage.h" + + +template +TestIntegralImage::TestIntegralImage(std::string testName, NCVTestSourceProvider &src, + Ncv32u width, Ncv32u height) + : + NCVTestProvider(testName), + src(src), + width(width), + height(height) +{ +} + + +template +bool TestIntegralImage::toString(std::ofstream &strOut) +{ + strOut << "sizeof(T_in)=" << sizeof(T_in) << std::endl; + strOut << "sizeof(T_out)=" << sizeof(T_out) << std::endl; + strOut << "width=" << width << std::endl; + strOut << "height=" << height << std::endl; + return true; +} + + +template +bool TestIntegralImage::init() +{ + return true; +} + + +template +bool TestIntegralImage::process() +{ + NCVStatus ncvStat; + bool rcode = false; + + Ncv32u widthII = this->width + 1; + Ncv32u heightII = this->height + 1; + + NCVMatrixAlloc d_img(*this->allocatorGPU.get(), this->width, this->height); + ncvAssertReturn(d_img.isMemAllocated(), false); + NCVMatrixAlloc h_img(*this->allocatorCPU.get(), this->width, this->height); + ncvAssertReturn(h_img.isMemAllocated(), false); + NCVMatrixAlloc d_imgII(*this->allocatorGPU.get(), widthII, heightII); + ncvAssertReturn(d_imgII.isMemAllocated(), false); + NCVMatrixAlloc h_imgII(*this->allocatorCPU.get(), widthII, heightII); + ncvAssertReturn(h_imgII.isMemAllocated(), false); + NCVMatrixAlloc h_imgII_d(*this->allocatorCPU.get(), widthII, heightII); + ncvAssertReturn(h_imgII_d.isMemAllocated(), false); + + Ncv32u bufSize; + if (sizeof(T_in) == sizeof(Ncv8u)) + { + ncvStat = nppiStIntegralGetSize_8u32u(NcvSize32u(this->width, this->height), &bufSize, this->devProp); + ncvAssertReturn(NPPST_SUCCESS == ncvStat, false); + } + else if (sizeof(T_in) == sizeof(Ncv32f)) + { + ncvStat = nppiStIntegralGetSize_32f32f(NcvSize32u(this->width, this->height), &bufSize, this->devProp); + ncvAssertReturn(NPPST_SUCCESS == ncvStat, false); + } + else + { + ncvAssertPrintReturn(false, "Incorrect integral image test instance", false); + } + + NCVVectorAlloc d_tmpBuf(*this->allocatorGPU.get(), bufSize); + ncvAssertReturn(d_tmpBuf.isMemAllocated(), false); + + NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting()); + NCV_SKIP_COND_BEGIN + + ncvAssertReturn(this->src.fill(h_img), false); + + ncvStat = h_img.copySolid(d_img, 0); + ncvAssertReturn(ncvStat == NPPST_SUCCESS, false); + + if (sizeof(T_in) == sizeof(Ncv8u)) + { + ncvStat = nppiStIntegral_8u32u_C1R((Ncv8u *)d_img.ptr(), d_img.pitch(), + (Ncv32u *)d_imgII.ptr(), d_imgII.pitch(), + NcvSize32u(this->width, this->height), + d_tmpBuf.ptr(), bufSize, this->devProp); + ncvAssertReturn(ncvStat == NPPST_SUCCESS, false); + } + else if (sizeof(T_in) == sizeof(Ncv32f)) + { + ncvStat = nppiStIntegral_32f32f_C1R((Ncv32f *)d_img.ptr(), d_img.pitch(), + (Ncv32f *)d_imgII.ptr(), d_imgII.pitch(), + NcvSize32u(this->width, this->height), + d_tmpBuf.ptr(), bufSize, this->devProp); + ncvAssertReturn(ncvStat == NPPST_SUCCESS, false); + } + else + { + ncvAssertPrintReturn(false, "Incorrect integral image test instance", false); + } + + ncvStat = d_imgII.copySolid(h_imgII_d, 0); + ncvAssertReturn(ncvStat == NPPST_SUCCESS, false); + + if (sizeof(T_in) == sizeof(Ncv8u)) + { + ncvStat = nppiStIntegral_8u32u_C1R_host((Ncv8u *)h_img.ptr(), h_img.pitch(), + (Ncv32u *)h_imgII.ptr(), h_imgII.pitch(), + NcvSize32u(this->width, this->height)); + ncvAssertReturn(ncvStat == NPPST_SUCCESS, false); + } + else if (sizeof(T_in) == sizeof(Ncv32f)) + { + ncvStat = nppiStIntegral_32f32f_C1R_host((Ncv32f *)h_img.ptr(), h_img.pitch(), + (Ncv32f *)h_imgII.ptr(), h_imgII.pitch(), + NcvSize32u(this->width, this->height)); + ncvAssertReturn(ncvStat == NPPST_SUCCESS, false); + } + else + { + ncvAssertPrintReturn(false, "Incorrect integral image test instance", false); + } + + NCV_SKIP_COND_END + + //bit-to-bit check + bool bLoopVirgin = true; + + NCV_SKIP_COND_BEGIN + for (Ncv32u i=0; bLoopVirgin && i < h_img.height() + 1; i++) + { + for (Ncv32u j=0; bLoopVirgin && j < h_img.width() + 1; j++) + { + if (sizeof(T_in) == sizeof(Ncv8u)) + { + if (h_imgII.ptr()[h_imgII.stride()*i+j] != h_imgII_d.ptr()[h_imgII_d.stride()*i+j]) + { + bLoopVirgin = false; + } + } + else if (sizeof(T_in) == sizeof(Ncv32f)) + { + if (fabsf((float)h_imgII.ptr()[h_imgII.stride()*i+j] - (float)h_imgII_d.ptr()[h_imgII_d.stride()*i+j]) > 0.01f) + { + bLoopVirgin = false; + } + } + else + { + ncvAssertPrintReturn(false, "Incorrect integral image test instance", false); + } + } + } + NCV_SKIP_COND_END + + if (bLoopVirgin) + { + rcode = true; + } + + return rcode; +} + + +template +bool TestIntegralImage::deinit() +{ + return true; +} + + +template class TestIntegralImage; +template class TestIntegralImage; diff --git a/tests/gpu/src/nvidia/TestIntegralImage.h b/tests/gpu/src/nvidia/TestIntegralImage.h new file mode 100644 index 0000000000..22677766b6 --- /dev/null +++ b/tests/gpu/src/nvidia/TestIntegralImage.h @@ -0,0 +1,40 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual + * property and proprietary rights in and to this software and + * related documentation and any modifications thereto. + * Any use, reproduction, disclosure, or distribution of this + * software and related documentation without an express license + * agreement from NVIDIA Corporation is strictly prohibited. + */ +#ifndef _testintegralimage_h_ +#define _testintegralimage_h_ + +#include "NCVTest.hpp" +#include "NCVTestSourceProvider.hpp" + + +template +class TestIntegralImage : public NCVTestProvider +{ +public: + + TestIntegralImage(std::string testName, NCVTestSourceProvider &src, + Ncv32u width, Ncv32u height); + + virtual bool init(); + virtual bool process(); + virtual bool deinit(); + virtual bool toString(std::ofstream &strOut); + +private: + TestIntegralImage(const TestIntegralImage&); + TestIntegralImage& operator=(const TestIntegralImage&); + + NCVTestSourceProvider &src; + Ncv32u width; + Ncv32u height; +}; + +#endif // _testintegralimage_h_ diff --git a/tests/gpu/src/nvidia/TestIntegralImageSquared.cpp b/tests/gpu/src/nvidia/TestIntegralImageSquared.cpp new file mode 100644 index 0000000000..ec245934fd --- /dev/null +++ b/tests/gpu/src/nvidia/TestIntegralImageSquared.cpp @@ -0,0 +1,117 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual + * property and proprietary rights in and to this software and + * related documentation and any modifications thereto. + * Any use, reproduction, disclosure, or distribution of this + * software and related documentation without an express license + * agreement from NVIDIA Corporation is strictly prohibited. + */ + +#include "TestIntegralImageSquared.h" + + +TestIntegralImageSquared::TestIntegralImageSquared(std::string testName, NCVTestSourceProvider &src, + Ncv32u width, Ncv32u height) + : + NCVTestProvider(testName), + src(src), + width(width), + height(height) +{ +} + + +bool TestIntegralImageSquared::toString(std::ofstream &strOut) +{ + strOut << "width=" << width << std::endl; + strOut << "height=" << height << std::endl; + return true; +} + + +bool TestIntegralImageSquared::init() +{ + return true; +} + + +bool TestIntegralImageSquared::process() +{ + NCVStatus ncvStat; + bool rcode = false; + + Ncv32u widthSII = this->width + 1; + Ncv32u heightSII = this->height + 1; + + NCVMatrixAlloc d_img(*this->allocatorGPU.get(), this->width, this->height); + ncvAssertReturn(d_img.isMemAllocated(), false); + NCVMatrixAlloc h_img(*this->allocatorCPU.get(), this->width, this->height); + ncvAssertReturn(h_img.isMemAllocated(), false); + NCVMatrixAlloc d_imgSII(*this->allocatorGPU.get(), widthSII, heightSII); + ncvAssertReturn(d_imgSII.isMemAllocated(), false); + NCVMatrixAlloc h_imgSII(*this->allocatorCPU.get(), widthSII, heightSII); + ncvAssertReturn(h_imgSII.isMemAllocated(), false); + NCVMatrixAlloc h_imgSII_d(*this->allocatorCPU.get(), widthSII, heightSII); + ncvAssertReturn(h_imgSII_d.isMemAllocated(), false); + + Ncv32u bufSize; + ncvStat = nppiStSqrIntegralGetSize_8u64u(NcvSize32u(this->width, this->height), &bufSize, this->devProp); + ncvAssertReturn(NPPST_SUCCESS == ncvStat, false); + NCVVectorAlloc d_tmpBuf(*this->allocatorGPU.get(), bufSize); + ncvAssertReturn(d_tmpBuf.isMemAllocated(), false); + + NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting()); + NCV_SKIP_COND_BEGIN + + ncvAssertReturn(this->src.fill(h_img), false); + + ncvStat = h_img.copySolid(d_img, 0); + ncvAssertReturn(ncvStat == NPPST_SUCCESS, false); + + ncvStat = nppiStSqrIntegral_8u64u_C1R(d_img.ptr(), d_img.pitch(), + d_imgSII.ptr(), d_imgSII.pitch(), + NcvSize32u(this->width, this->height), + d_tmpBuf.ptr(), bufSize, this->devProp); + ncvAssertReturn(ncvStat == NPPST_SUCCESS, false); + + ncvStat = d_imgSII.copySolid(h_imgSII_d, 0); + ncvAssertReturn(ncvStat == NPPST_SUCCESS, false); + + ncvStat = nppiStSqrIntegral_8u64u_C1R_host(h_img.ptr(), h_img.pitch(), + h_imgSII.ptr(), h_imgSII.pitch(), + NcvSize32u(this->width, this->height)); + ncvAssertReturn(ncvStat == NPPST_SUCCESS, false); + + NCV_SKIP_COND_END + + //bit-to-bit check + bool bLoopVirgin = true; + + NCV_SKIP_COND_BEGIN + for (Ncv32u i=0; bLoopVirgin && i < h_img.height() + 1; i++) + { + for (Ncv32u j=0; bLoopVirgin && j < h_img.width() + 1; j++) + { + if (h_imgSII.ptr()[h_imgSII.stride()*i+j] != h_imgSII_d.ptr()[h_imgSII_d.stride()*i+j]) + { + bLoopVirgin = false; + } + } + } + NCV_SKIP_COND_END + + if (bLoopVirgin) + { + rcode = true; + } + + return rcode; +} + + +bool TestIntegralImageSquared::deinit() +{ + return true; +} diff --git a/tests/gpu/src/nvidia/TestIntegralImageSquared.h b/tests/gpu/src/nvidia/TestIntegralImageSquared.h new file mode 100644 index 0000000000..b1aaf28af5 --- /dev/null +++ b/tests/gpu/src/nvidia/TestIntegralImageSquared.h @@ -0,0 +1,39 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual + * property and proprietary rights in and to this software and + * related documentation and any modifications thereto. + * Any use, reproduction, disclosure, or distribution of this + * software and related documentation without an express license + * agreement from NVIDIA Corporation is strictly prohibited. + */ +#ifndef _testintegralimagesquared_h_ +#define _testintegralimagesquared_h_ + +#include "NCVTest.hpp" +#include "NCVTestSourceProvider.hpp" + + +class TestIntegralImageSquared : public NCVTestProvider +{ +public: + + TestIntegralImageSquared(std::string testName, NCVTestSourceProvider &src, + Ncv32u width, Ncv32u height); + + virtual bool init(); + virtual bool process(); + virtual bool deinit(); + virtual bool toString(std::ofstream &strOut); + +private: + TestIntegralImageSquared(const TestIntegralImageSquared&); + TestIntegralImageSquared& operator=(const TestIntegralImageSquared&); + + NCVTestSourceProvider &src; + Ncv32u width; + Ncv32u height; +}; + +#endif // _testintegralimagesquared_h_ diff --git a/tests/gpu/src/nvidia/TestRectStdDev.cpp b/tests/gpu/src/nvidia/TestRectStdDev.cpp new file mode 100644 index 0000000000..40a1ad7dac --- /dev/null +++ b/tests/gpu/src/nvidia/TestRectStdDev.cpp @@ -0,0 +1,180 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual + * property and proprietary rights in and to this software and + * related documentation and any modifications thereto. + * Any use, reproduction, disclosure, or distribution of this + * software and related documentation without an express license + * agreement from NVIDIA Corporation is strictly prohibited. + */ + +#include + +#include "TestRectStdDev.h" + + +TestRectStdDev::TestRectStdDev(std::string testName, NCVTestSourceProvider &src, + Ncv32u width, Ncv32u height, NcvRect32u rect, Ncv32f scaleFactor, + NcvBool bTextureCache) + : + NCVTestProvider(testName), + src(src), + width(width), + height(height), + rect(rect), + scaleFactor(scaleFactor), + bTextureCache(bTextureCache) +{ +} + + +bool TestRectStdDev::toString(std::ofstream &strOut) +{ + strOut << "width=" << width << std::endl; + strOut << "height=" << height << std::endl; + strOut << "rect=[" << rect.x << ", " << rect.y << ", " << rect.width << ", " << rect.height << "]\n"; + strOut << "scaleFactor=" << scaleFactor << std::endl; + strOut << "bTextureCache=" << bTextureCache << std::endl; + return true; +} + + +bool TestRectStdDev::init() +{ + return true; +} + + +bool TestRectStdDev::process() +{ + NCVStatus ncvStat; + bool rcode = false; + + Ncv32s _normWidth = (Ncv32s)this->width - this->rect.x - this->rect.width + 1; + Ncv32s _normHeight = (Ncv32s)this->height - this->rect.y - this->rect.height + 1; + if (_normWidth <= 0 || _normHeight <= 0) + { + return true; + } + Ncv32u normWidth = (Ncv32u)_normWidth; + Ncv32u normHeight = (Ncv32u)_normHeight; + NcvSize32u szNormRoi(normWidth, normHeight); + + Ncv32u widthII = this->width + 1; + Ncv32u heightII = this->height + 1; + Ncv32u widthSII = this->width + 1; + Ncv32u heightSII = this->height + 1; + + NCVMatrixAlloc d_img(*this->allocatorGPU.get(), this->width, this->height); + ncvAssertReturn(d_img.isMemAllocated(), false); + NCVMatrixAlloc h_img(*this->allocatorCPU.get(), this->width, this->height); + ncvAssertReturn(h_img.isMemAllocated(), false); + + NCVMatrixAlloc d_imgII(*this->allocatorGPU.get(), widthII, heightII); + ncvAssertReturn(d_imgII.isMemAllocated(), false); + NCVMatrixAlloc h_imgII(*this->allocatorCPU.get(), widthII, heightII); + ncvAssertReturn(h_imgII.isMemAllocated(), false); + + NCVMatrixAlloc d_imgSII(*this->allocatorGPU.get(), widthSII, heightSII); + ncvAssertReturn(d_imgSII.isMemAllocated(), false); + NCVMatrixAlloc h_imgSII(*this->allocatorCPU.get(), widthSII, heightSII); + ncvAssertReturn(h_imgSII.isMemAllocated(), false); + + NCVMatrixAlloc d_norm(*this->allocatorGPU.get(), normWidth, normHeight); + ncvAssertReturn(d_norm.isMemAllocated(), false); + NCVMatrixAlloc h_norm(*this->allocatorCPU.get(), normWidth, normHeight); + ncvAssertReturn(h_norm.isMemAllocated(), false); + NCVMatrixAlloc h_norm_d(*this->allocatorCPU.get(), normWidth, normHeight); + ncvAssertReturn(h_norm_d.isMemAllocated(), false); + + Ncv32u bufSizeII, bufSizeSII; + ncvStat = nppiStIntegralGetSize_8u32u(NcvSize32u(this->width, this->height), &bufSizeII, this->devProp); + ncvAssertReturn(NPPST_SUCCESS == ncvStat, false); + ncvStat = nppiStSqrIntegralGetSize_8u64u(NcvSize32u(this->width, this->height), &bufSizeSII, this->devProp); + ncvAssertReturn(NPPST_SUCCESS == ncvStat, false); + Ncv32u bufSize = bufSizeII > bufSizeSII ? bufSizeII : bufSizeSII; + NCVVectorAlloc d_tmpBuf(*this->allocatorGPU.get(), bufSize); + ncvAssertReturn(d_tmpBuf.isMemAllocated(), false); + + NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting()); + NCV_SKIP_COND_BEGIN + ncvAssertReturn(this->src.fill(h_img), false); + + ncvStat = h_img.copySolid(d_img, 0); + ncvAssertReturn(ncvStat == NPPST_SUCCESS, false); + + ncvStat = nppiStIntegral_8u32u_C1R(d_img.ptr(), d_img.pitch(), + d_imgII.ptr(), d_imgII.pitch(), + NcvSize32u(this->width, this->height), + d_tmpBuf.ptr(), bufSize, this->devProp); + ncvAssertReturn(ncvStat == NPPST_SUCCESS, false); + + ncvStat = nppiStSqrIntegral_8u64u_C1R(d_img.ptr(), d_img.pitch(), + d_imgSII.ptr(), d_imgSII.pitch(), + NcvSize32u(this->width, this->height), + d_tmpBuf.ptr(), bufSize, this->devProp); + ncvAssertReturn(ncvStat == NPPST_SUCCESS, false); + + ncvStat = nppiStRectStdDev_32f_C1R(d_imgII.ptr(), d_imgII.pitch(), + d_imgSII.ptr(), d_imgSII.pitch(), + d_norm.ptr(), d_norm.pitch(), + szNormRoi, this->rect, + this->scaleFactor, + this->bTextureCache); + ncvAssertReturn(ncvStat == NPPST_SUCCESS, false); + + ncvStat = d_norm.copySolid(h_norm_d, 0); + ncvAssertReturn(ncvStat == NPPST_SUCCESS, false); + + ncvStat = nppiStIntegral_8u32u_C1R_host(h_img.ptr(), h_img.pitch(), + h_imgII.ptr(), h_imgII.pitch(), + NcvSize32u(this->width, this->height)); + ncvAssertReturn(ncvStat == NPPST_SUCCESS, false); + + ncvStat = nppiStSqrIntegral_8u64u_C1R_host(h_img.ptr(), h_img.pitch(), + h_imgSII.ptr(), h_imgSII.pitch(), + NcvSize32u(this->width, this->height)); + ncvAssertReturn(ncvStat == NPPST_SUCCESS, false); + + ncvStat = nppiStRectStdDev_32f_C1R_host(h_imgII.ptr(), h_imgII.pitch(), + h_imgSII.ptr(), h_imgSII.pitch(), + h_norm.ptr(), h_norm.pitch(), + szNormRoi, this->rect, + this->scaleFactor); + ncvAssertReturn(ncvStat == NPPST_SUCCESS, false); + NCV_SKIP_COND_END + + //bit-to-bit check + bool bLoopVirgin = true; + + NCV_SKIP_COND_BEGIN + const Ncv64f relEPS = 0.005; + for (Ncv32u i=0; bLoopVirgin && i < h_norm.height(); i++) + { + for (Ncv32u j=0; bLoopVirgin && j < h_norm.width(); j++) + { + Ncv64f absErr = fabs(h_norm.ptr()[h_norm.stride()*i+j] - h_norm_d.ptr()[h_norm_d.stride()*i+j]); + Ncv64f relErr = absErr / h_norm.ptr()[h_norm.stride()*i+j]; + + if (relErr > relEPS) + { + bLoopVirgin = false; + } + } + } + NCV_SKIP_COND_END + + if (bLoopVirgin) + { + rcode = true; + } + + return rcode; +} + + +bool TestRectStdDev::deinit() +{ + return true; +} diff --git a/tests/gpu/src/nvidia/TestRectStdDev.h b/tests/gpu/src/nvidia/TestRectStdDev.h new file mode 100644 index 0000000000..7c0473e296 --- /dev/null +++ b/tests/gpu/src/nvidia/TestRectStdDev.h @@ -0,0 +1,44 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual + * property and proprietary rights in and to this software and + * related documentation and any modifications thereto. + * Any use, reproduction, disclosure, or distribution of this + * software and related documentation without an express license + * agreement from NVIDIA Corporation is strictly prohibited. + */ +#ifndef _testrectstddev_h_ +#define _testrectstddev_h_ + +#include "NCVTest.hpp" +#include "NCVTestSourceProvider.hpp" + + +class TestRectStdDev : public NCVTestProvider +{ +public: + + TestRectStdDev(std::string testName, NCVTestSourceProvider &src, + Ncv32u width, Ncv32u height, NcvRect32u rect, Ncv32f scaleFactor, + NcvBool bTextureCache); + + virtual bool init(); + virtual bool process(); + virtual bool deinit(); + virtual bool toString(std::ofstream &strOut); + +private: + TestRectStdDev(const TestRectStdDev&); + TestRectStdDev& operator=(const TestRectStdDev&); + + NCVTestSourceProvider &src; + NcvRect32u rect; + Ncv32u width; + Ncv32u height; + Ncv32f scaleFactor; + + NcvBool bTextureCache; +}; + +#endif // _testrectstddev_h_ diff --git a/tests/gpu/src/nvidia/TestResize.cpp b/tests/gpu/src/nvidia/TestResize.cpp new file mode 100644 index 0000000000..02108997b3 --- /dev/null +++ b/tests/gpu/src/nvidia/TestResize.cpp @@ -0,0 +1,161 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual + * property and proprietary rights in and to this software and + * related documentation and any modifications thereto. + * Any use, reproduction, disclosure, or distribution of this + * software and related documentation without an express license + * agreement from NVIDIA Corporation is strictly prohibited. + */ + +#include + +#include "TestResize.h" + + +template +TestResize::TestResize(std::string testName, NCVTestSourceProvider &src, + Ncv32u width, Ncv32u height, Ncv32u scaleFactor, NcvBool bTextureCache) + : + NCVTestProvider(testName), + src(src), + width(width), + height(height), + scaleFactor(scaleFactor), + bTextureCache(bTextureCache) +{ +} + + +template +bool TestResize::toString(std::ofstream &strOut) +{ + strOut << "sizeof(T)=" << sizeof(T) << std::endl; + strOut << "width=" << width << std::endl; + strOut << "scaleFactor=" << scaleFactor << std::endl; + strOut << "bTextureCache=" << bTextureCache << std::endl; + return true; +} + + +template +bool TestResize::init() +{ + return true; +} + + +template +bool TestResize::process() +{ + NCVStatus ncvStat; + bool rcode = false; + + Ncv32s smallWidth = this->width / this->scaleFactor; + Ncv32s smallHeight = this->height / this->scaleFactor; + if (smallWidth == 0 || smallHeight == 0) + { + return true; + } + + NcvSize32u srcSize(this->width, this->height); + + NCVMatrixAlloc d_img(*this->allocatorGPU.get(), this->width, this->height); + ncvAssertReturn(d_img.isMemAllocated(), false); + NCVMatrixAlloc h_img(*this->allocatorCPU.get(), this->width, this->height); + ncvAssertReturn(h_img.isMemAllocated(), false); + + NCVMatrixAlloc d_small(*this->allocatorGPU.get(), smallWidth, smallHeight); + ncvAssertReturn(d_small.isMemAllocated(), false); + NCVMatrixAlloc h_small(*this->allocatorCPU.get(), smallWidth, smallHeight); + ncvAssertReturn(h_small.isMemAllocated(), false); + NCVMatrixAlloc h_small_d(*this->allocatorCPU.get(), smallWidth, smallHeight); + ncvAssertReturn(h_small_d.isMemAllocated(), false); + + NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting()); + NCV_SKIP_COND_BEGIN + ncvAssertReturn(this->src.fill(h_img), false); + NCV_SKIP_COND_END + + ncvStat = h_img.copySolid(d_img, 0); + ncvAssertReturn(ncvStat == NPPST_SUCCESS, false); + NCV_SKIP_COND_BEGIN + if (sizeof(T) == sizeof(Ncv32u)) + { + ncvStat = nppiStDownsampleNearest_32u_C1R((Ncv32u *)d_img.ptr(), d_img.pitch(), + (Ncv32u *)d_small.ptr(), d_small.pitch(), + srcSize, this->scaleFactor, + this->bTextureCache); + } + else if (sizeof(T) == sizeof(Ncv64u)) + { + ncvStat = nppiStDownsampleNearest_64u_C1R((Ncv64u *)d_img.ptr(), d_img.pitch(), + (Ncv64u *)d_small.ptr(), d_small.pitch(), + srcSize, this->scaleFactor, + this->bTextureCache); + } + else + { + ncvAssertPrintReturn(false, "Incorrect downsample test instance", false); + } + ncvAssertReturn(ncvStat == NPPST_SUCCESS, false); + NCV_SKIP_COND_END + ncvStat = d_small.copySolid(h_small_d, 0); + ncvAssertReturn(ncvStat == NPPST_SUCCESS, false); + + NCV_SKIP_COND_BEGIN + if (sizeof(T) == sizeof(Ncv32u)) + { + ncvStat = nppiStDownsampleNearest_32u_C1R_host((Ncv32u *)h_img.ptr(), h_img.pitch(), + (Ncv32u *)h_small.ptr(), h_small.pitch(), + srcSize, this->scaleFactor); + } + else if (sizeof(T) == sizeof(Ncv64u)) + { + ncvStat = nppiStDownsampleNearest_64u_C1R_host((Ncv64u *)h_img.ptr(), h_img.pitch(), + (Ncv64u *)h_small.ptr(), h_small.pitch(), + srcSize, this->scaleFactor); + } + else + { + ncvAssertPrintReturn(false, "Incorrect downsample test instance", false); + } + ncvAssertReturn(ncvStat == NPPST_SUCCESS, false); + NCV_SKIP_COND_END + + //bit-to-bit check + bool bLoopVirgin = true; + + NCV_SKIP_COND_BEGIN + //const Ncv64f relEPS = 0.005; + for (Ncv32u i=0; bLoopVirgin && i < h_small.height(); i++) + { + for (Ncv32u j=0; bLoopVirgin && j < h_small.width(); j++) + { + if (h_small.ptr()[h_small.stride()*i+j] != h_small_d.ptr()[h_small_d.stride()*i+j]) + { + bLoopVirgin = false; + } + } + } + NCV_SKIP_COND_END + + if (bLoopVirgin) + { + rcode = true; + } + + return rcode; +} + + +template +bool TestResize::deinit() +{ + return true; +} + + +template class TestResize; +template class TestResize; diff --git a/tests/gpu/src/nvidia/TestResize.h b/tests/gpu/src/nvidia/TestResize.h new file mode 100644 index 0000000000..1bd57a8007 --- /dev/null +++ b/tests/gpu/src/nvidia/TestResize.h @@ -0,0 +1,42 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual + * property and proprietary rights in and to this software and + * related documentation and any modifications thereto. + * Any use, reproduction, disclosure, or distribution of this + * software and related documentation without an express license + * agreement from NVIDIA Corporation is strictly prohibited. + */ +#ifndef _testresize_h_ +#define _testresize_h_ + +#include "NCVTest.hpp" +#include "NCVTestSourceProvider.hpp" + +template +class TestResize : public NCVTestProvider +{ +public: + + TestResize(std::string testName, NCVTestSourceProvider &src, + Ncv32u width, Ncv32u height, Ncv32u scaleFactor, NcvBool bTextureCache); + + virtual bool init(); + virtual bool process(); + virtual bool deinit(); + virtual bool toString(std::ofstream &strOut); + +private: + TestResize(const TestResize&); + TestResize& operator=(const TestResize&); + + NCVTestSourceProvider &src; + Ncv32u width; + Ncv32u height; + Ncv32u scaleFactor; + + NcvBool bTextureCache; +}; + +#endif // _testresize_h_ diff --git a/tests/gpu/src/nvidia/TestTranspose.cpp b/tests/gpu/src/nvidia/TestTranspose.cpp new file mode 100644 index 0000000000..aa131f867b --- /dev/null +++ b/tests/gpu/src/nvidia/TestTranspose.cpp @@ -0,0 +1,148 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual + * property and proprietary rights in and to this software and + * related documentation and any modifications thereto. + * Any use, reproduction, disclosure, or distribution of this + * software and related documentation without an express license + * agreement from NVIDIA Corporation is strictly prohibited. + */ + +#include + +#include "TestTranspose.h" + + +template +TestTranspose::TestTranspose(std::string testName, NCVTestSourceProvider &src, + Ncv32u width, Ncv32u height) + : + NCVTestProvider(testName), + src(src), + width(width), + height(height) +{ +} + + +template +bool TestTranspose::toString(std::ofstream &strOut) +{ + strOut << "sizeof(T)=" << sizeof(T) << std::endl; + strOut << "width=" << width << std::endl; + return true; +} + + +template +bool TestTranspose::init() +{ + return true; +} + + +template +bool TestTranspose::process() +{ + NCVStatus ncvStat; + bool rcode = false; + + NcvSize32u srcSize(this->width, this->height); + + NCVMatrixAlloc d_img(*this->allocatorGPU.get(), this->width, this->height); + ncvAssertReturn(d_img.isMemAllocated(), false); + NCVMatrixAlloc h_img(*this->allocatorCPU.get(), this->width, this->height); + ncvAssertReturn(h_img.isMemAllocated(), false); + + NCVMatrixAlloc d_dst(*this->allocatorGPU.get(), this->height, this->width); + ncvAssertReturn(d_dst.isMemAllocated(), false); + NCVMatrixAlloc h_dst(*this->allocatorCPU.get(), this->height, this->width); + ncvAssertReturn(h_dst.isMemAllocated(), false); + NCVMatrixAlloc h_dst_d(*this->allocatorCPU.get(), this->height, this->width); + ncvAssertReturn(h_dst_d.isMemAllocated(), false); + + NCV_SET_SKIP_COND(this->allocatorGPU.get()->isCounting()); + NCV_SKIP_COND_BEGIN + ncvAssertReturn(this->src.fill(h_img), false); + NCV_SKIP_COND_END + + ncvStat = h_img.copySolid(d_img, 0); + ncvAssertReturn(ncvStat == NPPST_SUCCESS, false); + NCV_SKIP_COND_BEGIN + if (sizeof(T) == sizeof(Ncv32u)) + { + ncvStat = nppiStTranspose_32u_C1R((Ncv32u *)d_img.ptr(), d_img.pitch(), + (Ncv32u *)d_dst.ptr(), d_dst.pitch(), + NcvSize32u(this->width, this->height)); + } + else if (sizeof(T) == sizeof(Ncv64u)) + { + ncvStat = nppiStTranspose_64u_C1R((Ncv64u *)d_img.ptr(), d_img.pitch(), + (Ncv64u *)d_dst.ptr(), d_dst.pitch(), + NcvSize32u(this->width, this->height)); + } + else + { + ncvAssertPrintReturn(false, "Incorrect transpose test instance", false); + } + ncvAssertReturn(ncvStat == NPPST_SUCCESS, false); + NCV_SKIP_COND_END + ncvStat = d_dst.copySolid(h_dst_d, 0); + ncvAssertReturn(ncvStat == NPPST_SUCCESS, false); + + NCV_SKIP_COND_BEGIN + if (sizeof(T) == sizeof(Ncv32u)) + { + ncvStat = nppiStTranspose_32u_C1R_host((Ncv32u *)h_img.ptr(), h_img.pitch(), + (Ncv32u *)h_dst.ptr(), h_dst.pitch(), + NcvSize32u(this->width, this->height)); + } + else if (sizeof(T) == sizeof(Ncv64u)) + { + ncvStat = nppiStTranspose_64u_C1R_host((Ncv64u *)h_img.ptr(), h_img.pitch(), + (Ncv64u *)h_dst.ptr(), h_dst.pitch(), + NcvSize32u(this->width, this->height)); + } + else + { + ncvAssertPrintReturn(false, "Incorrect downsample test instance", false); + } + ncvAssertReturn(ncvStat == NPPST_SUCCESS, false); + NCV_SKIP_COND_END + + //bit-to-bit check + bool bLoopVirgin = true; + + NCV_SKIP_COND_BEGIN + //const Ncv64f relEPS = 0.005; + for (Ncv32u i=0; bLoopVirgin && i < this->width; i++) + { + for (Ncv32u j=0; bLoopVirgin && j < this->height; j++) + { + if (h_dst.ptr()[h_dst.stride()*i+j] != h_dst_d.ptr()[h_dst_d.stride()*i+j]) + { + bLoopVirgin = false; + } + } + } + NCV_SKIP_COND_END + + if (bLoopVirgin) + { + rcode = true; + } + + return rcode; +} + + +template +bool TestTranspose::deinit() +{ + return true; +} + + +template class TestTranspose; +template class TestTranspose; diff --git a/tests/gpu/src/nvidia/TestTranspose.h b/tests/gpu/src/nvidia/TestTranspose.h new file mode 100644 index 0000000000..d865c3c8f2 --- /dev/null +++ b/tests/gpu/src/nvidia/TestTranspose.h @@ -0,0 +1,41 @@ +/* + * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. + * + * NVIDIA Corporation and its licensors retain all intellectual + * property and proprietary rights in and to this software and + * related documentation and any modifications thereto. + * Any use, reproduction, disclosure, or distribution of this + * software and related documentation without an express license + * agreement from NVIDIA Corporation is strictly prohibited. + */ +#ifndef _testtranspose_h_ +#define _testtranspose_h_ + +#include "NCVTest.hpp" +#include "NCVTestSourceProvider.hpp" + + +template +class TestTranspose : public NCVTestProvider +{ +public: + + TestTranspose(std::string testName, NCVTestSourceProvider &src, + Ncv32u width, Ncv32u height); + + virtual bool init(); + virtual bool process(); + virtual bool deinit(); + virtual bool toString(std::ofstream &strOut); + +private: + + TestTranspose(const TestTranspose&); + TestTranspose& operator=(const TestTranspose&); + + NCVTestSourceProvider &src; + Ncv32u width; + Ncv32u height; +}; + +#endif // _testtranspose_h_ diff --git a/tests/gpu/src/nvidia/main_nvidia.cpp b/tests/gpu/src/nvidia/main_nvidia.cpp new file mode 100644 index 0000000000..e4f2119c30 --- /dev/null +++ b/tests/gpu/src/nvidia/main_nvidia.cpp @@ -0,0 +1,346 @@ +#pragma warning (disable : 4408 4201 4100) + +#include + +#include "NCV.hpp" +#include "NCVHaarObjectDetection.hpp" + +#include "TestIntegralImage.h" +#include "TestIntegralImageSquared.h" +#include "TestRectStdDev.h" +#include "TestResize.h" +#include "TestCompact.h" +#include "TestTranspose.h" + +#include "TestDrawRects.h" +#include "TestHypothesesGrow.h" +#include "TestHypothesesFilter.h" +#include "TestHaarCascadeLoader.h" +#include "TestHaarCascadeApplication.h" + +#include "NCVAutoTestLister.hpp" +#include "NCVTestSourceProvider.hpp" + + +template +void generateIntegralTests(NCVAutoTestLister &testLister, NCVTestSourceProvider &src, + Ncv32u maxWidth, Ncv32u maxHeight) +{ + for (Ncv32f _i=1.0; _i(testName, src, i, 2)); + } + for (Ncv32f _i=1.0; _i(testName, src, 2, i)); + } + + //test VGA + testLister.add(new TestIntegralImage("LinIntImg_VGA", src, 640, 480)); + + //TODO: add tests of various resolutions up to 4096x4096 +} + + +void generateSquaredIntegralTests(NCVAutoTestLister &testLister, NCVTestSourceProvider &src, + Ncv32u maxWidth, Ncv32u maxHeight) +{ + for (Ncv32f _i=1.0; _i &src, + Ncv32u maxWidth, Ncv32u maxHeight) +{ + NcvRect32u rect(1,1,18,18); + + for (Ncv32f _i=32; _i +void generateResizeTests(NCVAutoTestLister &testLister, NCVTestSourceProvider &src) +{ + //test VGA + for (Ncv32u i=1; i<480; i+=3) + { + char testName[80]; + sprintf_s(testName, sizeof(testName), "TestResize_VGA_s%d", i); + testLister.add(new TestResize(testName, src, 640, 480, i, true)); + testLister.add(new TestResize(testName, src, 640, 480, i, false)); + } + + //test HD + for (Ncv32u i=1; i<1080; i+=5) + { + char testName[80]; + sprintf_s(testName, sizeof(testName), "TestResize_1080_s%d", i); + testLister.add(new TestResize(testName, src, 1920, 1080, i, true)); + testLister.add(new TestResize(testName, src, 1920, 1080, i, false)); + } + + //TODO: add tests of various resolutions up to 4096x4096 +} + + +void generateNPPSTVectorTests(NCVAutoTestLister &testLister, NCVTestSourceProvider &src, Ncv32u maxLength) +{ + //compaction + for (Ncv32f _i=256.0; _i +void generateTransposeTests(NCVAutoTestLister &testLister, NCVTestSourceProvider &src) +{ + for (int i=2; i<64; i+=4) + { + for (int j=2; j<64; j+=4) + { + char testName[80]; + sprintf_s(testName, sizeof(testName), "TestTranspose_%dx%d", i, j); + testLister.add(new TestTranspose(testName, src, i, j)); + } + } + + for (int i=1; i<128; i+=1) + { + for (int j=1; j<2; j+=1) + { + char testName[80]; + sprintf_s(testName, sizeof(testName), "TestTranspose_%dx%d", i, j); + testLister.add(new TestTranspose(testName, src, i, j)); + } + } + + testLister.add(new TestTranspose("TestTranspose_VGA", src, 640, 480)); + testLister.add(new TestTranspose("TestTranspose_HD1080", src, 1920, 1080)); +} + + +template +void generateDrawRectsTests(NCVAutoTestLister &testLister, NCVTestSourceProvider &src, NCVTestSourceProvider &src32u, + Ncv32u maxWidth, Ncv32u maxHeight) +{ + for (Ncv32f _i=16.0; _i(testName, src, src32u, i, j, i*j/1000+1, (T)0xFFFFFFFF)); + } + else if (sizeof(T) == sizeof(Ncv8u)) + { + testLister.add(new TestDrawRects(testName, src, src32u, i, j, i*j/1000+1, (T)0xFF)); + } + else + { + ncvAssertPrintCheck(false, "Attempted to instantiate non-existing DrawRects test suite"); + } + } + + //test VGA + testLister.add(new TestDrawRects("DrawRects_VGA", src, src32u, 640, 480, 640*480/1000, (T)0xFF)); + + //TODO: add tests of various resolutions up to 4096x4096 +} + + +void generateVectorTests(NCVAutoTestLister &testLister, NCVTestSourceProvider &src, Ncv32u maxLength) +{ + //growth + for (Ncv32f _i=10.0; _i &src, Ncv32u maxLength) +{ + for (Ncv32f _i=1.0; _i &src, + Ncv32u maxWidth, Ncv32u maxHeight) +{ + for (Ncv32u i=20; i<512; i+=11) + { + for (Ncv32u j=20; j<128; j+=5) + { + char testName[80]; + sprintf_s(testName, sizeof(testName), "HaarAppl%d_%d", i, j); + testLister.add(new TestHaarCascadeApplication(testName, src, "haarcascade_frontalface_alt.xml", j, i)); + } + } + for (Ncv32f _i=20.0; _i testSrcRandom_32u(2010, 0, 0xFFFFFFFF, 4096, 4096); + NCVTestSourceProvider testSrcRandom_8u(2010, 0, 255, 4096, 4096); + NCVTestSourceProvider testSrcRandom_64u(2010, 0, 0xFFFFFFFFFFFFFFFF, 4096, 4096); + NCVTestSourceProvider testSrcFacesVGA_8u("../../data/group_1_640x480_VGA.pgm"); + NCVTestSourceProvider testSrcRandom_32f(2010, -1.0f, 1.0f, 4096, 4096); + + printf("Generating NPPST test suites\n"); + generateIntegralTests(testListerII, testSrcRandom_8u, 4096, 4096); + generateIntegralTests(testListerII, testSrcRandom_32f, 4096, 4096); + generateSquaredIntegralTests(testListerSII, testSrcRandom_8u, 4096, 4096); + generateRectStdDevTests(testListerRStdDev, testSrcRandom_8u, 4096, 4096); + generateResizeTests(testListerResize, testSrcRandom_32u); + generateResizeTests(testListerResize, testSrcRandom_64u); + generateNPPSTVectorTests(testListerNPPSTVectorOperations, testSrcRandom_32u, 4096*4096); + generateTransposeTests(testListerTranspose, testSrcRandom_32u); + generateTransposeTests(testListerTranspose, testSrcRandom_64u); + + printf("Generating NCV test suites\n"); + generateDrawRectsTests(testListerVisualize, testSrcRandom_8u, testSrcRandom_32u, 4096, 4096); + generateDrawRectsTests(testListerVisualize, testSrcRandom_32u, testSrcRandom_32u, 4096, 4096); + generateVectorTests(testListerVectorOperations, testSrcRandom_32u, 4096*4096); + generateHypothesesFiltrationTests(testListerHypFiltration, testSrcRandom_32u, 1024); + generateHaarLoaderTests(testListerHaarLoader); + generateHaarApplicationTests(testListerHaarAppl, testSrcFacesVGA_8u, 1280, 720); + + testListerII.invoke(); + testListerSII.invoke(); + testListerRStdDev.invoke(); + testListerResize.invoke(); + testListerNPPSTVectorOperations.invoke(); + testListerTranspose.invoke(); + testListerVisualize.invoke(); + testListerVectorOperations.invoke(); + testListerHypFiltration.invoke(); + testListerHaarLoader.invoke(); + testListerHaarAppl.invoke(); + + return 0; +} diff --git a/tests/gpu/src/nvidia_tests.cpp b/tests/gpu/src/nvidia_tests.cpp new file mode 100644 index 0000000000..04d6436829 --- /dev/null +++ b/tests/gpu/src/nvidia_tests.cpp @@ -0,0 +1,63 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// Intel License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000, Intel Corporation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of Intel Corporation may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#include "gputest.hpp" +#include "cvconfig.h" + +class CV_NVidiaTestsCaller : public CvTest +{ +public: + CV_NVidiaTestsCaller() : CvTest("GPU-NVidia", "NVidia") {} + virtual ~CV_NVidiaTestsCaller() {} + +protected: + + void run( int ) + { +#if defined(HAVE_CUDA) + int main_nvidia(); + main_nvidia(); + ts->set_failed_test_info(CvTS::OK); +#else + ts->set_failed_test_info(CvTS::SKIPPED); +#endif + } +} CV_NVidiaTestsCaller_test; \ No newline at end of file