moved GpuMat's operations implementation to core module

pull/13383/head
Vladislav Vinogradov 13 years ago
parent 0f53f2993e
commit 2695039a79
  1. 130
      modules/core/CMakeLists.txt
  2. 22
      modules/core/include/opencv2/core/gpumat.hpp
  3. 50
      modules/core/src/cuda/matrix_operations.cu
  4. 499
      modules/core/src/gpumat.cpp
  5. 4
      modules/gpu/include/opencv2/gpu/gpu.hpp
  6. 32
      modules/gpu/src/cuda/internal_shared.hpp
  7. 32
      modules/gpu/src/cuda/safe_call.hpp
  8. 50
      modules/gpu/src/cudastream.cpp
  9. 16
      modules/gpu/src/error.cpp
  10. 374
      modules/gpu/src/initialization.cpp
  11. 1
      modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
  12. 1
      modules/gpu/src/opencv2/gpu/device/color.hpp
  13. 100
      modules/gpu/src/opencv2/gpu/device/common.hpp
  14. 2
      modules/gpu/src/opencv2/gpu/device/datamov_utils.hpp
  15. 2
      modules/gpu/src/opencv2/gpu/device/detail/color_detail.hpp
  16. 2
      modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp
  17. 2
      modules/gpu/src/opencv2/gpu/device/detail/type_traits_detail.hpp
  18. 2
      modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp
  19. 1
      modules/gpu/src/opencv2/gpu/device/detail/vec_distance_detail.hpp
  20. 21
      modules/gpu/src/opencv2/gpu/device/dynamic_smem.hpp
  21. 1
      modules/gpu/src/opencv2/gpu/device/emulation.hpp
  22. 1
      modules/gpu/src/opencv2/gpu/device/filters.hpp
  23. 1
      modules/gpu/src/opencv2/gpu/device/funcattrib.hpp
  24. 1
      modules/gpu/src/opencv2/gpu/device/functional.hpp
  25. 2
      modules/gpu/src/opencv2/gpu/device/limits.hpp
  26. 2
      modules/gpu/src/opencv2/gpu/device/saturate_cast.hpp
  27. 2
      modules/gpu/src/opencv2/gpu/device/transform.hpp
  28. 1
      modules/gpu/src/opencv2/gpu/device/type_traits.hpp
  29. 1
      modules/gpu/src/opencv2/gpu/device/utility.hpp
  30. 1
      modules/gpu/src/opencv2/gpu/device/vec_distance.hpp
  31. 1
      modules/gpu/src/opencv2/gpu/device/vec_math.hpp
  32. 2
      modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
  33. 2
      modules/gpu/src/opencv2/gpu/device/warp.hpp
  34. 2
      modules/gpu/src/opencv2/gpu/device/warp_reduce.hpp

@ -3,4 +3,132 @@ if(ZLIB_FOUND)
else() else()
include_directories("${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/zlib") include_directories("${CMAKE_CURRENT_SOURCE_DIR}/../../3rdparty/zlib")
endif() endif()
define_opencv_module(core ${ZLIB_LIBRARY})
#define_opencv_module(core ${ZLIB_LIBRARY})
set(name "core")
project(opencv_${name})
include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include"
"${CMAKE_CURRENT_SOURCE_DIR}/src"
"${CMAKE_CURRENT_BINARY_DIR}")
file(GLOB lib_srcs "src/*.cpp")
file(GLOB lib_int_hdrs "src/*.h*")
file(GLOB lib_hdrs "include/opencv2/${name}/*.h*")
file(GLOB lib_hdrs_detail "include/opencv2/${name}/detail/*.h*")
if(COMMAND get_module_external_sources)
get_module_external_sources(${name})
endif()
source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs})
source_group("Include" FILES ${lib_hdrs})
source_group("Include\\detail" FILES ${lib_hdrs_detail})
list(APPEND lib_hdrs ${lib_hdrs_detail})
if (HAVE_CUDA)
file(GLOB lib_cuda "src/cuda/*.cu")
source_group("Cuda" FILES "${lib_cuda}")
include_directories(${CUDA_INCLUDE_DIRS})
include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/src")
include_directories("${OpenCV_SOURCE_DIR}/modules/gpu/src/cuda")
set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -gencode arch=compute_10,code=sm_10
-gencode arch=compute_11,code=sm_11
-gencode arch=compute_12,code=sm_12
-gencode arch=compute_13,code=sm_13
-gencode arch=compute_20,code=sm_20
-gencode arch=compute_20,code=sm_21)
if (UNIX OR APPLE)
set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-fPIC;")
#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" "-fPIC")
endif()
#set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-keep")
#set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;/EHsc-;")
if (APPLE)
set (CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-fno-finite-math-only;")
endif()
CUDA_COMPILE(cuda_objs ${lib_cuda})
#CUDA_BUILD_CLEAN_TARGET()
endif()
set(the_target "opencv_${name}")
add_library(${the_target} ${lib_srcs} ${lib_hdrs} ${lib_int_hdrs} ${lib_cuda} ${cuda_objs})
# For dynamic link numbering convenions
if(NOT ANDROID)
# Android SDK build scripts can include only .so files into final .apk
# As result we should not set version properties for Android
set_target_properties(${the_target} PROPERTIES
VERSION ${OPENCV_VERSION}
SOVERSION ${OPENCV_SOVERSION}
)
endif()
set_target_properties(${the_target} PROPERTIES OUTPUT_NAME "${the_target}${OPENCV_DLLVERSION}" )
if(ENABLE_SOLUTION_FOLDERS)
set_target_properties(${the_target} PROPERTIES FOLDER "modules")
endif()
if (BUILD_SHARED_LIBS)
if(MSVC)
set_target_properties(${the_target} PROPERTIES DEFINE_SYMBOL CVAPI_EXPORTS)
else()
add_definitions(-DCVAPI_EXPORTS)
endif()
endif()
# Additional target properties
set_target_properties(${the_target} PROPERTIES
DEBUG_POSTFIX "${OPENCV_DEBUG_POSTFIX}"
ARCHIVE_OUTPUT_DIRECTORY ${LIBRARY_OUTPUT_PATH}
RUNTIME_OUTPUT_DIRECTORY ${EXECUTABLE_OUTPUT_PATH}
INSTALL_NAME_DIR lib
)
# Add the required libraries for linking:
target_link_libraries(${the_target} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${ZLIB_LIBRARY})
if (HAVE_CUDA)
target_link_libraries(${the_target} ${CUDA_LIBRARIES})
unset(CUDA_npp_LIBRARY CACHE)
find_cuda_helper_libs(npp)
target_link_libraries(${the_target} ${CUDA_npp_LIBRARY})
endif()
if(MSVC)
if(CMAKE_CROSSCOMPILING)
set_target_properties(${the_target} PROPERTIES
LINK_FLAGS "/NODEFAULTLIB:secchk"
)
endif()
set_target_properties(${the_target} PROPERTIES
LINK_FLAGS "/NODEFAULTLIB:libc /DEBUG"
)
endif()
# Dependencies of this target:
add_dependencies(${the_target} ${ZLIB_LIBRARY})
install(TARGETS ${the_target}
RUNTIME DESTINATION bin COMPONENT main
LIBRARY DESTINATION ${OPENCV_LIB_INSTALL_PATH} COMPONENT main
ARCHIVE DESTINATION ${OPENCV_LIB_INSTALL_PATH} COMPONENT main)
install(FILES ${lib_hdrs}
DESTINATION ${OPENCV_INCLUDE_PREFIX}/opencv2/${name}
COMPONENT main)
add_opencv_precompiled_headers(${the_target})
define_opencv_test(${name})
define_opencv_perf_test(${name})

@ -212,27 +212,9 @@ namespace cv { namespace gpu
CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m); CV_EXPORTS void ensureSizeIsEnough(int rows, int cols, int type, GpuMat& m);
CV_EXPORTS void ensureSizeIsEnough(Size size, int type, GpuMat& m); CV_EXPORTS void ensureSizeIsEnough(Size size, int type, GpuMat& m);
class CV_EXPORTS GpuFuncTable //////////////////////////////// Error handling ////////////////////////
{
public:
virtual ~GpuFuncTable() {}
virtual void copy(const Mat& src, GpuMat& dst) const = 0;
virtual void copy(const GpuMat& src, Mat& dst) const = 0;
virtual void copy(const GpuMat& src, GpuMat& dst) const = 0;
virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0;
virtual void convert(const GpuMat& src, GpuMat& dst) const = 0;
virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const = 0;
virtual void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const = 0;
virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;
virtual void free(void* devPtr) const = 0;
};
CV_EXPORTS void setGpuFuncTable(const GpuFuncTable* funcTbl); CV_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func);
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////

@ -40,7 +40,6 @@
// //
//M*/ //M*/
#include "internal_shared.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp" #include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/transform.hpp" #include "opencv2/gpu/device/transform.hpp"
#include "opencv2/gpu/device/functional.hpp" #include "opencv2/gpu/device/functional.hpp"
@ -75,7 +74,7 @@ namespace cv { namespace gpu { namespace device
} }
template<typename T> template<typename T>
void copy_to_with_mask_run(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream) void copy_to_with_mask_run(DevMem2Db mat_src, DevMem2Db mat_dst, DevMem2Db mask, int channels, cudaStream_t stream)
{ {
dim3 threadsPerBlock(16,16, 1); dim3 threadsPerBlock(16,16, 1);
dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1); dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
@ -88,9 +87,9 @@ namespace cv { namespace gpu { namespace device
cudaSafeCall ( cudaDeviceSynchronize() ); cudaSafeCall ( cudaDeviceSynchronize() );
} }
void copy_to_with_mask(const DevMem2Db& mat_src, DevMem2Db mat_dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream) void copy_to_with_mask(DevMem2Db mat_src, DevMem2Db mat_dst, int depth, DevMem2Db mask, int channels, cudaStream_t stream)
{ {
typedef void (*CopyToFunc)(const DevMem2Db& mat_src, const DevMem2Db& mat_dst, const DevMem2Db& mask, int channels, const cudaStream_t & stream); typedef void (*CopyToFunc)(DevMem2Db mat_src, DevMem2Db mat_dst, DevMem2Db mask, int channels, cudaStream_t stream);
static CopyToFunc tab[8] = static CopyToFunc tab[8] =
{ {
@ -106,7 +105,8 @@ namespace cv { namespace gpu { namespace device
CopyToFunc func = tab[depth]; CopyToFunc func = tab[depth];
if (func == 0) cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__); if (func == 0)
cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__);
func(mat_src, mat_dst, mask, channels, stream); func(mat_src, mat_dst, mask, channels, stream);
} }
@ -188,7 +188,7 @@ namespace cv { namespace gpu { namespace device
} }
} }
template <typename T> template <typename T>
void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream) void set_to_gpu(DevMem2Db mat, const T* scalar, DevMem2Db mask, int channels, cudaStream_t stream)
{ {
writeScalar(scalar); writeScalar(scalar);
@ -202,16 +202,16 @@ namespace cv { namespace gpu { namespace device
cudaSafeCall ( cudaDeviceSynchronize() ); cudaSafeCall ( cudaDeviceSynchronize() );
} }
template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream); template void set_to_gpu<uchar >(DevMem2Db mat, const uchar* scalar, DevMem2Db mask, int channels, cudaStream_t stream);
template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream); template void set_to_gpu<schar >(DevMem2Db mat, const schar* scalar, DevMem2Db mask, int channels, cudaStream_t stream);
template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream); template void set_to_gpu<ushort>(DevMem2Db mat, const ushort* scalar, DevMem2Db mask, int channels, cudaStream_t stream);
template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream); template void set_to_gpu<short >(DevMem2Db mat, const short* scalar, DevMem2Db mask, int channels, cudaStream_t stream);
template void set_to_gpu<int >(const DevMem2Db& mat, const int* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream); template void set_to_gpu<int >(DevMem2Db mat, const int* scalar, DevMem2Db mask, int channels, cudaStream_t stream);
template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream); template void set_to_gpu<float >(DevMem2Db mat, const float* scalar, DevMem2Db mask, int channels, cudaStream_t stream);
template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream); template void set_to_gpu<double>(DevMem2Db mat, const double* scalar, DevMem2Db mask, int channels, cudaStream_t stream);
template <typename T> template <typename T>
void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream) void set_to_gpu(DevMem2Db mat, const T* scalar, int channels, cudaStream_t stream)
{ {
writeScalar(scalar); writeScalar(scalar);
@ -225,13 +225,13 @@ namespace cv { namespace gpu { namespace device
cudaSafeCall ( cudaDeviceSynchronize() ); cudaSafeCall ( cudaDeviceSynchronize() );
} }
template void set_to_gpu<uchar >(const DevMem2Db& mat, const uchar* scalar, int channels, cudaStream_t stream); template void set_to_gpu<uchar >(DevMem2Db mat, const uchar* scalar, int channels, cudaStream_t stream);
template void set_to_gpu<schar >(const DevMem2Db& mat, const schar* scalar, int channels, cudaStream_t stream); template void set_to_gpu<schar >(DevMem2Db mat, const schar* scalar, int channels, cudaStream_t stream);
template void set_to_gpu<ushort>(const DevMem2Db& mat, const ushort* scalar, int channels, cudaStream_t stream); template void set_to_gpu<ushort>(DevMem2Db mat, const ushort* scalar, int channels, cudaStream_t stream);
template void set_to_gpu<short >(const DevMem2Db& mat, const short* scalar, int channels, cudaStream_t stream); template void set_to_gpu<short >(DevMem2Db mat, const short* scalar, int channels, cudaStream_t stream);
template void set_to_gpu<int >(const DevMem2Db& mat, const int* scalar, int channels, cudaStream_t stream); template void set_to_gpu<int >(DevMem2Db mat, const int* scalar, int channels, cudaStream_t stream);
template void set_to_gpu<float >(const DevMem2Db& mat, const float* scalar, int channels, cudaStream_t stream); template void set_to_gpu<float >(DevMem2Db mat, const float* scalar, int channels, cudaStream_t stream);
template void set_to_gpu<double>(const DevMem2Db& mat, const double* scalar, int channels, cudaStream_t stream); template void set_to_gpu<double>(DevMem2Db mat, const double* scalar, int channels, cudaStream_t stream);
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
//////////////////////////////// ConvertTo //////////////////////////////// //////////////////////////////// ConvertTo ////////////////////////////////
@ -298,7 +298,7 @@ namespace cv { namespace gpu { namespace device
}; };
template<typename T, typename D> template<typename T, typename D>
void cvt_(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, cudaStream_t stream) void cvt_(DevMem2Db src, DevMem2Db dst, double alpha, double beta, cudaStream_t stream)
{ {
cudaSafeCall( cudaSetDoubleForDevice(&alpha) ); cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
cudaSafeCall( cudaSetDoubleForDevice(&beta) ); cudaSafeCall( cudaSetDoubleForDevice(&beta) );
@ -306,11 +306,9 @@ namespace cv { namespace gpu { namespace device
::cv::gpu::device::transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, stream); ::cv::gpu::device::transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, stream);
} }
void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, void convert_gpu(DevMem2Db src, int sdepth, DevMem2Db dst, int ddepth, double alpha, double beta, cudaStream_t stream)
cudaStream_t stream = 0)
{ {
typedef void (*caller_t)(const DevMem2Db& src, const DevMem2Db& dst, double alpha, double beta, typedef void (*caller_t)(DevMem2Db src, DevMem2Db dst, double alpha, double beta, cudaStream_t stream);
cudaStream_t stream);
static const caller_t tab[8][8] = static const caller_t tab[8][8] =
{ {

@ -43,6 +43,14 @@
#include "precomp.hpp" #include "precomp.hpp"
#include "opencv2/core/gpumat.hpp" #include "opencv2/core/gpumat.hpp"
#include <iostream>
#include <sstream>
#ifdef HAVE_CUDA
#include <cuda_runtime.h>
#include <npp.h>
#endif
using namespace std; using namespace std;
using namespace cv; using namespace cv;
using namespace cv::gpu; using namespace cv::gpu;
@ -283,6 +291,31 @@ cv::Mat::Mat(const GpuMat& m) : flags(0), dims(0), rows(0), cols(0), data(0), re
m.download(*this); m.download(*this);
} }
namespace
{
class CV_EXPORTS GpuFuncTable
{
public:
virtual ~GpuFuncTable() {}
virtual void copy(const Mat& src, GpuMat& dst) const = 0;
virtual void copy(const GpuMat& src, Mat& dst) const = 0;
virtual void copy(const GpuMat& src, GpuMat& dst) const = 0;
virtual void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const = 0;
virtual void convert(const GpuMat& src, GpuMat& dst) const = 0;
virtual void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const = 0;
virtual void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const = 0;
virtual void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const = 0;
virtual void free(void* devPtr) const = 0;
};
}
#ifndef HAVE_CUDA
namespace namespace
{ {
void throw_nogpu() void throw_nogpu()
@ -308,20 +341,460 @@ namespace
void free(void*) const {} void free(void*) const {}
}; };
const GpuFuncTable* g_funcTbl = 0;
const GpuFuncTable* gpuFuncTable() const GpuFuncTable* gpuFuncTable()
{ {
static EmptyFuncTable empty; static EmptyFuncTable empty;
return g_funcTbl ? g_funcTbl : &empty; return &empty;
}
}
#else // HAVE_CUDA
namespace cv { namespace gpu { namespace device
{
void copy_to_with_mask(DevMem2Db src, DevMem2Db dst, int depth, DevMem2Db mask, int channels, cudaStream_t stream);
template <typename T>
void set_to_gpu(DevMem2Db mat, const T* scalar, int channels, cudaStream_t stream);
template <typename T>
void set_to_gpu(DevMem2Db mat, const T* scalar, DevMem2Db mask, int channels, cudaStream_t stream);
void convert_gpu(DevMem2Db src, int sdepth, DevMem2Db dst, int ddepth, double alpha, double beta, cudaStream_t stream);
}}}
namespace
{
#if defined(__GNUC__)
#define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
#define nppSafeCall(expr) ___nppSafeCall(expr, __FILE__, __LINE__, __func__)
#else /* defined(__CUDACC__) || defined(__MSVC__) */
#define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__)
#define nppSafeCall(expr) ___nppSafeCall(expr, __FILE__, __LINE__)
#endif
inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
{
if (cudaSuccess != err)
cv::gpu::error(cudaGetErrorString(err), file, line, func);
}
inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
{
if (err < 0)
{
std::ostringstream msg;
msg << "NPP API Call Error: " << err;
cv::gpu::error(msg.str().c_str(), file, line, func);
}
} }
} }
void cv::gpu::setGpuFuncTable(const GpuFuncTable* funcTbl) namespace
{ {
g_funcTbl = funcTbl; template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, cudaStream_t stream)
{
Scalar_<T> sf = s;
::cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), stream);
}
template <typename T> void kernelSetCaller(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
{
Scalar_<T> sf = s;
::cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), stream);
}
} }
namespace cv { namespace gpu
{
CV_EXPORTS void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0)
{
::cv::gpu::device::copy_to_with_mask(src, dst, src.depth(), mask, src.channels(), stream);
}
CV_EXPORTS void convertTo(const GpuMat& src, GpuMat& dst)
{
::cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0);
}
CV_EXPORTS void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0)
{
::cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream);
}
CV_EXPORTS void setTo(GpuMat& src, Scalar s, cudaStream_t stream)
{
typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream);
static const caller_t callers[] =
{
kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
kernelSetCaller<float>, kernelSetCaller<double>
};
callers[src.depth()](src, s, stream);
}
CV_EXPORTS void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
{
typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
static const caller_t callers[] =
{
kernelSetCaller<uchar>, kernelSetCaller<schar>, kernelSetCaller<ushort>, kernelSetCaller<short>, kernelSetCaller<int>,
kernelSetCaller<float>, kernelSetCaller<double>
};
callers[src.depth()](src, s, mask, stream);
}
CV_EXPORTS void setTo(GpuMat& src, Scalar s)
{
setTo(src, s, 0);
}
CV_EXPORTS void setTo(GpuMat& src, Scalar s, const GpuMat& mask)
{
setTo(src, s, mask, 0);
}
}}
namespace
{
//////////////////////////////////////////////////////////////////////////
// Convert
template<int n> struct NPPTypeTraits;
template<> struct NPPTypeTraits<CV_8U> { typedef Npp8u npp_type; };
template<> struct NPPTypeTraits<CV_16U> { typedef Npp16u npp_type; };
template<> struct NPPTypeTraits<CV_16S> { typedef Npp16s npp_type; };
template<> struct NPPTypeTraits<CV_32S> { typedef Npp32s npp_type; };
template<> struct NPPTypeTraits<CV_32F> { typedef Npp32f npp_type; };
template<int SDEPTH, int DDEPTH> struct NppConvertFunc
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI);
};
template<int DDEPTH> struct NppConvertFunc<CV_32F, DDEPTH>
{
typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode);
};
template<int SDEPTH, int DDEPTH, typename NppConvertFunc<SDEPTH, DDEPTH>::func_ptr func> struct NppCvt
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
static void cvt(const GpuMat& src, GpuMat& dst)
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz) );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
{
typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
static void cvt(const GpuMat& src, GpuMat& dst)
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
nppSafeCall( func(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz, NPP_RND_NEAR) );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
//////////////////////////////////////////////////////////////////////////
// Set
template<int SDEPTH, int SCN> struct NppSetFunc
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
};
template<int SDEPTH> struct NppSetFunc<SDEPTH, 1>
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
};
template<int SDEPTH, int SCN, typename NppSetFunc<SDEPTH, SCN>::func_ptr func> struct NppSet
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
static void set(GpuMat& src, Scalar s)
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
Scalar_<src_t> nppS = s;
nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz) );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func>
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
static void set(GpuMat& src, Scalar s)
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
Scalar_<src_t> nppS = s;
nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz) );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template<int SDEPTH, int SCN> struct NppSetMaskFunc
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
};
template<int SDEPTH> struct NppSetMaskFunc<SDEPTH, 1>
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
};
template<int SDEPTH, int SCN, typename NppSetMaskFunc<SDEPTH, SCN>::func_ptr func> struct NppSetMask
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
static void set(GpuMat& src, Scalar s, const GpuMat& mask)
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
Scalar_<src_t> nppS = s;
nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func>
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
static void set(GpuMat& src, Scalar s, const GpuMat& mask)
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
Scalar_<src_t> nppS = s;
nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
class CudaFuncTable : public GpuFuncTable
{
public:
void copy(const Mat& src, GpuMat& dst) const
{
cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) );
}
void copy(const GpuMat& src, Mat& dst) const
{
cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) );
}
void copy(const GpuMat& src, GpuMat& dst) const
{
cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) );
}
void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const
{
::cv::gpu::copyWithMask(src, dst, mask);
}
void convert(const GpuMat& src, GpuMat& dst) const
{
typedef void (*caller_t)(const GpuMat& src, GpuMat& dst);
static const caller_t callers[7][7][7] =
{
{
/* 8U -> 8U */ {0, 0, 0, 0},
/* 8U -> 8S */ {::cv::gpu::convertTo, ::cv::gpu::convertTo, ::cv::gpu::convertTo, ::cv::gpu::convertTo},
/* 8U -> 16U */ {NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C4R>::cvt},
/* 8U -> 16S */ {NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C4R>::cvt},
/* 8U -> 32S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 8U -> 32F */ {NppCvt<CV_8U, CV_32F, nppiConvert_8u32f_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 8U -> 64F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}
},
{
/* 8S -> 8U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 8S -> 8S */ {0,0,0,0},
/* 8S -> 16U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 8S -> 16S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 8S -> 32S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 8S -> 32F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 8S -> 64F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}
},
{
/* 16U -> 8U */ {NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C4R>::cvt},
/* 16U -> 8S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 16U -> 16U */ {0,0,0,0},
/* 16U -> 16S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 16U -> 32S */ {NppCvt<CV_16U, CV_32S, nppiConvert_16u32s_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 16U -> 32F */ {NppCvt<CV_16U, CV_32F, nppiConvert_16u32f_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 16U -> 64F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}
},
{
/* 16S -> 8U */ {NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C4R>::cvt},
/* 16S -> 8S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 16S -> 16U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 16S -> 16S */ {0,0,0,0},
/* 16S -> 32S */ {NppCvt<CV_16S, CV_32S, nppiConvert_16s32s_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 16S -> 32F */ {NppCvt<CV_16S, CV_32F, nppiConvert_16s32f_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 16S -> 64F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}
},
{
/* 32S -> 8U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 32S -> 8S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 32S -> 16U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 32S -> 16S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 32S -> 32S */ {0,0,0,0},
/* 32S -> 32F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 32S -> 64F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}
},
{
/* 32F -> 8U */ {NppCvt<CV_32F, CV_8U, nppiConvert_32f8u_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 32F -> 8S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 32F -> 16U */ {NppCvt<CV_32F, CV_16U, nppiConvert_32f16u_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 32F -> 16S */ {NppCvt<CV_32F, CV_16S, nppiConvert_32f16s_C1R>::cvt,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 32F -> 32S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 32F -> 32F */ {0,0,0,0},
/* 32F -> 64F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo}
},
{
/* 64F -> 8U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 64F -> 8S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 64F -> 16U */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 64F -> 16S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 64F -> 32S */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 64F -> 32F */ {::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo,::cv::gpu::convertTo},
/* 64F -> 64F */ {0,0,0,0}
}
};
caller_t func = callers[src.depth()][dst.depth()][src.channels() - 1];
CV_DbgAssert(func != 0);
func(src, dst);
}
void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const
{
::cv::gpu::convertTo(src, dst, alpha, beta);
}
void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const
{
NppiSize sz;
sz.width = m.cols;
sz.height = m.rows;
if (mask.empty())
{
if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
{
cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) );
return;
}
if (m.depth() == CV_8U)
{
int cn = m.channels();
if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))
{
int val = saturate_cast<uchar>(s[0]);
cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) );
return;
}
}
typedef void (*caller_t)(GpuMat& src, Scalar s);
static const caller_t callers[7][4] =
{
{NppSet<CV_8U, 1, nppiSet_8u_C1R>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSet<CV_8U, 4, nppiSet_8u_C4R>::set},
{::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo},
{NppSet<CV_16U, 1, nppiSet_16u_C1R>::set, NppSet<CV_16U, 2, nppiSet_16u_C2R>::set, ::cv::gpu::setTo, NppSet<CV_16U, 4, nppiSet_16u_C4R>::set},
{NppSet<CV_16S, 1, nppiSet_16s_C1R>::set, NppSet<CV_16S, 2, nppiSet_16s_C2R>::set, ::cv::gpu::setTo, NppSet<CV_16S, 4, nppiSet_16s_C4R>::set},
{NppSet<CV_32S, 1, nppiSet_32s_C1R>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSet<CV_32S, 4, nppiSet_32s_C4R>::set},
{NppSet<CV_32F, 1, nppiSet_32f_C1R>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSet<CV_32F, 4, nppiSet_32f_C4R>::set},
{::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo}
};
callers[m.depth()][m.channels() - 1](m, s);
}
else
{
typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask);
static const caller_t callers[7][4] =
{
{NppSetMask<CV_8U, 1, nppiSet_8u_C1MR>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSetMask<CV_8U, 4, nppiSet_8u_C4MR>::set},
{::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo},
{NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::set},
{NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::set},
{NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::set},
{NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::set, ::cv::gpu::setTo, ::cv::gpu::setTo, NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::set},
{::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo, ::cv::gpu::setTo}
};
callers[m.depth()][m.channels() - 1](m, s, mask);
}
}
void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const
{
cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) );
}
void free(void* devPtr) const
{
cudaFree(devPtr);
}
};
const GpuFuncTable* gpuFuncTable()
{
static CudaFuncTable funcTable;
return &funcTable;
}
}
#endif // HAVE_CUDA
void cv::gpu::GpuMat::upload(const Mat& m) void cv::gpu::GpuMat::upload(const Mat& m)
{ {
CV_DbgAssert(!m.empty()); CV_DbgAssert(!m.empty());
@ -458,3 +931,19 @@ void cv::gpu::GpuMat::release()
step = rows = cols = 0; step = rows = cols = 0;
refcount = 0; refcount = 0;
} }
void cv::gpu::error(const char *error_string, const char *file, const int line, const char *func)
{
int code = CV_GpuApiCallError;
if (uncaught_exception())
{
const char* errorStr = cvErrorStr(code);
const char* function = func ? func : "unknown function";
cerr << "OpenCV Error: " << errorStr << "(" << error_string << ") in " << function << ", file " << file << ", line " << line;
cerr.flush();
}
else
cv::error( cv::Exception(code, error_string, func, file, line) );
}

@ -139,10 +139,6 @@ private:
int minorVersion_; int minorVersion_;
}; };
//////////////////////////////// Error handling ////////////////////////
CV_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func);
//////////////////////////////// CudaMem //////////////////////////////// //////////////////////////////// CudaMem ////////////////////////////////
// CudaMem is limited cv::Mat with page locked memory allocation. // CudaMem is limited cv::Mat with page locked memory allocation.
// Page locked memory is only needed for async and faster coping to GPU. // Page locked memory is only needed for async and faster coping to GPU.

@ -49,36 +49,6 @@
#include "opencv2/gpu/devmem2d.hpp" #include "opencv2/gpu/devmem2d.hpp"
#include "safe_call.hpp" #include "safe_call.hpp"
#ifndef CV_PI
#define CV_PI 3.1415926535897932384626433832795
#endif
#ifndef CV_PI_F
#ifndef CV_PI
#define CV_PI_F 3.14159265f
#else
#define CV_PI_F ((float)CV_PI)
#endif
#endif
#ifdef __CUDACC__
namespace cv { namespace gpu { namespace device
{
typedef unsigned char uchar;
typedef unsigned short ushort;
typedef signed char schar;
typedef unsigned int uint;
template<class T> static inline void bindTexture(const textureReference* tex, const DevMem2D_<T>& img)
{
cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
}
}}}
#endif
namespace cv { namespace gpu namespace cv { namespace gpu
{ {
enum enum
@ -94,8 +64,6 @@ namespace cv { namespace gpu
// Returns true if the GPU analogue exists, false otherwise. // Returns true if the GPU analogue exists, false otherwise.
bool tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType); bool tryConvertToGpuBorderType(int cpuBorderType, int& gpuBorderType);
static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }
class NppStreamHandler class NppStreamHandler
{ {
public: public:

@ -69,36 +69,36 @@ namespace cv { namespace gpu
void ncvError(int err, const char *file, const int line, const char *func = ""); void ncvError(int err, const char *file, const int line, const char *func = "");
void cufftError(int err, const char *file, const int line, const char *func = ""); void cufftError(int err, const char *file, const int line, const char *func = "");
void cublasError(int err, const char *file, const int line, const char *func = ""); void cublasError(int err, const char *file, const int line, const char *func = "");
}}
static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "") static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
{ {
if (cudaSuccess != err) if (cudaSuccess != err)
cv::gpu::error(cudaGetErrorString(err), file, line, func); cv::gpu::error(cudaGetErrorString(err), file, line, func);
} }
static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "") static inline void ___nppSafeCall(int err, const char *file, const int line, const char *func = "")
{ {
if (err < 0) if (err < 0)
cv::gpu::nppError(err, file, line, func); cv::gpu::nppError(err, file, line, func);
} }
static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "") static inline void ___ncvSafeCall(int err, const char *file, const int line, const char *func = "")
{ {
if (NCV_SUCCESS != err) if (NCV_SUCCESS != err)
cv::gpu::ncvError(err, file, line, func); cv::gpu::ncvError(err, file, line, func);
} }
static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "") static inline void ___cufftSafeCall(cufftResult_t err, const char *file, const int line, const char *func = "")
{ {
if (CUFFT_SUCCESS != err) if (CUFFT_SUCCESS != err)
cv::gpu::cufftError(err, file, line, func); cv::gpu::cufftError(err, file, line, func);
} }
static inline void ___cublasSafeCall(cublasStatus_t err, const char *file, const int line, const char *func = "") static inline void ___cublasSafeCall(cublasStatus_t err, const char *file, const int line, const char *func = "")
{ {
if (CUBLAS_STATUS_SUCCESS != err) if (CUBLAS_STATUS_SUCCESS != err)
cv::gpu::cublasError(err, file, line, func); cv::gpu::cublasError(err, file, line, func);
} }
}}
#endif /* __OPENCV_CUDA_SAFE_CALL_HPP__ */ #endif /* __OPENCV_CUDA_SAFE_CALL_HPP__ */

@ -71,19 +71,13 @@ cv::gpu::Stream::operator bool() const { throw_nogpu(); return false; }
#include "opencv2/gpu/stream_accessor.hpp" #include "opencv2/gpu/stream_accessor.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu
{ {
void copy_to_with_mask(const DevMem2Db& src, DevMem2Db dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t & stream = 0); void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream);
void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream);
template <typename T> void setTo(GpuMat& src, Scalar s, cudaStream_t stream);
void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream); void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
template <typename T> }}
void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
}}}
using namespace ::cv::gpu::device;
struct Stream::Impl struct Stream::Impl
{ {
@ -99,20 +93,6 @@ namespace
size_t bwidth = src.cols * src.elemSize(); size_t bwidth = src.cols * src.elemSize();
cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, k, s) ); cudaSafeCall( cudaMemcpy2DAsync(dst.data, dst.step, src.data, src.step, bwidth, src.rows, k, s) );
}; };
template <typename T>
void kernelSet(GpuMat& src, const Scalar& s, cudaStream_t stream)
{
Scalar_<T> sf = s;
set_to_gpu(src, sf.val, src.channels(), stream);
}
template <typename T>
void kernelSetMask(GpuMat& src, const Scalar& s, const GpuMat& mask, cudaStream_t stream)
{
Scalar_<T> sf = s;
set_to_gpu(src, sf.val, mask, src.channels(), stream);
}
} }
CV_EXPORTS cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream) { return stream.impl ? stream.impl->stream : 0; }; CV_EXPORTS cudaStream_t cv::gpu::StreamAccessor::getStream(const Stream& stream) { return stream.impl ? stream.impl->stream : 0; };
@ -208,13 +188,7 @@ void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar s)
} }
} }
typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, cudaStream_t stream); setTo(src, s, impl->stream);
static const set_caller_t set_callers[] =
{
kernelSet<uchar>, kernelSet<schar>, kernelSet<ushort>, kernelSet<short>,
kernelSet<int>, kernelSet<float>, kernelSet<double>
};
set_callers[src.depth()](src, s, impl->stream);
} }
void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask) void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask)
@ -224,13 +198,7 @@ void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask)
CV_Assert(mask.type() == CV_8UC1); CV_Assert(mask.type() == CV_8UC1);
typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, const GpuMat& mask, cudaStream_t stream); setTo(src, val, mask, impl->stream);
static const set_caller_t set_callers[] =
{
kernelSetMask<uchar>, kernelSetMask<schar>, kernelSetMask<ushort>, kernelSetMask<short>,
kernelSetMask<int>, kernelSetMask<float>, kernelSetMask<double>
};
set_callers[src.depth()](src, val, mask, impl->stream);
} }
void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype, double alpha, double beta) void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype, double alpha, double beta)
@ -258,7 +226,7 @@ void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype,
psrc = &(temp = src); psrc = &(temp = src);
dst.create( src.size(), rtype ); dst.create( src.size(), rtype );
convert_gpu(psrc->reshape(1), sdepth, dst.reshape(1), ddepth, alpha, beta, impl->stream); convertTo(src, dst, alpha, beta, impl->stream);
} }
cv::gpu::Stream::operator bool() const cv::gpu::Stream::operator bool() const

@ -220,22 +220,6 @@ namespace cv
{ {
namespace gpu namespace gpu
{ {
void error(const char *error_string, const char *file, const int line, const char *func)
{
int code = CV_GpuApiCallError;
if (uncaught_exception())
{
const char* errorStr = cvErrorStr(code);
const char* function = func ? func : "unknown function";
cerr << "OpenCV Error: " << errorStr << "(" << error_string << ") in " << function << ", file " << file << ", line " << line;
cerr.flush();
}
else
cv::error( cv::Exception(code, error_string, func, file, line) );
}
void nppError(int code, const char *file, const int line, const char *func) void nppError(int code, const char *file, const int line, const char *func)
{ {
string msg = getErrorString(code, npp_errors, npp_error_num); string msg = getErrorString(code, npp_errors, npp_error_num);

@ -271,379 +271,5 @@ void cv::gpu::DeviceInfo::queryMemory(size_t& free_memory, size_t& total_memory)
setDevice(prev_device_id); setDevice(prev_device_id);
} }
////////////////////////////////////////////////////////////////////
// GpuFuncTable
namespace cv { namespace gpu { namespace device
{
void copy_to_with_mask(const DevMem2Db& src, DevMem2Db dst, int depth, const DevMem2Db& mask, int channels, const cudaStream_t& stream = 0);
template <typename T>
void set_to_gpu(const DevMem2Db& mat, const T* scalar, int channels, cudaStream_t stream);
template <typename T>
void set_to_gpu(const DevMem2Db& mat, const T* scalar, const DevMem2Db& mask, int channels, cudaStream_t stream);
void convert_gpu(const DevMem2Db& src, int sdepth, const DevMem2Db& dst, int ddepth, double alpha, double beta, cudaStream_t stream = 0);
}}}
namespace
{
//////////////////////////////////////////////////////////////////////////
// Convert
template<int n> struct NPPTypeTraits;
template<> struct NPPTypeTraits<CV_8U> { typedef Npp8u npp_type; };
template<> struct NPPTypeTraits<CV_16U> { typedef Npp16u npp_type; };
template<> struct NPPTypeTraits<CV_16S> { typedef Npp16s npp_type; };
template<> struct NPPTypeTraits<CV_32S> { typedef Npp32s npp_type; };
template<> struct NPPTypeTraits<CV_32F> { typedef Npp32f npp_type; };
template<int SDEPTH, int DDEPTH> struct NppConvertFunc
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
typedef NppStatus (*func_ptr)(const src_t* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI);
};
template<int DDEPTH> struct NppConvertFunc<CV_32F, DDEPTH>
{
typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
typedef NppStatus (*func_ptr)(const Npp32f* pSrc, int nSrcStep, dst_t* pDst, int nDstStep, NppiSize oSizeROI, NppRoundMode eRoundMode);
};
template<int SDEPTH, int DDEPTH, typename NppConvertFunc<SDEPTH, DDEPTH>::func_ptr func> struct NppCvt
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
static void cvt(const GpuMat& src, GpuMat& dst)
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
nppSafeCall( func(src.ptr<src_t>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz) );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template<int DDEPTH, typename NppConvertFunc<CV_32F, DDEPTH>::func_ptr func> struct NppCvt<CV_32F, DDEPTH, func>
{
typedef typename NPPTypeTraits<DDEPTH>::npp_type dst_t;
static void cvt(const GpuMat& src, GpuMat& dst)
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
nppSafeCall( func(src.ptr<Npp32f>(), static_cast<int>(src.step), dst.ptr<dst_t>(), static_cast<int>(dst.step), sz, NPP_RND_NEAR) );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
void convertToKernelCaller(const GpuMat& src, GpuMat& dst)
{
::cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0);
}
//////////////////////////////////////////////////////////////////////////
// Set
template<int SDEPTH, int SCN> struct NppSetFunc
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
};
template<int SDEPTH> struct NppSetFunc<SDEPTH, 1>
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI);
};
template<int SDEPTH, int SCN, typename NppSetFunc<SDEPTH, SCN>::func_ptr func> struct NppSet
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
static void set(GpuMat& src, Scalar s)
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
Scalar_<src_t> nppS = s;
nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz) );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template<int SDEPTH, typename NppSetFunc<SDEPTH, 1>::func_ptr func> struct NppSet<SDEPTH, 1, func>
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
static void set(GpuMat& src, Scalar s)
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
Scalar_<src_t> nppS = s;
nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz) );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template <typename T>
void kernelSet(GpuMat& src, Scalar s)
{
Scalar_<T> sf = s;
::cv::gpu::device::set_to_gpu(src, sf.val, src.channels(), 0);
}
template<int SDEPTH, int SCN> struct NppSetMaskFunc
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
typedef NppStatus (*func_ptr)(const src_t values[], src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
};
template<int SDEPTH> struct NppSetMaskFunc<SDEPTH, 1>
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
typedef NppStatus (*func_ptr)(src_t val, src_t* pSrc, int nSrcStep, NppiSize oSizeROI, const Npp8u* pMask, int nMaskStep);
};
template<int SDEPTH, int SCN, typename NppSetMaskFunc<SDEPTH, SCN>::func_ptr func> struct NppSetMask
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
static void set(GpuMat& src, Scalar s, const GpuMat& mask)
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
Scalar_<src_t> nppS = s;
nppSafeCall( func(nppS.val, src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template<int SDEPTH, typename NppSetMaskFunc<SDEPTH, 1>::func_ptr func> struct NppSetMask<SDEPTH, 1, func>
{
typedef typename NPPTypeTraits<SDEPTH>::npp_type src_t;
static void set(GpuMat& src, Scalar s, const GpuMat& mask)
{
NppiSize sz;
sz.width = src.cols;
sz.height = src.rows;
Scalar_<src_t> nppS = s;
nppSafeCall( func(nppS[0], src.ptr<src_t>(), static_cast<int>(src.step), sz, mask.ptr<Npp8u>(), static_cast<int>(mask.step)) );
cudaSafeCall( cudaDeviceSynchronize() );
}
};
template <typename T>
void kernelSetMask(GpuMat& src, Scalar s, const GpuMat& mask)
{
Scalar_<T> sf = s;
::cv::gpu::device::set_to_gpu(src, sf.val, mask, src.channels(), 0);
}
class CudaFuncTable : public GpuFuncTable
{
public:
void copy(const Mat& src, GpuMat& dst) const
{
cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyHostToDevice) );
}
void copy(const GpuMat& src, Mat& dst) const
{
cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToHost) );
}
void copy(const GpuMat& src, GpuMat& dst) const
{
cudaSafeCall( cudaMemcpy2D(dst.data, dst.step, src.data, src.step, src.cols * src.elemSize(), src.rows, cudaMemcpyDeviceToDevice) );
}
void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask) const
{
::cv::gpu::device::copy_to_with_mask(src, dst, src.depth(), mask, src.channels());
}
void convert(const GpuMat& src, GpuMat& dst) const
{
typedef void (*caller_t)(const GpuMat& src, GpuMat& dst);
static const caller_t callers[7][7][7] =
{
{
/* 8U -> 8U */ {0, 0, 0, 0},
/* 8U -> 8S */ {convertToKernelCaller, convertToKernelCaller, convertToKernelCaller, convertToKernelCaller},
/* 8U -> 16U */ {NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_8U, CV_16U, nppiConvert_8u16u_C4R>::cvt},
/* 8U -> 16S */ {NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_8U, CV_16S, nppiConvert_8u16s_C4R>::cvt},
/* 8U -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 8U -> 32F */ {NppCvt<CV_8U, CV_32F, nppiConvert_8u32f_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 8U -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
},
{
/* 8S -> 8U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 8S -> 8S */ {0,0,0,0},
/* 8S -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 8S -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 8S -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 8S -> 32F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 8S -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
},
{
/* 16U -> 8U */ {NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_16U, CV_8U, nppiConvert_16u8u_C4R>::cvt},
/* 16U -> 8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 16U -> 16U */ {0,0,0,0},
/* 16U -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 16U -> 32S */ {NppCvt<CV_16U, CV_32S, nppiConvert_16u32s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 16U -> 32F */ {NppCvt<CV_16U, CV_32F, nppiConvert_16u32f_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 16U -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
},
{
/* 16S -> 8U */ {NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,NppCvt<CV_16S, CV_8U, nppiConvert_16s8u_C4R>::cvt},
/* 16S -> 8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 16S -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 16S -> 16S */ {0,0,0,0},
/* 16S -> 32S */ {NppCvt<CV_16S, CV_32S, nppiConvert_16s32s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 16S -> 32F */ {NppCvt<CV_16S, CV_32F, nppiConvert_16s32f_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 16S -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
},
{
/* 32S -> 8U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 32S -> 8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 32S -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 32S -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 32S -> 32S */ {0,0,0,0},
/* 32S -> 32F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 32S -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
},
{
/* 32F -> 8U */ {NppCvt<CV_32F, CV_8U, nppiConvert_32f8u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 32F -> 8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 32F -> 16U */ {NppCvt<CV_32F, CV_16U, nppiConvert_32f16u_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 32F -> 16S */ {NppCvt<CV_32F, CV_16S, nppiConvert_32f16s_C1R>::cvt,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 32F -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 32F -> 32F */ {0,0,0,0},
/* 32F -> 64F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller}
},
{
/* 64F -> 8U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 64F -> 8S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 64F -> 16U */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 64F -> 16S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 64F -> 32S */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 64F -> 32F */ {convertToKernelCaller,convertToKernelCaller,convertToKernelCaller,convertToKernelCaller},
/* 64F -> 64F */ {0,0,0,0}
}
};
caller_t func = callers[src.depth()][dst.depth()][src.channels() - 1];
CV_DbgAssert(func != 0);
func(src, dst);
}
void convert(const GpuMat& src, GpuMat& dst, double alpha, double beta) const
{
::cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta);
}
void setTo(GpuMat& m, Scalar s, const GpuMat& mask) const
{
NppiSize sz;
sz.width = m.cols;
sz.height = m.rows;
if (mask.empty())
{
if (s[0] == 0.0 && s[1] == 0.0 && s[2] == 0.0 && s[3] == 0.0)
{
cudaSafeCall( cudaMemset2D(m.data, m.step, 0, m.cols * m.elemSize(), m.rows) );
return;
}
if (m.depth() == CV_8U)
{
int cn = m.channels();
if (cn == 1 || (cn == 2 && s[0] == s[1]) || (cn == 3 && s[0] == s[1] && s[0] == s[2]) || (cn == 4 && s[0] == s[1] && s[0] == s[2] && s[0] == s[3]))
{
int val = saturate_cast<uchar>(s[0]);
cudaSafeCall( cudaMemset2D(m.data, m.step, val, m.cols * m.elemSize(), m.rows) );
return;
}
}
typedef void (*caller_t)(GpuMat& src, Scalar s);
static const caller_t callers[7][4] =
{
{NppSet<CV_8U, 1, nppiSet_8u_C1R>::set,kernelSet<uchar>,kernelSet<uchar>,NppSet<CV_8U, 4, nppiSet_8u_C4R>::set},
{kernelSet<schar>,kernelSet<schar>,kernelSet<schar>,kernelSet<schar>},
{NppSet<CV_16U, 1, nppiSet_16u_C1R>::set,NppSet<CV_16U, 2, nppiSet_16u_C2R>::set,kernelSet<ushort>,NppSet<CV_16U, 4, nppiSet_16u_C4R>::set},
{NppSet<CV_16S, 1, nppiSet_16s_C1R>::set,NppSet<CV_16S, 2, nppiSet_16s_C2R>::set,kernelSet<short>,NppSet<CV_16S, 4, nppiSet_16s_C4R>::set},
{NppSet<CV_32S, 1, nppiSet_32s_C1R>::set,kernelSet<int>,kernelSet<int>,NppSet<CV_32S, 4, nppiSet_32s_C4R>::set},
{NppSet<CV_32F, 1, nppiSet_32f_C1R>::set,kernelSet<float>,kernelSet<float>,NppSet<CV_32F, 4, nppiSet_32f_C4R>::set},
{kernelSet<double>,kernelSet<double>,kernelSet<double>,kernelSet<double>}
};
callers[m.depth()][m.channels() - 1](m, s);
}
else
{
typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask);
static const caller_t callers[7][4] =
{
{NppSetMask<CV_8U, 1, nppiSet_8u_C1MR>::set,kernelSetMask<uchar>,kernelSetMask<uchar>,NppSetMask<CV_8U, 4, nppiSet_8u_C4MR>::set},
{kernelSetMask<schar>,kernelSetMask<schar>,kernelSetMask<schar>,kernelSetMask<schar>},
{NppSetMask<CV_16U, 1, nppiSet_16u_C1MR>::set,kernelSetMask<ushort>,kernelSetMask<ushort>,NppSetMask<CV_16U, 4, nppiSet_16u_C4MR>::set},
{NppSetMask<CV_16S, 1, nppiSet_16s_C1MR>::set,kernelSetMask<short>,kernelSetMask<short>,NppSetMask<CV_16S, 4, nppiSet_16s_C4MR>::set},
{NppSetMask<CV_32S, 1, nppiSet_32s_C1MR>::set,kernelSetMask<int>,kernelSetMask<int>,NppSetMask<CV_32S, 4, nppiSet_32s_C4MR>::set},
{NppSetMask<CV_32F, 1, nppiSet_32f_C1MR>::set,kernelSetMask<float>,kernelSetMask<float>,NppSetMask<CV_32F, 4, nppiSet_32f_C4MR>::set},
{kernelSetMask<double>,kernelSetMask<double>,kernelSetMask<double>,kernelSetMask<double>}
};
callers[m.depth()][m.channels() - 1](m, s, mask);
}
}
void mallocPitch(void** devPtr, size_t* step, size_t width, size_t height) const
{
cudaSafeCall( cudaMallocPitch(devPtr, step, width, height) );
}
void free(void* devPtr) const
{
cudaFree(devPtr);
}
};
class Initializer
{
public:
Initializer()
{
static CudaFuncTable funcTable;
setGpuFuncTable(&funcTable);
}
};
Initializer init;
}
#endif #endif

@ -43,7 +43,6 @@
#ifndef __OPENCV_GPU_BORDER_INTERPOLATE_HPP__ #ifndef __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
#define __OPENCV_GPU_BORDER_INTERPOLATE_HPP__ #define __OPENCV_GPU_BORDER_INTERPOLATE_HPP__
#include "internal_shared.hpp"
#include "saturate_cast.hpp" #include "saturate_cast.hpp"
#include "vec_traits.hpp" #include "vec_traits.hpp"
#include "vec_math.hpp" #include "vec_math.hpp"

@ -43,7 +43,6 @@
#ifndef __OPENCV_GPU_COLOR_HPP__ #ifndef __OPENCV_GPU_COLOR_HPP__
#define __OPENCV_GPU_COLOR_HPP__ #define __OPENCV_GPU_COLOR_HPP__
#include "internal_shared.hpp"
#include "detail/color_detail.hpp" #include "detail/color_detail.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device

@ -0,0 +1,100 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef __OPENCV_GPU_COMMON_HPP__
#define __OPENCV_GPU_COMMON_HPP__
#include <cuda_runtime.h>
#include "opencv2/core/devmem2d.hpp"
#ifndef CV_PI
#define CV_PI 3.1415926535897932384626433832795
#endif
#ifndef CV_PI_F
#ifndef CV_PI
#define CV_PI_F 3.14159265f
#else
#define CV_PI_F ((float)CV_PI)
#endif
#endif
namespace cv { namespace gpu
{
__host__ __device__ __forceinline__ int divUp(int total, int grain)
{
return (total + grain - 1) / grain;
}
namespace device
{
typedef unsigned char uchar;
typedef unsigned short ushort;
typedef signed char schar;
typedef unsigned int uint;
template<class T> inline void bindTexture(const textureReference* tex, const DevMem2D_<T>& img)
{
cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
}
}
}}
#if defined(__GNUC__)
#define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
#else /* defined(__CUDACC__) || defined(__MSVC__) */
#define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__)
#endif
namespace cv { namespace gpu
{
void error(const char *error_string, const char *file, const int line, const char *func = "");
}}
static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
{
if (cudaSuccess != err)
cv::gpu::error(cudaGetErrorString(err), file, line, func);
}
#endif // __OPENCV_GPU_COMMON_HPP__

@ -43,7 +43,7 @@
#ifndef __OPENCV_GPU_DATAMOV_UTILS_HPP__ #ifndef __OPENCV_GPU_DATAMOV_UTILS_HPP__
#define __OPENCV_GPU_DATAMOV_UTILS_HPP__ #define __OPENCV_GPU_DATAMOV_UTILS_HPP__
#include "internal_shared.hpp" #include "common.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {

@ -43,7 +43,7 @@
#ifndef __OPENCV_GPU_COLOR_DETAIL_HPP__ #ifndef __OPENCV_GPU_COLOR_DETAIL_HPP__
#define __OPENCV_GPU_COLOR_DETAIL_HPP__ #define __OPENCV_GPU_COLOR_DETAIL_HPP__
#include "internal_shared.hpp" #include "../common.hpp"
#include "../vec_traits.hpp" #include "../vec_traits.hpp"
#include "../saturate_cast.hpp" #include "../saturate_cast.hpp"
#include "../limits.hpp" #include "../limits.hpp"

@ -43,7 +43,7 @@
#ifndef __OPENCV_GPU_TRANSFORM_DETAIL_HPP__ #ifndef __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
#define __OPENCV_GPU_TRANSFORM_DETAIL_HPP__ #define __OPENCV_GPU_TRANSFORM_DETAIL_HPP__
#include "internal_shared.hpp" #include "../common.hpp"
#include "../vec_traits.hpp" #include "../vec_traits.hpp"
#include "../functional.hpp" #include "../functional.hpp"

@ -43,7 +43,7 @@
#ifndef __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__ #ifndef __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__
#define __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__ #define __OPENCV_GPU_TYPE_TRAITS_DETAIL_HPP__
#include "internal_shared.hpp" #include "../common.hpp"
#include "../vec_traits.hpp" #include "../vec_traits.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device

@ -43,8 +43,6 @@
#ifndef __OPENCV_GPU_UTILITY_DETAIL_HPP__ #ifndef __OPENCV_GPU_UTILITY_DETAIL_HPP__
#define __OPENCV_GPU_UTILITY_DETAIL_HPP__ #define __OPENCV_GPU_UTILITY_DETAIL_HPP__
#include "internal_shared.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
namespace utility_detail namespace utility_detail

@ -43,7 +43,6 @@
#ifndef __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__ #ifndef __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
#define __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__ #define __OPENCV_GPU_VEC_DISTANCE_DETAIL_HPP__
#include "internal_shared.hpp"
#include "../datamov_utils.hpp" #include "../datamov_utils.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device

@ -43,12 +43,10 @@
#ifndef __OPENCV_GPU_DYNAMIC_SMEM_HPP__ #ifndef __OPENCV_GPU_DYNAMIC_SMEM_HPP__
#define __OPENCV_GPU_DYNAMIC_SMEM_HPP__ #define __OPENCV_GPU_DYNAMIC_SMEM_HPP__
#include "internal_shared.hpp" namespace cv { namespace gpu { namespace device
BEGIN_OPENCV_DEVICE_NAMESPACE
template<class T> struct DynamicSharedMem
{ {
template<class T> struct DynamicSharedMem
{
__device__ __forceinline__ operator T*() __device__ __forceinline__ operator T*()
{ {
extern __shared__ int __smem[]; extern __shared__ int __smem[];
@ -60,11 +58,11 @@ template<class T> struct DynamicSharedMem
extern __shared__ int __smem[]; extern __shared__ int __smem[];
return (T*)__smem; return (T*)__smem;
} }
}; };
// specialize for double to avoid unaligned memory access compile errors // specialize for double to avoid unaligned memory access compile errors
template<> struct DynamicSharedMem<double> template<> struct DynamicSharedMem<double>
{ {
__device__ __forceinline__ operator double*() __device__ __forceinline__ operator double*()
{ {
extern __shared__ double __smem_d[]; extern __shared__ double __smem_d[];
@ -76,8 +74,7 @@ template<> struct DynamicSharedMem<double>
extern __shared__ double __smem_d[]; extern __shared__ double __smem_d[];
return (double*)__smem_d; return (double*)__smem_d;
} }
}; };
}}}
END_OPENCV_DEVICE_NAMESPACE
#endif // __OPENCV_GPU_DYNAMIC_SMEM_HPP__ #endif // __OPENCV_GPU_DYNAMIC_SMEM_HPP__

@ -43,7 +43,6 @@
#ifndef OPENCV_GPU_EMULATION_HPP_ #ifndef OPENCV_GPU_EMULATION_HPP_
#define OPENCV_GPU_EMULATION_HPP_ #define OPENCV_GPU_EMULATION_HPP_
#include "internal_shared.hpp"
#include "warp_reduce.hpp" #include "warp_reduce.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device

@ -43,7 +43,6 @@
#ifndef __OPENCV_GPU_FILTERS_HPP__ #ifndef __OPENCV_GPU_FILTERS_HPP__
#define __OPENCV_GPU_FILTERS_HPP__ #define __OPENCV_GPU_FILTERS_HPP__
#include "internal_shared.hpp"
#include "saturate_cast.hpp" #include "saturate_cast.hpp"
#include "vec_traits.hpp" #include "vec_traits.hpp"
#include "vec_math.hpp" #include "vec_math.hpp"

@ -45,7 +45,6 @@
#define __OPENCV_GPU_DEVICE_FUNCATTRIB_HPP_ #define __OPENCV_GPU_DEVICE_FUNCATTRIB_HPP_
#include <cstdio> #include <cstdio>
#include "internal_shared.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {

@ -44,7 +44,6 @@
#define __OPENCV_GPU_FUNCTIONAL_HPP__ #define __OPENCV_GPU_FUNCTIONAL_HPP__
#include <thrust/functional.h> #include <thrust/functional.h>
#include "internal_shared.hpp"
#include "saturate_cast.hpp" #include "saturate_cast.hpp"
#include "vec_traits.hpp" #include "vec_traits.hpp"
#include "type_traits.hpp" #include "type_traits.hpp"

@ -44,7 +44,7 @@
#define __OPENCV_GPU_LIMITS_GPU_HPP__ #define __OPENCV_GPU_LIMITS_GPU_HPP__
#include <limits> #include <limits>
#include "internal_shared.hpp" #include "common.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {

@ -43,7 +43,7 @@
#ifndef __OPENCV_GPU_SATURATE_CAST_HPP__ #ifndef __OPENCV_GPU_SATURATE_CAST_HPP__
#define __OPENCV_GPU_SATURATE_CAST_HPP__ #define __OPENCV_GPU_SATURATE_CAST_HPP__
#include "internal_shared.hpp" #include "common.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {

@ -43,7 +43,7 @@
#ifndef __OPENCV_GPU_TRANSFORM_HPP__ #ifndef __OPENCV_GPU_TRANSFORM_HPP__
#define __OPENCV_GPU_TRANSFORM_HPP__ #define __OPENCV_GPU_TRANSFORM_HPP__
#include "internal_shared.hpp" #include "common.hpp"
#include "utility.hpp" #include "utility.hpp"
#include "detail/transform_detail.hpp" #include "detail/transform_detail.hpp"

@ -43,7 +43,6 @@
#ifndef __OPENCV_GPU_TYPE_TRAITS_HPP__ #ifndef __OPENCV_GPU_TYPE_TRAITS_HPP__
#define __OPENCV_GPU_TYPE_TRAITS_HPP__ #define __OPENCV_GPU_TYPE_TRAITS_HPP__
#include "internal_shared.hpp"
#include "detail/type_traits_detail.hpp" #include "detail/type_traits_detail.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device

@ -43,7 +43,6 @@
#ifndef __OPENCV_GPU_UTILITY_HPP__ #ifndef __OPENCV_GPU_UTILITY_HPP__
#define __OPENCV_GPU_UTILITY_HPP__ #define __OPENCV_GPU_UTILITY_HPP__
#include "internal_shared.hpp"
#include "saturate_cast.hpp" #include "saturate_cast.hpp"
#include "datamov_utils.hpp" #include "datamov_utils.hpp"
#include "detail/utility_detail.hpp" #include "detail/utility_detail.hpp"

@ -43,7 +43,6 @@
#ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__ #ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__
#define __OPENCV_GPU_VEC_DISTANCE_HPP__ #define __OPENCV_GPU_VEC_DISTANCE_HPP__
#include "internal_shared.hpp"
#include "utility.hpp" #include "utility.hpp"
#include "functional.hpp" #include "functional.hpp"
#include "detail/vec_distance_detail.hpp" #include "detail/vec_distance_detail.hpp"

@ -43,7 +43,6 @@
#ifndef __OPENCV_GPU_VECMATH_HPP__ #ifndef __OPENCV_GPU_VECMATH_HPP__
#define __OPENCV_GPU_VECMATH_HPP__ #define __OPENCV_GPU_VECMATH_HPP__
#include "internal_shared.hpp"
#include "saturate_cast.hpp" #include "saturate_cast.hpp"
#include "vec_traits.hpp" #include "vec_traits.hpp"
#include "functional.hpp" #include "functional.hpp"

@ -43,7 +43,7 @@
#ifndef __OPENCV_GPU_VEC_TRAITS_HPP__ #ifndef __OPENCV_GPU_VEC_TRAITS_HPP__
#define __OPENCV_GPU_VEC_TRAITS_HPP__ #define __OPENCV_GPU_VEC_TRAITS_HPP__
#include "internal_shared.hpp" #include "common.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {

@ -43,8 +43,6 @@
#ifndef __OPENCV_GPU_DEVICE_WARP_HPP__ #ifndef __OPENCV_GPU_DEVICE_WARP_HPP__
#define __OPENCV_GPU_DEVICE_WARP_HPP__ #define __OPENCV_GPU_DEVICE_WARP_HPP__
#include "internal_shared.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
struct Warp struct Warp

@ -44,8 +44,6 @@
#ifndef OPENCV_GPU_WARP_REDUCE_HPP__ #ifndef OPENCV_GPU_WARP_REDUCE_HPP__
#define OPENCV_GPU_WARP_REDUCE_HPP__ #define OPENCV_GPU_WARP_REDUCE_HPP__
#include "internal_shared.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
template <class T> template <class T>

Loading…
Cancel
Save