Merge branch 'master' of code.opencv.org:opencv

pull/8/head
Alexander Mordvintesv 13 years ago
commit 2ea3dd391a
  1. 21
      android/android.toolchain.cmake
  2. 2
      cmake/OpenCVCompilerOptions.cmake
  3. 12
      modules/core/include/opencv2/core/core.hpp
  4. 12
      modules/core/include/opencv2/core/internal.hpp
  5. 164
      modules/core/src/lapack.cpp
  6. 129
      modules/core/src/parallel.cpp
  7. 936
      modules/gpu/src/cascadeclassifier.cpp
  8. 38
      modules/imgproc/perf/perf_bilateral.cpp
  9. 192
      modules/imgproc/src/shapedescr.cpp
  10. 253
      modules/imgproc/src/smooth.cpp
  11. 290
      modules/imgproc/test/test_bilateral_filter.cpp
  12. 29
      modules/objdetect/src/haar.cpp
  13. 2
      modules/ocl/cl2cpp.py
  14. 2
      modules/ocl/include/opencv2/ocl/matrix_operations.hpp
  15. 1
      modules/ocl/include/opencv2/ocl/ocl.hpp
  16. 102
      modules/ocl/src/arithm.cpp
  17. 34
      modules/ocl/src/imgproc.cpp
  18. 26
      modules/ocl/src/initialization.cpp
  19. 36
      modules/ocl/src/kernels/arithm_addWeighted.cl
  20. 4
      modules/ocl/src/kernels/arithm_cartToPolar.cl
  21. 131
      modules/ocl/src/kernels/arithm_div.cl
  22. 4
      modules/ocl/src/kernels/arithm_exp.cl
  23. 4
      modules/ocl/src/kernels/arithm_log.cl
  24. 171
      modules/ocl/src/kernels/convertC3C4.cl
  25. 209
      modules/ocl/src/kernels/imgproc_resize.cl
  26. 60
      modules/ocl/src/kernels/operator_setTo.cl
  27. 114
      modules/ocl/src/kernels/operator_setToM.cl
  28. 519
      modules/ocl/src/matrix_operations.cpp
  29. 6
      modules/ocl/src/precomp.hpp
  30. 2
      modules/ocl/test/test_imgproc.cpp
  31. 100
      modules/ocl/test/test_matrix_operation.cpp
  32. 2
      modules/python/test/test.py

@ -180,6 +180,8 @@
# - modified May 2012 # - modified May 2012
# [+] updated for NDK r8 # [+] updated for NDK r8
# [+] added mips architecture support # [+] added mips architecture support
# - modified August 2012
# [+] updated for NDK r8b
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
cmake_minimum_required( VERSION 2.6.3 ) cmake_minimum_required( VERSION 2.6.3 )
@ -199,7 +201,7 @@ set( CMAKE_SYSTEM_NAME Linux )
#this one not so much #this one not so much
set( CMAKE_SYSTEM_VERSION 1 ) set( CMAKE_SYSTEM_VERSION 1 )
set( ANDROID_SUPPORTED_NDK_VERSIONS ${ANDROID_EXTRA_NDK_VERSIONS} -r8 -r7c -r7b -r7 -r6b -r6 -r5c -r5b -r5 "" ) set( ANDROID_SUPPORTED_NDK_VERSIONS ${ANDROID_EXTRA_NDK_VERSIONS} -r8b -r8 -r7c -r7b -r7 -r6b -r6 -r5c -r5b -r5 "" )
if(NOT DEFINED ANDROID_NDK_SEARCH_PATHS) if(NOT DEFINED ANDROID_NDK_SEARCH_PATHS)
if( CMAKE_HOST_WIN32 ) if( CMAKE_HOST_WIN32 )
file( TO_CMAKE_PATH "$ENV{PROGRAMFILES}" ANDROID_NDK_SEARCH_PATHS ) file( TO_CMAKE_PATH "$ENV{PROGRAMFILES}" ANDROID_NDK_SEARCH_PATHS )
@ -473,11 +475,11 @@ if( BUILD_WITH_ANDROID_NDK )
foreach( __toolchain ${__availableToolchains} ) foreach( __toolchain ${__availableToolchains} )
__DETECT_TOOLCHAIN_MACHINE_NAME( __machine "${ANDROID_NDK}/toolchains/${__toolchain}/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME}" ) __DETECT_TOOLCHAIN_MACHINE_NAME( __machine "${ANDROID_NDK}/toolchains/${__toolchain}/prebuilt/${ANDROID_NDK_HOST_SYSTEM_NAME}" )
if( __machine ) if( __machine )
string( REGEX MATCH "[0-9]+.[0-9]+.[0-9]+$" __version "${__toolchain}" ) string( REGEX MATCH "[0-9]+[.][0-9]+[.]*[0-9]*$" __version "${__toolchain}" )
string( REGEX MATCH "^[^-]+" __arch "${__toolchain}" ) string( REGEX MATCH "^[^-]+" __arch "${__toolchain}" )
list( APPEND __availableToolchainMachines ${__machine} ) list( APPEND __availableToolchainMachines "${__machine}" )
list( APPEND __availableToolchainArchs ${__arch} ) list( APPEND __availableToolchainArchs "${__arch}" )
list( APPEND __availableToolchainCompilerVersions ${__version} ) list( APPEND __availableToolchainCompilerVersions "${__version}" )
else() else()
list( REMOVE_ITEM __availableToolchains "${__toolchain}" ) list( REMOVE_ITEM __availableToolchains "${__toolchain}" )
endif() endif()
@ -669,8 +671,13 @@ if( BUILD_WITH_ANDROID_NDK )
set( __stlIncludePath "${ANDROID_NDK}/sources/cxx-stl/stlport/stlport" ) set( __stlIncludePath "${ANDROID_NDK}/sources/cxx-stl/stlport/stlport" )
set( __stlLibPath "${ANDROID_NDK}/sources/cxx-stl/stlport/libs/${ANDROID_NDK_ABI_NAME}" ) set( __stlLibPath "${ANDROID_NDK}/sources/cxx-stl/stlport/libs/${ANDROID_NDK_ABI_NAME}" )
else() else()
set( __stlIncludePath "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/include" ) if( EXISTS "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/${ANDROID_COMPILER_VERSION}" )
set( __stlLibPath "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/libs/${ANDROID_NDK_ABI_NAME}" ) set( __stlIncludePath "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/${ANDROID_COMPILER_VERSION}/include" )
set( __stlLibPath "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/${ANDROID_COMPILER_VERSION}/libs/${ANDROID_NDK_ABI_NAME}" )
else()
set( __stlIncludePath "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/include" )
set( __stlLibPath "${ANDROID_NDK}/sources/cxx-stl/gnu-libstdc++/libs/${ANDROID_NDK_ABI_NAME}" )
endif()
endif() endif()
endif() endif()

@ -139,7 +139,7 @@ if(CMAKE_COMPILER_IS_GNUCXX)
if(ENABLE_SSSE3) if(ENABLE_SSSE3)
add_extra_compiler_option(-mssse3) add_extra_compiler_option(-mssse3)
endif() endif()
if(HAVE_GCC43_OR_NEWER) if(HAVE_GCC43_OR_NEWER OR APPLE)
if(ENABLE_SSE41) if(ENABLE_SSE41)
add_extra_compiler_option(-msse4.1) add_extra_compiler_option(-msse4.1)
endif() endif()

@ -4608,6 +4608,18 @@ float CommandLineParser::analyzeValue<float>(const std::string& str, bool space_
template<> CV_EXPORTS template<> CV_EXPORTS
double CommandLineParser::analyzeValue<double>(const std::string& str, bool space_delete); double CommandLineParser::analyzeValue<double>(const std::string& str, bool space_delete);
/////////////////////////////// Parallel Primitives //////////////////////////////////
// a base body class
class CV_EXPORTS ParallelLoopBody
{
public:
virtual void operator() (const Range& range) const = 0;
virtual ~ParallelLoopBody();
};
CV_EXPORTS void parallel_for_(const Range& range, const ParallelLoopBody& body);
} }
#endif // __cplusplus #endif // __cplusplus

@ -120,17 +120,23 @@ CV_INLINE IppiSize ippiSize(int width, int height)
# else # else
# define CV_SSSE3 0 # define CV_SSSE3 0
# endif # endif
# if defined __SSE4_1__ || (defined _MSC_VER && _MSC_VER >= 1600) # if defined __SSE4_1__ || (defined _MSC_VER && _MSC_VER >= 1500)
# include <smmintrin.h> # include <smmintrin.h>
# define CV_SSE4_1 1 # define CV_SSE4_1 1
# else
# define CV_SSE4_1 0
# endif # endif
# if defined __SSE4_2__ || (defined _MSC_VER && _MSC_VER >= 1600) # if defined __SSE4_2__ || (defined _MSC_VER && _MSC_VER >= 1500)
# include <nmmintrin.h> # include <nmmintrin.h>
# define CV_SSE4_2 1 # define CV_SSE4_2 1
# else
# define CV_SSE4_2 0
# endif # endif
# if defined __AVX__ || (defined _MSC_VER && _MSC_VER >= 1600) # if defined __AVX__ || (defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219)
# include <immintrin.h> # include <immintrin.h>
# define CV_AVX 1 # define CV_AVX 1
# else
# define CV_AVX 0
# endif # endif
# else # else
# define CV_SSE 0 # define CV_SSE 0

@ -954,7 +954,7 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
size_t esz = CV_ELEM_SIZE(type); size_t esz = CV_ELEM_SIZE(type);
int m = src.rows, n = src.cols; int m = src.rows, n = src.cols;
if( method == DECOMP_SVD ) if( method == DECOMP_SVD )
{ {
int nm = std::min(m, n); int nm = std::min(m, n);
@ -1010,82 +1010,84 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
if( type == CV_32FC1 ) if( type == CV_32FC1 )
{ {
double d = det2(Sf); double d = det2(Sf);
#if CV_SSE4_2 if( d != 0. )
if(USE_SSE4_2)
{
__m128 zero = _mm_setzero_ps();
__m128 t0 = _mm_loadl_pi(zero, (const __m64*)srcdata); //t0 = sf(0,0) sf(0,1)
__m128 t1 = _mm_loadh_pi(zero,(const __m64*)((const float*)(srcdata+srcstep))); //t1 = sf(1,0) sf(1,1)
__m128 s0 = _mm_blend_ps(t0,t1,12);
d = 1./d;
result = true;
__m128 det =_mm_set1_ps((float)d);
s0 = _mm_mul_ps(s0, det);
const uchar CV_DECL_ALIGNED(16) inv[16] = {0,0,0,0,0,0,0,0x80,0,0,0,0x80,0,0,0,0};
__m128 pattern = _mm_load_ps((const float*)inv);
s0 = _mm_xor_ps(s0, pattern);//==-1*s0
s0 = _mm_shuffle_ps(s0, s0, _MM_SHUFFLE(0,2,1,3));
_mm_storel_pi((__m64*)dstdata, s0);
_mm_storeh_pi((__m64*)((float*)(dstdata+dststep)), s0);
}
#else
if( d != 0. )
{ {
double t0, t1; result = true;
result = true; d = 1./d;
d = 1./d;
t0 = Sf(0,0)*d; #if CV_SSE2
t1 = Sf(1,1)*d; if(USE_SSE2)
Df(1,1) = (float)t0; {
Df(0,0) = (float)t1; __m128 zero = _mm_setzero_ps();
t0 = -Sf(0,1)*d; __m128 t0 = _mm_loadl_pi(zero, (const __m64*)srcdata); //t0 = sf(0,0) sf(0,1)
t1 = -Sf(1,0)*d; __m128 t1 = _mm_loadh_pi(zero, (const __m64*)(srcdata+srcstep)); //t1 = sf(1,0) sf(1,1)
Df(0,1) = (float)t0; __m128 s0 = _mm_or_ps(t0, t1);
Df(1,0) = (float)t1; __m128 det =_mm_set1_ps((float)d);
} s0 = _mm_mul_ps(s0, det);
#endif const uchar CV_DECL_ALIGNED(16) inv[16] = {0,0,0,0,0,0,0,0x80,0,0,0,0x80,0,0,0,0};
__m128 pattern = _mm_load_ps((const float*)inv);
s0 = _mm_xor_ps(s0, pattern);//==-1*s0
s0 = _mm_shuffle_ps(s0, s0, _MM_SHUFFLE(0,2,1,3));
_mm_storel_pi((__m64*)dstdata, s0);
_mm_storeh_pi((__m64*)((float*)(dstdata+dststep)), s0);
}
else
#endif
{
double t0, t1;
t0 = Sf(0,0)*d;
t1 = Sf(1,1)*d;
Df(1,1) = (float)t0;
Df(0,0) = (float)t1;
t0 = -Sf(0,1)*d;
t1 = -Sf(1,0)*d;
Df(0,1) = (float)t0;
Df(1,0) = (float)t1;
}
}
} }
else else
{ {
double d = det2(Sd); double d = det2(Sd);
#if CV_SSE2 if( d != 0. )
if(USE_SSE2)
{
__m128d s0 = _mm_loadu_pd((const double*)srcdata); //s0 = sf(0,0) sf(0,1)
__m128d s1 = _mm_loadu_pd ((const double*)(srcdata+srcstep));//s1 = sf(1,0) sf(1,1)
__m128d sm = _mm_shuffle_pd(s0, s1, _MM_SHUFFLE2(1,0)); //sm = sf(0,0) sf(1,1) - main diagonal
__m128d ss = _mm_shuffle_pd(s0, s1, _MM_SHUFFLE2(0,1)); //sm = sf(0,1) sf(1,0) - secondary diagonal
result = true;
d = 1./d;
__m128d det = _mm_load1_pd((const double*)&d);
sm = _mm_mul_pd(sm, det);
//__m128d pattern = _mm_set1_pd(-1.);
static const uchar CV_DECL_ALIGNED(16) inv[8] = {0,0,0,0,0,0,0,0x80};
__m128d pattern = _mm_load1_pd((double*)inv);
ss = _mm_mul_pd(ss, det);
ss = _mm_xor_pd(ss, pattern);//==-1*ss
//ss = _mm_mul_pd(ss,pattern);
s0 = _mm_shuffle_pd(sm, ss, _MM_SHUFFLE2(0,1));
s1 = _mm_shuffle_pd(ss, sm, _MM_SHUFFLE2(0,1));
_mm_store_pd((double*)dstdata, s0);
_mm_store_pd((double*)(dstdata+dststep), s1);
}
#else
if( d != 0. )
{ {
double t0, t1; result = true;
result = true; d = 1./d;
d = 1./d; #if CV_SSE2
t0 = Sd(0,0)*d; if(USE_SSE2)
t1 = Sd(1,1)*d; {
Dd(1,1) = t0; __m128d s0 = _mm_loadu_pd((const double*)srcdata); //s0 = sf(0,0) sf(0,1)
Dd(0,0) = t1; __m128d s1 = _mm_loadu_pd ((const double*)(srcdata+srcstep));//s1 = sf(1,0) sf(1,1)
t0 = -Sd(0,1)*d; __m128d sm = _mm_unpacklo_pd(s0, _mm_load_sd((const double*)(srcdata+srcstep)+1)); //sm = sf(0,0) sf(1,1) - main diagonal
t1 = -Sd(1,0)*d; __m128d ss = _mm_shuffle_pd(s0, s1, _MM_SHUFFLE2(0,1)); //ss = sf(0,1) sf(1,0) - secondary diagonal
Dd(0,1) = t0; __m128d det = _mm_load1_pd((const double*)&d);
Dd(1,0) = t1; sm = _mm_mul_pd(sm, det);
}
#endif uchar CV_DECL_ALIGNED(16) inv[8] = {0,0,0,0,0,0,0,0x80};
__m128d pattern = _mm_load1_pd((double*)inv);
ss = _mm_mul_pd(ss, det);
ss = _mm_xor_pd(ss, pattern);//==-1*ss
s0 = _mm_shuffle_pd(sm, ss, _MM_SHUFFLE2(0,1));
s1 = _mm_shuffle_pd(ss, sm, _MM_SHUFFLE2(0,1));
_mm_storeu_pd((double*)dstdata, s0);
_mm_storeu_pd((double*)(dstdata+dststep), s1);
}
else
#endif
{
double t0, t1;
t0 = Sd(0,0)*d;
t1 = Sd(1,1)*d;
Dd(1,1) = t0;
Dd(0,0) = t1;
t0 = -Sd(0,1)*d;
t1 = -Sd(1,0)*d;
Dd(0,1) = t0;
Dd(1,0) = t1;
}
}
} }
} }
else if( n == 3 ) else if( n == 3 )
@ -1095,18 +1097,17 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
double d = det3(Sf); double d = det3(Sf);
if( d != 0. ) if( d != 0. )
{ {
float t[9];
result = true; result = true;
d = 1./d; d = 1./d;
float t[9];
t[0] = (float)(((double)Sf(1,1) * Sf(2,2) - (double)Sf(1,2) * Sf(2,1)) * d); t[0] = (float)(((double)Sf(1,1) * Sf(2,2) - (double)Sf(1,2) * Sf(2,1)) * d);
t[1] = (float)(((double)Sf(0,2) * Sf(2,1) - (double)Sf(0,1) * Sf(2,2)) * d); t[1] = (float)(((double)Sf(0,2) * Sf(2,1) - (double)Sf(0,1) * Sf(2,2)) * d);
t[2] = (float)(((double)Sf(0,1) * Sf(1,2) - (double)Sf(0,2) * Sf(1,1)) * d); t[2] = (float)(((double)Sf(0,1) * Sf(1,2) - (double)Sf(0,2) * Sf(1,1)) * d);
t[3] = (float)(((double)Sf(1,2) * Sf(2,0) - (double)Sf(1,0) * Sf(2,2)) * d); t[3] = (float)(((double)Sf(1,2) * Sf(2,0) - (double)Sf(1,0) * Sf(2,2)) * d);
t[4] = (float)(((double)Sf(0,0) * Sf(2,2) - (double)Sf(0,2) * Sf(2,0)) * d); t[4] = (float)(((double)Sf(0,0) * Sf(2,2) - (double)Sf(0,2) * Sf(2,0)) * d);
t[5] = (float)(((double)Sf(0,2) * Sf(1,0) - (double)Sf(0,0) * Sf(1,2)) * d); t[5] = (float)(((double)Sf(0,2) * Sf(1,0) - (double)Sf(0,0) * Sf(1,2)) * d);
t[6] = (float)(((double)Sf(1,0) * Sf(2,1) - (double)Sf(1,1) * Sf(2,0)) * d); t[6] = (float)(((double)Sf(1,0) * Sf(2,1) - (double)Sf(1,1) * Sf(2,0)) * d);
t[7] = (float)(((double)Sf(0,1) * Sf(2,0) - (double)Sf(0,0) * Sf(2,1)) * d); t[7] = (float)(((double)Sf(0,1) * Sf(2,0) - (double)Sf(0,0) * Sf(2,1)) * d);
t[8] = (float)(((double)Sf(0,0) * Sf(1,1) - (double)Sf(0,1) * Sf(1,0)) * d); t[8] = (float)(((double)Sf(0,0) * Sf(1,1) - (double)Sf(0,1) * Sf(1,0)) * d);
@ -1121,18 +1122,18 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
double d = det3(Sd); double d = det3(Sd);
if( d != 0. ) if( d != 0. )
{ {
result = true;
d = 1./d;
double t[9]; double t[9];
result = true;
d = 1./d;
t[0] = (Sd(1,1) * Sd(2,2) - Sd(1,2) * Sd(2,1)) * d; t[0] = (Sd(1,1) * Sd(2,2) - Sd(1,2) * Sd(2,1)) * d;
t[1] = (Sd(0,2) * Sd(2,1) - Sd(0,1) * Sd(2,2)) * d; t[1] = (Sd(0,2) * Sd(2,1) - Sd(0,1) * Sd(2,2)) * d;
t[2] = (Sd(0,1) * Sd(1,2) - Sd(0,2) * Sd(1,1)) * d; t[2] = (Sd(0,1) * Sd(1,2) - Sd(0,2) * Sd(1,1)) * d;
t[3] = (Sd(1,2) * Sd(2,0) - Sd(1,0) * Sd(2,2)) * d; t[3] = (Sd(1,2) * Sd(2,0) - Sd(1,0) * Sd(2,2)) * d;
t[4] = (Sd(0,0) * Sd(2,2) - Sd(0,2) * Sd(2,0)) * d; t[4] = (Sd(0,0) * Sd(2,2) - Sd(0,2) * Sd(2,0)) * d;
t[5] = (Sd(0,2) * Sd(1,0) - Sd(0,0) * Sd(1,2)) * d; t[5] = (Sd(0,2) * Sd(1,0) - Sd(0,0) * Sd(1,2)) * d;
t[6] = (Sd(1,0) * Sd(2,1) - Sd(1,1) * Sd(2,0)) * d; t[6] = (Sd(1,0) * Sd(2,1) - Sd(1,1) * Sd(2,0)) * d;
t[7] = (Sd(0,1) * Sd(2,0) - Sd(0,0) * Sd(2,1)) * d; t[7] = (Sd(0,1) * Sd(2,0) - Sd(0,0) * Sd(2,1)) * d;
t[8] = (Sd(0,0) * Sd(1,1) - Sd(0,1) * Sd(1,0)) * d; t[8] = (Sd(0,0) * Sd(1,1) - Sd(0,1) * Sd(1,0)) * d;
@ -1171,7 +1172,7 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
return result; return result;
} }
int elem_size = CV_ELEM_SIZE(type); int elem_size = CV_ELEM_SIZE(type);
AutoBuffer<uchar> buf(n*n*elem_size); AutoBuffer<uchar> buf(n*n*elem_size);
Mat src1(n, n, type, (uchar*)buf); Mat src1(n, n, type, (uchar*)buf);
src.copyTo(src1); src.copyTo(src1);
@ -1193,6 +1194,7 @@ double cv::invert( InputArray _src, OutputArray _dst, int method )
} }
/****************************************************************************************\ /****************************************************************************************\
* Solving a linear system * * Solving a linear system *
\****************************************************************************************/ \****************************************************************************************/
@ -1603,7 +1605,7 @@ void SVD::backSubst( InputArray _w, InputArray _u, InputArray _vt,
Mat w = _w.getMat(), u = _u.getMat(), vt = _vt.getMat(), rhs = _rhs.getMat(); Mat w = _w.getMat(), u = _u.getMat(), vt = _vt.getMat(), rhs = _rhs.getMat();
int type = w.type(), esz = (int)w.elemSize(); int type = w.type(), esz = (int)w.elemSize();
int m = u.rows, n = vt.cols, nb = rhs.data ? rhs.cols : m, nm = std::min(m, n); int m = u.rows, n = vt.cols, nb = rhs.data ? rhs.cols : m, nm = std::min(m, n);
size_t wstep = w.rows == 1 ? esz : w.cols == 1 ? (size_t)w.step : (size_t)w.step + esz; size_t wstep = w.rows == 1 ? (size_t)esz : w.cols == 1 ? (size_t)w.step : (size_t)w.step + esz;
AutoBuffer<uchar> buffer(nb*sizeof(double) + 16); AutoBuffer<uchar> buffer(nb*sizeof(double) + 16);
CV_Assert( w.type() == u.type() && u.type() == vt.type() && u.data && vt.data && w.data ); CV_Assert( w.type() == u.type() && u.type() == vt.type() && u.data && vt.data && w.data );
CV_Assert( u.cols >= nm && vt.rows >= nm && CV_Assert( u.cols >= nm && vt.rows >= nm &&

@ -0,0 +1,129 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
#ifdef HAVE_CONCURRENCY
# include <ppl.h>
#elif defined HAVE_OPENMP
# include <omp.h>
#elif defined HAVE_GCD
# include <dispatch/dispatch.h>
#elif defined HAVE_TBB
# include "tbb/tbb_stddef.h"
# if TBB_VERSION_MAJOR*100 + TBB_VERSION_MINOR >= 202
# include "tbb/tbb.h"
# include "tbb/task.h"
# undef min
# undef max
# else
# undef HAVE_TBB
# endif // end TBB version
#endif // HAVE_CONCURRENCY
/*
HAVE_TBB - using TBB
HAVE_GCD - using GCD
HAVE_OPENMP - using OpenMP
HAVE_CONCURRENCY - using visual studio 2010 concurrency
*/
namespace cv
{
ParallelLoopBody::~ParallelLoopBody() { }
#ifdef HAVE_TBB
class TbbProxyLoopBody
{
public:
TbbProxyLoopBody(const ParallelLoopBody& _body) :
body(&_body)
{ }
void operator ()(const tbb::blocked_range<int>& range) const
{
body->operator()(Range(range.begin(), range.end()));
}
private:
const ParallelLoopBody* body;
};
#endif // end HAVE_TBB
#ifdef HAVE_GCD
static
void block_function(void* context, size_t index)
{
ParallelLoopBody* ptr_body = static_cast<ParallelLoopBody*>(context);
ptr_body->operator()(Range(index, index + 1));
}
#endif // HAVE_GCD
void parallel_for_(const Range& range, const ParallelLoopBody& body)
{
#ifdef HAVE_TBB
tbb::parallel_for(tbb::blocked_range<int>(range.start, range.end), TbbProxyLoopBody(body));
#elif defined HAVE_CONCURRENCY
Concurrency::parallel_for(range.start, range.end, body);
#elif defined HAVE_OPENMP
#pragma omp parallel for schedule(dynamic)
for (int i = range.start; i < range.end; ++i)
body(Range(i, i + 1));
#elif defined (HAVE_GCD)
dispatch_queue_t concurrent_queue = dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
dispatch_apply_f(range.end - range.start, concurrent_queue, &const_cast<ParallelLoopBody&>(body), block_function);
#else
body(range);
#endif // end HAVE_TBB
}
} // namespace cv

File diff suppressed because it is too large Load Diff

@ -0,0 +1,38 @@
#include "perf_precomp.hpp"
using namespace std;
using namespace cv;
using namespace perf;
using namespace testing;
using std::tr1::make_tuple;
using std::tr1::get;
CV_ENUM(Mat_Type, CV_8UC1, CV_8UC3, CV_32FC1, CV_32FC3)
typedef TestBaseWithParam< tr1::tuple<Size, int, Mat_Type> > TestBilateralFilter;
PERF_TEST_P( TestBilateralFilter, BilateralFilter,
Combine(
Values( szVGA, sz1080p ), // image size
Values( 3, 5 ), // d
ValuesIn( Mat_Type::all() ) // image type
)
)
{
Size sz;
int d, type;
const double sigmaColor = 1., sigmaSpace = 1.;
sz = get<0>(GetParam());
d = get<1>(GetParam());
type = get<2>(GetParam());
Mat src(sz, type);
Mat dst(sz, type);
declare.in(src, WARMUP_RNG).out(dst).time(20);
TEST_CYCLE() bilateralFilter(src, dst, d, sigmaColor, sigmaSpace, BORDER_DEFAULT);
SANITY_CHECK(dst);
}

@ -951,9 +951,6 @@ cvBoundingRect( CvArr* array, int update )
if( ptseq->header_size < (int)sizeof(CvContour)) if( ptseq->header_size < (int)sizeof(CvContour))
{ {
/*if( update == 1 )
CV_Error( CV_StsBadArg, "The header is too small to fit the rectangle, "
"so it could not be updated" );*/
update = 0; update = 0;
calculate = 1; calculate = 1;
} }
@ -1067,86 +1064,123 @@ cvBoundingRect( CvArr* array, int update )
if( xmin >= size.width ) if( xmin >= size.width )
xmin = ymin = 0; xmin = ymin = 0;
} }
else if( ptseq->total ) else if( ptseq->total )
{ {
int is_float = CV_SEQ_ELTYPE(ptseq) == CV_32FC2; int is_float = CV_SEQ_ELTYPE(ptseq) == CV_32FC2;
cvStartReadSeq( ptseq, &reader, 0 ); cvStartReadSeq( ptseq, &reader, 0 );
CvPoint pt;
if( !is_float ) CV_READ_SEQ_ELEM( pt, reader );
{ #if CV_SSE4_2
CvPoint pt; if(cv::checkHardwareSupport(CV_CPU_SSE4_2))
/* init values */ {
CV_READ_SEQ_ELEM( pt, reader ); if( !is_float )
xmin = xmax = pt.x; {
ymin = ymax = pt.y; __m128i minval, maxval;
minval = maxval = _mm_loadl_epi64((const __m128i*)(&pt)); //min[0]=pt.x, min[1]=pt.y
for( i = 1; i < ptseq->total; i++ )
{ for( i = 1; i < ptseq->total; i++)
CV_READ_SEQ_ELEM( pt, reader ); {
__m128i ptXY = _mm_loadl_epi64((const __m128i*)(reader.ptr));
if( xmin > pt.x ) CV_NEXT_SEQ_ELEM(sizeof(pt), reader);
xmin = pt.x; minval = _mm_min_epi32(ptXY, minval);
maxval = _mm_max_epi32(ptXY, maxval);
if( xmax < pt.x ) }
xmax = pt.x; xmin = _mm_cvtsi128_si32(minval);
ymin = _mm_cvtsi128_si32(_mm_srli_si128(minval, 4));
if( ymin > pt.y ) xmax = _mm_cvtsi128_si32(maxval);
ymin = pt.y; ymax = _mm_cvtsi128_si32(_mm_srli_si128(maxval, 4));
}
if( ymax < pt.y ) else
ymax = pt.y; {
__m128 minvalf, maxvalf, z = _mm_setzero_ps(), ptXY = _mm_setzero_ps();
minvalf = maxvalf = _mm_loadl_pi(z, (const __m64*)(&pt));
for( i = 1; i < ptseq->total; i++ )
{
ptXY = _mm_loadl_pi(ptXY, (const __m64*)reader.ptr);
CV_NEXT_SEQ_ELEM(sizeof(pt), reader);
minvalf = _mm_min_ps(minvalf, ptXY);
maxvalf = _mm_max_ps(maxvalf, ptXY);
}
float xyminf[2], xymaxf[2];
_mm_storel_pi((__m64*)xyminf, minvalf);
_mm_storel_pi((__m64*)xymaxf, maxvalf);
xmin = cvFloor(xyminf[0]);
ymin = cvFloor(xyminf[1]);
xmax = cvFloor(xymaxf[0]);
ymax = cvFloor(xymaxf[1]);
} }
} }
else else
{ #endif
CvPoint pt; {
Cv32suf v; if( !is_float )
/* init values */ {
CV_READ_SEQ_ELEM( pt, reader ); xmin = xmax = pt.x;
xmin = xmax = CV_TOGGLE_FLT(pt.x); ymin = ymax = pt.y;
ymin = ymax = CV_TOGGLE_FLT(pt.y);
for( i = 1; i < ptseq->total; i++ )
for( i = 1; i < ptseq->total; i++ ) {
{ CV_READ_SEQ_ELEM( pt, reader );
CV_READ_SEQ_ELEM( pt, reader );
pt.x = CV_TOGGLE_FLT(pt.x); if( xmin > pt.x )
pt.y = CV_TOGGLE_FLT(pt.y); xmin = pt.x;
if( xmin > pt.x ) if( xmax < pt.x )
xmin = pt.x; xmax = pt.x;
if( xmax < pt.x ) if( ymin > pt.y )
xmax = pt.x; ymin = pt.y;
if( ymin > pt.y ) if( ymax < pt.y )
ymin = pt.y; ymax = pt.y;
}
if( ymax < pt.y ) }
ymax = pt.y; else
} {
Cv32suf v;
v.i = CV_TOGGLE_FLT(xmin); xmin = cvFloor(v.f); // init values
v.i = CV_TOGGLE_FLT(ymin); ymin = cvFloor(v.f); xmin = xmax = CV_TOGGLE_FLT(pt.x);
/* because right and bottom sides of ymin = ymax = CV_TOGGLE_FLT(pt.y);
the bounding rectangle are not inclusive
(note +1 in width and height calculation below), for( i = 1; i < ptseq->total; i++ )
cvFloor is used here instead of cvCeil */ {
v.i = CV_TOGGLE_FLT(xmax); xmax = cvFloor(v.f); CV_READ_SEQ_ELEM( pt, reader );
v.i = CV_TOGGLE_FLT(ymax); ymax = cvFloor(v.f); pt.x = CV_TOGGLE_FLT(pt.x);
} pt.y = CV_TOGGLE_FLT(pt.y);
}
if( xmin > pt.x )
rect.x = xmin; xmin = pt.x;
rect.y = ymin;
rect.width = xmax - xmin + 1; if( xmax < pt.x )
rect.height = ymax - ymin + 1; xmax = pt.x;
if( update ) if( ymin > pt.y )
ymin = pt.y;
if( ymax < pt.y )
ymax = pt.y;
}
v.i = CV_TOGGLE_FLT(xmin); xmin = cvFloor(v.f);
v.i = CV_TOGGLE_FLT(ymin); ymin = cvFloor(v.f);
// because right and bottom sides of the bounding rectangle are not inclusive
// (note +1 in width and height calculation below), cvFloor is used here instead of cvCeil
v.i = CV_TOGGLE_FLT(xmax); xmax = cvFloor(v.f);
v.i = CV_TOGGLE_FLT(ymax); ymax = cvFloor(v.f);
}
}
rect.x = xmin;
rect.y = ymin;
rect.width = xmax - xmin + 1;
rect.height = ymax - ymin + 1;
}
if( update )
((CvContour*)ptseq)->rect = rect; ((CvContour*)ptseq)->rect = rect;
return rect; return rect;
} }
/* End of file. */ /* End of file. */

@ -1288,48 +1288,119 @@ void cv::medianBlur( InputArray _src0, OutputArray _dst, int ksize )
namespace cv namespace cv
{ {
class BilateralFilter_8u_Invoker :
public ParallelLoopBody
{
public:
BilateralFilter_8u_Invoker(Mat& _dest, const Mat& _temp, int _radius, int _maxk,
int* _space_ofs, float *_space_weight, float *_color_weight) :
ParallelLoopBody(), dest(&_dest), temp(&_temp), radius(_radius),
maxk(_maxk), space_ofs(_space_ofs), space_weight(_space_weight), color_weight(_color_weight)
{
}
virtual void operator() (const Range& range) const
{
int i, j, cn = dest->channels(), k;
Size size = dest->size();
for( i = range.start; i < range.end; i++ )
{
const uchar* sptr = temp->ptr(i+radius) + radius*cn;
uchar* dptr = dest->ptr(i);
if( cn == 1 )
{
for( j = 0; j < size.width; j++ )
{
float sum = 0, wsum = 0;
int val0 = sptr[j];
for( k = 0; k < maxk; k++ )
{
int val = sptr[j + space_ofs[k]];
float w = space_weight[k]*color_weight[std::abs(val - val0)];
sum += val*w;
wsum += w;
}
// overflow is not possible here => there is no need to use CV_CAST_8U
dptr[j] = (uchar)cvRound(sum/wsum);
}
}
else
{
assert( cn == 3 );
for( j = 0; j < size.width*3; j += 3 )
{
float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0;
int b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2];
for( k = 0; k < maxk; k++ )
{
const uchar* sptr_k = sptr + j + space_ofs[k];
int b = sptr_k[0], g = sptr_k[1], r = sptr_k[2];
float w = space_weight[k]*color_weight[std::abs(b - b0) +
std::abs(g - g0) + std::abs(r - r0)];
sum_b += b*w; sum_g += g*w; sum_r += r*w;
wsum += w;
}
wsum = 1.f/wsum;
b0 = cvRound(sum_b*wsum);
g0 = cvRound(sum_g*wsum);
r0 = cvRound(sum_r*wsum);
dptr[j] = (uchar)b0; dptr[j+1] = (uchar)g0; dptr[j+2] = (uchar)r0;
}
}
}
}
private:
const Mat *temp;
Mat *dest;
int radius, maxk, *space_ofs;
float *space_weight, *color_weight;
};
static void static void
bilateralFilter_8u( const Mat& src, Mat& dst, int d, bilateralFilter_8u( const Mat& src, Mat& dst, int d,
double sigma_color, double sigma_space, double sigma_color, double sigma_space,
int borderType ) int borderType )
{ {
int cn = src.channels(); int cn = src.channels();
int i, j, k, maxk, radius; int i, j, maxk, radius;
Size size = src.size(); Size size = src.size();
CV_Assert( (src.type() == CV_8UC1 || src.type() == CV_8UC3) && CV_Assert( (src.type() == CV_8UC1 || src.type() == CV_8UC3) &&
src.type() == dst.type() && src.size() == dst.size() && src.type() == dst.type() && src.size() == dst.size() &&
src.data != dst.data ); src.data != dst.data );
if( sigma_color <= 0 ) if( sigma_color <= 0 )
sigma_color = 1; sigma_color = 1;
if( sigma_space <= 0 ) if( sigma_space <= 0 )
sigma_space = 1; sigma_space = 1;
double gauss_color_coeff = -0.5/(sigma_color*sigma_color); double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
double gauss_space_coeff = -0.5/(sigma_space*sigma_space); double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
if( d <= 0 ) if( d <= 0 )
radius = cvRound(sigma_space*1.5); radius = cvRound(sigma_space*1.5);
else else
radius = d/2; radius = d/2;
radius = MAX(radius, 1); radius = MAX(radius, 1);
d = radius*2 + 1; d = radius*2 + 1;
Mat temp; Mat temp;
copyMakeBorder( src, temp, radius, radius, radius, radius, borderType ); copyMakeBorder( src, temp, radius, radius, radius, radius, borderType );
vector<float> _color_weight(cn*256); vector<float> _color_weight(cn*256);
vector<float> _space_weight(d*d); vector<float> _space_weight(d*d);
vector<int> _space_ofs(d*d); vector<int> _space_ofs(d*d);
float* color_weight = &_color_weight[0]; float* color_weight = &_color_weight[0];
float* space_weight = &_space_weight[0]; float* space_weight = &_space_weight[0];
int* space_ofs = &_space_ofs[0]; int* space_ofs = &_space_ofs[0];
// initialize color-related bilateral filter coefficients // initialize color-related bilateral filter coefficients
for( i = 0; i < 256*cn; i++ ) for( i = 0; i < 256*cn; i++ )
color_weight[i] = (float)std::exp(i*i*gauss_color_coeff); color_weight[i] = (float)std::exp(i*i*gauss_color_coeff);
// initialize space-related bilateral filter coefficients // initialize space-related bilateral filter coefficients
for( i = -radius, maxk = 0; i <= radius; i++ ) for( i = -radius, maxk = 0; i <= radius; i++ )
for( j = -radius; j <= radius; j++ ) for( j = -radius; j <= radius; j++ )
@ -1340,55 +1411,88 @@ bilateralFilter_8u( const Mat& src, Mat& dst, int d,
space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff); space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff);
space_ofs[maxk++] = (int)(i*temp.step + j*cn); space_ofs[maxk++] = (int)(i*temp.step + j*cn);
} }
BilateralFilter_8u_Invoker body(dst, temp, radius, maxk, space_ofs, space_weight, color_weight);
parallel_for_(Range(0, size.height), body);
}
for( i = 0; i < size.height; i++ )
class BilateralFilter_32f_Invoker :
public ParallelLoopBody
{
public:
BilateralFilter_32f_Invoker(int _cn, int _radius, int _maxk, int *_space_ofs,
const Mat& _temp, Mat& _dest, float _scale_index, float *_space_weight, float *_expLUT) :
ParallelLoopBody(), cn(_cn), radius(_radius), maxk(_maxk), space_ofs(_space_ofs),
temp(&_temp), dest(&_dest), scale_index(_scale_index), space_weight(_space_weight), expLUT(_expLUT)
{ {
const uchar* sptr = temp.data + (i+radius)*temp.step + radius*cn; }
uchar* dptr = dst.data + i*dst.step;
if( cn == 1 ) virtual void operator() (const Range& range) const
{
int i, j, k;
Size size = dest->size();
for( i = range.start; i < range.end; i++ )
{ {
for( j = 0; j < size.width; j++ ) const float* sptr = temp->ptr<float>(i+radius) + radius*cn;
float* dptr = dest->ptr<float>(i);
if( cn == 1 )
{ {
float sum = 0, wsum = 0; for( j = 0; j < size.width; j++ )
int val0 = sptr[j];
for( k = 0; k < maxk; k++ )
{ {
int val = sptr[j + space_ofs[k]]; float sum = 0, wsum = 0;
float w = space_weight[k]*color_weight[std::abs(val - val0)]; float val0 = sptr[j];
sum += val*w; for( k = 0; k < maxk; k++ )
wsum += w; {
float val = sptr[j + space_ofs[k]];
float alpha = (float)(std::abs(val - val0)*scale_index);
int idx = cvFloor(alpha);
alpha -= idx;
float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
sum += val*w;
wsum += w;
}
dptr[j] = (float)(sum/wsum);
} }
// overflow is not possible here => there is no need to use CV_CAST_8U
dptr[j] = (uchar)cvRound(sum/wsum);
} }
} else
else
{
assert( cn == 3 );
for( j = 0; j < size.width*3; j += 3 )
{ {
float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0; assert( cn == 3 );
int b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2]; for( j = 0; j < size.width*3; j += 3 )
for( k = 0; k < maxk; k++ )
{ {
const uchar* sptr_k = sptr + j + space_ofs[k]; float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0;
int b = sptr_k[0], g = sptr_k[1], r = sptr_k[2]; float b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2];
float w = space_weight[k]*color_weight[std::abs(b - b0) + for( k = 0; k < maxk; k++ )
std::abs(g - g0) + std::abs(r - r0)]; {
sum_b += b*w; sum_g += g*w; sum_r += r*w; const float* sptr_k = sptr + j + space_ofs[k];
wsum += w; float b = sptr_k[0], g = sptr_k[1], r = sptr_k[2];
float alpha = (float)((std::abs(b - b0) +
std::abs(g - g0) + std::abs(r - r0))*scale_index);
int idx = cvFloor(alpha);
alpha -= idx;
float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
sum_b += b*w; sum_g += g*w; sum_r += r*w;
wsum += w;
}
wsum = 1.f/wsum;
b0 = sum_b*wsum;
g0 = sum_g*wsum;
r0 = sum_r*wsum;
dptr[j] = b0; dptr[j+1] = g0; dptr[j+2] = r0;
} }
wsum = 1.f/wsum;
b0 = cvRound(sum_b*wsum);
g0 = cvRound(sum_g*wsum);
r0 = cvRound(sum_r*wsum);
dptr[j] = (uchar)b0; dptr[j+1] = (uchar)g0; dptr[j+2] = (uchar)r0;
} }
} }
} }
}
private:
int cn, radius, maxk, *space_ofs;
const Mat* temp;
Mat *dest;
float scale_index, *space_weight, *expLUT;
};
static void static void
bilateralFilter_32f( const Mat& src, Mat& dst, int d, bilateralFilter_32f( const Mat& src, Mat& dst, int d,
@ -1396,7 +1500,7 @@ bilateralFilter_32f( const Mat& src, Mat& dst, int d,
int borderType ) int borderType )
{ {
int cn = src.channels(); int cn = src.channels();
int i, j, k, maxk, radius; int i, j, maxk, radius;
double minValSrc=-1, maxValSrc=1; double minValSrc=-1, maxValSrc=1;
const int kExpNumBinsPerChannel = 1 << 12; const int kExpNumBinsPerChannel = 1 << 12;
int kExpNumBins = 0; int kExpNumBins = 0;
@ -1474,57 +1578,10 @@ bilateralFilter_32f( const Mat& src, Mat& dst, int d,
space_ofs[maxk++] = (int)(i*(temp.step/sizeof(float)) + j*cn); space_ofs[maxk++] = (int)(i*(temp.step/sizeof(float)) + j*cn);
} }
for( i = 0; i < size.height; i++ ) // parallel_for usage
{
const float* sptr = (const float*)(temp.data + (i+radius)*temp.step) + radius*cn;
float* dptr = (float*)(dst.data + i*dst.step);
if( cn == 1 ) BilateralFilter_32f_Invoker body(cn, radius, maxk, space_ofs, temp, dst, scale_index, space_weight, expLUT);
{ parallel_for_(Range(0, size.height), body);
for( j = 0; j < size.width; j++ )
{
float sum = 0, wsum = 0;
float val0 = sptr[j];
for( k = 0; k < maxk; k++ )
{
float val = sptr[j + space_ofs[k]];
float alpha = (float)(std::abs(val - val0)*scale_index);
int idx = cvFloor(alpha);
alpha -= idx;
float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
sum += val*w;
wsum += w;
}
dptr[j] = (float)(sum/wsum);
}
}
else
{
assert( cn == 3 );
for( j = 0; j < size.width*3; j += 3 )
{
float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0;
float b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2];
for( k = 0; k < maxk; k++ )
{
const float* sptr_k = sptr + j + space_ofs[k];
float b = sptr_k[0], g = sptr_k[1], r = sptr_k[2];
float alpha = (float)((std::abs(b - b0) +
std::abs(g - g0) + std::abs(r - r0))*scale_index);
int idx = cvFloor(alpha);
alpha -= idx;
float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
sum_b += b*w; sum_g += g*w; sum_r += r*w;
wsum += w;
}
wsum = 1.f/wsum;
b0 = sum_b*wsum;
g0 = sum_g*wsum;
r0 = sum_r*wsum;
dptr[j] = b0; dptr[j+1] = g0; dptr[j+2] = r0;
}
}
}
} }
} }

@ -0,0 +1,290 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "test_precomp.hpp"
using namespace cv;
namespace cvtest
{
class CV_BilateralFilterTest :
public cvtest::BaseTest
{
public:
enum
{
MAX_WIDTH = 1920, MIN_WIDTH = 1,
MAX_HEIGHT = 1080, MIN_HEIGHT = 1
};
CV_BilateralFilterTest();
~CV_BilateralFilterTest();
protected:
virtual void run_func();
virtual int prepare_test_case(int test_case_index);
virtual int validate_test_results(int test_case_index);
private:
void reference_bilateral_filter(const Mat& src, Mat& dst, int d, double sigma_color,
double sigma_space, int borderType = BORDER_DEFAULT);
int getRandInt(RNG& rng, int min_value, int max_value) const;
double _sigma_color;
double _sigma_space;
Mat _src;
Mat _parallel_dst;
int _d;
};
CV_BilateralFilterTest::CV_BilateralFilterTest() :
cvtest::BaseTest(), _src(), _parallel_dst(), _d()
{
test_case_count = 1000;
}
CV_BilateralFilterTest::~CV_BilateralFilterTest()
{
}
int CV_BilateralFilterTest::getRandInt(RNG& rng, int min_value, int max_value) const
{
double rand_value = rng.uniform(log((double)min_value), log((double)max_value + 1));
return cvRound(exp((double)rand_value));
}
void CV_BilateralFilterTest::reference_bilateral_filter(const Mat &src, Mat &dst, int d,
double sigma_color, double sigma_space, int borderType)
{
int cn = src.channels();
int i, j, k, maxk, radius;
double minValSrc = -1, maxValSrc = 1;
const int kExpNumBinsPerChannel = 1 << 12;
int kExpNumBins = 0;
float lastExpVal = 1.f;
float len, scale_index;
Size size = src.size();
dst.create(size, src.type());
CV_Assert( (src.type() == CV_32FC1 || src.type() == CV_32FC3) &&
src.type() == dst.type() && src.size() == dst.size() &&
src.data != dst.data );
if( sigma_color <= 0 )
sigma_color = 1;
if( sigma_space <= 0 )
sigma_space = 1;
double gauss_color_coeff = -0.5/(sigma_color*sigma_color);
double gauss_space_coeff = -0.5/(sigma_space*sigma_space);
if( d <= 0 )
radius = cvRound(sigma_space*1.5);
else
radius = d/2;
radius = MAX(radius, 1);
d = radius*2 + 1;
// compute the min/max range for the input image (even if multichannel)
minMaxLoc( src.reshape(1), &minValSrc, &maxValSrc );
if(std::abs(minValSrc - maxValSrc) < FLT_EPSILON)
{
src.copyTo(dst);
return;
}
// temporary copy of the image with borders for easy processing
Mat temp;
copyMakeBorder( src, temp, radius, radius, radius, radius, borderType );
patchNaNs(temp);
// allocate lookup tables
vector<float> _space_weight(d*d);
vector<int> _space_ofs(d*d);
float* space_weight = &_space_weight[0];
int* space_ofs = &_space_ofs[0];
// assign a length which is slightly more than needed
len = (float)(maxValSrc - minValSrc) * cn;
kExpNumBins = kExpNumBinsPerChannel * cn;
vector<float> _expLUT(kExpNumBins+2);
float* expLUT = &_expLUT[0];
scale_index = kExpNumBins/len;
// initialize the exp LUT
for( i = 0; i < kExpNumBins+2; i++ )
{
if( lastExpVal > 0.f )
{
double val = i / scale_index;
expLUT[i] = (float)std::exp(val * val * gauss_color_coeff);
lastExpVal = expLUT[i];
}
else
expLUT[i] = 0.f;
}
// initialize space-related bilateral filter coefficients
for( i = -radius, maxk = 0; i <= radius; i++ )
for( j = -radius; j <= radius; j++ )
{
double r = std::sqrt((double)i*i + (double)j*j);
if( r > radius )
continue;
space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff);
space_ofs[maxk++] = (int)(i*(temp.step/sizeof(float)) + j*cn);
}
for( i = 0; i < size.height; i++ )
{
const float* sptr = (const float*)(temp.data + (i+radius)*temp.step) + radius*cn;
float* dptr = (float*)(dst.data + i*dst.step);
if( cn == 1 )
{
for( j = 0; j < size.width; j++ )
{
float sum = 0, wsum = 0;
float val0 = sptr[j];
for( k = 0; k < maxk; k++ )
{
float val = sptr[j + space_ofs[k]];
float alpha = (float)(std::abs(val - val0)*scale_index);
int idx = cvFloor(alpha);
alpha -= idx;
float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
sum += val*w;
wsum += w;
}
dptr[j] = (float)(sum/wsum);
}
}
else
{
assert( cn == 3 );
for( j = 0; j < size.width*3; j += 3 )
{
float sum_b = 0, sum_g = 0, sum_r = 0, wsum = 0;
float b0 = sptr[j], g0 = sptr[j+1], r0 = sptr[j+2];
for( k = 0; k < maxk; k++ )
{
const float* sptr_k = sptr + j + space_ofs[k];
float b = sptr_k[0], g = sptr_k[1], r = sptr_k[2];
float alpha = (float)((std::abs(b - b0) +
std::abs(g - g0) + std::abs(r - r0))*scale_index);
int idx = cvFloor(alpha);
alpha -= idx;
float w = space_weight[k]*(expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
sum_b += b*w; sum_g += g*w; sum_r += r*w;
wsum += w;
}
wsum = 1.f/wsum;
b0 = sum_b*wsum;
g0 = sum_g*wsum;
r0 = sum_r*wsum;
dptr[j] = b0; dptr[j+1] = g0; dptr[j+2] = r0;
}
}
}
}
int CV_BilateralFilterTest::prepare_test_case(int /* test_case_index */)
{
const static int types[] = { CV_32FC1, CV_32FC3, CV_8UC1, CV_8UC3 };
RNG& rng = ts->get_rng();
Size size(getRandInt(rng, MIN_WIDTH, MAX_WIDTH), getRandInt(rng, MIN_HEIGHT, MAX_HEIGHT));
int type = types[rng(sizeof(types) / sizeof(types[0]))];
_d = rng.uniform(0., 1.) > 0.5 ? 5 : 3;
_src.create(size, type);
rng.fill(_src, RNG::UNIFORM, 0, 256);
_sigma_color = _sigma_space = 1.;
return 1;
}
int CV_BilateralFilterTest::validate_test_results(int test_case_index)
{
static const double eps = 1;
Mat reference_dst, reference_src;
if (_src.depth() == CV_32F)
reference_bilateral_filter(_src, reference_dst, _d, _sigma_color, _sigma_space);
else
{
int type = _src.type();
_src.convertTo(reference_src, CV_32F);
reference_bilateral_filter(reference_src, reference_dst, _d, _sigma_color, _sigma_space);
reference_dst.convertTo(reference_dst, type);
}
double e = norm(reference_dst, _parallel_dst);
if (e > eps)
{
ts->printf(cvtest::TS::CONSOLE, "actual error: %g, expected: %g", e, eps);
ts->set_failed_test_info(cvtest::TS::FAIL_BAD_ACCURACY);
}
else
ts->set_failed_test_info(cvtest::TS::OK);
return BaseTest::validate_test_results(test_case_index);
}
void CV_BilateralFilterTest::run_func()
{
bilateralFilter(_src, _parallel_dst, _d, _sigma_color, _sigma_space);
}
TEST(Imgproc_BilateralFilter, accuracy)
{
CV_BilateralFilterTest test;
test.safe_run();
}
} // end of namespace cvtest

@ -43,19 +43,26 @@
#include "precomp.hpp" #include "precomp.hpp"
#include <stdio.h> #include <stdio.h>
/*
/*#if CV_SSE2 #if CV_SSE2
# if CV_SSE4 || defined __SSE4__ # if !CV_SSE4_1 && !CV_SSE4_2
# include <smmintrin.h> # define _mm_blendv_pd(a, b, m) _mm_xor_pd(a, _mm_and_pd(_mm_xor_pd(b, a), m))
# else # define _mm_blendv_ps(a, b, m) _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(b, a), m))
# define _mm_blendv_pd(a, b, m) _mm_xor_pd(a, _mm_and_pd(_mm_xor_pd(b, a), m))
# define _mm_blendv_ps(a, b, m) _mm_xor_ps(a, _mm_and_ps(_mm_xor_ps(b, a), m))
# endif # endif
#if defined CV_ICC
# define CV_HAAR_USE_SSE 1
#endif #endif
#endif*/
#if defined CV_ICC
# if defined CV_AVX
# define CV_HAAR_USE_AVX 1
# else
# if defined CV_SSE2 || defined CV_SSE4_1 || defined CV_SSE4_2
# define CV_HAAR_USE_SSE 1
# else
# define CV_HAAR_NO_SIMD 1
# endif
# endif
#endif
*/
/* these settings affect the quality of detection: change with care */ /* these settings affect the quality of detection: change with care */
#define CV_ADJUST_FEATURES 1 #define CV_ADJUST_FEATURES 1
#define CV_ADJUST_WEIGHTS 0 #define CV_ADJUST_WEIGHTS 0
@ -730,6 +737,7 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
{ {
CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j; CvHidHaarClassifier* classifier = cascade->stage_classifier[i].classifier + j;
CvHidHaarTreeNode* node = classifier->node; CvHidHaarTreeNode* node = classifier->node;
#ifndef CV_HAAR_USE_SSE #ifndef CV_HAAR_USE_SSE
double t = node->threshold*variance_norm_factor; double t = node->threshold*variance_norm_factor;
double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight; double sum = calc_sum(node->feature.rect[0],p_offset) * node->feature.rect[0].weight;
@ -745,6 +753,7 @@ cvRunHaarClassifierCascadeSum( const CvHaarClassifierCascade* _cascade,
t = _mm_cmpgt_sd(t, sum); t = _mm_cmpgt_sd(t, sum);
stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t)); stage_sum = _mm_add_sd(stage_sum, _mm_blendv_pd(b, a, t));
#endif #endif
} }
} }
else else

@ -14,7 +14,7 @@ cl_list = glob.glob(os.path.join(indir, "*.cl"))
kfile = open(outname, "wt") kfile = open(outname, "wt")
kfile.write("""// This file is auto-generated. Do not edit! kfile.write("""// This file is auto-generated. Do not edit!
//#include "precomp.hpp"
namespace cv namespace cv
{ {
namespace ocl namespace ocl

@ -49,7 +49,7 @@ namespace cv
namespace ocl namespace ocl
{ {
////////////////////////////////////OpenCL kernel strings////////////////////////// ////////////////////////////////////OpenCL kernel strings//////////////////////////
extern const char *convertC3C4; //extern const char *convertC3C4;
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
//////////////////////////////// oclMat //////////////////////////////// //////////////////////////////// oclMat ////////////////////////////////

@ -49,6 +49,7 @@
#include "opencv2/core/core.hpp" #include "opencv2/core/core.hpp"
#include "opencv2/imgproc/imgproc.hpp" #include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/objdetect/objdetect.hpp" #include "opencv2/objdetect/objdetect.hpp"
#include "opencv2/features2d/features2d.hpp"
namespace cv namespace cv
{ {

@ -455,13 +455,12 @@ void cv::ocl::multiply(const oclMat &src1, const oclMat &src2, oclMat &dst, doub
} }
void cv::ocl::divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar) void cv::ocl::divide(const oclMat &src1, const oclMat &src2, oclMat &dst, double scalar)
{ {
if(src1.clCxt -> impl -> double_support ==0)
{
CV_Error(-217,"Selected device don't support double\r\n");
return;
}
arithmetic_run<double>(src1, src2, dst, "arithm_div", &arithm_div, (void *)(&scalar)); if(src1.clCxt -> impl -> double_support !=0)
arithmetic_run<double>(src1, src2, dst, "arithm_div", &arithm_div, (void *)(&scalar));
else
arithmetic_run<float>(src1, src2, dst, "arithm_div", &arithm_div, (void *)(&scalar));
} }
template <typename WT ,typename CL_WT> template <typename WT ,typename CL_WT>
void arithmetic_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, string kernelName, const char **kernelString, int isMatSubScalar) void arithmetic_scalar_run(const oclMat &src1, const Scalar &src2, oclMat &dst, const oclMat &mask, string kernelName, const char **kernelString, int isMatSubScalar)
@ -579,7 +578,14 @@ void arithmetic_scalar_run(const oclMat &src, oclMat &dst, string kernelName, co
args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&src.rows ));
args.push_back( make_pair( sizeof(cl_int), (void *)&cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 )); args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
args.push_back( make_pair( sizeof(cl_double), (void *)&scalar ));
if(src.clCxt -> impl -> double_support !=0)
args.push_back( make_pair( sizeof(cl_double), (void *)&scalar ));
else
{
float f_scalar = (float)scalar;
args.push_back( make_pair( sizeof(cl_float), (void *)&f_scalar));
}
openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth); openCLExecuteKernel(clCxt, kernelString, kernelName, globalThreads, localThreads, args, -1, depth);
} }
@ -670,9 +676,9 @@ void compare_run(const oclMat &src1, const oclMat &src2, oclMat &dst, string ker
int cols = divUp(dst.cols + offset_cols, vector_length); int cols = divUp(dst.cols + offset_cols, vector_length);
size_t localThreads[3] = { 64, 4, 1 }; size_t localThreads[3] = { 64, 4, 1 };
size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0], size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0],
divUp(dst.rows, localThreads[1]) * localThreads[1], divUp(dst.rows, localThreads[1]) * localThreads[1],
1 1
}; };
int dst_step1 = dst.cols * dst.elemSize(); int dst_step1 = dst.cols * dst.elemSize();
vector<pair<size_t , const void *> > args; vector<pair<size_t , const void *> > args;
args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data )); args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
@ -1253,7 +1259,11 @@ void arithmetic_exp_log_run(const oclMat &src, oclMat &dst, string kernelName, c
CV_Assert( src.type() == CV_32F || src.type() == CV_64F); CV_Assert( src.type() == CV_32F || src.type() == CV_64F);
Context *clCxt = src.clCxt; Context *clCxt = src.clCxt;
if(clCxt -> impl -> double_support ==0 && src.type() == CV_64F)
{
CV_Error(-217,"Selected device don't support double\r\n");
return;
}
//int channels = dst.channels(); //int channels = dst.channels();
int depth = dst.depth(); int depth = dst.depth();
@ -2193,56 +2203,46 @@ void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2,
size_t localThreads[3] = { 256, 1, 1 }; size_t localThreads[3] = { 256, 1, 1 };
size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0], size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0],
divUp(dst.rows, localThreads[1]) * localThreads[1], divUp(dst.rows, localThreads[1]) * localThreads[1],
1 1
}; };
int dst_step1 = dst.cols * dst.elemSize(); int dst_step1 = dst.cols * dst.elemSize();
vector<pair<size_t , const void *> > args; vector<pair<size_t , const void *> > args;
if(sizeof(double) == 8) args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset));
args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src2.step ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset));
if(src1.clCxt -> impl -> double_support != 0)
{ {
args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
args.push_back( make_pair( sizeof(cl_double), (void *)&alpha )); args.push_back( make_pair( sizeof(cl_double), (void *)&alpha ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset));
args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
args.push_back( make_pair( sizeof(cl_double), (void *)&beta )); args.push_back( make_pair( sizeof(cl_double), (void *)&beta ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src2.step ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset));
args.push_back( make_pair( sizeof(cl_double), (void *)&gama )); args.push_back( make_pair( sizeof(cl_double), (void *)&gama ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset));
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));
args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
} }
else else
{ {
args.push_back( make_pair( sizeof(cl_mem), (void *)&src1.data ));
args.push_back( make_pair( sizeof(cl_float), (void *)&alpha )); args.push_back( make_pair( sizeof(cl_float), (void *)&alpha ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.step ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.offset));
args.push_back( make_pair( sizeof(cl_mem), (void *)&src2.data ));
args.push_back( make_pair( sizeof(cl_float), (void *)&beta )); args.push_back( make_pair( sizeof(cl_float), (void *)&beta ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src2.step ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src2.offset));
args.push_back( make_pair( sizeof(cl_float), (void *)&gama )); args.push_back( make_pair( sizeof(cl_float), (void *)&gama ));
args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data )); }
args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset)); args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data ));
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows )); args.push_back( make_pair( sizeof(cl_int), (void *)&dst.step ));
args.push_back( make_pair( sizeof(cl_int), (void *)&cols )); args.push_back( make_pair( sizeof(cl_int), (void *)&dst.offset));
args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 )); args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows ));
} args.push_back( make_pair( sizeof(cl_int), (void *)&cols ));
args.push_back( make_pair( sizeof(cl_int), (void *)&dst_step1 ));
openCLExecuteKernel(clCxt, &arithm_addWeighted, "addWeighted", globalThreads, localThreads, args, -1, depth); openCLExecuteKernel(clCxt, &arithm_addWeighted, "addWeighted", globalThreads, localThreads, args, -1, depth);
} }
void cv::ocl::magnitudeSqr(const oclMat &src1, const oclMat &src2, oclMat &dst) void cv::ocl::magnitudeSqr(const oclMat &src1, const oclMat &src2, oclMat &dst)
{ {
CV_Assert(src1.type() == src2.type() && src1.size() == src2.size() && CV_Assert(src1.type() == src2.type() && src1.size() == src2.size() &&
(src1.depth() == CV_32F )); (src1.depth() == CV_32F ));
dst.create(src1.size(), src1.type()); dst.create(src1.size(), src1.type());
@ -2265,9 +2265,9 @@ void cv::ocl::magnitudeSqr(const oclMat &src1, const oclMat &src2, oclMat &dst)
size_t localThreads[3] = { 256, 1, 1 }; size_t localThreads[3] = { 256, 1, 1 };
size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0], size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0],
divUp(dst.rows, localThreads[1]) * localThreads[1], divUp(dst.rows, localThreads[1]) * localThreads[1],
1 1
}; };
int dst_step1 = dst.cols * dst.elemSize(); int dst_step1 = dst.cols * dst.elemSize();
vector<pair<size_t , const void *> > args; vector<pair<size_t , const void *> > args;
@ -2313,9 +2313,9 @@ void cv::ocl::magnitudeSqr(const oclMat &src1, oclMat &dst)
size_t localThreads[3] = { 256, 1, 1 }; size_t localThreads[3] = { 256, 1, 1 };
size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0], size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0],
divUp(dst.rows, localThreads[1]) * localThreads[1], divUp(dst.rows, localThreads[1]) * localThreads[1],
1 1
}; };
int dst_step1 = dst.cols * dst.elemSize(); int dst_step1 = dst.cols * dst.elemSize();
vector<pair<size_t , const void *> > args; vector<pair<size_t , const void *> > args;
@ -2348,9 +2348,9 @@ void arithmetic_pow_run(const oclMat &src1, double p, oclMat &dst, string kernel
size_t localThreads[3] = { 64, 4, 1 }; size_t localThreads[3] = { 64, 4, 1 };
size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0], size_t globalThreads[3] = { divUp(cols, localThreads[0]) * localThreads[0],
divUp(rows, localThreads[1]) * localThreads[1], divUp(rows, localThreads[1]) * localThreads[1],
1 1
}; };
int dst_step1 = dst.cols * dst.elemSize(); int dst_step1 = dst.cols * dst.elemSize();
vector<pair<size_t , const void *> > args; vector<pair<size_t , const void *> > args;

@ -410,7 +410,11 @@ namespace cv
float ify = 1. / fy; float ify = 1. / fy;
double ifx_d = 1. / fx; double ifx_d = 1. / fx;
double ify_d = 1. / fy; double ify_d = 1. / fy;
int srcStep_in_pixel = src.step1() / src.channels();
int srcoffset_in_pixel = src.offset / src.elemSize();
int dstStep_in_pixel = dst.step1() / dst.channels();
int dstoffset_in_pixel = dst.offset / dst.elemSize();
//printf("%d %d\n",src.step1() , dst.elemSize());
string kernelName; string kernelName;
if(interpolation == INTER_LINEAR) if(interpolation == INTER_LINEAR)
kernelName = "resizeLN"; kernelName = "resizeLN";
@ -438,25 +442,33 @@ namespace cv
{ {
args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data)); args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data)); args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset)); args.push_back( make_pair(sizeof(cl_int), (void *)&dstoffset_in_pixel));
args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset)); args.push_back( make_pair(sizeof(cl_int), (void *)&srcoffset_in_pixel));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step)); args.push_back( make_pair(sizeof(cl_int), (void *)&dstStep_in_pixel));
args.push_back( make_pair(sizeof(cl_int), (void *)&src.step)); args.push_back( make_pair(sizeof(cl_int), (void *)&srcStep_in_pixel));
args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols)); args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols));
args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows)); args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols)); args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows)); args.push_back( make_pair(sizeof(cl_int), (void *)&dst.rows));
args.push_back( make_pair(sizeof(cl_double), (void *)&ifx_d)); if(src.clCxt -> impl -> double_support != 0)
args.push_back( make_pair(sizeof(cl_double), (void *)&ify_d)); {
args.push_back( make_pair(sizeof(cl_double), (void *)&ifx_d));
args.push_back( make_pair(sizeof(cl_double), (void *)&ify_d));
}
else
{
args.push_back( make_pair(sizeof(cl_float), (void *)&ifx));
args.push_back( make_pair(sizeof(cl_float), (void *)&ify));
}
} }
else else
{ {
args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data)); args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data)); args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset)); args.push_back( make_pair(sizeof(cl_int), (void *)&dstoffset_in_pixel));
args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset)); args.push_back( make_pair(sizeof(cl_int), (void *)&srcoffset_in_pixel));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.step)); args.push_back( make_pair(sizeof(cl_int), (void *)&dstStep_in_pixel));
args.push_back( make_pair(sizeof(cl_int), (void *)&src.step)); args.push_back( make_pair(sizeof(cl_int), (void *)&srcStep_in_pixel));
args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols)); args.push_back( make_pair(sizeof(cl_int), (void *)&src.cols));
args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows)); args.push_back( make_pair(sizeof(cl_int), (void *)&src.rows));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols)); args.push_back( make_pair(sizeof(cl_int), (void *)&dst.cols));

@ -378,20 +378,36 @@ namespace cv
void openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch, void openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch,
const void *src, size_t spitch, const void *src, size_t spitch,
size_t width, size_t height, enum openCLMemcpyKind kind) size_t width, size_t height, enum openCLMemcpyKind kind, int channels)
{ {
size_t buffer_origin[3] = {0, 0, 0}; size_t buffer_origin[3] = {0, 0, 0};
size_t host_origin[3] = {0, 0, 0}; size_t host_origin[3] = {0, 0, 0};
size_t region[3] = {width, height, 1}; size_t region[3] = {width, height, 1};
if(kind == clMemcpyHostToDevice) if(kind == clMemcpyHostToDevice)
{ {
openCLSafeCall(clEnqueueWriteBufferRect(clCxt->impl->clCmdQueue, (cl_mem)dst, CL_TRUE, if(dpitch == width || channels==3)
buffer_origin, host_origin, region, dpitch, 0, spitch, 0, src, 0, 0, 0)); {
openCLSafeCall(clEnqueueWriteBuffer(clCxt->impl->clCmdQueue, (cl_mem)dst, CL_TRUE,
0, width*height, src, 0, NULL, NULL));
}
else
{
openCLSafeCall(clEnqueueWriteBufferRect(clCxt->impl->clCmdQueue, (cl_mem)dst, CL_TRUE,
buffer_origin, host_origin, region, dpitch, 0, spitch, 0, src, 0, 0, 0));
}
} }
else if(kind == clMemcpyDeviceToHost) else if(kind == clMemcpyDeviceToHost)
{ {
openCLSafeCall(clEnqueueReadBufferRect(clCxt->impl->clCmdQueue, (cl_mem)src, CL_TRUE, if(spitch == width || channels==3)
buffer_origin, host_origin, region, spitch, 0, dpitch, 0, dst, 0, 0, 0)); {
openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)src, CL_TRUE,
0, width*height, dst, 0, NULL, NULL));
}
else
{
openCLSafeCall(clEnqueueReadBufferRect(clCxt->impl->clCmdQueue, (cl_mem)src, CL_TRUE,
buffer_origin, host_origin, region, spitch, 0, dpitch, 0, dst, 0, 0, 0));
}
} }
} }

@ -51,9 +51,9 @@ typedef float F;
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////addWeighted////////////////////////////////////////////// /////////////////////////////////////////////addWeighted//////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void addWeighted_D0 (__global uchar *src1, F alpha,int src1_step,int src1_offset, __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset,
__global uchar *src2, F beta, int src2_step,int src2_offset, __global uchar *src2, int src2_step,int src2_offset,
F gama, F alpha,F beta,F gama,
__global uchar *dst, int dst_step,int dst_offset, __global uchar *dst, int dst_step,int dst_offset,
int rows, int cols,int dst_step1) int rows, int cols,int dst_step1)
{ {
@ -99,9 +99,9 @@ __kernel void addWeighted_D0 (__global uchar *src1, F alpha,int src1_step,int sr
__kernel void addWeighted_D2 (__global ushort *src1, F alpha,int src1_step,int src1_offset, __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offset,
__global ushort *src2, F beta, int src2_step,int src2_offset, __global ushort *src2, int src2_step,int src2_offset,
F gama, F alpha,F beta,F gama,
__global ushort *dst, int dst_step,int dst_offset, __global ushort *dst, int dst_step,int dst_offset,
int rows, int cols,int dst_step1) int rows, int cols,int dst_step1)
{ {
@ -145,9 +145,9 @@ __kernel void addWeighted_D2 (__global ushort *src1, F alpha,int src1_step,int s
} }
__kernel void addWeighted_D3 (__global short *src1, F alpha,int src1_step,int src1_offset, __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offset,
__global short *src2, F beta, int src2_step,int src2_offset, __global short *src2, int src2_step,int src2_offset,
F gama, F alpha,F beta,F gama,
__global short *dst, int dst_step,int dst_offset, __global short *dst, int dst_step,int dst_offset,
int rows, int cols,int dst_step1) int rows, int cols,int dst_step1)
{ {
@ -190,9 +190,9 @@ __kernel void addWeighted_D3 (__global short *src1, F alpha,int src1_step,int sr
} }
__kernel void addWeighted_D4 (__global int *src1, F alpha,int src1_step,int src1_offset, __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
__global int *src2, F beta, int src2_step,int src2_offset, __global int *src2, int src2_step,int src2_offset,
F gama, F alpha,F beta, F gama,
__global int *dst, int dst_step,int dst_offset, __global int *dst, int dst_step,int dst_offset,
int rows, int cols,int dst_step1) int rows, int cols,int dst_step1)
{ {
@ -238,9 +238,9 @@ __kernel void addWeighted_D4 (__global int *src1, F alpha,int src1_step,int src1
} }
__kernel void addWeighted_D5 (__global float *src1, F alpha,int src1_step,int src1_offset, __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset,
__global float *src2, F beta, int src2_step,int src2_offset, __global float *src2, int src2_step,int src2_offset,
F gama, F alpha,F beta, F gama,
__global float *dst, int dst_step,int dst_offset, __global float *dst, int dst_step,int dst_offset,
int rows, int cols,int dst_step1) int rows, int cols,int dst_step1)
{ {
@ -286,9 +286,9 @@ __kernel void addWeighted_D5 (__global float *src1, F alpha,int src1_step,int sr
} }
#if defined (DOUBLE_SUPPORT) #if defined (DOUBLE_SUPPORT)
__kernel void addWeighted_D6 (__global double *src1, F alpha,int src1_step,int src1_offset, __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offset,
__global double *src2, F beta, int src2_step,int src2_offset, __global double *src2, int src2_step,int src2_offset,
F gama, F alpha,F beta, F gama,
__global double *dst, int dst_step,int dst_offset, __global double *dst, int dst_step,int dst_offset,
int rows, int cols,int dst_step1) int rows, int cols,int dst_step1)
{ {

@ -49,6 +49,10 @@
#define CV_PI 3.1415926535897932384626433832795 #define CV_PI 3.1415926535897932384626433832795
#ifndef DBL_EPSILON
#define DBL_EPSILON 0x1.0p-52
#endif
__kernel void arithm_cartToPolar_D5 (__global float *src1, int src1_step, int src1_offset, __kernel void arithm_cartToPolar_D5 (__global float *src1, int src1_step, int src1_offset,
__global float *src2, int src2_step, int src2_offset, __global float *src2, int src2_step, int src2_offset,
__global float *dst1, int dst1_step, int dst1_offset, //magnitude __global float *dst1, int dst1_step, int dst1_offset, //magnitude

@ -45,36 +45,45 @@
#if defined (DOUBLE_SUPPORT) #if defined (DOUBLE_SUPPORT)
#pragma OPENCL EXTENSION cl_khr_fp64:enable #pragma OPENCL EXTENSION cl_khr_fp64:enable
typedef double F ;
typedef double4 F4;
#define convert_F4 convert_double4
#define convert_F convert_double
#else
typedef float F;
typedef float4 F4;
#define convert_F4 convert_float4
#define convert_F convert_float
#endif #endif
uchar round2_uchar(double v){ uchar round2_uchar(F v){
uchar v1 = convert_uchar_sat(v); uchar v1 = convert_uchar_sat(round(v));
uchar v2 = convert_uchar_sat(v+(v>=0 ? 0.5 : -0.5)); //uchar v2 = convert_uchar_sat(v+(v>=0 ? 0.5 : -0.5));
return (((v-v1)==0.5) && (v1%2==0)) ? v1 : v2; return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
} }
ushort round2_ushort(double v){ ushort round2_ushort(F v){
ushort v1 = convert_ushort_sat(v); ushort v1 = convert_ushort_sat(round(v));
ushort v2 = convert_ushort_sat(v+(v>=0 ? 0.5 : -0.5)); //ushort v2 = convert_ushort_sat(v+(v>=0 ? 0.5 : -0.5));
return (((v-v1)==0.5) && (v1%2==0)) ? v1 : v2; return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
} }
short round2_short(double v){ short round2_short(F v){
short v1 = convert_short_sat(v); short v1 = convert_short_sat(round(v));
short v2 = convert_short_sat(v+(v>=0 ? 0.5 : -0.5)); //short v2 = convert_short_sat(v+(v>=0 ? 0.5 : -0.5));
return (((v-v1)==0.5) && (v1%2==0)) ? v1 : v2; return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
} }
int round2_int(double v){ int round2_int(F v){
int v1 = convert_int_sat(v); int v1 = convert_int_sat(round(v));
int v2 = convert_int_sat(v+(v>=0 ? 0.5 : -0.5)); //int v2 = convert_int_sat(v+(v>=0 ? 0.5 : -0.5));
return (((v-v1)==0.5) && (v1%2==0)) ? v1 : v2; return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
} }
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
////////////////////////////divide/////////////////////////////////////////////////// ////////////////////////////divide///////////////////////////////////////////////////
@ -83,7 +92,7 @@ int round2_int(double v){
__kernel void arithm_div_D0 (__global uchar *src1, int src1_step, int src1_offset, __kernel void arithm_div_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset, __global uchar *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset, __global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1, double scalar) int rows, int cols, int dst_step1, F scalar)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
@ -104,13 +113,13 @@ __kernel void arithm_div_D0 (__global uchar *src1, int src1_step, int src1_offse
uchar4 src2_data = vload4(0, src2 + src2_index); uchar4 src2_data = vload4(0, src2 + src2_index);
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
double4 tmp = convert_double4(src1_data) * scalar; F4 tmp = convert_F4(src1_data) * scalar;
uchar4 tmp_data; uchar4 tmp_data;
tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_uchar(tmp.x / (double)src2_data.x); tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_uchar(tmp.x / (F)src2_data.x);
tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_uchar(tmp.y / (double)src2_data.y); tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_uchar(tmp.y / (F)src2_data.y);
tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_uchar(tmp.z / (double)src2_data.z); tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_uchar(tmp.z / (F)src2_data.z);
tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_uchar(tmp.w / (double)src2_data.w); tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_uchar(tmp.w / (F)src2_data.w);
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y; dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
@ -124,7 +133,7 @@ __kernel void arithm_div_D0 (__global uchar *src1, int src1_step, int src1_offse
__kernel void arithm_div_D2 (__global ushort *src1, int src1_step, int src1_offset, __kernel void arithm_div_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset, __global ushort *src2, int src2_step, int src2_offset,
__global ushort *dst, int dst_step, int dst_offset, __global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1, double scalar) int rows, int cols, int dst_step1, F scalar)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
@ -145,13 +154,13 @@ __kernel void arithm_div_D2 (__global ushort *src1, int src1_step, int src1_offs
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
double4 tmp = convert_double4(src1_data) * scalar; F4 tmp = convert_F4(src1_data) * scalar;
ushort4 tmp_data; ushort4 tmp_data;
tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_ushort(tmp.x / (double)src2_data.x); tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_ushort(tmp.x / (F)src2_data.x);
tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_ushort(tmp.y / (double)src2_data.y); tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_ushort(tmp.y / (F)src2_data.y);
tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_ushort(tmp.z / (double)src2_data.z); tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_ushort(tmp.z / (F)src2_data.z);
tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_ushort(tmp.w / (double)src2_data.w); tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_ushort(tmp.w / (F)src2_data.w);
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y; dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
@ -164,7 +173,7 @@ __kernel void arithm_div_D2 (__global ushort *src1, int src1_step, int src1_offs
__kernel void arithm_div_D3 (__global short *src1, int src1_step, int src1_offset, __kernel void arithm_div_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset, __global short *src2, int src2_step, int src2_offset,
__global short *dst, int dst_step, int dst_offset, __global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1, double scalar) int rows, int cols, int dst_step1, F scalar)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
@ -185,13 +194,13 @@ __kernel void arithm_div_D3 (__global short *src1, int src1_step, int src1_offse
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
double4 tmp = convert_double4(src1_data) * scalar; F4 tmp = convert_F4(src1_data) * scalar;
short4 tmp_data; short4 tmp_data;
tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_short(tmp.x / (double)src2_data.x); tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_short(tmp.x / (F)src2_data.x);
tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_short(tmp.y / (double)src2_data.y); tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_short(tmp.y / (F)src2_data.y);
tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_short(tmp.z / (double)src2_data.z); tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_short(tmp.z / (F)src2_data.z);
tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_short(tmp.w / (double)src2_data.w); tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_short(tmp.w / (F)src2_data.w);
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
@ -206,7 +215,7 @@ __kernel void arithm_div_D3 (__global short *src1, int src1_step, int src1_offse
__kernel void arithm_div_D4 (__global int *src1, int src1_step, int src1_offset, __kernel void arithm_div_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset, __global int *src2, int src2_step, int src2_offset,
__global int *dst, int dst_step, int dst_offset, __global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1, double scalar) int rows, int cols, int dst_step1, F scalar)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
@ -220,8 +229,8 @@ __kernel void arithm_div_D4 (__global int *src1, int src1_step, int src1_offset,
int data1 = *((__global int *)((__global char *)src1 + src1_index)); int data1 = *((__global int *)((__global char *)src1 + src1_index));
int data2 = *((__global int *)((__global char *)src2 + src2_index)); int data2 = *((__global int *)((__global char *)src2 + src2_index));
double tmp = convert_double(data1) * scalar; F tmp = convert_F(data1) * scalar;
int tmp_data = (tmp == 0 || data2 == 0) ? 0 : round2_int(tmp / (convert_double)(data2)); int tmp_data = (tmp == 0 || data2 == 0) ? 0 : round2_int(tmp / (convert_F)(data2));
*((__global int *)((__global char *)dst + dst_index)) =tmp_data; *((__global int *)((__global char *)dst + dst_index)) =tmp_data;
} }
@ -230,7 +239,7 @@ __kernel void arithm_div_D4 (__global int *src1, int src1_step, int src1_offset,
__kernel void arithm_div_D5 (__global float *src1, int src1_step, int src1_offset, __kernel void arithm_div_D5 (__global float *src1, int src1_step, int src1_offset,
__global float *src2, int src2_step, int src2_offset, __global float *src2, int src2_step, int src2_offset,
__global float *dst, int dst_step, int dst_offset, __global float *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1, double scalar) int rows, int cols, int dst_step1, F scalar)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
@ -244,13 +253,14 @@ __kernel void arithm_div_D5 (__global float *src1, int src1_step, int src1_offse
float data1 = *((__global float *)((__global char *)src1 + src1_index)); float data1 = *((__global float *)((__global char *)src1 + src1_index));
float data2 = *((__global float *)((__global char *)src2 + src2_index)); float data2 = *((__global float *)((__global char *)src2 + src2_index));
double tmp = convert_double(data1) * scalar; F tmp = convert_F(data1) * scalar;
float tmp_data = (tmp == 0 || data2 == 0) ? 0 : convert_float(tmp / (convert_double)(data2)); float tmp_data = (tmp == 0 || data2 == 0) ? 0 : convert_float(tmp / (convert_F)(data2));
*((__global float *)((__global char *)dst + dst_index)) = tmp_data; *((__global float *)((__global char *)dst + dst_index)) = tmp_data;
} }
} }
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_div_D6 (__global double *src1, int src1_step, int src1_offset, __kernel void arithm_div_D6 (__global double *src1, int src1_step, int src1_offset,
__global double *src2, int src2_step, int src2_offset, __global double *src2, int src2_step, int src2_offset,
__global double *dst, int dst_step, int dst_offset, __global double *dst, int dst_step, int dst_offset,
@ -274,10 +284,11 @@ __kernel void arithm_div_D6 (__global double *src1, int src1_step, int src1_offs
*((__global double *)((__global char *)dst + dst_index)) = tmp_data; *((__global double *)((__global char *)dst + dst_index)) = tmp_data;
} }
} }
#endif
/************************************div with scalar************************************/ /************************************div with scalar************************************/
__kernel void arithm_s_div_D0 (__global uchar *src, int src_step, int src_offset, __kernel void arithm_s_div_D0 (__global uchar *src, int src_step, int src_offset,
__global uchar *dst, int dst_step, int dst_offset, __global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1, double scalar) int rows, int cols, int dst_step1, F scalar)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
@ -297,10 +308,10 @@ __kernel void arithm_s_div_D0 (__global uchar *src, int src_step, int src_offset
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data; uchar4 tmp_data;
tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_uchar(scalar / (double)src_data.x); tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_uchar(scalar / (F)src_data.x);
tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_uchar(scalar / (double)src_data.y); tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_uchar(scalar / (F)src_data.y);
tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_uchar(scalar / (double)src_data.z); tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_uchar(scalar / (F)src_data.z);
tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_uchar(scalar / (double)src_data.w); tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_uchar(scalar / (F)src_data.w);
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y; dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
@ -313,7 +324,7 @@ __kernel void arithm_s_div_D0 (__global uchar *src, int src_step, int src_offset
__kernel void arithm_s_div_D2 (__global ushort *src, int src_step, int src_offset, __kernel void arithm_s_div_D2 (__global ushort *src, int src_step, int src_offset,
__global ushort *dst, int dst_step, int dst_offset, __global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1, double scalar) int rows, int cols, int dst_step1, F scalar)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
@ -333,10 +344,10 @@ __kernel void arithm_s_div_D2 (__global ushort *src, int src_step, int src_offse
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
ushort4 tmp_data; ushort4 tmp_data;
tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_ushort(scalar / (double)src_data.x); tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_ushort(scalar / (F)src_data.x);
tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_ushort(scalar / (double)src_data.y); tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_ushort(scalar / (F)src_data.y);
tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_ushort(scalar / (double)src_data.z); tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_ushort(scalar / (F)src_data.z);
tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_ushort(scalar / (double)src_data.w); tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_ushort(scalar / (F)src_data.w);
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y; dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y;
@ -348,7 +359,7 @@ __kernel void arithm_s_div_D2 (__global ushort *src, int src_step, int src_offse
} }
__kernel void arithm_s_div_D3 (__global short *src, int src_step, int src_offset, __kernel void arithm_s_div_D3 (__global short *src, int src_step, int src_offset,
__global short *dst, int dst_step, int dst_offset, __global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1, double scalar) int rows, int cols, int dst_step1, F scalar)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
@ -368,10 +379,10 @@ __kernel void arithm_s_div_D3 (__global short *src, int src_step, int src_offset
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
short4 tmp_data; short4 tmp_data;
tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_short(scalar / (double)src_data.x); tmp_data.x = ((scalar == 0) || (src_data.x == 0)) ? 0 : round2_short(scalar / (F)src_data.x);
tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_short(scalar / (double)src_data.y); tmp_data.y = ((scalar == 0) || (src_data.y == 0)) ? 0 : round2_short(scalar / (F)src_data.y);
tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_short(scalar / (double)src_data.z); tmp_data.z = ((scalar == 0) || (src_data.z == 0)) ? 0 : round2_short(scalar / (F)src_data.z);
tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_short(scalar / (double)src_data.w); tmp_data.w = ((scalar == 0) || (src_data.w == 0)) ? 0 : round2_short(scalar / (F)src_data.w);
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
@ -385,7 +396,7 @@ __kernel void arithm_s_div_D3 (__global short *src, int src_step, int src_offset
__kernel void arithm_s_div_D4 (__global int *src, int src_step, int src_offset, __kernel void arithm_s_div_D4 (__global int *src, int src_step, int src_offset,
__global int *dst, int dst_step, int dst_offset, __global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1, double scalar) int rows, int cols, int dst_step1, F scalar)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
@ -397,7 +408,7 @@ __kernel void arithm_s_div_D4 (__global int *src, int src_step, int src_offset,
int data = *((__global int *)((__global char *)src + src_index)); int data = *((__global int *)((__global char *)src + src_index));
int tmp_data = (scalar == 0 || data == 0) ? 0 : round2_int(scalar / (convert_double)(data)); int tmp_data = (scalar == 0 || data == 0) ? 0 : round2_int(scalar / (convert_F)(data));
*((__global int *)((__global char *)dst + dst_index)) =tmp_data; *((__global int *)((__global char *)dst + dst_index)) =tmp_data;
} }
@ -405,7 +416,7 @@ __kernel void arithm_s_div_D4 (__global int *src, int src_step, int src_offset,
__kernel void arithm_s_div_D5 (__global float *src, int src_step, int src_offset, __kernel void arithm_s_div_D5 (__global float *src, int src_step, int src_offset,
__global float *dst, int dst_step, int dst_offset, __global float *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1, double scalar) int rows, int cols, int dst_step1, F scalar)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
@ -417,12 +428,13 @@ __kernel void arithm_s_div_D5 (__global float *src, int src_step, int src_offset
float data = *((__global float *)((__global char *)src + src_index)); float data = *((__global float *)((__global char *)src + src_index));
float tmp_data = (scalar == 0 || data == 0) ? 0 : convert_float(scalar / (convert_double)(data)); float tmp_data = (scalar == 0 || data == 0) ? 0 : convert_float(scalar / (convert_F)(data));
*((__global float *)((__global char *)dst + dst_index)) = tmp_data; *((__global float *)((__global char *)dst + dst_index)) = tmp_data;
} }
} }
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_div_D6 (__global double *src, int src_step, int src_offset, __kernel void arithm_s_div_D6 (__global double *src, int src_step, int src_offset,
__global double *dst, int dst_step, int dst_offset, __global double *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1, double scalar) int rows, int cols, int dst_step1, double scalar)
@ -442,5 +454,6 @@ __kernel void arithm_s_div_D6 (__global double *src, int src_step, int src_offse
*((__global double *)((__global char *)dst + dst_index)) = tmp_data; *((__global double *)((__global char *)dst + dst_index)) = tmp_data;
} }
} }
#endif

@ -70,6 +70,8 @@ __kernel void arithm_exp_D5(int rows, int cols, int srcStep, int dstStep, int sr
} }
} }
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_exp_D6(int rows, int cols, int srcStep, int dstStep, int srcOffset, int dstOffset, __global double *src, __global double *dst) __kernel void arithm_exp_D6(int rows, int cols, int srcStep, int dstStep, int srcOffset, int dstOffset, __global double *src, __global double *dst)
{ {
int x = get_global_id(0); int x = get_global_id(0);
@ -87,3 +89,5 @@ __kernel void arithm_exp_D6(int rows, int cols, int srcStep, int dstStep, int sr
// dst[dstIdx] = exp(src[srcIdx]); // dst[dstIdx] = exp(src[srcIdx]);
} }
} }
#endif

@ -73,7 +73,7 @@ __kernel void arithm_log_D5(int rows, int cols, int srcStep, int dstStep, int sr
} }
} }
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_log_D6(int rows, int cols, int srcStep, int dstStep, int srcOffset, int dstOffset, __global double *src, __global double *dst) __kernel void arithm_log_D6(int rows, int cols, int srcStep, int dstStep, int srcOffset, int dstOffset, __global double *src, __global double *dst)
{ {
int x = get_global_id(0); int x = get_global_id(0);
@ -91,4 +91,4 @@ __kernel void arithm_log_D6(int rows, int cols, int srcStep, int dstStep, int sr
} }
} }
#endif

@ -6,7 +6,7 @@
// Third party copyrights are property of their respective owners. // Third party copyrights are property of their respective owners.
// //
// @Authors // @Authors
// Zero Lin, zero.lin@amd.com // Niko Li, newlife20080214@gmail.com
// Redistribution and use in source and binary forms, with or without modification, // Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met: // are permitted provided that the following conditions are met:
// //
@ -32,106 +32,107 @@
// the use of this software, even if advised of the possibility of such damage. // the use of this software, even if advised of the possibility of such damage.
// //
// //
//#pragma OPENCL EXTENSION cl_amd_printf : enable
__kernel void convertC3C4_D0(__global const char4 * restrict src, __global char4 *dst, int cols, int rows, __kernel void convertC3C4(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst, int cols, int rows,
int srcStep, int dstStep) int dstStep_in_piexl,int pixel_end)
{ {
int id = get_global_id(0); int id = get_global_id(0);
int y = id / cols; //int pixel_end = mul24(cols -1 , rows -1);
int x = id % cols; int3 pixelid = (int3)(mul24(id,3),mad24(id,3,1),mad24(id,3,2));
pixelid = clamp(pixelid,0,pixel_end);
GENTYPE4 pixel0, pixel1, pixel2, outpix0,outpix1,outpix2,outpix3;
pixel0 = src[pixelid.x];
pixel1 = src[pixelid.y];
pixel2 = src[pixelid.z];
int d = y * srcStep + x * 3;
char8 data = (char8)(src[d>>2], src[(d>>2) + 1]);
char temp[8] = {data.s0, data.s1, data.s2, data.s3, data.s4, data.s5, data.s6, data.s7};
int start = d & 3;
char4 ndata = (char4)(temp[start], temp[start + 1], temp[start + 2], 0);
if(y < rows)
dst[y * dstStep + x] = ndata;
}
__kernel void convertC3C4_D1(__global const short* restrict src, __global short4 *dst, int cols, int rows, outpix0 = (GENTYPE4)(pixel0.x,pixel0.y,pixel0.z,0);
int srcStep, int dstStep) outpix1 = (GENTYPE4)(pixel0.w,pixel1.x,pixel1.y,0);
{ outpix2 = (GENTYPE4)(pixel1.z,pixel1.w,pixel2.x,0);
int id = get_global_id(0); outpix3 = (GENTYPE4)(pixel2.y,pixel2.z,pixel2.w,0);
int y = id / cols;
int x = id % cols;
int d = (y * srcStep + x * 6)>>1; int4 outy = (id<<2)/cols;
short4 data = *(__global short4 *)(src + ((d>>1)<<1)); int4 outx = (id<<2)%cols;
short temp[4] = {data.s0, data.s1, data.s2, data.s3}; outx.y++;
outx.z+=2;
int start = d & 1; outx.w+=3;
short4 ndata = (short4)(temp[start], temp[start + 1], temp[start + 2], 0); outy = select(outy,outy+1,outx>=cols);
if(y < rows) outx = select(outx,outx-cols,outx>=cols);
dst[y * dstStep + x] = ndata; //outpix3 = select(outpix3, outpix0, (uchar4)(outy.w>=rows));
//outpix2 = select(outpix2, outpix0, (uchar4)(outy.z>=rows));
//outpix1 = select(outpix1, outpix0, (uchar4)(outy.y>=rows));
//outx = select(outx,(int4)outx.x,outy>=rows);
//outy = select(outy,(int4)outy.x,outy>=rows);
int4 addr = mad24(outy,dstStep_in_piexl,outx);
if(outx.w<cols && outy.w<rows)
{
dst[addr.x] = outpix0;
dst[addr.y] = outpix1;
dst[addr.z] = outpix2;
dst[addr.w] = outpix3;
}
else if(outx.z<cols && outy.z<rows)
{
dst[addr.x] = outpix0;
dst[addr.y] = outpix1;
dst[addr.z] = outpix2;
}
else if(outx.y<cols && outy.y<rows)
{
dst[addr.x] = outpix0;
dst[addr.y] = outpix1;
}
else if(outx.x<cols && outy.x<rows)
{
dst[addr.x] = outpix0;
}
} }
__kernel void convertC3C4_D2(__global const int * restrict src, __global int4 *dst, int cols, int rows,
int srcStep, int dstStep)
{
int id = get_global_id(0);
int y = id / cols;
int x = id % cols;
int d = (y * srcStep + x * 12)>>2;
int4 data = *(__global int4 *)(src + d);
data.z = 0;
if(y < rows)
dst[y * dstStep + x] = data;
}
__kernel void convertC4C3_D2(__global const int4 * restrict src, __global int *dst, int cols, int rows,
int srcStep, int dstStep)
{
int id = get_global_id(0);
int y = id / cols;
int x = id % cols;
int4 data = src[y * srcStep + x];
if(y < rows)
{
int d = y * dstStep + x * 3;
dst[d] = data.x;
dst[d + 1] = data.y;
dst[d + 2] = data.z;
}
}
__kernel void convertC4C3_D1(__global const short4 * restrict src, __global short *dst, int cols, int rows, __kernel void convertC4C3(__global const GENTYPE4 * restrict src, __global GENTYPE4 *dst, int cols, int rows,
int srcStep, int dstStep) int srcStep_in_pixel,int pixel_end)
{ {
int id = get_global_id(0); int id = get_global_id(0)<<2;
int y = id / cols; int y = id / cols;
int x = id % cols; int x = id % cols;
int4 x4 = (int4)(x,x+1,x+2,x+3);
int4 y4 = select((int4)y,(int4)(y+1),x4>=(int4)cols);
x4 = select(x4,x4-(int4)cols,x4>=(int4)cols);
int4 addr = mad24(y4,(int4)srcStep_in_pixel,x4);
GENTYPE4 pixel0,pixel1,pixel2,pixel3, outpixel1, outpixel2;
pixel0 = src[addr.x];
pixel1 = src[addr.y];
pixel2 = src[addr.z];
pixel3 = src[addr.w];
short4 data = src[y * srcStep + x]; pixel0.w = pixel1.x;
outpixel1.x = pixel1.y;
if(y < rows) outpixel1.y = pixel1.z;
outpixel1.z = pixel2.x;
outpixel1.w = pixel2.y;
outpixel2.x = pixel2.z;
outpixel2.y = pixel3.x;
outpixel2.z = pixel3.y;
outpixel2.w = pixel3.z;
int4 outaddr = mul24(id>>2 , 3);
outaddr.y++;
outaddr.z+=2;
//printf("%d ",outaddr.z);
if(outaddr.z <= pixel_end)
{ {
int d = y * dstStep + x * 3; dst[outaddr.x] = pixel0;
dst[d] = data.x; dst[outaddr.y] = outpixel1;
dst[d + 1] = data.y; dst[outaddr.z] = outpixel2;
dst[d + 2] = data.z;
} }
} else if(outaddr.y <= pixel_end)
__kernel void convertC4C3_D0(__global const char4 * restrict src, __global char *dst, int cols, int rows,
int srcStep, int dstStep)
{
int id = get_global_id(0);
int y = id / cols;
int x = id % cols;
char4 data = src[y * srcStep + x];
if(y < rows)
{ {
int d = y * dstStep + x * 3; dst[outaddr.x] = pixel0;
dst[d] = data.x; dst[outaddr.y] = outpixel1;
dst[d + 1] = data.y;
dst[d + 2] = data.z;
} }
else if(outaddr.x <= pixel_end)
{
dst[outaddr.x] = pixel0;
}
} }

@ -16,7 +16,7 @@
// //
// @Authors // @Authors
// Zhang Ying, zhangying913@gmail.com // Zhang Ying, zhangying913@gmail.com
// // Niko Li, newlife20080214@gmail.com
// Redistribution and use in source and binary forms, with or without modification, // Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met: // are permitted provided that the following conditions are met:
// //
@ -50,21 +50,11 @@
#if defined DOUBLE_SUPPORT #if defined DOUBLE_SUPPORT
#pragma OPENCL EXTENSION cl_khr_fp64:enable #pragma OPENCL EXTENSION cl_khr_fp64:enable
typedef double F ; #define F double
#else #else
typedef float F; #define F float
#endif #endif
inline uint4 getPoint_8uc4(__global uchar4 * data, int offset, int x, int y, int step)
{
return convert_uint4(data[(offset>>2)+ y * (step>>2) + x]);
}
inline float getPoint_32fc1(__global float * data, int offset, int x, int y, int step)
{
return data[(offset>>2)+ y * (step>>2) + x];
}
#define INTER_RESIZE_COEF_BITS 11 #define INTER_RESIZE_COEF_BITS 11
#define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS) #define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS)
@ -72,8 +62,8 @@ inline float getPoint_32fc1(__global float * data, int offset, int x, int y, int
#define CAST_SCALE (1.0f/(1<<CAST_BITS)) #define CAST_SCALE (1.0f/(1<<CAST_BITS))
#define INC(x,l) ((x+1) >= (l) ? (x):((x)+1)) #define INC(x,l) ((x+1) >= (l) ? (x):((x)+1))
__kernel void resizeLN_C1_D0(__global unsigned char * dst, __global unsigned char const * restrict src, __kernel void resizeLN_C1_D0(__global uchar * dst, __global uchar const * restrict src,
int dst_offset, int src_offset,int dst_step, int src_step, int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify ) int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
{ {
int gx = get_global_id(0); int gx = get_global_id(0);
@ -81,7 +71,7 @@ __kernel void resizeLN_C1_D0(__global unsigned char * dst, __global unsigned cha
float4 sx, u, xf; float4 sx, u, xf;
int4 x, DX; int4 x, DX;
gx = (gx<<2) - (dst_offset&3); gx = (gx<<2) - (dstoffset_in_pixel&3);
DX = (int4)(gx, gx+1, gx+2, gx+3); DX = (int4)(gx, gx+1, gx+2, gx+3);
sx = (convert_float4(DX) + 0.5f) * ifx - 0.5f; sx = (convert_float4(DX) + 0.5f) * ifx - 0.5f;
xf = floor(sx); xf = floor(sx);
@ -119,10 +109,10 @@ __kernel void resizeLN_C1_D0(__global unsigned char * dst, __global unsigned cha
int4 val1, val2, val; int4 val1, val2, val;
int4 sdata1, sdata2, sdata3, sdata4; int4 sdata1, sdata2, sdata3, sdata4;
int4 pos1 = src_offset + y * src_step + x; int4 pos1 = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel);
int4 pos2 = src_offset + y * src_step + x_; int4 pos2 = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel);
int4 pos3 = src_offset + y_ * src_step + x; int4 pos3 = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel);
int4 pos4 = src_offset + y_ * src_step + x_; int4 pos4 = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel);
sdata1.s0 = src[pos1.s0]; sdata1.s0 = src[pos1.s0];
sdata1.s1 = src[pos1.s1]; sdata1.s1 = src[pos1.s1];
@ -144,20 +134,44 @@ __kernel void resizeLN_C1_D0(__global unsigned char * dst, __global unsigned cha
sdata4.s2 = src[pos4.s2]; sdata4.s2 = src[pos4.s2];
sdata4.s3 = src[pos4.s3]; sdata4.s3 = src[pos4.s3];
val1 = U1 * sdata1 + U * sdata2; val1 = mul24(U1 , sdata1) + mul24(U , sdata2);
val2 = U1 * sdata3 + U * sdata4; val2 = mul24(U1 , sdata3) + mul24(U , sdata4);
val = V1 * val1 + V * val2; val = mul24(V1 , val1) + mul24(V , val2);
__global uchar4* d = (__global uchar4*)(dst + dst_offset + dy * dst_step + gx); //__global uchar4* d = (__global uchar4*)(dst + dstoffset_in_pixel + dy * dststep_in_pixel + gx);
uchar4 dVal = *d; //uchar4 dVal = *d;
int4 con = ( DX >= 0 && DX < dst_cols && dy >= 0 && dy < dst_rows); //int4 con = ( DX >= 0 && DX < dst_cols && dy >= 0 && dy < dst_rows);
val = ((val + (1<<(CAST_BITS-1))) >> CAST_BITS); val = ((val + (1<<(CAST_BITS-1))) >> CAST_BITS);
*d = convert_uchar4(con != 0) ? convert_uchar4_sat(val) : dVal; //*d = convert_uchar4(con != 0) ? convert_uchar4_sat(val) : dVal;
pos4 = mad24(dy, dststep_in_pixel, gx+dstoffset_in_pixel);
pos4.y++;
pos4.z+=2;
uchar4 uval = convert_uchar4_sat(val);
int con = (gx >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows);
if(con)
{
*(__global uchar4*)(dst + pos4.x)=uval;
}
else
{
if(gx >= 0 && gx < dst_cols && dy >= 0 && dy < dst_rows)
{
dst[pos4.x]=uval.x;
}
if(gx+1 >= 0 && gx+1 < dst_cols && dy >= 0 && dy < dst_rows)
{
dst[pos4.y]=uval.y;
}
if(gx+2 >= 0 && gx+2 < dst_cols && dy >= 0 && dy < dst_rows)
{
dst[pos4.z]=uval.z;
}
}
} }
__kernel void resizeLN_C4_D0(__global uchar4 * dst, __global uchar4 * src, __kernel void resizeLN_C4_D0(__global uchar4 * dst, __global uchar4 * src,
int dst_offset, int src_offset,int dst_step, int src_step, int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify ) int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
@ -182,18 +196,25 @@ __kernel void resizeLN_C4_D0(__global uchar4 * dst, __global uchar4 * src,
int y_ = INC(y,src_rows); int y_ = INC(y,src_rows);
int x_ = INC(x,src_cols); int x_ = INC(x,src_cols);
int4 srcpos;
uint4 val = U1* V1 * getPoint_8uc4(src,src_offset,x,y,src_step) + srcpos.x = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel);
U1* V * getPoint_8uc4(src,src_offset,x,y_,src_step) + srcpos.y = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel);
U * V1 * getPoint_8uc4(src,src_offset,x_,y,src_step) + srcpos.z = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel);
U * V * getPoint_8uc4(src,src_offset,x_,y_,src_step); srcpos.w = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel);
int4 data0 = convert_int4(src[srcpos.x]);
int4 data1 = convert_int4(src[srcpos.y]);
int4 data2 = convert_int4(src[srcpos.z]);
int4 data3 = convert_int4(src[srcpos.w]);
int4 val = mul24(mul24(U1, V1) , data0) + mul24(mul24(U, V1) , data1)
+mul24(mul24(U1, V) , data2)+mul24(mul24(U, V) , data3);
int dstpos = mad24(dy, dststep_in_pixel, dx+dstoffset_in_pixel);
uchar4 uval = convert_uchar4((val + (1<<(CAST_BITS-1)))>>CAST_BITS);
if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows) if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
dst[(dst_offset>>2) + dy * (dst_step>>2) + dx] = convert_uchar4((val + (1<<(CAST_BITS-1)))>>CAST_BITS); dst[dstpos] = uval;
} }
__kernel void resizeLN_C1_D5(__global float * dst, __global float * src, __kernel void resizeLN_C1_D5(__global float * dst, __global float * src,
int dst_offset, int src_offset,int dst_step, int src_step, int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify ) int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
@ -210,19 +231,29 @@ __kernel void resizeLN_C1_D5(__global float * dst, __global float * src,
int y_ = INC(y,src_rows); int y_ = INC(y,src_rows);
int x_ = INC(x,src_cols); int x_ = INC(x,src_cols);
float u1 = 1.f-u;
float val1 = (1.0f-u) * getPoint_32fc1(src,src_offset,x,y,src_step) + float v1 = 1.f-v;
u * getPoint_32fc1(src,src_offset,x_,y,src_step) ; int4 srcpos;
float val2 = (1.0f-u) * getPoint_32fc1(src,src_offset,x,y_,src_step) + srcpos.x = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel);
u * getPoint_32fc1(src,src_offset,x_,y_,src_step); srcpos.y = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel);
float val = (1.0f-v) * val1 + v * val2; srcpos.z = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel);
srcpos.w = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel);
float data0 = src[srcpos.x];
float data1 = src[srcpos.y];
float data2 = src[srcpos.z];
float data3 = src[srcpos.w];
float val1 = u1 * data0 +
u * data1 ;
float val2 = u1 * data2 +
u * data3;
float val = v1 * val1 + v * val2;
int dstpos = mad24(dy, dststep_in_pixel, dx+dstoffset_in_pixel);
if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows) if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
dst[(dst_offset>>2) + dy * (dst_step>>2) + dx] = val; dst[dstpos] = val;
} }
__kernel void resizeLN_C4_D5(__global float4 * dst, __global float4 * src, __kernel void resizeLN_C4_D5(__global float4 * dst, __global float4 * src,
int dst_offset, int src_offset,int dst_step, int src_step, int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify ) int src_cols, int src_rows, int dst_cols, int dst_rows, float ifx, float ify )
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
@ -239,31 +270,35 @@ __kernel void resizeLN_C4_D5(__global float4 * dst, __global float4 * src,
int y_ = INC(y,src_rows); int y_ = INC(y,src_rows);
int x_ = INC(x,src_cols); int x_ = INC(x,src_cols);
float u1 = 1.f-u;
float v1 = 1.f-v;
int4 srcpos;
srcpos.x = mad24(y, srcstep_in_pixel, x+srcoffset_in_pixel);
srcpos.y = mad24(y, srcstep_in_pixel, x_+srcoffset_in_pixel);
srcpos.z = mad24(y_, srcstep_in_pixel, x+srcoffset_in_pixel);
srcpos.w = mad24(y_, srcstep_in_pixel, x_+srcoffset_in_pixel);
float4 s_data1, s_data2, s_data3, s_data4; float4 s_data1, s_data2, s_data3, s_data4;
src_offset = (src_offset >> 4); s_data1 = src[srcpos.x];
src_step = (src_step >> 4); s_data2 = src[srcpos.y];
s_data1 = src[src_offset + y*src_step + x]; s_data3 = src[srcpos.z];
s_data2 = src[src_offset + y*src_step + x_]; s_data4 = src[srcpos.w];
s_data3 = src[src_offset + y_*src_step + x]; float4 val = u1 * v1 * s_data1 + u * v1 * s_data2
s_data4 = src[src_offset + y_*src_step + x_]; +u1 * v *s_data3 + u * v *s_data4;
s_data1 = (1.0f-u) * s_data1 + u * s_data2; int dstpos = mad24(dy, dststep_in_pixel, dx+dstoffset_in_pixel);
s_data2 = (1.0f-u) * s_data3 + u * s_data4;
s_data3 = (1.0f-v) * s_data1 + v * s_data2;
if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows) if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
dst[(dst_offset>>4) + dy * (dst_step>>4) + dx] = s_data3; dst[dstpos] = val;
} }
__kernel void resizeNN_C1_D0(__global uchar * dst, __global uchar * src, __kernel void resizeNN_C1_D0(__global uchar * dst, __global uchar * src,
int dst_offset, int src_offset,int dst_step, int src_step, int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
int src_cols, int src_rows, int dst_cols, int dst_rows, F ifx, F ify ) int src_cols, int src_rows, int dst_cols, int dst_rows, F ifx, F ify )
{ {
int gx = get_global_id(0); int gx = get_global_id(0);
int dy = get_global_id(1); int dy = get_global_id(1);
gx = (gx<<2) - (dst_offset&3); gx = (gx<<2) - (dstoffset_in_pixel&3);
int4 GX = (int4)(gx, gx+1, gx+2, gx+3); //int4 GX = (int4)(gx, gx+1, gx+2, gx+3);
int4 sx; int4 sx;
int sy; int sy;
@ -279,22 +314,42 @@ __kernel void resizeNN_C1_D0(__global uchar * dst, __global uchar * src,
sy = min((int)floor(s5), src_rows-1); sy = min((int)floor(s5), src_rows-1);
uchar4 val; uchar4 val;
int4 pos = src_offset + sy * src_step + sx; int4 pos = mad24(sy, srcstep_in_pixel, sx+srcoffset_in_pixel);
val.s0 = src[pos.s0]; val.s0 = src[pos.s0];
val.s1 = src[pos.s1]; val.s1 = src[pos.s1];
val.s2 = src[pos.s2]; val.s2 = src[pos.s2];
val.s3 = src[pos.s3]; val.s3 = src[pos.s3];
__global uchar4* d = (__global uchar4*)(dst + dst_offset + dy * dst_step + gx); //__global uchar4* d = (__global uchar4*)(dst + dstoffset_in_pixel + dy * dststep_in_pixel + gx);
uchar4 dVal = *d; //uchar4 dVal = *d;
int4 con = (GX >= 0 && GX < dst_cols && dy >= 0 && dy < dst_rows); pos = mad24(dy, dststep_in_pixel, gx+dstoffset_in_pixel);
val = convert_uchar4(con != 0) ? val : dVal; pos.y++;
pos.z+=2;
*d = val;
int con = (gx >= 0 && gx+3 < dst_cols && dy >= 0 && dy < dst_rows);
if(con)
{
*(__global uchar4*)(dst + pos.x)=val;
}
else
{
if(gx >= 0 && gx < dst_cols && dy >= 0 && dy < dst_rows)
{
dst[pos.x]=val.x;
}
if(gx+1 >= 0 && gx+1 < dst_cols && dy >= 0 && dy < dst_rows)
{
dst[pos.y]=val.y;
}
if(gx+2 >= 0 && gx+2 < dst_cols && dy >= 0 && dy < dst_rows)
{
dst[pos.z]=val.z;
}
}
} }
__kernel void resizeNN_C4_D0(__global uchar4 * dst, __global uchar4 * src, __kernel void resizeNN_C4_D0(__global uchar4 * dst, __global uchar4 * src,
int dst_offset, int src_offset,int dst_step, int src_step, int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
int src_cols, int src_rows, int dst_cols, int dst_rows, F ifx, F ify ) int src_cols, int src_rows, int dst_cols, int dst_rows, F ifx, F ify )
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
@ -304,8 +359,8 @@ __kernel void resizeNN_C4_D0(__global uchar4 * dst, __global uchar4 * src,
F s2 = dy*ify; F s2 = dy*ify;
int sx = fmin((float)floor(s1), (float)src_cols-1); int sx = fmin((float)floor(s1), (float)src_cols-1);
int sy = fmin((float)floor(s2), (float)src_rows-1); int sy = fmin((float)floor(s2), (float)src_rows-1);
int dpos = (dst_offset>>2) + dy * (dst_step>>2) + dx; int dpos = mad24(dy, dststep_in_pixel, dx + dstoffset_in_pixel);
int spos = (src_offset>>2) + sy * (src_step>>2) + sx; int spos = mad24(sy, srcstep_in_pixel, sx + srcoffset_in_pixel);
if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows) if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
dst[dpos] = src[spos]; dst[dpos] = src[spos];
@ -313,7 +368,7 @@ __kernel void resizeNN_C4_D0(__global uchar4 * dst, __global uchar4 * src,
} }
__kernel void resizeNN_C1_D5(__global float * dst, __global float * src, __kernel void resizeNN_C1_D5(__global float * dst, __global float * src,
int dst_offset, int src_offset,int dst_step, int src_step, int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
int src_cols, int src_rows, int dst_cols, int dst_rows, F ifx, F ify ) int src_cols, int src_rows, int dst_cols, int dst_rows, F ifx, F ify )
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
@ -323,16 +378,16 @@ __kernel void resizeNN_C1_D5(__global float * dst, __global float * src,
F s2 = dy*ify; F s2 = dy*ify;
int sx = fmin((float)floor(s1), (float)src_cols-1); int sx = fmin((float)floor(s1), (float)src_cols-1);
int sy = fmin((float)floor(s2), (float)src_rows-1); int sy = fmin((float)floor(s2), (float)src_rows-1);
int dpos = (dst_offset>>2) + dy * (dst_step>>2) + dx;
int spos = (src_offset>>2) + sy * (src_step>>2) + sx; int dpos = mad24(dy, dststep_in_pixel, dx + dstoffset_in_pixel);
int spos = mad24(sy, srcstep_in_pixel, sx + srcoffset_in_pixel);
if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows) if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
dst[dpos] = src[spos]; dst[dpos] = src[spos];
} }
__kernel void resizeNN_C4_D5(__global float4 * dst, __global float4 * src, __kernel void resizeNN_C4_D5(__global float4 * dst, __global float4 * src,
int dst_offset, int src_offset,int dst_step, int src_step, int dstoffset_in_pixel, int srcoffset_in_pixel,int dststep_in_pixel, int srcstep_in_pixel,
int src_cols, int src_rows, int dst_cols, int dst_rows, F ifx, F ify ) int src_cols, int src_rows, int dst_cols, int dst_rows, F ifx, F ify )
{ {
int dx = get_global_id(0); int dx = get_global_id(0);
@ -343,8 +398,8 @@ __kernel void resizeNN_C4_D5(__global float4 * dst, __global float4 * src,
int s_row = floor(s2); int s_row = floor(s2);
int sx = min(s_col, src_cols-1); int sx = min(s_col, src_cols-1);
int sy = min(s_row, src_rows-1); int sy = min(s_row, src_rows-1);
int dpos = (dst_offset>>4) + dy * (dst_step>>4) + dx; int dpos = mad24(dy, dststep_in_pixel, dx + dstoffset_in_pixel);
int spos = (src_offset>>4) + sy * (src_step>>4) + sx; int spos = mad24(sy, srcstep_in_pixel, sx + srcoffset_in_pixel);
if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows) if(dx>=0 && dx<dst_cols && dy>=0 && dy<dst_rows)
dst[dpos] = src[spos]; dst[dpos] = src[spos];

@ -34,13 +34,8 @@
// //
// //
/*
#if defined (DOUBLE_SUPPORT)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
*/
__kernel void set_to_without_mask_C1_D0(float4 scalar,__global uchar * dstMat, __kernel void set_to_without_mask_C1_D0(uchar scalar,__global uchar * dstMat,
int cols,int rows,int dstStep_in_pixel,int offset_in_pixel) int cols,int rows,int dstStep_in_pixel,int offset_in_pixel)
{ {
int x=get_global_id(0)<<2; int x=get_global_id(0)<<2;
@ -49,7 +44,8 @@ __kernel void set_to_without_mask_C1_D0(float4 scalar,__global uchar * dstMat,
int addr_end = mad24(y,dstStep_in_pixel,cols+offset_in_pixel); int addr_end = mad24(y,dstStep_in_pixel,cols+offset_in_pixel);
int idx = mad24(y,dstStep_in_pixel,(int)(x+ offset_in_pixel & (int)0xfffffffc)); int idx = mad24(y,dstStep_in_pixel,(int)(x+ offset_in_pixel & (int)0xfffffffc));
uchar4 out; uchar4 out;
out.x = out.y = out.z = out.w = convert_uchar_sat(scalar.x); out.x = out.y = out.z = out.w = scalar;
if ( (idx>=addr_start)&(idx+3 < addr_end) & (y < rows)) if ( (idx>=addr_start)&(idx+3 < addr_end) & (y < rows))
{ {
*(__global uchar4*)(dstMat+idx) = out; *(__global uchar4*)(dstMat+idx) = out;
@ -65,7 +61,7 @@ __kernel void set_to_without_mask_C1_D0(float4 scalar,__global uchar * dstMat,
} }
} }
__kernel void set_to_without_mask_C4_D0(float4 scalar,__global uchar4 * dstMat, __kernel void set_to_without_mask(GENTYPE scalar,__global GENTYPE * dstMat,
int cols,int rows,int dstStep_in_pixel,int offset_in_pixel) int cols,int rows,int dstStep_in_pixel,int offset_in_pixel)
{ {
int x=get_global_id(0); int x=get_global_id(0);
@ -73,52 +69,6 @@ __kernel void set_to_without_mask_C4_D0(float4 scalar,__global uchar4 * dstMat,
if ( (x < cols) & (y < rows)) if ( (x < cols) & (y < rows))
{ {
int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel); int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel);
dstMat[idx] = convert_uchar4_sat(scalar); dstMat[idx] = scalar;
} }
} }
__kernel void set_to_without_mask_C1_D4(float4 scalar,__global int * dstMat,
int cols,int rows,int dstStep_in_pixel,int offset_in_pixel)
{
int x=get_global_id(0);
int y=get_global_id(1);
if ( (x < cols) & (y < rows))
{
int idx = mad24(y, dstStep_in_pixel, x+offset_in_pixel);
dstMat[idx] = convert_int_sat(scalar.x);
}
}
__kernel void set_to_without_mask_C4_D4(float4 scalar,__global int4 * dstMat,
int cols,int rows,int dstStep_in_pixel,int offset_in_pixel)
{
int x=get_global_id(0);
int y=get_global_id(1);
if ( (x < cols) & (y < rows))
{
int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel);
dstMat[idx] = convert_int4_sat(scalar);
}
}
__kernel void set_to_without_mask_C1_D5(float4 scalar,__global float * dstMat,
int cols,int rows,int dstStep_in_pixel,int offset_in_pixel)
{
int x=get_global_id(0);
int y=get_global_id(1);
if ( (x < cols) & (y < rows))
{
int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel);
dstMat[idx] = scalar.x;
}
}
__kernel void set_to_without_mask_C4_D5(float4 scalar,__global float4 * dstMat,
int cols,int rows,int dstStep_in_pixel,int offset_in_pixel)
{
int x=get_global_id(0);
int y=get_global_id(1);
if ( (x < cols) & (y < rows))
{
int idx = mad24(y,dstStep_in_pixel,x+ offset_in_pixel);
dstMat[idx] = scalar;
}
}

@ -35,12 +35,6 @@
// //
/*#if defined (__ATI__)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (__NVIDIA__)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
*/
/* /*
__kernel void set_to_with_mask_C1_D0( __kernel void set_to_with_mask_C1_D0(
float4 scalar, float4 scalar,
@ -67,7 +61,7 @@ __kernel void set_to_with_mask_C1_D0(
*/ */
//#pragma OPENCL EXTENSION cl_amd_printf : enable //#pragma OPENCL EXTENSION cl_amd_printf : enable
__kernel void set_to_with_mask_C1_D0( __kernel void set_to_with_mask_C1_D0(
float4 scalar, uchar scalar,
__global uchar* dstMat, __global uchar* dstMat,
int cols, int cols,
int rows, int rows,
@ -85,7 +79,7 @@ __kernel void set_to_with_mask_C1_D0(
int mask_addr_start = mad24(y,maskStep,maskoffset); int mask_addr_start = mad24(y,maskStep,maskoffset);
int mask_addr_end = mad24(y,maskStep,cols+maskoffset); int mask_addr_end = mad24(y,maskStep,cols+maskoffset);
int maskidx = mad24(y,maskStep,x+ maskoffset & (int)0xfffffffc); int maskidx = mad24(y,maskStep,x+ maskoffset & (int)0xfffffffc);
uchar out = convert_uchar_sat(scalar.x);
int off_mask = (maskoffset & 3) - (dstoffset_in_pixel & 3) +3; int off_mask = (maskoffset & 3) - (dstoffset_in_pixel & 3) +3;
if ( (x < cols) & (y < rows) ) if ( (x < cols) & (y < rows) )
@ -107,104 +101,16 @@ __kernel void set_to_with_mask_C1_D0(
temp_mask2.z = (maskidx+6 >=mask_addr_start)&(maskidx+6 < mask_addr_end) ? temp_mask2.z : 0; temp_mask2.z = (maskidx+6 >=mask_addr_start)&(maskidx+6 < mask_addr_end) ? temp_mask2.z : 0;
temp_mask2.w = (maskidx+7 >=mask_addr_start)&(maskidx+7 < mask_addr_end) ? temp_mask2.w : 0; temp_mask2.w = (maskidx+7 >=mask_addr_start)&(maskidx+7 < mask_addr_end) ? temp_mask2.w : 0;
uchar trans_mask[10] = {temp_mask1.y,temp_mask1.z,temp_mask1.w,temp_mask.x,temp_mask.y,temp_mask.z,temp_mask.w,temp_mask2.x,temp_mask2.y,temp_mask2.z}; uchar trans_mask[10] = {temp_mask1.y,temp_mask1.z,temp_mask1.w,temp_mask.x,temp_mask.y,temp_mask.z,temp_mask.w,temp_mask2.x,temp_mask2.y,temp_mask2.z};
temp_dst.x = (dstidx>=dst_addr_start)&(dstidx<dst_addr_end)& trans_mask[off_mask] ? out : temp_dst.x; temp_dst.x = (dstidx>=dst_addr_start)&(dstidx<dst_addr_end)& trans_mask[off_mask] ? scalar : temp_dst.x;
temp_dst.y = (dstidx+1>=dst_addr_start)&(dstidx+1<dst_addr_end)& trans_mask[off_mask+1] ? out : temp_dst.y; temp_dst.y = (dstidx+1>=dst_addr_start)&(dstidx+1<dst_addr_end)& trans_mask[off_mask+1] ? scalar : temp_dst.y;
temp_dst.z = (dstidx+2>=dst_addr_start)&(dstidx+2<dst_addr_end)& trans_mask[off_mask+2] ? out : temp_dst.z; temp_dst.z = (dstidx+2>=dst_addr_start)&(dstidx+2<dst_addr_end)& trans_mask[off_mask+2] ? scalar : temp_dst.z;
temp_dst.w = (dstidx+3>=dst_addr_start)&(dstidx+3<dst_addr_end)& trans_mask[off_mask+3] ? out : temp_dst.w; temp_dst.w = (dstidx+3>=dst_addr_start)&(dstidx+3<dst_addr_end)& trans_mask[off_mask+3] ? scalar : temp_dst.w;
*(__global uchar4*)(dstMat+dstidx) = temp_dst; *(__global uchar4*)(dstMat+dstidx) = temp_dst;
} }
} }
__kernel void set_to_with_mask_C4_D0( __kernel void set_to_with_mask(
float4 scalar, GENTYPE scalar,
__global uchar4 * dstMat, __global GENTYPE * dstMat,
int cols,
int rows,
int dstStep_in_pixel,
int dstoffset_in_pixel,
__global const uchar * restrict maskMat,
int maskStep,
int maskoffset)
{
int x=get_global_id(0);
int y=get_global_id(1);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
int maskidx = mad24(y,maskStep,x+ maskoffset);
uchar mask = maskMat[maskidx];
if ( (x < cols) & (y < rows) & mask)
{
dstMat[dstidx] = convert_uchar4_sat(scalar);
}
}
__kernel void set_to_with_mask_C1_D4(
float4 scalar,
__global int * dstMat,
int cols,
int rows,
int dstStep_in_pixel,
int dstoffset_in_pixel,
__global const uchar * restrict maskMat,
int maskStep,
int maskoffset)
{
int x=get_global_id(0);
int y=get_global_id(1);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
int maskidx = mad24(y,maskStep,x+ maskoffset);
uchar mask = maskMat[maskidx];
if ( (x < cols) & (y < rows) & mask)
{
dstMat[dstidx] = convert_int_sat(scalar.x);
}
}
__kernel void set_to_with_mask_C4_D4(
float4 scalar,
__global int4 * dstMat,
int cols,
int rows,
int dstStep_in_pixel,
int dstoffset_in_pixel,
__global const uchar * restrict maskMat,
int maskStep,
int maskoffset)
{
int x=get_global_id(0);
int y=get_global_id(1);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
int maskidx = mad24(y,maskStep,x+ maskoffset);
uchar mask = maskMat[maskidx];
if ( (x < cols) & (y < rows) & mask)
{
dstMat[dstidx] = convert_int4_sat(scalar);
}
}
__kernel void set_to_with_mask_C1_D5(
float4 scalar,
__global float * dstMat,
int cols,
int rows,
int dstStep_in_pixel,
int dstoffset_in_pixel,
__global const uchar * restrict maskMat,
int maskStep,
int maskoffset)
{
int x=get_global_id(0);
int y=get_global_id(1);
int dstidx = mad24(y,dstStep_in_pixel,x+ dstoffset_in_pixel);
int maskidx = mad24(y,maskStep,x+ maskoffset);
uchar mask = maskMat[maskidx];
if ( (x < cols) & (y < rows) & mask)
{
dstMat[dstidx] = scalar.x;
}
}
__kernel void set_to_with_mask_C4_D5(
float4 scalar,
__global float4 * dstMat,
int cols, int cols,
int rows, int rows,
int dstStep_in_pixel, int dstStep_in_pixel,
@ -220,7 +126,7 @@ __kernel void set_to_with_mask_C4_D5(
uchar mask = maskMat[maskidx]; uchar mask = maskMat[maskidx];
if ( (x < cols) & (y < rows) & mask) if ( (x < cols) & (y < rows) & mask)
{ {
dstMat[dstidx] = scalar; dstMat[dstidx] = scalar;
} }
} }

@ -120,6 +120,7 @@ namespace cv
extern const char *operator_convertTo; extern const char *operator_convertTo;
extern const char *operator_setTo; extern const char *operator_setTo;
extern const char *operator_setToM; extern const char *operator_setToM;
extern const char *convertC3C4;
} }
} }
@ -127,43 +128,98 @@ namespace cv
// convert_C3C4 // convert_C3C4
void convert_C3C4(const cl_mem &src, oclMat &dst, int srcStep) void convert_C3C4(const cl_mem &src, oclMat &dst, int srcStep)
{ {
int dstStep = dst.step1() / dst.channels(); int dstStep_in_pixel = dst.step1() / dst.channels();
int pixel_end = dst.wholecols * dst.wholerows -1;
Context *clCxt = dst.clCxt; Context *clCxt = dst.clCxt;
string kernelName = "convertC3C4"; string kernelName = "convertC3C4";
char compile_option[32];
switch(dst.depth())
{
case 0:
sprintf(compile_option, "-D GENTYPE4=uchar4");
break;
case 1:
sprintf(compile_option, "-D GENTYPE4=char4");
break;
case 2:
sprintf(compile_option, "-D GENTYPE4=ushort4");
break;
case 3:
sprintf(compile_option, "-D GENTYPE4=short4");
break;
case 4:
sprintf(compile_option, "-D GENTYPE4=int4");
break;
case 5:
sprintf(compile_option, "-D GENTYPE4=float4");
break;
case 6:
sprintf(compile_option, "-D GENTYPE4=double4");
break;
default:
CV_Error(-217,"unknown depth");
}
vector< pair<size_t, const void *> > args; vector< pair<size_t, const void *> > args;
args.push_back( make_pair( sizeof(cl_mem), (void *)&src)); args.push_back( make_pair( sizeof(cl_mem), (void *)&src));
args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&dst.data));
args.push_back( make_pair( sizeof(cl_int), (void *)&dst.wholecols)); args.push_back( make_pair( sizeof(cl_int), (void *)&dst.wholecols));
args.push_back( make_pair( sizeof(cl_int), (void *)&dst.wholerows)); args.push_back( make_pair( sizeof(cl_int), (void *)&dst.wholerows));
args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep)); args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep_in_pixel));
args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep)); args.push_back( make_pair( sizeof(cl_int), (void *)&pixel_end));
size_t globalThreads[3] = {(dst.wholecols *dst.wholerows + 255) / 256 * 256, 1, 1}; size_t globalThreads[3] = {((dst.wholecols *dst.wholerows+3)/4 + 255) / 256 * 256, 1, 1};
size_t localThreads[3] = {256, 1, 1}; size_t localThreads[3] = {256, 1, 1};
openCLExecuteKernel(clCxt, &convertC3C4, kernelName, globalThreads, localThreads, args, -1, dst.elemSize1() >> 1); openCLExecuteKernel(clCxt, &convertC3C4, kernelName, globalThreads, localThreads, args, -1, -1,compile_option);
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// convert_C4C3 // convert_C4C3
void convert_C4C3(const oclMat &src, cl_mem &dst, int dstStep) void convert_C4C3(const oclMat &src, cl_mem &dst, int dstStep)
{ {
int srcStep = src.step1() / src.channels(); int srcStep_in_pixel = src.step1() / src.channels();
int pixel_end = src.wholecols*src.wholerows -1;
Context *clCxt = src.clCxt; Context *clCxt = src.clCxt;
string kernelName = "convertC4C3"; string kernelName = "convertC4C3";
char compile_option[32];
switch(src.depth())
{
case 0:
sprintf(compile_option, "-D GENTYPE4=uchar4");
break;
case 1:
sprintf(compile_option, "-D GENTYPE4=char4");
break;
case 2:
sprintf(compile_option, "-D GENTYPE4=ushort4");
break;
case 3:
sprintf(compile_option, "-D GENTYPE4=short4");
break;
case 4:
sprintf(compile_option, "-D GENTYPE4=int4");
break;
case 5:
sprintf(compile_option, "-D GENTYPE4=float4");
break;
case 6:
sprintf(compile_option, "-D GENTYPE4=double4");
break;
default:
CV_Error(-217,"unknown depth");
}
vector< pair<size_t, const void *> > args; vector< pair<size_t, const void *> > args;
args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data)); args.push_back( make_pair( sizeof(cl_mem), (void *)&src.data));
args.push_back( make_pair( sizeof(cl_mem), (void *)&dst)); args.push_back( make_pair( sizeof(cl_mem), (void *)&dst));
args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholecols)); args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholecols));
args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholerows)); args.push_back( make_pair( sizeof(cl_int), (void *)&src.wholerows));
args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep)); args.push_back( make_pair( sizeof(cl_int), (void *)&srcStep_in_pixel));
args.push_back( make_pair( sizeof(cl_int), (void *)&dstStep)); args.push_back( make_pair( sizeof(cl_int), (void *)&pixel_end));
size_t globalThreads[3] = {(src.wholecols *src.wholerows + 255) / 256 * 256, 1, 1}; size_t globalThreads[3] = {((src.wholecols *src.wholerows+3)/4 + 255) / 256 * 256, 1, 1};
size_t localThreads[3] = {256, 1, 1}; size_t localThreads[3] = {256, 1, 1};
openCLExecuteKernel(clCxt, &convertC3C4, kernelName, globalThreads, localThreads, args, -1, src.elemSize1() >> 1); openCLExecuteKernel(clCxt, &convertC3C4, kernelName, globalThreads, localThreads, args, -1, -1,compile_option);
} }
void cv::ocl::oclMat::upload(const Mat &m) void cv::ocl::oclMat::upload(const Mat &m)
@ -173,23 +229,47 @@ void cv::ocl::oclMat::upload(const Mat &m)
Point ofs; Point ofs;
m.locateROI(wholeSize, ofs); m.locateROI(wholeSize, ofs);
int type = m.type(); int type = m.type();
//if(m.channels() == 3) if(m.channels() == 3)
//type = CV_MAKETYPE(m.depth(), 4); {
type = CV_MAKETYPE(m.depth(), 4);
}
create(wholeSize, type); create(wholeSize, type);
//if(m.channels() == 3) if(m.channels() == 3)
//{ {
//int pitch = GPU_MATRIX_MALLOC_STEP(wholeSize.width * 3 * m.elemSize1()); int pitch = wholeSize.width * 3 * m.elemSize1();
//int err; int tail_padding = m.elemSize1()*3072;
//cl_mem temp = clCreateBuffer(clCxt->clContext,CL_MEM_READ_WRITE, int err;
//pitch*wholeSize.height,0,&err); cl_mem temp = clCreateBuffer(clCxt->impl->clContext,CL_MEM_READ_WRITE,
//CV_DbgAssert(err==0); (pitch*wholeSize.height+tail_padding-1)/tail_padding*tail_padding,0,&err);
openCLVerifyCall(err);
//openCLMemcpy2D(clCxt,temp,pitch,m.datastart,m.step,wholeSize.width*m.elemSize(),wholeSize.height,clMemcpyHostToDevice);
//convert_C3C4(temp, *this, pitch); openCLMemcpy2D(clCxt,temp,pitch,m.datastart,m.step,wholeSize.width*m.elemSize(),wholeSize.height,clMemcpyHostToDevice,3);
//} convert_C3C4(temp, *this, pitch);
//else //int* cputemp=new int[wholeSize.height*wholeSize.width * 3];
openCLMemcpy2D(clCxt, data, step, m.datastart, m.step, wholeSize.width * elemSize(), wholeSize.height, clMemcpyHostToDevice); //int* cpudata=new int[this->step*this->wholerows/sizeof(int)];
//openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, temp, CL_TRUE,
// 0, wholeSize.height*wholeSize.width * 3* sizeof(int), cputemp, 0, NULL, NULL));
//openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)data, CL_TRUE,
// 0, this->step*this->wholerows, cpudata, 0, NULL, NULL));
//for(int i=0;i<wholeSize.height;i++)
//{
// int *a = cputemp+i*wholeSize.width * 3,*b = cpudata + i*this->step/sizeof(int);
// for(int j=0;j<wholeSize.width;j++)
// {
// if((a[3*j] != b[4*j])||(a[3*j+1] != b[4*j+1])||(a[3*j+2] != b[4*j+2]))
// printf("rows=%d,cols=%d,cputtemp=%d,%d,%d;cpudata=%d,%d,%d\n",
// i,j,a[3*j],a[3*j+1],a[3*j+2],b[4*j],b[4*j+1],b[4*j+2]);
// }
//}
//delete []cputemp;
//delete []cpudata;
openCLSafeCall(clReleaseMemObject(temp));
}
else
{
openCLMemcpy2D(clCxt, data, step, m.datastart, m.step, wholeSize.width * elemSize(), wholeSize.height, clMemcpyHostToDevice);
}
rows = m.rows; rows = m.rows;
cols = m.cols; cols = m.cols;
@ -201,23 +281,47 @@ void cv::ocl::oclMat::download(cv::Mat &m) const
{ {
CV_DbgAssert(!this->empty()); CV_DbgAssert(!this->empty());
int t = type(); int t = type();
//if(download_channels == 3) if(download_channels == 3)
//t = CV_MAKETYPE(depth(), 3); {
t = CV_MAKETYPE(depth(), 3);
}
m.create(wholerows, wholecols, t); m.create(wholerows, wholecols, t);
//if(download_channels == 3) if(download_channels == 3)
//{ {
//int pitch = GPU_MATRIX_MALLOC_STEP(wholecols * 3 * m.elemSize1()); int pitch = wholecols * 3 * m.elemSize1();
//int err; int tail_padding = m.elemSize1()*3072;
//cl_mem temp = clCreateBuffer(clCxt->clContext,CL_MEM_READ_WRITE, int err;
//pitch*wholerows,0,&err); cl_mem temp = clCreateBuffer(clCxt->impl->clContext,CL_MEM_READ_WRITE,
//CV_DbgAssert(err==0); (pitch*wholerows+tail_padding-1)/tail_padding*tail_padding,0,&err);
openCLVerifyCall(err);
//convert_C4C3(*this, temp, pitch/m.elemSize1());
//openCLMemcpy2D(clCxt,m.data,m.step,temp,pitch,wholecols*m.elemSize(),wholerows,clMemcpyDeviceToHost); convert_C4C3(*this, temp, pitch/m.elemSize1());
//} openCLMemcpy2D(clCxt,m.data,m.step,temp,pitch,wholecols*m.elemSize(),wholerows,clMemcpyDeviceToHost,3);
//else //int* cputemp=new int[wholecols*wholerows * 3];
openCLMemcpy2D(clCxt, m.data, m.step, data, step, wholecols * elemSize(), wholerows, clMemcpyDeviceToHost); //int* cpudata=new int[this->step*this->wholerows/sizeof(int)];
//openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, temp, CL_TRUE,
// 0, wholecols*wholerows * 3* sizeof(int), cputemp, 0, NULL, NULL));
//openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, (cl_mem)data, CL_TRUE,
// 0, this->step*this->wholerows, cpudata, 0, NULL, NULL));
//for(int i=0;i<wholerows;i++)
//{
// int *a = cputemp+i*wholecols * 3,*b = cpudata + i*this->step/sizeof(int);
// for(int j=0;j<wholecols;j++)
// {
// if((a[3*j] != b[4*j])||(a[3*j+1] != b[4*j+1])||(a[3*j+2] != b[4*j+2]))
// printf("rows=%d,cols=%d,cputtemp=%d,%d,%d;cpudata=%d,%d,%d\n",
// i,j,a[3*j],a[3*j+1],a[3*j+2],b[4*j],b[4*j+1],b[4*j+2]);
// }
//}
//delete []cputemp;
//delete []cpudata;
openCLSafeCall(clReleaseMemObject(temp));
}
else
{
openCLMemcpy2D(clCxt, m.data, m.step, data, step, wholecols * elemSize(), wholerows, clMemcpyDeviceToHost);
}
Size wholesize; Size wholesize;
Point ofs; Point ofs;
locateROI(wholesize, ofs); locateROI(wholesize, ofs);
@ -373,11 +477,7 @@ oclMat &cv::ocl::oclMat::operator = (const Scalar &s)
void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, string kernelName) void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, string kernelName)
{ {
vector<pair<size_t , const void *> > args; vector<pair<size_t , const void *> > args;
cl_float4 val;
val.s[0] = scalar.val[0];
val.s[1] = scalar.val[1];
val.s[2] = scalar.val[2];
val.s[3] = scalar.val[3];
size_t localThreads[3] = {16, 16, 1}; size_t localThreads[3] = {16, 16, 1};
size_t globalThreads[3]; size_t globalThreads[3];
globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0]; globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
@ -388,25 +488,168 @@ void set_to_withoutmask_run(const oclMat &dst, const Scalar &scalar, string kern
{ {
globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0]; globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
} }
args.push_back( make_pair( sizeof(cl_float4) , (void *)&val )); char compile_option[32];
union sc
{
cl_uchar4 uval;
cl_char4 cval;
cl_ushort4 usval;
cl_short4 shval;
cl_int4 ival;
cl_float4 fval;
cl_double4 dval;
}val;
switch(dst.depth())
{
case 0:
val.uval.s[0] = saturate_cast<uchar>(scalar.val[0]);
val.uval.s[1] = saturate_cast<uchar>(scalar.val[1]);
val.uval.s[2] = saturate_cast<uchar>(scalar.val[2]);
val.uval.s[3] = saturate_cast<uchar>(scalar.val[3]);
switch(dst.channels())
{
case 1:
sprintf(compile_option, "-D GENTYPE=uchar");
args.push_back( make_pair( sizeof(cl_uchar) , (void *)&val.uval.s[0] ));
break;
case 4:
sprintf(compile_option, "-D GENTYPE=uchar4");
args.push_back( make_pair( sizeof(cl_uchar4) , (void *)&val.uval ));
break;
default:
CV_Error(-217,"unsupported channels");
}
break;
case 1:
val.cval.s[0] = saturate_cast<char>(scalar.val[0]);
val.cval.s[1] = saturate_cast<char>(scalar.val[1]);
val.cval.s[2] = saturate_cast<char>(scalar.val[2]);
val.cval.s[3] = saturate_cast<char>(scalar.val[3]);
switch(dst.channels())
{
case 1:
sprintf(compile_option, "-D GENTYPE=char");
args.push_back( make_pair( sizeof(cl_char) , (void *)&val.cval.s[0] ));
break;
case 4:
sprintf(compile_option, "-D GENTYPE=char4");
args.push_back( make_pair( sizeof(cl_char4) , (void *)&val.cval ));
break;
default:
CV_Error(-217,"unsupported channels");
}
break;
case 2:
val.usval.s[0] = saturate_cast<ushort>(scalar.val[0]);
val.usval.s[1] = saturate_cast<ushort>(scalar.val[1]);
val.usval.s[2] = saturate_cast<ushort>(scalar.val[2]);
val.usval.s[3] = saturate_cast<ushort>(scalar.val[3]);
switch(dst.channels())
{
case 1:
sprintf(compile_option, "-D GENTYPE=ushort");
args.push_back( make_pair( sizeof(cl_ushort) , (void *)&val.usval.s[0] ));
break;
case 4:
sprintf(compile_option, "-D GENTYPE=ushort4");
args.push_back( make_pair( sizeof(cl_ushort4) , (void *)&val.usval ));
break;
default:
CV_Error(-217,"unsupported channels");
}
break;
case 3:
val.shval.s[0] = saturate_cast<short>(scalar.val[0]);
val.shval.s[1] = saturate_cast<short>(scalar.val[1]);
val.shval.s[2] = saturate_cast<short>(scalar.val[2]);
val.shval.s[3] = saturate_cast<short>(scalar.val[3]);
switch(dst.channels())
{
case 1:
sprintf(compile_option, "-D GENTYPE=short");
args.push_back( make_pair( sizeof(cl_short) , (void *)&val.shval.s[0] ));
break;
case 4:
sprintf(compile_option, "-D GENTYPE=short4");
args.push_back( make_pair( sizeof(cl_short4) , (void *)&val.shval ));
break;
default:
CV_Error(-217,"unsupported channels");
}
break;
case 4:
val.ival.s[0] = saturate_cast<int>(scalar.val[0]);
val.ival.s[1] = saturate_cast<int>(scalar.val[1]);
val.ival.s[2] = saturate_cast<int>(scalar.val[2]);
val.ival.s[3] = saturate_cast<int>(scalar.val[3]);
switch(dst.channels())
{
case 1:
sprintf(compile_option, "-D GENTYPE=int");
args.push_back( make_pair( sizeof(cl_int) , (void *)&val.ival.s[0] ));
break;
case 4:
sprintf(compile_option, "-D GENTYPE=int4");
args.push_back( make_pair( sizeof(cl_int4) , (void *)&val.ival ));
break;
default:
CV_Error(-217,"unsupported channels");
}
break;
case 5:
val.fval.s[0] = scalar.val[0];
val.fval.s[1] = scalar.val[1];
val.fval.s[2] = scalar.val[2];
val.fval.s[3] = scalar.val[3];
switch(dst.channels())
{
case 1:
sprintf(compile_option, "-D GENTYPE=float");
args.push_back( make_pair( sizeof(cl_float) , (void *)&val.fval.s[0] ));
break;
case 4:
sprintf(compile_option, "-D GENTYPE=float4");
args.push_back( make_pair( sizeof(cl_float4) , (void *)&val.fval ));
break;
default:
CV_Error(-217,"unsupported channels");
}
break;
case 6:
val.dval.s[0] = scalar.val[0];
val.dval.s[1] = scalar.val[1];
val.dval.s[2] = scalar.val[2];
val.dval.s[3] = scalar.val[3];
switch(dst.channels())
{
case 1:
sprintf(compile_option, "-D GENTYPE=double");
args.push_back( make_pair( sizeof(cl_double) , (void *)&val.dval.s[0] ));
break;
case 4:
sprintf(compile_option, "-D GENTYPE=double4");
args.push_back( make_pair( sizeof(cl_double4) , (void *)&val.dval ));
break;
default:
CV_Error(-217,"unsupported channels");
}
break;
default:
CV_Error(-217,"unknown depth");
}
args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data )); args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols )); args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows )); args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&step_in_pixel )); args.push_back( make_pair( sizeof(cl_int) , (void *)&step_in_pixel ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&offset_in_pixel)); args.push_back( make_pair( sizeof(cl_int) , (void *)&offset_in_pixel));
openCLExecuteKernel(dst.clCxt , &operator_setTo, kernelName, globalThreads, openCLExecuteKernel(dst.clCxt , &operator_setTo, kernelName, globalThreads,
localThreads, args, dst.channels(), dst.depth()); localThreads, args, -1, -1,compile_option);
} }
void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const oclMat &mask, string kernelName) void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const oclMat &mask, string kernelName)
{ {
CV_DbgAssert( dst.rows == mask.rows && dst.cols == mask.cols); CV_DbgAssert( dst.rows == mask.rows && dst.cols == mask.cols);
vector<pair<size_t , const void *> > args; vector<pair<size_t , const void *> > args;
cl_float4 val;
val.s[0] = scalar.val[0];
val.s[1] = scalar.val[1];
val.s[2] = scalar.val[2];
val.s[3] = scalar.val[3];
size_t localThreads[3] = {16, 16, 1}; size_t localThreads[3] = {16, 16, 1};
size_t globalThreads[3]; size_t globalThreads[3];
globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0]; globalThreads[0] = (dst.cols + localThreads[0] - 1) / localThreads[0] * localThreads[0];
@ -417,7 +660,155 @@ void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const oclMat &
globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0]; globalThreads[0] = ((dst.cols + 4) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
} }
int step_in_pixel = dst.step / dst.elemSize(), offset_in_pixel = dst.offset / dst.elemSize(); int step_in_pixel = dst.step / dst.elemSize(), offset_in_pixel = dst.offset / dst.elemSize();
args.push_back( make_pair( sizeof(cl_float4) , (void *)&val )); char compile_option[32];
union sc
{
cl_uchar4 uval;
cl_char4 cval;
cl_ushort4 usval;
cl_short4 shval;
cl_int4 ival;
cl_float4 fval;
cl_double4 dval;
}val;
switch(dst.depth())
{
case 0:
val.uval.s[0] = saturate_cast<uchar>(scalar.val[0]);
val.uval.s[1] = saturate_cast<uchar>(scalar.val[1]);
val.uval.s[2] = saturate_cast<uchar>(scalar.val[2]);
val.uval.s[3] = saturate_cast<uchar>(scalar.val[3]);
switch(dst.channels())
{
case 1:
sprintf(compile_option, "-D GENTYPE=uchar");
args.push_back( make_pair( sizeof(cl_uchar) , (void *)&val.uval.s[0] ));
break;
case 4:
sprintf(compile_option, "-D GENTYPE=uchar4");
args.push_back( make_pair( sizeof(cl_uchar4) , (void *)&val.uval ));
break;
default:
CV_Error(-217,"unsupported channels");
}
break;
case 1:
val.cval.s[0] = saturate_cast<char>(scalar.val[0]);
val.cval.s[1] = saturate_cast<char>(scalar.val[1]);
val.cval.s[2] = saturate_cast<char>(scalar.val[2]);
val.cval.s[3] = saturate_cast<char>(scalar.val[3]);
switch(dst.channels())
{
case 1:
sprintf(compile_option, "-D GENTYPE=char");
args.push_back( make_pair( sizeof(cl_char) , (void *)&val.cval.s[0] ));
break;
case 4:
sprintf(compile_option, "-D GENTYPE=char4");
args.push_back( make_pair( sizeof(cl_char4) , (void *)&val.cval ));
break;
default:
CV_Error(-217,"unsupported channels");
}
break;
case 2:
val.usval.s[0] = saturate_cast<ushort>(scalar.val[0]);
val.usval.s[1] = saturate_cast<ushort>(scalar.val[1]);
val.usval.s[2] = saturate_cast<ushort>(scalar.val[2]);
val.usval.s[3] = saturate_cast<ushort>(scalar.val[3]);
switch(dst.channels())
{
case 1:
sprintf(compile_option, "-D GENTYPE=ushort");
args.push_back( make_pair( sizeof(cl_ushort) , (void *)&val.usval.s[0] ));
break;
case 4:
sprintf(compile_option, "-D GENTYPE=ushort4");
args.push_back( make_pair( sizeof(cl_ushort4) , (void *)&val.usval ));
break;
default:
CV_Error(-217,"unsupported channels");
}
break;
case 3:
val.shval.s[0] = saturate_cast<short>(scalar.val[0]);
val.shval.s[1] = saturate_cast<short>(scalar.val[1]);
val.shval.s[2] = saturate_cast<short>(scalar.val[2]);
val.shval.s[3] = saturate_cast<short>(scalar.val[3]);
switch(dst.channels())
{
case 1:
sprintf(compile_option, "-D GENTYPE=short");
args.push_back( make_pair( sizeof(cl_short) , (void *)&val.shval.s[0] ));
break;
case 4:
sprintf(compile_option, "-D GENTYPE=short4");
args.push_back( make_pair( sizeof(cl_short4) , (void *)&val.shval ));
break;
default:
CV_Error(-217,"unsupported channels");
}
break;
case 4:
val.ival.s[0] = saturate_cast<int>(scalar.val[0]);
val.ival.s[1] = saturate_cast<int>(scalar.val[1]);
val.ival.s[2] = saturate_cast<int>(scalar.val[2]);
val.ival.s[3] = saturate_cast<int>(scalar.val[3]);
switch(dst.channels())
{
case 1:
sprintf(compile_option, "-D GENTYPE=int");
args.push_back( make_pair( sizeof(cl_int) , (void *)&val.ival.s[0] ));
break;
case 4:
sprintf(compile_option, "-D GENTYPE=int4");
args.push_back( make_pair( sizeof(cl_int4) , (void *)&val.ival ));
break;
default:
CV_Error(-217,"unsupported channels");
}
break;
case 5:
val.fval.s[0] = scalar.val[0];
val.fval.s[1] = scalar.val[1];
val.fval.s[2] = scalar.val[2];
val.fval.s[3] = scalar.val[3];
switch(dst.channels())
{
case 1:
sprintf(compile_option, "-D GENTYPE=float");
args.push_back( make_pair( sizeof(cl_float) , (void *)&val.fval.s[0] ));
break;
case 4:
sprintf(compile_option, "-D GENTYPE=float4");
args.push_back( make_pair( sizeof(cl_float4) , (void *)&val.fval ));
break;
default:
CV_Error(-217,"unsupported channels");
}
break;
case 6:
val.dval.s[0] = scalar.val[0];
val.dval.s[1] = scalar.val[1];
val.dval.s[2] = scalar.val[2];
val.dval.s[3] = scalar.val[3];
switch(dst.channels())
{
case 1:
sprintf(compile_option, "-D GENTYPE=double");
args.push_back( make_pair( sizeof(cl_double) , (void *)&val.dval.s[0] ));
break;
case 4:
sprintf(compile_option, "-D GENTYPE=double4");
args.push_back( make_pair( sizeof(cl_double4) , (void *)&val.dval ));
break;
default:
CV_Error(-217,"unsupported channels");
}
break;
default:
CV_Error(-217,"unknown depth");
}
args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data )); args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols )); args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.cols ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows )); args.push_back( make_pair( sizeof(cl_int) , (void *)&dst.rows ));
@ -427,7 +818,7 @@ void set_to_withmask_run(const oclMat &dst, const Scalar &scalar, const oclMat &
args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.step )); args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.step ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.offset )); args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.offset ));
openCLExecuteKernel(dst.clCxt , &operator_setToM, kernelName, globalThreads, openCLExecuteKernel(dst.clCxt , &operator_setToM, kernelName, globalThreads,
localThreads, args, dst.channels(), dst.depth()); localThreads, args, -1, -1,compile_option);
} }
oclMat &cv::ocl::oclMat::setTo(const Scalar &scalar, const oclMat &mask) oclMat &cv::ocl::oclMat::setTo(const Scalar &scalar, const oclMat &mask)
@ -446,11 +837,25 @@ oclMat &cv::ocl::oclMat::setTo(const Scalar &scalar, const oclMat &mask)
// (cl_mem)mem,1,0,sizeof(double)*4,s,0,0,0)); // (cl_mem)mem,1,0,sizeof(double)*4,s,0,0,0));
if (mask.empty()) if (mask.empty())
{ {
set_to_withoutmask_run(*this, scalar, "set_to_without_mask"); if(type()==CV_8UC1)
{
set_to_withoutmask_run(*this, scalar, "set_to_without_mask_C1_D0");
}
else
{
set_to_withoutmask_run(*this, scalar, "set_to_without_mask");
}
} }
else else
{ {
set_to_withmask_run(*this, scalar, mask, "set_to_with_mask"); if(type()==CV_8UC1)
{
set_to_withmask_run(*this, scalar, mask,"set_to_with_mask_C1_D0");
}
else
{
set_to_withmask_run(*this, scalar, mask, "set_to_with_mask");
}
} }
return *this; return *this;

@ -97,7 +97,7 @@ namespace cv
size_t widthInBytes, size_t height); size_t widthInBytes, size_t height);
void openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch, void openCLMemcpy2D(Context *clCxt, void *dst, size_t dpitch,
const void *src, size_t spitch, const void *src, size_t spitch,
size_t width, size_t height, enum openCLMemcpyKind kind); size_t width, size_t height, enum openCLMemcpyKind kind, int channels=-1);
void openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset, void openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset,
const void *src, size_t spitch, const void *src, size_t spitch,
size_t width, size_t height, int src_offset, enum openCLMemcpyKind kind); size_t width, size_t height, int src_offset, enum openCLMemcpyKind kind);
@ -126,8 +126,8 @@ namespace cv
cl_mem openCLMalloc(cl_context clCxt, size_t size, cl_mem_flags flags, void *host_ptr); cl_mem openCLMalloc(cl_context clCxt, size_t size, cl_mem_flags flags, void *host_ptr);
void openCLMemcpy2DWithNoPadding(cl_command_queue command_queue, cl_mem buffer, size_t size, size_t offset, void *ptr, //void openCLMemcpy2DWithNoPadding(cl_command_queue command_queue, cl_mem buffer, size_t size, size_t offset, void *ptr,
enum openCLMemcpyKind kind, cl_bool blocking_write); // enum openCLMemcpyKind kind, cl_bool blocking_write);
int savetofile(const Context *clcxt, cl_program &program, const char *fileName); int savetofile(const Context *clcxt, cl_program &program, const char *fileName);
struct Context::Impl struct Context::Impl
{ {

@ -958,7 +958,7 @@ TEST_P(Remap, Mat)
if((interpolation == 1 && map1Type == CV_16SC2) ||(interpolation == 1 && map1Type == CV_16SC1 && map2Type == CV_16SC1)) if((interpolation == 1 && map1Type == CV_16SC2) ||(interpolation == 1 && map1Type == CV_16SC1 && map2Type == CV_16SC1))
{ {
cout << "LINEAR don't support the map1Type and map2Type" << endl; cout << "LINEAR don't support the map1Type and map2Type" << endl;
return; return;
} }
int bordertype[] = {cv::BORDER_CONSTANT,cv::BORDER_REPLICATE/*,BORDER_REFLECT,BORDER_WRAP,BORDER_REFLECT_101*/}; int bordertype[] = {cv::BORDER_CONSTANT,cv::BORDER_REPLICATE/*,BORDER_REFLECT,BORDER_WRAP,BORDER_REFLECT_101*/};
const char* borderstr[]={"BORDER_CONSTANT", "BORDER_REPLICATE"/*, "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"*/}; const char* borderstr[]={"BORDER_CONSTANT", "BORDER_REPLICATE"/*, "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"*/};

@ -396,6 +396,101 @@ TEST_P(SetTo, With_mask)
} }
} }
//convertC3C4
PARAM_TEST_CASE(convertC3C4, MatType, cv::Size)
{
int type;
cv::Size ksize;
//src mat
cv::Mat mat1;
cv::Mat dst;
// set up roi
int roicols;
int roirows;
int src1x;
int src1y;
int dstx;
int dsty;
//src mat with roi
cv::Mat mat1_roi;
cv::Mat dst_roi;
std::vector<cv::ocl::Info> oclinfo;
//ocl dst mat for testing
cv::ocl::oclMat gdst_whole;
//ocl mat with roi
cv::ocl::oclMat gmat1;
cv::ocl::oclMat gdst;
virtual void SetUp()
{
type = GET_PARAM(0);
ksize = GET_PARAM(1);
//dst = randomMat(rng, size, type, 5, 16, false);
int devnums = getDevice(oclinfo);
CV_Assert(devnums > 0);
//if you want to use undefault device, set it here
//setDevice(oclinfo[1]);
}
void random_roi()
{
#ifdef RANDOMROI
//randomize ROI
cv::RNG &rng = TS::ptr()->get_rng();
roicols = rng.uniform(2, mat1.cols);
roirows = rng.uniform(2, mat1.rows);
src1x = rng.uniform(0, mat1.cols - roicols);
src1y = rng.uniform(0, mat1.rows - roirows);
dstx = rng.uniform(0, dst.cols - roicols);
dsty = rng.uniform(0, dst.rows - roirows);
#else
roicols = mat1.cols;
roirows = mat1.rows;
src1x = 0;
src1y = 0;
dstx = 0;
dsty = 0;
#endif
mat1_roi = mat1(Rect(src1x, src1y, roicols, roirows));
dst_roi = dst(Rect(dstx, dsty, roicols, roirows));
gdst_whole = dst;
gdst = gdst_whole(Rect(dstx, dsty, roicols, roirows));
gmat1 = mat1_roi;
}
};
TEST_P(convertC3C4, Accuracy)
{
cv::RNG &rng = TS::ptr()->get_rng();
for(int j = 0; j < LOOP_TIMES; j++)
{
//random_roi();
int width = rng.uniform(2, MWIDTH);
int height = rng.uniform(2, MHEIGHT);
cv::Size size(width, height);
mat1 = randomMat(rng, size, type, 0, 40, false);
gmat1 = mat1;
cv::Mat cpu_dst;
gmat1.download(cpu_dst);
char sss[1024];
sprintf(sss, "cols=%d,rows=%d", mat1.cols, mat1.rows);
EXPECT_MAT_NEAR(mat1, cpu_dst, 0.0, sss);
}
}
INSTANTIATE_TEST_CASE_P(MatrixOperation, ConvertTo, Combine( INSTANTIATE_TEST_CASE_P(MatrixOperation, ConvertTo, Combine(
Values(CV_8UC1, CV_8UC4, CV_32SC1, CV_32SC4, CV_32FC1, CV_32FC4), Values(CV_8UC1, CV_8UC4, CV_32SC1, CV_32SC4, CV_32FC1, CV_32FC4),
@ -408,5 +503,8 @@ INSTANTIATE_TEST_CASE_P(MatrixOperation, CopyTo, Combine(
INSTANTIATE_TEST_CASE_P(MatrixOperation, SetTo, Combine( INSTANTIATE_TEST_CASE_P(MatrixOperation, SetTo, Combine(
Values(CV_8UC1, CV_8UC4, CV_32SC1, CV_32SC4, CV_32FC1, CV_32FC4), Values(CV_8UC1, CV_8UC4, CV_32SC1, CV_32SC4, CV_32FC1, CV_32FC4),
Values(false))); // Values(false) is the reserved parameter Values(false))); // Values(false) is the reserved parameter
INSTANTIATE_TEST_CASE_P(MatrixOperation, convertC3C4, Combine(
Values(CV_8UC3, CV_32SC3, CV_32FC3),
Values(cv::Size())));
#endif #endif

@ -69,7 +69,7 @@ class OpenCVTests(unittest.TestCase):
def get_sample(self, filename, iscolor = cv.CV_LOAD_IMAGE_COLOR): def get_sample(self, filename, iscolor = cv.CV_LOAD_IMAGE_COLOR):
if not filename in self.image_cache: if not filename in self.image_cache:
filedata = urllib.urlopen("http://code.opencv.org/svn/opencv/trunk/opencv/" + filename).read() filedata = urllib.urlopen("http://code.opencv.org/projects/opencv/repository/revisions/master/raw/" + filename).read()
imagefiledata = cv.CreateMatHeader(1, len(filedata), cv.CV_8UC1) imagefiledata = cv.CreateMatHeader(1, len(filedata), cv.CV_8UC1)
cv.SetData(imagefiledata, filedata, len(filedata)) cv.SetData(imagefiledata, filedata, len(filedata))
self.image_cache[filename] = cv.DecodeImageM(imagefiledata, iscolor) self.image_cache[filename] = cv.DecodeImageM(imagefiledata, iscolor)

Loading…
Cancel
Save