Merge branch '2.4'

pull/72/head^2
Andrey Kamaev 12 years ago
commit 57775988e7
  1. 2
      cmake/OpenCVDetectOpenCL.cmake
  2. 16
      ios/build_framework.py
  3. 71
      modules/core/include/opencv2/core/operations.hpp
  4. 10
      modules/core/src/arithm.cpp
  5. 8
      modules/core/test/test_arithm.cpp
  6. 2
      modules/gpu/src/surf.cpp
  7. 2
      modules/ocl/CMakeLists.txt
  8. 20
      modules/ocl/include/opencv2/ocl/ocl.hpp
  9. 76
      modules/ocl/perf/main.cpp
  10. 6
      modules/ocl/perf/perf_arithm.cpp
  11. 6
      modules/ocl/perf/perf_canny.cpp
  12. 4
      modules/ocl/perf/perf_filters.cpp
  13. 11
      modules/ocl/perf/perf_haar.cpp
  14. 9
      modules/ocl/perf/perf_hog.cpp
  15. 1
      modules/ocl/perf/perf_imgproc.cpp
  16. 1
      modules/ocl/perf/perf_matrix_operation.cpp
  17. 2
      modules/ocl/perf/perf_split_merge.cpp
  18. 4
      modules/ocl/perf/perf_surf.cpp
  19. 20
      modules/ocl/src/arithm.cpp
  20. 28
      modules/ocl/src/brute_force_matcher.cpp
  21. 8
      modules/ocl/src/fft.cpp
  22. 8
      modules/ocl/src/filtering.cpp
  23. 11
      modules/ocl/src/gemm.cpp
  24. 43
      modules/ocl/src/haar.cpp
  25. 95
      modules/ocl/src/imgproc.cpp
  26. 17
      modules/ocl/src/initialization.cpp
  27. 158
      modules/ocl/src/kernels/arithm_addWeighted.cl
  28. 96
      modules/ocl/src/kernels/arithm_bitwise_and.cl
  29. 25
      modules/ocl/src/kernels/arithm_bitwise_not.cl
  30. 36
      modules/ocl/src/kernels/arithm_bitwise_or.cl
  31. 98
      modules/ocl/src/kernels/arithm_bitwise_xor.cl
  32. 422
      modules/ocl/src/kernels/arithm_compare_eq.cl
  33. 414
      modules/ocl/src/kernels/arithm_compare_ne.cl
  34. 50
      modules/ocl/src/kernels/arithm_magnitudeSqr.cl
  35. 1337
      modules/ocl/src/kernels/imgproc_remap.cl
  36. 435
      modules/ocl/src/kernels/split_mat.cl
  37. 18
      modules/ocl/src/match_template.cpp
  38. 10
      modules/ocl/src/matrix_operations.cpp
  39. 6
      modules/ocl/src/mcwutil.cpp
  40. 9
      modules/ocl/src/precomp.hpp
  41. 87
      modules/ocl/src/pyrlk.cpp
  42. 43
      modules/ocl/test/main.cpp
  43. 2
      modules/ocl/test/precomp.hpp
  44. 9
      modules/ocl/test/test_arithm.cpp
  45. 11
      modules/ocl/test/test_canny.cpp
  46. 15
      modules/ocl/test/test_haar.cpp
  47. 6
      modules/ocl/test/test_hog.cpp
  48. 16
      modules/ocl/test/test_imgproc.cpp
  49. 30
      modules/ocl/test/test_match_template.cpp
  50. 6
      modules/ocl/test/test_pyrlk.cpp
  51. 5
      modules/python/src2/cv2.cpp
  52. 11
      samples/cpp/stitching_detailed.cpp

@ -2,7 +2,7 @@ if(APPLE)
set(OPENCL_FOUND YES) set(OPENCL_FOUND YES)
set(OPENCL_LIBRARIES "-framework OpenCL") set(OPENCL_LIBRARIES "-framework OpenCL")
else() else()
find_package(OpenCL QUIET) #find_package(OpenCL QUIET)
if(WITH_OPENCLAMDFFT) if(WITH_OPENCLAMDFFT)
find_path(CLAMDFFT_INCLUDE_DIR find_path(CLAMDFFT_INCLUDE_DIR
NAMES clAmdFft.h) NAMES clAmdFft.h)

@ -27,10 +27,10 @@ However, OpenCV.framework directory is erased and recreated on each run.
import glob, re, os, os.path, shutil, string, sys import glob, re, os, os.path, shutil, string, sys
def build_opencv(srcroot, buildroot, target): def build_opencv(srcroot, buildroot, target, arch):
"builds OpenCV for device or simulator" "builds OpenCV for device or simulator"
builddir = os.path.join(buildroot, target) builddir = os.path.join(buildroot, target + '-' + arch)
if not os.path.isdir(builddir): if not os.path.isdir(builddir):
os.makedirs(builddir) os.makedirs(builddir)
currdir = os.getcwd() currdir = os.getcwd()
@ -52,8 +52,8 @@ def build_opencv(srcroot, buildroot, target):
if os.path.isfile(wlib): if os.path.isfile(wlib):
os.remove(wlib) os.remove(wlib)
os.system("xcodebuild -parallelizeTargets -jobs 8 -sdk %s -configuration Release -target ALL_BUILD" % target.lower()) os.system("xcodebuild -parallelizeTargets ARCHS=%s -jobs 8 -sdk %s -configuration Release -target ALL_BUILD" % (arch, target.lower()))
os.system("xcodebuild -sdk %s -configuration Release -target install install" % target.lower()) os.system("xcodebuild ARCHS=%s -sdk %s -configuration Release -target install install" % (arch, target.lower()))
os.chdir(currdir) os.chdir(currdir)
def put_framework_together(srcroot, dstroot): def put_framework_together(srcroot, dstroot):
@ -112,8 +112,10 @@ def put_framework_together(srcroot, dstroot):
def build_framework(srcroot, dstroot): def build_framework(srcroot, dstroot):
"main function to do all the work" "main function to do all the work"
for target in ["iPhoneOS", "iPhoneSimulator"]: targets = ["iPhoneOS", "iPhoneOS", "iPhoneSimulator"]
build_opencv(srcroot, os.path.join(dstroot, "build"), target) archs = ["armv7", "armv7s", "i386"]
for i in range(len(targets)):
build_opencv(srcroot, os.path.join(dstroot, "build"), targets[i], archs[i])
put_framework_together(srcroot, dstroot) put_framework_together(srcroot, dstroot)
@ -123,4 +125,4 @@ if __name__ == "__main__":
print "Usage:\n\t./build_framework.py <outputdir>\n\n" print "Usage:\n\t./build_framework.py <outputdir>\n\n"
sys.exit(0) sys.exit(0)
build_framework(os.path.abspath(os.path.join(os.path.dirname(sys.argv[0]), "..")), os.path.abspath(sys.argv[1])) build_framework(os.path.abspath(os.path.join(os.path.dirname(sys.argv[0]), "..")), os.path.abspath(sys.argv[1]))

@ -3780,23 +3780,6 @@ struct CV_EXPORTS Formatted
vector<int> params; vector<int> params;
}; };
/** Writes a point to an output stream in Matlab notation
*/
template<typename _Tp> inline std::ostream& operator<<(std::ostream& out, const Point_<_Tp>& p)
{
out << "[" << p.x << ", " << p.y << "]";
return out;
}
/** Writes a point to an output stream in Matlab notation
*/
template<typename _Tp> inline std::ostream& operator<<(std::ostream& out, const Point3_<_Tp>& p)
{
out << "[" << p.x << ", " << p.y << ", " << p.z << "]";
return out;
}
static inline Formatted format(const Mat& mtx, const char* fmt, static inline Formatted format(const Mat& mtx, const char* fmt,
const vector<int>& params=vector<int>()) const vector<int>& params=vector<int>())
{ {
@ -3858,6 +3841,60 @@ template<typename _Tp> static inline std::ostream& operator << (std::ostream& ou
} }
/** Writes a Matx to an output stream.
*/
template<typename _Tp, int m, int n> inline std::ostream& operator<<(std::ostream& out, const Matx<_Tp, m, n>& matx)
{
out << cv::Mat(matx);
return out;
}
/** Writes a point to an output stream in Matlab notation
*/
template<typename _Tp> inline std::ostream& operator<<(std::ostream& out, const Point_<_Tp>& p)
{
out << "[" << p.x << ", " << p.y << "]";
return out;
}
/** Writes a point to an output stream in Matlab notation
*/
template<typename _Tp> inline std::ostream& operator<<(std::ostream& out, const Point3_<_Tp>& p)
{
out << "[" << p.x << ", " << p.y << ", " << p.z << "]";
return out;
}
/** Writes a Vec to an output stream. Format example : [10, 20, 30]
*/
template<typename _Tp, int n> inline std::ostream& operator<<(std::ostream& out, const Vec<_Tp, n>& vec)
{
out << "[";
for (int i = 0; i < n - 1; ++i) {
out << vec[i] << ", ";
}
out << vec[n-1] << "]";
return out;
}
/** Writes a Size_ to an output stream. Format example : [640 x 480]
*/
template<typename _Tp> inline std::ostream& operator<<(std::ostream& out, const Size_<_Tp>& size)
{
out << "[" << size.width << " x " << size.height << "]";
return out;
}
/** Writes a Rect_ to an output stream. Format example : [640 x 480 from (10, 20)]
*/
template<typename _Tp> inline std::ostream& operator<<(std::ostream& out, const Rect_<_Tp>& rect)
{
out << "[" << rect.width << " x " << rect.height << " from (" << rect.x << ", " << rect.y << ")]";
return out;
}
template<typename _Tp> inline Ptr<_Tp> Algorithm::create(const string& name) template<typename _Tp> inline Ptr<_Tp> Algorithm::create(const string& name)
{ {
return _create(name).ptr<_Tp>(); return _create(name).ptr<_Tp>();

@ -1228,10 +1228,10 @@ static int actualScalarDepth(const double* data, int len)
maxval = MAX(maxval, ival); maxval = MAX(maxval, ival);
} }
return i < len ? CV_64F : return i < len ? CV_64F :
minval >= 0 && maxval <= UCHAR_MAX ? CV_8U : minval >= 0 && maxval <= (int)UCHAR_MAX ? CV_8U :
minval >= SCHAR_MIN && maxval <= SCHAR_MAX ? CV_8S : minval >= (int)SCHAR_MIN && maxval <= (int)SCHAR_MAX ? CV_8S :
minval >= 0 && maxval <= USHRT_MAX ? CV_16U : minval >= 0 && maxval <= (int)USHRT_MAX ? CV_16U :
minval >= SHRT_MIN && maxval <= SHRT_MAX ? CV_16S : minval >= (int)SHRT_MIN && maxval <= (int)SHRT_MAX ? CV_16S :
CV_32S; CV_32S;
} }
@ -1281,7 +1281,7 @@ static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
depth2 = CV_32F; depth2 = CV_32F;
} }
else else
depth2 = src1.depth() < CV_32S || src1.depth() == CV_32F ? CV_32F : CV_64F; depth2 = CV_64F;
} }
int cn = src1.channels(), depth1 = src1.depth(), wtype; int cn = src1.channels(), depth1 = src1.depth(), wtype;

@ -1522,4 +1522,12 @@ protected:
TEST(Core_ArithmMask, uninitialized) { CV_ArithmMaskTest test; test.safe_run(); } TEST(Core_ArithmMask, uninitialized) { CV_ArithmMaskTest test; test.safe_run(); }
TEST(Multiply, FloatingPointRounding)
{
cv::Mat src(1, 1, CV_8UC1, cv::Scalar::all(110)), dst;
cv::Scalar s(147.286359696927, 1, 1 ,1);
cv::multiply(src, s, dst, 1, CV_16U);
// with CV_32F this produce result 16202
ASSERT_EQ(dst.at<ushort>(0,0), 16201);
}

@ -150,8 +150,6 @@ namespace
sumOffset = bindSumTex(surf_.sum); sumOffset = bindSumTex(surf_.sum);
return;
if (use_mask) if (use_mask)
{ {
min(mask, 1.0, surf_.mask1); min(mask, 1.0, surf_.mask1);

@ -13,7 +13,7 @@ set(kernels_cpp "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp")
set(cl2cpp_script "${CMAKE_CURRENT_SOURCE_DIR}/cl2cpp.py") set(cl2cpp_script "${CMAKE_CURRENT_SOURCE_DIR}/cl2cpp.py")
set(cl2cpp_script_W32 "${CMAKE_CURRENT_SOURCE_DIR}/cl2cpp.js") set(cl2cpp_script_W32 "${CMAKE_CURRENT_SOURCE_DIR}/cl2cpp.js")
IF(WIN32) IF(0)
add_custom_command( add_custom_command(
OUTPUT ${kernels_cpp} OUTPUT ${kernels_cpp}
COMMAND wscript.exe ${cl2cpp_script_W32} "${CMAKE_CURRENT_SOURCE_DIR}/src/kernels" ${kernels_cpp} COMMAND wscript.exe ${cl2cpp_script_W32} "${CMAKE_CURRENT_SOURCE_DIR}/src/kernels" ${kernels_cpp}

@ -57,13 +57,15 @@ namespace cv
namespace ocl namespace ocl
{ {
using std::auto_ptr; using std::auto_ptr;
enum
#define CVCL_DEVICE_TYPE_DEFAULT (1 << 0) {
#define CVCL_DEVICE_TYPE_CPU (1 << 1) CVCL_DEVICE_TYPE_DEFAULT = (1 << 0),
#define CVCL_DEVICE_TYPE_GPU (1 << 2) CVCL_DEVICE_TYPE_CPU = (1 << 1),
#define CVCL_DEVICE_TYPE_ACCELERATOR (1 << 3) CVCL_DEVICE_TYPE_GPU = (1 << 2),
//#define CVCL_DEVICE_TYPE_CUSTOM (1 << 4) CVCL_DEVICE_TYPE_ACCELERATOR = (1 << 3),
#define CVCL_DEVICE_TYPE_ALL 0xFFFFFFFF //CVCL_DEVICE_TYPE_CUSTOM = (1 << 4)
CVCL_DEVICE_TYPE_ALL = 0xFFFFFFFF
};
//this class contains ocl runtime information //this class contains ocl runtime information
class CV_EXPORTS Info class CV_EXPORTS Info
{ {
@ -825,7 +827,6 @@ namespace cv
}; };
#ifdef HAVE_CLAMDFFT
///////////////////////////////////////// clAmdFft related ///////////////////////////////////////// ///////////////////////////////////////// clAmdFft related /////////////////////////////////////////
//! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix. //! Performs a forward or inverse discrete Fourier transform (1D or 2D) of floating point matrix.
//! Param dft_size is the size of DFT transform. //! Param dft_size is the size of DFT transform.
@ -839,16 +840,13 @@ namespace cv
// real to complex dft output is not the same with cpu version // real to complex dft output is not the same with cpu version
// real to complex and complex to real does not support DFT_ROWS // real to complex and complex to real does not support DFT_ROWS
CV_EXPORTS void dft(const oclMat &src, oclMat &dst, Size dft_size = Size(0, 0), int flags = 0); CV_EXPORTS void dft(const oclMat &src, oclMat &dst, Size dft_size = Size(0, 0), int flags = 0);
#endif // HAVE_CLAMDFFT
#ifdef HAVE_CLAMDBLAS
//! implements generalized matrix product algorithm GEMM from BLAS //! implements generalized matrix product algorithm GEMM from BLAS
// The functionality requires clAmdBlas library // The functionality requires clAmdBlas library
// only support type CV_32FC1 // only support type CV_32FC1
// flag GEMM_3_T is not supported // flag GEMM_3_T is not supported
CV_EXPORTS void gemm(const oclMat &src1, const oclMat &src2, double alpha, CV_EXPORTS void gemm(const oclMat &src1, const oclMat &src2, double alpha,
const oclMat &src3, double beta, oclMat &dst, int flags = 0); const oclMat &src3, double beta, oclMat &dst, int flags = 0);
#endif
//////////////// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector ////////////// //////////////// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector //////////////

@ -73,22 +73,86 @@ void print_info()
#endif #endif
} }
std::string workdir;
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
std::vector<cv::ocl::Info> oclinfo;
TS::ptr()->init("ocl"); TS::ptr()->init("ocl");
InitGoogleTest(&argc, argv); InitGoogleTest(&argc, argv);
const char *keys =
"{ h | help | false | print help message }"
"{ w | workdir | ../../../samples/c/| set working directory }"
"{ t | type | gpu | set device type:cpu or gpu}"
"{ p | platform | 0 | set platform id }"
"{ d | device | 0 | set device id }";
CommandLineParser cmd(argc, argv, keys);
if (cmd.get<bool>("help"))
{
cout << "Avaible options besides goole test option:" << endl;
cmd.printParams();
}
workdir = cmd.get<string>("workdir");
string type = cmd.get<string>("type");
unsigned int pid = cmd.get<unsigned int>("platform");
int device = cmd.get<int>("device");
print_info(); print_info();
int flag = CVCL_DEVICE_TYPE_GPU;
if(type == "cpu")
{
flag = CVCL_DEVICE_TYPE_CPU;
}
std::vector<cv::ocl::Info> oclinfo;
int devnums = getDevice(oclinfo); int devnums = getDevice(oclinfo);
if(devnums < 1) if(devnums <= device || device < 0)
{ {
std::cout << "no device found\n";
std::cout << "device invalid\n";
return -1; return -1;
} }
//if you want to use undefault device, set it here
//setDevice(oclinfo[0]); if(pid >= oclinfo.size())
{
std::cout << "platform invalid\n";
return -1;
}
if(pid != 0 || device != 0)
{
setDevice(oclinfo[pid], device);
}
cout << "Device type:" << type << endl << "Device name:" << oclinfo[pid].DeviceName[device] << endl;
setBinpath(CLBINPATH); setBinpath(CLBINPATH);
return RUN_ALL_TESTS(); return RUN_ALL_TESTS();
} }

@ -2672,13 +2672,13 @@ TEST_P(CountNonZero, MAT)
Has_roi(k); Has_roi(k);
t0 = (double)cvGetTickCount();//cpu start t0 = (double)cvGetTickCount();//cpu start
int cpures = cv::countNonZero(mat1_roi); cv::countNonZero(mat1_roi);
t0 = (double)cvGetTickCount() - t0;//cpu end t0 = (double)cvGetTickCount() - t0;//cpu end
t1 = (double)cvGetTickCount();//gpu start1 t1 = (double)cvGetTickCount();//gpu start1
gmat1 = mat1_roi; gmat1 = mat1_roi;
t2 = (double)cvGetTickCount(); //kernel t2 = (double)cvGetTickCount(); //kernel
int gpures = cv::ocl::countNonZero(gmat1); cv::ocl::countNonZero(gmat1);
t2 = (double)cvGetTickCount() - t2;//kernel t2 = (double)cvGetTickCount() - t2;//kernel
t1 = (double)cvGetTickCount() - t1;//gpu end1 t1 = (double)cvGetTickCount() - t1;//gpu end1
if(j == 0) if(j == 0)
@ -2713,7 +2713,7 @@ TEST_P(CountNonZero, MAT)
{ {
cout << "\nwith roi:"; cout << "\nwith roi:";
}; };
int gpures = cv::ocl::countNonZero(gmat1); cv::ocl::countNonZero(gmat1);
}; };
#endif #endif

@ -52,8 +52,6 @@ using namespace cvtest;
using namespace testing; using namespace testing;
using namespace std; using namespace std;
#define FILTER_IMAGE "../../../samples/gpu/road.png"
#ifndef MWC_TEST_UTILITY #ifndef MWC_TEST_UTILITY
#define MWC_TEST_UTILITY #define MWC_TEST_UTILITY
@ -79,7 +77,7 @@ IMPLEMENT_PARAM_CLASS(Channels, int)
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// Canny1 // Canny1
extern std::string workdir;
IMPLEMENT_PARAM_CLASS(AppertureSize, int); IMPLEMENT_PARAM_CLASS(AppertureSize, int);
IMPLEMENT_PARAM_CLASS(L2gradient, bool); IMPLEMENT_PARAM_CLASS(L2gradient, bool);
@ -101,7 +99,7 @@ PARAM_TEST_CASE(Canny1, AppertureSize, L2gradient)
TEST_P(Canny1, Performance) TEST_P(Canny1, Performance)
{ {
cv::Mat img = readImage(FILTER_IMAGE, cv::IMREAD_GRAYSCALE); cv::Mat img = readImage(workdir + "fruits.jpg", cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(img.empty()); ASSERT_FALSE(img.empty());
double low_thresh = 100.0; double low_thresh = 100.0;

@ -1184,11 +1184,11 @@ INSTANTIATE_TEST_CASE_P(Filters, Laplacian, Combine(
//INSTANTIATE_TEST_CASE_P(Filter, ErodeDilate, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 2, 3))); //INSTANTIATE_TEST_CASE_P(Filter, ErodeDilate, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 2, 3)));
INSTANTIATE_TEST_CASE_P(Filter, Erode, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(false))); INSTANTIATE_TEST_CASE_P(Filter, Erode, Combine(Values(CV_8UC1, CV_8UC1), Values(false)));
//INSTANTIATE_TEST_CASE_P(Filter, ErodeDilate, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 2, 3))); //INSTANTIATE_TEST_CASE_P(Filter, ErodeDilate, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(1, 2, 3)));
INSTANTIATE_TEST_CASE_P(Filter, Dilate, Combine(Values(CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4), Values(false))); INSTANTIATE_TEST_CASE_P(Filter, Dilate, Combine(Values(CV_8UC1, CV_8UC1), Values(false)));
INSTANTIATE_TEST_CASE_P(Filter, Sobel, Combine(Values(CV_8UC1, CV_32FC1), INSTANTIATE_TEST_CASE_P(Filter, Sobel, Combine(Values(CV_8UC1, CV_32FC1),

@ -52,7 +52,7 @@ using namespace cvtest;
using namespace testing; using namespace testing;
using namespace std; using namespace std;
using namespace cv; using namespace cv;
extern std::string workdir;
struct getRect struct getRect
{ {
Rect operator ()(const CvAvgComp &e) const Rect operator ()(const CvAvgComp &e) const
@ -80,9 +80,6 @@ PARAM_TEST_CASE(HaarTestBase, int, int)
if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName))) if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName)))
{ {
cout << "ERROR: Could not load classifier cascade" << endl; cout << "ERROR: Could not load classifier cascade" << endl;
cout << "Usage: facedetect [--cascade=<cascade_path>]\n"
" [--scale[=<image scale>\n"
" [filename|camera_index]\n" << endl ;
return; return;
} }
//int devnums = getDevice(oclinfo); //int devnums = getDevice(oclinfo);
@ -99,16 +96,16 @@ struct Haar : HaarTestBase {};
TEST_F(Haar, FaceDetect) TEST_F(Haar, FaceDetect)
{ {
string imgName = "../../../samples/c/lena.jpg"; string imgName = workdir + "lena.jpg";
Mat img = imread( imgName, 1 ); Mat img = imread( imgName, 1 );
if(img.empty()) if(img.empty())
{ {
std::cout << "Couldn't read test" << index << ".jpg" << std::endl; std::cout << imgName << std::endl;
return ; return ;
} }
int i = 0; //int i = 0;
double t = 0; double t = 0;
vector<Rect> faces, oclfaces; vector<Rect> faces, oclfaces;

@ -53,8 +53,7 @@ using namespace cv::ocl;
using namespace cvtest; using namespace cvtest;
using namespace testing; using namespace testing;
using namespace std; using namespace std;
extern std::string workdir;
#define FILTER_IMAGE "../../../samples/gpu/road.png"
#ifndef MWC_TEST_UTILITY #ifndef MWC_TEST_UTILITY
#define MWC_TEST_UTILITY #define MWC_TEST_UTILITY
@ -100,15 +99,15 @@ PARAM_TEST_CASE(HOG, WinSizw48, bool)
TEST_P(HOG, Performance) TEST_P(HOG, Performance)
{ {
cv::Mat img = readImage(FILTER_IMAGE, cv::IMREAD_GRAYSCALE); cv::Mat img = readImage(workdir + "lena.jpg", cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(img.empty()); ASSERT_FALSE(img.empty());
// define HOG related arguments // define HOG related arguments
float scale = 1.05; float scale = 1.05;
int nlevels = 13; //int nlevels = 13;
float gr_threshold = 8; float gr_threshold = 8;
float hit_threshold = 1.4; float hit_threshold = 1.4;
bool hit_threshold_auto = true; //bool hit_threshold_auto = true;
int win_width = is48 ? 48 : 64; int win_width = is48 ? 48 : 64;
int win_stride_width = 8; int win_stride_width = 8;

@ -1246,6 +1246,7 @@ TEST_P(Remap, Mat)
} }
int bordertype[] = {cv::BORDER_CONSTANT, cv::BORDER_REPLICATE/*,BORDER_REFLECT,BORDER_WRAP,BORDER_REFLECT_101*/}; int bordertype[] = {cv::BORDER_CONSTANT, cv::BORDER_REPLICATE/*,BORDER_REFLECT,BORDER_WRAP,BORDER_REFLECT_101*/};
const char *borderstr[] = {"BORDER_CONSTANT", "BORDER_REPLICATE"/*, "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"*/}; const char *borderstr[] = {"BORDER_CONSTANT", "BORDER_REPLICATE"/*, "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"*/};
cout << borderstr[0] << endl;
#ifndef PRINT_KERNEL_RUN_TIME #ifndef PRINT_KERNEL_RUN_TIME
double totalcputick = 0; double totalcputick = 0;
double totalgputick = 0; double totalgputick = 0;

@ -714,7 +714,6 @@ TEST_P(DataTransfer, perf)
totaluploadtick = t0 + totaluploadtick; totaluploadtick = t0 + totaluploadtick;
totaldownloadtick = t1 + totaldownloadtick; totaldownloadtick = t1 + totaldownloadtick;
} }
EXPECT_MAT_SIMILAR(mat, cpu_dst, 0.0);
totaltick = totaluploadtick + totaldownloadtick; totaltick = totaluploadtick + totaldownloadtick;
cout << "average upload time is " << totaluploadtick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl; cout << "average upload time is " << totaluploadtick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;
cout << "average download time is " << totaldownloadtick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl; cout << "average download time is " << totaldownloadtick / ((double)cvGetTickFrequency()* LOOP_TIMES * 1000.) << "ms" << endl;

@ -450,7 +450,7 @@ TEST_P(Split, Accuracy)
for(int j = LOOPROISTART; j < LOOPROIEND; j ++) for(int j = LOOPROISTART; j < LOOPROIEND; j ++)
{ {
Has_roi(j); Has_roi(j);
cv::Mat dev_dst[4] = {dst1_roi, dst2_roi, dst3_roi, dst4_roi}; //cv::Mat dev_dst[4] = {dst1_roi, dst2_roi, dst3_roi, dst4_roi};
cv::ocl::oclMat dev_gdst[4] = {gdst1, gdst2, gdst3, gdst4}; cv::ocl::oclMat dev_gdst[4] = {gdst1, gdst2, gdst3, gdst4};
gdst1_whole = dst1; gdst1_whole = dst1;
gdst1 = gdst1_whole(Rect(dst1x, dst1y, roicols, roirows)); gdst1 = gdst1_whole(Rect(dst1x, dst1y, roicols, roirows));

@ -54,11 +54,11 @@ using namespace cvtest;
using namespace testing; using namespace testing;
using namespace std; using namespace std;
#define FILTER_IMAGE "../../../samples/gpu/road.png" extern std::string workdir;
TEST(SURF, Performance) TEST(SURF, Performance)
{ {
cv::Mat img = readImage(FILTER_IMAGE, cv::IMREAD_GRAYSCALE); cv::Mat img = readImage(workdir+"lena.jpg", cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(img.empty()); ASSERT_FALSE(img.empty());
ocl::SURF_OCL d_surf; ocl::SURF_OCL d_surf;

@ -772,7 +772,7 @@ Scalar arithmetic_sum(const oclMat &src, int type = 0)
{ {
size_t groupnum = src.clCxt->impl->maxComputeUnits; size_t groupnum = src.clCxt->impl->maxComputeUnits;
CV_Assert(groupnum != 0); CV_Assert(groupnum != 0);
int vlen = src.oclchannels() == 3 ? 12 : 8, dbsize = groupnum * vlen, status; int vlen = src.oclchannels() == 3 ? 12 : 8, dbsize = groupnum * vlen;
Context *clCxt = src.clCxt; Context *clCxt = src.clCxt;
T *p = new T[dbsize]; T *p = new T[dbsize];
cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize * sizeof(T)); cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize * sizeof(T));
@ -930,7 +930,7 @@ template <typename T> void arithmetic_minMax(const oclMat &src, double *minVal,
CV_Assert(groupnum != 0); CV_Assert(groupnum != 0);
groupnum = groupnum * 2; groupnum = groupnum * 2;
int vlen = 8; int vlen = 8;
int dbsize = groupnum * 2 * vlen * sizeof(T) , status; int dbsize = groupnum * 2 * vlen * sizeof(T) ;
Context *clCxt = src.clCxt; Context *clCxt = src.clCxt;
cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize); cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize);
*minVal = std::numeric_limits<double>::max() , *maxVal = -std::numeric_limits<double>::max(); *minVal = std::numeric_limits<double>::max() , *maxVal = -std::numeric_limits<double>::max();
@ -945,11 +945,11 @@ template <typename T> void arithmetic_minMax(const oclMat &src, double *minVal,
T *p = new T[groupnum * vlen * 2]; T *p = new T[groupnum * vlen * 2];
memset(p, 0, dbsize); memset(p, 0, dbsize);
openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize); openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize);
for(int i = 0; i < vlen * groupnum; i++) for(int i = 0; i < vlen * (int)groupnum; i++)
{ {
*minVal = *minVal < p[i] ? *minVal : p[i]; *minVal = *minVal < p[i] ? *minVal : p[i];
} }
for(int i = vlen * groupnum; i < 2 * vlen * groupnum; i++) for(int i = vlen * (int)groupnum; i < 2 * vlen * (int)groupnum; i++)
{ {
*maxVal = *maxVal > p[i] ? *maxVal : p[i]; *maxVal = *maxVal > p[i] ? *maxVal : p[i];
} }
@ -1606,7 +1606,7 @@ void arithmetic_minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
size_t groupnum = src.clCxt->impl->maxComputeUnits; size_t groupnum = src.clCxt->impl->maxComputeUnits;
CV_Assert(groupnum != 0); CV_Assert(groupnum != 0);
int minloc = -1 , maxloc = -1; int minloc = -1 , maxloc = -1;
int vlen = 4, dbsize = groupnum * vlen * 4 * sizeof(T) , status; int vlen = 4, dbsize = groupnum * vlen * 4 * sizeof(T) ;
Context *clCxt = src.clCxt; Context *clCxt = src.clCxt;
cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize); cl_mem dstBuffer = openCLCreateBuffer(clCxt, CL_MEM_WRITE_ONLY, dbsize);
*minVal = std::numeric_limits<double>::max() , *maxVal = -std::numeric_limits<double>::max(); *minVal = std::numeric_limits<double>::max() , *maxVal = -std::numeric_limits<double>::max();
@ -1621,15 +1621,15 @@ void arithmetic_minMaxLoc(const oclMat &src, double *minVal, double *maxVal,
T *p = new T[groupnum * vlen * 4]; T *p = new T[groupnum * vlen * 4];
memset(p, 0, dbsize); memset(p, 0, dbsize);
openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize); openCLReadBuffer(clCxt, dstBuffer, (void *)p, dbsize);
for(int i = 0; i < vlen * groupnum; i++) for(int i = 0; i < vlen * (int)groupnum; i++)
{ {
*minVal = (*minVal < p[i] || p[i + 2 * vlen * groupnum] == -1) ? *minVal : p[i]; *minVal = (*minVal < p[i] || p[i + 2 * vlen * groupnum] == -1) ? *minVal : p[i];
minloc = (*minVal < p[i] || p[i + 2 * vlen * groupnum] == -1) ? minloc : p[i + 2 * vlen * groupnum]; minloc = (*minVal < p[i] || p[i + 2 * vlen * groupnum] == -1) ? minloc : cvRound(p[i + 2 * vlen * groupnum]);
} }
for(int i = vlen * groupnum; i < 2 * vlen * groupnum; i++) for(int i = vlen * (int)groupnum; i < 2 * vlen * (int)groupnum; i++)
{ {
*maxVal = (*maxVal > p[i] || p[i + 2 * vlen * groupnum] == -1) ? *maxVal : p[i]; *maxVal = (*maxVal > p[i] || p[i + 2 * vlen * groupnum] == -1) ? *maxVal : p[i];
maxloc = (*maxVal > p[i] || p[i + 2 * vlen * groupnum] == -1) ? maxloc : p[i + 2 * vlen * groupnum]; maxloc = (*maxVal > p[i] || p[i + 2 * vlen * groupnum] == -1) ? maxloc : cvRound(p[i + 2 * vlen * groupnum]);
} }
int pre_rows = src.offset / src.step; int pre_rows = src.offset / src.step;
@ -1717,7 +1717,7 @@ int cv::ocl::countNonZero(const oclMat &src)
} }
CV_Assert(groupnum != 0); CV_Assert(groupnum != 0);
groupnum = groupnum * 2; groupnum = groupnum * 2;
int vlen = 8 , dbsize = groupnum * vlen, status; int vlen = 8 , dbsize = groupnum * vlen;
//cl_ulong start, end; //cl_ulong start, end;
Context *clCxt = src.clCxt; Context *clCxt = src.clCxt;
string kernelName = "arithm_op_nonzero"; string kernelName = "arithm_op_nonzero";

@ -227,8 +227,8 @@ void matchUnrolledCached(const oclMat &query, const oclMat &train, const oclMat
} }
template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ > template < int BLOCK_SIZE, int MAX_DESC_LEN, typename T/*, typename Mask*/ >
void matchUnrolledCached(const oclMat query, const oclMat *trains, int n, const oclMat mask, void matchUnrolledCached(const oclMat /*query*/, const oclMat * /*trains*/, int /*n*/, const oclMat /*mask*/,
const oclMat &bestTrainIdx, const oclMat &bestImgIdx, const oclMat &bestDistance, int distType) const oclMat &/*bestTrainIdx*/, const oclMat & /*bestImgIdx*/, const oclMat & /*bestDistance*/, int /*distType*/)
{ {
} }
@ -266,8 +266,8 @@ void match(const oclMat &query, const oclMat &train, const oclMat &mask,
} }
template < int BLOCK_SIZE, typename T/*, typename Mask*/ > template < int BLOCK_SIZE, typename T/*, typename Mask*/ >
void match(const oclMat query, const oclMat *trains, int n, const oclMat mask, void match(const oclMat /*query*/, const oclMat * /*trains*/, int /*n*/, const oclMat /*mask*/,
const oclMat &bestTrainIdx, const oclMat &bestImgIdx, const oclMat &bestDistance, int distType) const oclMat &/*bestTrainIdx*/, const oclMat & /*bestImgIdx*/, const oclMat & /*bestDistance*/, int /*distType*/)
{ {
} }
@ -796,7 +796,7 @@ void match2Dispatcher(const oclMat &query, const oclMat &train, const oclMat &ma
} }
template <int BLOCK_SIZE> template <int BLOCK_SIZE>
void findKnnMatch(int k, const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist, int distType) void findKnnMatch(int k, const oclMat &trainIdx, const oclMat &distance, const oclMat &allDist, int /*distType*/)
{ {
cv::ocl::Context *ctx = trainIdx.clCxt; cv::ocl::Context *ctx = trainIdx.clCxt;
size_t globalSize[] = {trainIdx.rows * BLOCK_SIZE, 1, 1}; size_t globalSize[] = {trainIdx.rows * BLOCK_SIZE, 1, 1};
@ -1406,7 +1406,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::knnMatch(const oclMat &query, const oc
} }
void cv::ocl::BruteForceMatcher_OCL_base::knnMatch2Collection(const oclMat &query, const oclMat &trainCollection, void cv::ocl::BruteForceMatcher_OCL_base::knnMatch2Collection(const oclMat &query, const oclMat &trainCollection,
oclMat &trainIdx, oclMat &imgIdx, oclMat &distance, const oclMat &maskCollection) oclMat &trainIdx, oclMat &imgIdx, oclMat &distance, const oclMat &/*maskCollection*/)
{ {
if (query.empty() || trainCollection.empty()) if (query.empty() || trainCollection.empty())
return; return;
@ -1548,7 +1548,8 @@ void cv::ocl::BruteForceMatcher_OCL_base::knnMatch(const oclMat &query, vector<
temp.reserve(2 * k); temp.reserve(2 * k);
matches.resize(query.rows); matches.resize(query.rows);
for_each(matches.begin(), matches.end(), bind2nd(mem_fun_ref(&vector<DMatch>::reserve), k)); for(size_t queryIdx = 0; queryIdx < matches.size(); queryIdx++ )
matches[queryIdx].reserve(k);
for (size_t imgIdx = 0, size = trainDescCollection.size(); imgIdx < size; ++imgIdx) for (size_t imgIdx = 0, size = trainDescCollection.size(); imgIdx < size; ++imgIdx)
{ {
@ -1572,8 +1573,15 @@ void cv::ocl::BruteForceMatcher_OCL_base::knnMatch(const oclMat &query, vector<
if (compactResult) if (compactResult)
{ {
vector< vector<DMatch> >::iterator new_end = remove_if(matches.begin(), matches.end(), mem_fun_ref(&vector<DMatch>::empty)); size_t i, j = 0;
matches.erase(new_end, matches.end()); for( i = 0; i < matches.size(); i++ )
if( !matches[i].empty() )
{
if( i > j )
matches[j] = matches[i];
j++;
}
matches.resize(j);
} }
} }
} }
@ -1702,7 +1710,7 @@ void cv::ocl::BruteForceMatcher_OCL_base::radiusMatch(const oclMat &query, const
} }
void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchCollection(const oclMat &query, oclMat &trainIdx, oclMat &imgIdx, oclMat &distance, void cv::ocl::BruteForceMatcher_OCL_base::radiusMatchCollection(const oclMat &query, oclMat &trainIdx, oclMat &imgIdx, oclMat &distance,
oclMat &nMatches, float maxDistance, const vector<oclMat> &masks) oclMat &nMatches, float /*maxDistance*/, const vector<oclMat> &masks)
{ {
if (query.empty() || empty()) if (query.empty() || empty())
return; return;

@ -51,13 +51,17 @@ using namespace cv;
using namespace cv::ocl; using namespace cv::ocl;
using namespace std; using namespace std;
#if !defined (HAVE_OPENCL) #if !defined HAVE_OPENCL
void cv::ocl::dft(const oclMat &src, oclMat &dst, int flags) void cv::ocl::dft(const oclMat &src, oclMat &dst, int flags)
{ {
throw_nogpu(); throw_nogpu();
} }
#elif !defined HAVE_CLAMDFFT
void cv::ocl::dft(const oclMat &src, oclMat &dst, int flags)
{
CV_Error(CV_StsNotImplemented, "OpenCL DFT is not implemented");
}
#else #else
#include <clAmdFft.h> #include <clAmdFft.h>
namespace cv namespace cv

@ -340,12 +340,12 @@ void GPUErode(const oclMat &src, oclMat &dst, oclMat &mat_kernel, Size &ksize, c
Context *clCxt = src.clCxt; Context *clCxt = src.clCxt;
string kernelName; string kernelName;
size_t localThreads[3] = {16, 16, 1}; size_t localThreads[3] = {16, 16, 1};
size_t globalThreads[3] = {(src.cols + localThreads[0]) / localThreads[0] *localThreads[0], (src.rows + localThreads[1]) / localThreads[1] *localThreads[1], 1}; size_t globalThreads[3] = {(src.cols + localThreads[0] - 1) / localThreads[0] *localThreads[0], (src.rows + localThreads[1] - 1) / localThreads[1] *localThreads[1], 1};
if(src.type() == CV_8UC1) if(src.type() == CV_8UC1)
{ {
kernelName = "morph_C1_D0"; kernelName = "morph_C1_D0";
globalThreads[0] = ((src.cols + 3) / 4 + localThreads[0]) / localThreads[0] * localThreads[0]; globalThreads[0] = ((src.cols + 3) / 4 + localThreads[0] - 1) / localThreads[0] * localThreads[0];
CV_Assert( localThreads[0]*localThreads[1] * 8 >= (localThreads[0] * 4 + ksize.width - 1) * (localThreads[1] + ksize.height - 1) ); CV_Assert( localThreads[0]*localThreads[1] * 8 >= (localThreads[0] * 4 + ksize.width - 1) * (localThreads[1] + ksize.height - 1) );
} }
else else
@ -489,7 +489,7 @@ namespace
MorphologyFilterEngine_GPU(const Ptr<BaseFilter_GPU> &filter2D_, int iters_) : MorphologyFilterEngine_GPU(const Ptr<BaseFilter_GPU> &filter2D_, int iters_) :
Filter2DEngine_GPU(filter2D_), iters(iters_) {} Filter2DEngine_GPU(filter2D_), iters(iters_) {}
virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1)) virtual void apply(const oclMat &src, oclMat &dst)
{ {
Filter2DEngine_GPU::apply(src, dst); Filter2DEngine_GPU::apply(src, dst);
//if (iters > 1) //if (iters > 1)
@ -778,7 +778,7 @@ namespace
virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1)) virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1))
{ {
Size src_size = src.size(); Size src_size = src.size();
int src_type = src.type(); //int src_type = src.type();
int cn = src.oclchannels(); int cn = src.oclchannels();
//dst.create(src_size, src_type); //dst.create(src_size, src_type);

@ -50,11 +50,18 @@
#include "clAmdBlas.h" #include "clAmdBlas.h"
#if !defined (HAVE_OPENCL) #if !defined HAVE_OPENCL
void cv::ocl::dft(const oclMat &src, oclMat &dst, int flags) void cv::ocl::gemm(const oclMat &src1, const oclMat &src2, double alpha,
const oclMat &src3, double beta, oclMat &dst, int flags)
{ {
throw_nogpu(); throw_nogpu();
} }
#elif !defined HAVE_CLAMDBLAS
void cv::ocl::gemm(const oclMat &src1, const oclMat &src2, double alpha,
const oclMat &src3, double beta, oclMat &dst, int flags)
{
CV_Error(CV_StsNotImplemented, "OpenCL BLAS is not implemented");
}
#else #else
using namespace cv; using namespace cv;

@ -513,13 +513,13 @@ gpuCreateHidHaarClassifierCascade( CvHaarClassifierCascade *cascade, int *size,
#define sum_elem_ptr(sum,row,col) \ #define sum_elem_ptr(sum,row,col) \
((sumtype*)CV_MAT_ELEM_PTR_FAST((sum),(row),(col),sizeof(sumtype))) ((sumtype*)CV_MAT_ELEM_PTR_FAST((sum),(row),(col),sizeof(sumtype)))
#define sqsum_elem_ptr(sqsum,row,col) \ #define sqsum_elem_ptr(sqsum,row,col) \
((sqsumtype*)CV_MAT_ELEM_PTR_FAST((sqsum),(row),(col),sizeof(sqsumtype))) ((sqsumtype*)CV_MAT_ELEM_PTR_FAST((sqsum),(row),(col),sizeof(sqsumtype)))
#define calc_sum(rect,offset) \ #define calc_sum(rect,offset) \
((rect).p0[offset] - (rect).p1[offset] - (rect).p2[offset] + (rect).p3[offset]) ((rect).p0[offset] - (rect).p1[offset] - (rect).p2[offset] + (rect).p3[offset])
CV_IMPL void CV_IMPL void
@ -813,14 +813,9 @@ gpuSetHaarClassifierCascade( CvHaarClassifierCascade *_cascade
CvHaarFeature *feature = CvHaarFeature *feature =
&_cascade->stage_classifier[i].classifier[j].haar_feature[l]; &_cascade->stage_classifier[i].classifier[j].haar_feature[l];
GpuHidHaarTreeNode *hidnode = &stage_classifier[i].classifier[j].node[l]; GpuHidHaarTreeNode *hidnode = &stage_classifier[i].classifier[j].node[l];
double sum0 = 0, area0 = 0;
CvRect r[3]; CvRect r[3];
int base_w = -1, base_h = -1;
int new_base_w = 0, new_base_h = 0;
int kx, ky;
int flagx = 0, flagy = 0;
int x0 = 0, y0 = 0;
int nr; int nr;
/* align blocks */ /* align blocks */
@ -872,7 +867,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
std::vector<cv::Rect> rectList; std::vector<cv::Rect> rectList;
std::vector<int> rweights; std::vector<int> rweights;
double factor; double factor;
int coi;
int datasize; int datasize;
int totalclassifier; int totalclassifier;
@ -885,9 +879,9 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
int *candidate; int *candidate;
cl_int status; cl_int status;
bool doCannyPruning = (flags & CV_HAAR_DO_CANNY_PRUNING) != 0; // bool doCannyPruning = (flags & CV_HAAR_DO_CANNY_PRUNING) != 0;
bool findBiggestObject = (flags & CV_HAAR_FIND_BIGGEST_OBJECT) != 0; bool findBiggestObject = (flags & CV_HAAR_FIND_BIGGEST_OBJECT) != 0;
bool roughSearch = (flags & CV_HAAR_DO_ROUGH_SEARCH) != 0; // bool roughSearch = (flags & CV_HAAR_DO_ROUGH_SEARCH) != 0;
//the Intel HD Graphics is unsupported //the Intel HD Graphics is unsupported
if (gimg.clCxt->impl->devName.find("Intel(R) HD Graphics") != string::npos) if (gimg.clCxt->impl->devName.find("Intel(R) HD Graphics") != string::npos)
@ -1015,7 +1009,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
gimgroi = gsum(roi); gimgroi = gsum(roi);
gimgroisq = gsqsum(roi); gimgroisq = gsqsum(roi);
//scaleinfo[i].rows = gimgroi.rows; //scaleinfo[i].rows = gimgroi.rows;
int ystep = 1; // factor > 2 ? 1 : 2;
int width = gimgroi.cols - 1 - cascade->orig_window_size.width; int width = gimgroi.cols - 1 - cascade->orig_window_size.width;
int height = gimgroi.rows - 1 - cascade->orig_window_size.height; int height = gimgroi.rows - 1 - cascade->orig_window_size.height;
scaleinfo[i].width_height = (width << 16) | height; scaleinfo[i].width_height = (width << 16) | height;
@ -1109,7 +1102,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
pq.s[2] = gcascade->pq2; pq.s[2] = gcascade->pq2;
pq.s[3] = gcascade->pq3; pq.s[3] = gcascade->pq3;
float correction = gcascade->inv_window_area; float correction = gcascade->inv_window_area;
int argcount = 0;
//int grpnumperline = ((m + localThreads[0] - 1) / localThreads[0]); //int grpnumperline = ((m + localThreads[0] - 1) / localThreads[0]);
//int totalgrp = ((n + localThreads[1] - 1) / localThreads[1])*grpnumperline; //int totalgrp = ((n + localThreads[1] - 1) / localThreads[1])*grpnumperline;
// openCLVerifyKernel(gsum.clCxt, kernel, &blocksize, globalThreads, localThreads); // openCLVerifyKernel(gsum.clCxt, kernel, &blocksize, globalThreads, localThreads);
@ -1184,7 +1177,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
{ {
CvSize winsize0 = cascade->orig_window_size; CvSize winsize0 = cascade->orig_window_size;
int n_factors = 0; int n_factors = 0;
int flag = 0;
oclMat gsum; oclMat gsum;
oclMat gsqsum; oclMat gsqsum;
cv::ocl::integral(gimg, gsum, gsqsum); cv::ocl::integral(gimg, gsum, gsqsum);
@ -1276,7 +1268,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
scaleinfo[i].imgoff = 0; scaleinfo[i].imgoff = 0;
scaleinfo[i].factor = factor; scaleinfo[i].factor = factor;
int startnodenum = nodenum * i; int startnodenum = nodenum * i;
int argcounts = 0;
float factor2 = (float)factor; float factor2 = (float)factor;
/* /*
openCLSafeCall(clSetKernelArg(kernel2, argcounts++, sizeof(cl_mem), (void *)&nodebuffer)); openCLSafeCall(clSetKernelArg(kernel2, argcounts++, sizeof(cl_mem), (void *)&nodebuffer));
@ -1294,7 +1285,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
args1.push_back ( make_pair(sizeof(cl_int) , (void *)&startnodenum )); args1.push_back ( make_pair(sizeof(cl_int) , (void *)&startnodenum ));
size_t globalThreads2[3] = {nodenum, 1, 1}; size_t globalThreads2[3] = {nodenum, 1, 1};
size_t localThreads2[3] = {256, 1, 1};
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier", globalThreads2, NULL/*localThreads2*/, args1, -1, -1); openCLExecuteKernel(gsum.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier", globalThreads2, NULL/*localThreads2*/, args1, -1, -1);
@ -2478,13 +2468,13 @@ else
// } /* j */ // } /* j */
// } // }
//} //}
/*
CV_INLINE CV_INLINE
double gpuEvalHidHaarClassifier( GpuHidHaarClassifier *classifier, double gpuEvalHidHaarClassifier( GpuHidHaarClassifier *classifier,
double variance_norm_factor, double variance_norm_factor,
size_t p_offset ) size_t p_offset )
{ {
/*
int idx = 0; int idx = 0;
do do
{ {
@ -2501,14 +2491,15 @@ double gpuEvalHidHaarClassifier( GpuHidHaarClassifier *classifier,
} }
while( idx > 0 ); while( idx > 0 );
return classifier->alpha[-idx]; return classifier->alpha[-idx];
*/
return 0.; return 0.;
} }
*/
CV_IMPL int CV_IMPL int
gpuRunHaarClassifierCascade( const CvHaarClassifierCascade *_cascade, gpuRunHaarClassifierCascade( /*const CvHaarClassifierCascade *_cascade,
CvPoint pt, int start_stage ) CvPoint pt, int start_stage */)
{ {
/* /*
int result = -1; int result = -1;
@ -2620,7 +2611,7 @@ namespace cv
for( y = y1; y < y2; y += ystep ) for( y = y1; y < y2; y += ystep )
for( x = 0; x < ssz.width; x += ystep ) for( x = 0; x < ssz.width; x += ystep )
{ {
if( gpuRunHaarClassifierCascade( cascade, cvPoint(x, y), 0 ) > 0 ) if( gpuRunHaarClassifierCascade( /*cascade, cvPoint(x, y), 0*/ ) > 0 )
vec->push_back(Rect(cvRound(x * factor), cvRound(y * factor), vec->push_back(Rect(cvRound(x * factor), cvRound(y * factor),
winSize.width, winSize.height)); winSize.width, winSize.height));
} }
@ -2679,7 +2670,7 @@ namespace cv
} }
} }
int result = gpuRunHaarClassifierCascade( cascade, cvPoint(x, y), 0 ); int result = gpuRunHaarClassifierCascade(/* cascade, cvPoint(x, y), 0 */);
if( result > 0 ) if( result > 0 )
vec->push_back(Rect(x, y, winsize.width, winsize.height)); vec->push_back(Rect(x, y, winsize.width, winsize.height));
ixstep = result != 0 ? 1 : 2; ixstep = result != 0 ? 1 : 2;

@ -296,9 +296,9 @@ namespace cv
kernelName = "remapNNF1Constant"; kernelName = "remapNNF1Constant";
} }
int channels = dst.oclchannels(); //int channels = dst.oclchannels();
int depth = dst.depth(); //int depth = dst.depth();
int type = src.type(); //int type = src.type();
size_t blkSizeX = 16, blkSizeY = 16; size_t blkSizeX = 16, blkSizeY = 16;
size_t glbSizeX; size_t glbSizeX;
int cols = dst.cols; int cols = dst.cols;
@ -308,7 +308,7 @@ namespace cv
glbSizeX = cols % blkSizeX == 0 ? cols : (cols / blkSizeX + 1) * blkSizeX; glbSizeX = cols % blkSizeX == 0 ? cols : (cols / blkSizeX + 1) * blkSizeX;
} }
else if(src.type() == CV_8UC3 || src.type() == CV_8UC4 || src.type() == CV_32FC1) else if(src.type() == CV_32FC1 && interpolation == INTER_LINEAR)
{ {
cols = (dst.cols + (dst.offset >> 2) % 4 + 3) / 4; cols = (dst.cols + (dst.offset >> 2) % 4 + 3) / 4;
glbSizeX = cols % blkSizeX == 0 ? cols : (cols / blkSizeX + 1) * blkSizeX; glbSizeX = cols % blkSizeX == 0 ? cols : (cols / blkSizeX + 1) * blkSizeX;
@ -322,73 +322,6 @@ namespace cv
size_t glbSizeY = dst.rows % blkSizeY == 0 ? dst.rows : (dst.rows / blkSizeY + 1) * blkSizeY; size_t glbSizeY = dst.rows % blkSizeY == 0 ? dst.rows : (dst.rows / blkSizeY + 1) * blkSizeY;
size_t globalThreads[3] = {glbSizeX, glbSizeY, 1}; size_t globalThreads[3] = {glbSizeX, glbSizeY, 1};
size_t localThreads[3] = {blkSizeX, blkSizeY, 1}; size_t localThreads[3] = {blkSizeX, blkSizeY, 1};
/*
/////////////////////////////
//using the image buffer
/////////////////////////////
size_t image_row_pitch = 0;
cl_int err1, err2, err3;
cl_mem_flags flags1 = CL_MEM_READ_ONLY;
cl_image_format format;
if(src.type() == CV_8UC1)
{
format.image_channel_order = CL_R;
format.image_channel_data_type = CL_UNSIGNED_INT8;
}
else if(src.type() == CV_8UC4)
{
format.image_channel_order = CL_RGBA;
format.image_channel_data_type = CL_UNSIGNED_INT8;
}
else if(src.type() == CV_32FC1)
{
format.image_channel_order = CL_R;
format.image_channel_data_type = CL_FLOAT;
}
else if(src.type() == CV_32FC4)
{
format.image_channel_order = CL_RGBA;
format.image_channel_data_type = CL_FLOAT;
}
cl_mem srcImage = clCreateImage2D(clCxt->impl->clContext, flags1, &format, src.cols, src.rows,
image_row_pitch, NULL, &err1);
if(err1 != CL_SUCCESS)
{
printf("Error creating CL image buffer, error code %d\n", err1);
return;
}
const size_t src_origin[3] = {0, 0, 0};
const size_t region[3] = {src.cols, src.rows, 1};
cl_event BtoI_event, ItoB_event;
err3 = clEnqueueCopyBufferToImage(clCxt->impl->clCmdQueue, (cl_mem)src.data, srcImage,
0, src_origin, region, 0, NULL, NULL);
if(err3 != CL_SUCCESS)
{
printf("Error copying buffer to image\n");
printf("Error code %d \n", err3);
return;
}
// clWaitForEvents(1, &BtoI_event);
cl_int ret;
Mat test(src.rows, src.cols, CV_8UC1);
memset(test.data, 0, src.rows*src.cols);
ret = clEnqueueReadImage(clCxt->impl->clCmdQueue, srcImage, CL_TRUE,
src_origin, region, 0, 0, test.data, NULL, NULL, &ItoB_event);
if(ret != CL_SUCCESS)
{
printf("read image error, %d ", ret);
return;
}
clWaitForEvents(1, &ItoB_event);
cout << "src" << endl;
cout << src << endl;
cout<<"image:"<<endl;
cout<< test << endl;
*/
vector< pair<size_t, const void *> > args; vector< pair<size_t, const void *> > args;
@ -396,7 +329,6 @@ namespace cv
{ {
args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data)); args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data)); args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
// args.push_back( make_pair(sizeof(cl_mem),(void*)&srcImage)); //imageBuffer
args.push_back( make_pair(sizeof(cl_mem), (void *)&map1.data)); args.push_back( make_pair(sizeof(cl_mem), (void *)&map1.data));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset)); args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset)); args.push_back( make_pair(sizeof(cl_int), (void *)&src.offset));
@ -425,7 +357,6 @@ namespace cv
{ {
args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data)); args.push_back( make_pair(sizeof(cl_mem), (void *)&dst.data));
args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data)); args.push_back( make_pair(sizeof(cl_mem), (void *)&src.data));
// args.push_back( make_pair(sizeof(cl_mem),(void*)&srcImage)); //imageBuffer
args.push_back( make_pair(sizeof(cl_mem), (void *)&map1.data)); args.push_back( make_pair(sizeof(cl_mem), (void *)&map1.data));
args.push_back( make_pair(sizeof(cl_mem), (void *)&map2.data)); args.push_back( make_pair(sizeof(cl_mem), (void *)&map2.data));
args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset)); args.push_back( make_pair(sizeof(cl_int), (void *)&dst.offset));
@ -1369,10 +1300,10 @@ namespace cv
if( src.depth() != CV_8U || src.oclchannels() != 4 ) if( src.depth() != CV_8U || src.oclchannels() != 4 )
CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" ); CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );
if(src.clCxt->impl->double_support == 0) // if(src.clCxt->impl->double_support == 0)
{ // {
CV_Error( CV_GpuNotSupported, "Selected device doesn't support double, so a deviation is exists.\nIf the accuracy is acceptable, the error can be ignored.\n"); // CV_Error( CV_GpuNotSupported, "Selected device doesn't support double, so a deviation exists.\nIf the accuracy is acceptable, the error can be ignored.\n");
} // }
dst.create( src.size(), CV_8UC4 ); dst.create( src.size(), CV_8UC4 );
@ -1437,10 +1368,10 @@ namespace cv
if( src.depth() != CV_8U || src.oclchannels() != 4 ) if( src.depth() != CV_8U || src.oclchannels() != 4 )
CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" ); CV_Error( CV_StsUnsupportedFormat, "Only 8-bit, 4-channel images are supported" );
if(src.clCxt->impl->double_support == 0) // if(src.clCxt->impl->double_support == 0)
{ // {
CV_Error( CV_GpuNotSupported, "Selected device doesn't support double, so a deviation is exists.\nIf the accuracy is acceptable, the error can be ignored.\n"); // CV_Error( CV_GpuNotSupported, "Selected device doesn't support double, so a deviation exists.\nIf the accuracy is acceptable, the error can be ignored.\n");
} // }
dstr.create( src.size(), CV_8UC4 ); dstr.create( src.size(), CV_8UC4 );
dstsp.create( src.size(), CV_16SC2 ); dstsp.create( src.size(), CV_16SC2 );
@ -1603,7 +1534,7 @@ namespace cv
int borderType ) int borderType )
{ {
int cn = src.channels(); int cn = src.channels();
int i, j, k, maxk, radius; int i, j, maxk, radius;
Size size = src.size(); Size size = src.size();
CV_Assert( (src.channels() == 1 || src.channels() == 3) && CV_Assert( (src.channels() == 1 || src.channels() == 3) &&

@ -424,7 +424,7 @@ namespace cv
void openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset, void openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset,
const void *src, size_t spitch, const void *src, size_t spitch,
size_t width, size_t height, int src_offset, enum openCLMemcpyKind kind) size_t width, size_t height, int src_offset)
{ {
size_t src_origin[3] = {src_offset % spitch, src_offset / spitch, 0}; size_t src_origin[3] = {src_offset % spitch, src_offset / spitch, 0};
size_t dst_origin[3] = {dst_offset % dpitch, dst_offset / dpitch, 0}; size_t dst_origin[3] = {dst_offset % dpitch, dst_offset / dpitch, 0};
@ -451,7 +451,7 @@ namespace cv
} }
int savetofile(const Context *clcxt, cl_program &program, const char *fileName) int savetofile(const Context *clcxt, cl_program &program, const char *fileName)
{ {
cl_int status; //cl_int status;
size_t numDevices = 1; size_t numDevices = 1;
cl_device_id *devices = clcxt->impl->devices; cl_device_id *devices = clcxt->impl->devices;
//figure out the sizes of each of the binaries. //figure out the sizes of each of the binaries.
@ -507,7 +507,7 @@ namespace cv
FILE *fp = fopen(fileName, "wb+"); FILE *fp = fopen(fileName, "wb+");
if(fp == NULL) if(fp == NULL)
{ {
char *temp; char *temp = NULL;
sprintf(temp, "Failed to load kernel file : %s\r\n", fileName); sprintf(temp, "Failed to load kernel file : %s\r\n", fileName);
CV_Error(CV_GpuApiCallError, temp); CV_Error(CV_GpuApiCallError, temp);
} }
@ -639,8 +639,7 @@ namespace cv
return kernel; return kernel;
} }
void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *blockSize, void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads)
size_t *globalThreads, size_t *localThreads)
{ {
size_t kernelWorkGroupSize; size_t kernelWorkGroupSize;
openCLSafeCall(clGetKernelWorkGroupInfo(kernel, clCxt->impl->devices[0], openCLSafeCall(clGetKernelWorkGroupInfo(kernel, clCxt->impl->devices[0],
@ -679,10 +678,10 @@ namespace cv
globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1]; globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1];
globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2]; globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2];
size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2]; //size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2];
cv::ocl::openCLVerifyKernel(clCxt, kernel, &blockSize, globalThreads, localThreads); cv::ocl::openCLVerifyKernel(clCxt, kernel, localThreads);
} }
for(int i = 0; i < args.size(); i ++) for(size_t i = 0; i < args.size(); i ++)
openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second)); openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
#ifndef PRINT_KERNEL_RUN_TIME #ifndef PRINT_KERNEL_RUN_TIME
@ -897,7 +896,7 @@ namespace cv
impl->maxComputeUnits = m.impl->maxComputeUnits; impl->maxComputeUnits = m.impl->maxComputeUnits;
impl->double_support = m.impl->double_support; impl->double_support = m.impl->double_support;
memcpy(impl->extra_options, m.impl->extra_options, 512); memcpy(impl->extra_options, m.impl->extra_options, 512);
for(int i = 0; i < m.impl->devices.size(); i++) for(size_t i = 0; i < m.impl->devices.size(); i++)
{ {
impl->devices.push_back(m.impl->devices[i]); impl->devices.push_back(m.impl->devices[i]);
impl->devName.push_back(m.impl->devName[i]); impl->devName.push_back(m.impl->devName[i]);

@ -61,30 +61,29 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
int y = get_global_id(1); int y = get_global_id(1);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
uchar4 src1_data ,src2_data; uchar4 src1_data ,src2_data;
src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0; src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0;
src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0; src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0;
src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0; src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0;
src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0; src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0;
src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0; src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0;
src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0; src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0;
src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0; src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0;
src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0; src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0;
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
// short4 tmp = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama; // short4 tmp = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama;
@ -118,21 +117,35 @@ __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offs
int y = get_global_id(1); int y = get_global_id(1);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset +( x<< 1) & (int)0xfffffff8); int dst_index = mad24(y, dst_step, dst_offset +( x<< 1) & (int)0xfffffff8);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
// int4 tmp = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama; // int4 tmp = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
@ -164,22 +177,36 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
int y = get_global_id(1); int y = get_global_id(1);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset +( x<< 1) - (dst_align << 1 )); int dst_index = mad24(y, dst_step, dst_offset +( x<< 1) - (dst_align << 1 ));
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); int src1_index_fix = src1_index < 0 ? 0 : src1_index;
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0)
{
short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
// int4 tmp = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama; // int4 tmp = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
int4 tmp; int4 tmp;
@ -209,24 +236,39 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
int y = get_global_id(1); int y = get_global_id(1);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define bitOfInt (sizeof(int)== 4 ? 2: 3) #define bitOfInt (sizeof(int)== 4 ? 2: 3)
#define dst_align ((dst_offset >> bitOfInt) & 3) #define dst_align ((dst_offset >> bitOfInt) & 3)
int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt)); int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt)); int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt)); int dst_index = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt));
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index)); int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index_fix));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0)
{
int4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
int4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index)); int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
// double4 tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ; // double4 tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
float4 tmp; float4 tmp;
@ -257,23 +299,37 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
int y = get_global_id(1); int y = get_global_id(1);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2) & 3) #define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2)); int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index)); int src1_index_fix = src1_index < 0 ? 0 : src1_index;
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index)); int src2_index_fix = src2_index < 0 ? 0 : src2_index;
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index)); float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
if(src1_index < 0)
{
float4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
float4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
// double4 tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ; // double4 tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
// float4 tmp_data =(src1_data) * alpha + (src2_data) * beta + gama ; // float4 tmp_data =(src1_data) * alpha + (src2_data) * beta + gama ;
@ -305,23 +361,37 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs
int y = get_global_id(1); int y = get_global_id(1);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 3) & 3) #define dst_align ((dst_offset >> 3) & 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3)); int dst_index = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3));
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index)); int src1_index_fix = src1_index < 0 ? 0 : src1_index;
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index)); int src2_index_fix = src2_index < 0 ? 0 : src2_index;
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index)); double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index));
if(src1_index < 0)
{
double4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
double4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
// double4 tmp_data = (src1_data) * alpha + (src2_data) * beta + gama ; // double4 tmp_data = (src1_data) * alpha + (src2_data) * beta + gama ;
double4 tmp_data; double4 tmp_data;
tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama; tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;

@ -63,15 +63,29 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
uchar4 src1_data = vload4(0, src1 + src1_index); int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src2_data = vload4(0, src2 + src2_index); uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = src1_data & src2_data; uchar4 tmp_data = src1_data & src2_data;
@ -99,16 +113,30 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
char4 src1_data = vload4(0, src1 + src1_index); int src1_index_fix = src1_index < 0 ? 0 : src1_index;
char4 src2_data = vload4(0, src2 + src2_index); int src2_index_fix = src2_index < 0 ? 0 : src2_index;
char4 src1_data = vload4(0, src1 + src1_index_fix);
char4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0)
{
char4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
char4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
char4 dst_data = *((__global char4 *)(dst + dst_index)); char4 dst_data = *((__global char4 *)(dst + dst_index));
char4 tmp_data = src1_data & src2_data; char4 tmp_data = src1_data & src2_data;
@ -136,16 +164,30 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); int src1_index_fix = src1_index < 0 ? 0 : src1_index;
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
ushort4 tmp_data = src1_data & src2_data; ushort4 tmp_data = src1_data & src2_data;
@ -174,16 +216,30 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); int src1_index_fix = src1_index < 0 ? 0 : src1_index;
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0)
{
short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
short4 tmp_data = src1_data & src2_data; short4 tmp_data = src1_data & src2_data;

@ -62,17 +62,24 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
uchar4 src1_data = vload4(0, src1 + src1_index); uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = ~ src1_data; uchar4 tmp_data = ~ src1_data;
/* if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
*/
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y; dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z; dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
@ -95,7 +102,7 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@ -129,7 +136,7 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@ -164,7 +171,7 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@ -238,12 +245,12 @@ __kernel void arithm_bitwise_not_D6 (__global char *src, int src_step, int src_o
{ {
int src_index = mad24(y, src_step, (x << 3) + src_offset); int src_index = mad24(y, src_step, (x << 3) + src_offset);
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); int dst_index = mad24(y, dst_step, (x << 3) + dst_offset);
char8 data; char8 data;
data = *((__global char8 *)((__global char *)src + src_index)); data = *((__global char8 *)((__global char *)src + src_index));
data = ~ data; data = ~ data;
*((__global char8 *)((__global char *)dst + dst_index)) = data; *((__global char8 *)((__global char *)dst + dst_index)) = data;
} }
} }

@ -63,16 +63,28 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
uchar4 src1_data = vload4(0, src1 + src1_index); int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src2_data = vload4(0, src2 + src2_index); uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = src1_data | src2_data; uchar4 tmp_data = src1_data | src2_data;
@ -99,8 +111,8 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@ -136,8 +148,8 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
@ -174,8 +186,8 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);

@ -63,16 +63,30 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
uchar4 src1_data = vload4(0, src1 + src1_index); int src1_index_fix = src1_index < 0 ? 0 : src1_index;
uchar4 src2_data = vload4(0, src2 + src2_index); int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = src1_data ^ src2_data; uchar4 tmp_data = src1_data ^ src2_data;
@ -99,16 +113,30 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
char4 src1_data = vload4(0, src1 + src1_index); int src1_index_fix = src1_index < 0 ? 0 : src1_index;
char4 src2_data = vload4(0, src2 + src2_index); int src2_index_fix = src2_index < 0 ? 0 : src2_index;
char4 src1_data = vload4(0, src1 + src1_index_fix);
char4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0)
{
char4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
char4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
char4 dst_data = *((__global char4 *)(dst + dst_index)); char4 dst_data = *((__global char4 *)(dst + dst_index));
char4 tmp_data = src1_data ^ src2_data; char4 tmp_data = src1_data ^ src2_data;
@ -136,16 +164,30 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); int src1_index_fix = src1_index < 0 ? 0 : src1_index;
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
ushort4 tmp_data = src1_data ^ src2_data; ushort4 tmp_data = src1_data ^ src2_data;
@ -174,17 +216,35 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); int src1_index_fix = src1_index < 0 ? 0 : src1_index;
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
if(src1_index < 0)
{
short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
short4 tmp_data = src1_data ^ src2_data; short4 tmp_data = src1_data ^ src2_data;
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;

@ -63,16 +63,31 @@ __kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
uchar4 src1_data = vload4(0, src1 + src1_index); int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src2_data = vload4(0, src2 + src2_index); uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data == src2_data)); uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
@ -85,7 +100,8 @@ __kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src
} }
} }
__kernel void arithm_compare_eq_D2 (__global ushort *src1, int src1_step, int src1_offset,
__kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset, __global ushort *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset, __global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
@ -98,16 +114,30 @@ __kernel void arithm_compare_eq_D2 (__global ushort *src1, int src1_step, int sr
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1)& 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
if(src1_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data == src2_data)); uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
@ -122,7 +152,6 @@ __kernel void arithm_compare_eq_D2 (__global ushort *src1, int src1_step, int sr
} }
__kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src1_offset, __kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset, __global short *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset, __global uchar *dst, int dst_step, int dst_offset,
@ -137,16 +166,32 @@ __kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
if(src1_index < 0)
{
short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data == src2_data)); uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
@ -170,18 +215,33 @@ __kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_
int y = get_global_id(1); int y = get_global_id(1);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2) & 3) #define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
if(src1_index < 0)
{
int4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
int4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data == src2_data)); uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
@ -206,15 +266,23 @@ __kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2) & 3) #define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src2_index < 0)
{
float4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index));
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index));
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data == src2_data)); uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
@ -240,15 +308,30 @@ __kernel void arithm_compare_eq_D6 (__global double *src1, int src1_step, int sr
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 3) & 3) #define dst_align ((dst_offset >> 3) & 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0)
{
double4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
double4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index));
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index));
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data == src2_data)); uchar4 tmp_data = convert_uchar4((src1_data == src2_data));
@ -276,16 +359,31 @@ __kernel void arithm_compare_gt_D0 (__global uchar *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
uchar4 src1_data = vload4(0, src1 + src1_index); int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src2_data = vload4(0, src2 + src2_index); uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data > src2_data)); uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
@ -312,16 +410,31 @@ __kernel void arithm_compare_gt_D2 (__global ushort *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
if(src1_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data > src2_data)); uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
@ -350,15 +463,30 @@ __kernel void arithm_compare_gt_D3 (__global short *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
if(src1_index < 0)
{
short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data > src2_data)); uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
@ -384,15 +512,31 @@ __kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2) & 3) #define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
if(src1_index < 0)
{
int4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
int4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data > src2_data)); uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
@ -417,15 +561,30 @@ __kernel void arithm_compare_gt_D5 (__global float *src1, int src1_step, int src
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2) & 3) #define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0)
{
float4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
float4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index));
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index));
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data > src2_data)); uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
@ -451,15 +610,30 @@ __kernel void arithm_compare_gt_D6 (__global double *src1, int src1_step, int sr
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 3) & 3) #define dst_align ((dst_offset >> 3) & 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0)
{
double4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
double4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index));
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index));
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data > src2_data)); uchar4 tmp_data = convert_uchar4((src1_data > src2_data));
@ -487,15 +661,31 @@ __kernel void arithm_compare_ge_D0 (__global uchar *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
uchar4 src1_data = vload4(0, src1 + src1_index); int src1_index_fix = src1_index < 0 ? 0 : src1_index;
uchar4 src2_data = vload4(0, src2 + src2_index); int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data >= src2_data)); uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
@ -525,15 +715,32 @@ __kernel void arithm_compare_ge_D2 (__global ushort *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
if(src1_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data >= src2_data)); uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
@ -563,15 +770,31 @@ __kernel void arithm_compare_ge_D3 (__global short *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1)& 3) #define dst_align ((dst_offset >> 1)& 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
if(src1_index < 0)
{
short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data >= src2_data)); uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
@ -598,16 +821,31 @@ __kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2)& 3) #define dst_align ((dst_offset >> 2)& 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); if(src1_index < 0)
{
int4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
int4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data >= src2_data)); uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
@ -632,15 +870,31 @@ __kernel void arithm_compare_ge_D5 (__global float *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2)& 3) #define dst_align ((dst_offset >> 2)& 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index)); int src1_index_fix = src1_index < 0 ? 0 : src1_index;
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index)); int src2_index_fix = src2_index < 0 ? 0 : src2_index;
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0)
{
float4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
float4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data >= src2_data)); uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
@ -667,16 +921,28 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 3)& 3) #define dst_align ((dst_offset >> 3)& 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index)); int src2_index_fix = src2_index < 0 ? 0 : src2_index;
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index)); double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0)
{
double4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
double4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data >= src2_data)); uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;

@ -59,15 +59,29 @@ __kernel void arithm_compare_ne_D0 (__global uchar *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
uchar4 src1_data = vload4(0, src1 + src1_index); int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src2_data = vload4(0, src2 + src2_index); uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data != src2_data)); uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
@ -97,15 +111,29 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1)& 3) #define dst_align ((dst_offset >> 1)& 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
if(src1_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data != src2_data)); uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
@ -135,15 +163,29 @@ __kernel void arithm_compare_ne_D3 (__global short *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1)& 3) #define dst_align ((dst_offset >> 1)& 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
if(src1_index < 0)
{
short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data != src2_data)); uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
@ -169,15 +211,31 @@ __kernel void arithm_compare_ne_D4 (__global int *src1, int src1_step, int src1_
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2)& 3) #define dst_align ((dst_offset >> 2)& 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
if(src1_index < 0)
{
int4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
int4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data != src2_data)); uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
@ -202,16 +260,29 @@ __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2) & 3) #define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index)); int src2_index_fix = src2_index < 0 ? 0 : src2_index;
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index)); float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src1_index < 0)
{
float4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
float4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data != src2_data)); uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
@ -236,15 +307,30 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 3) & 3) #define dst_align ((dst_offset >> 3) & 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0)
{
double4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
double4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index));
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index));
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data != src2_data)); uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
@ -258,7 +344,7 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr
} }
#endif #endif
/***********************************Compare LT*******************************/ /***********************************Compare LT*******************************/
__kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src1_offset, __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset, __global uchar *src2, int src2_step, int src2_offset,
@ -273,15 +359,29 @@ __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
uchar4 src1_data = vload4(0, src1 + src1_index); int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src2_data = vload4(0, src2 + src2_index); uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data < src2_data)); uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
@ -311,15 +411,30 @@ __kernel void arithm_compare_lt_D2 (__global ushort *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
if(src1_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data < src2_data)); uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
@ -349,15 +464,30 @@ __kernel void arithm_compare_lt_D3 (__global short *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
if(src1_index < 0)
{
short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data < src2_data)); uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
@ -383,15 +513,34 @@ __kernel void arithm_compare_lt_D4 (__global int *src1, int src1_step, int src1_
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2) & 3) #define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
if(src1_index < 0)
{
int4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
int4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data < src2_data)); uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
@ -416,16 +565,31 @@ __kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2) & 3) #define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index)); int src2_index_fix = src2_index < 0 ? 0 : src2_index;
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index)); float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0)
{
float4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
float4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data < src2_data)); uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
@ -450,16 +614,31 @@ __kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int sr
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 3) & 3) #define dst_align ((dst_offset >> 3) & 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index)); int src2_index_fix = src2_index < 0 ? 0 : src2_index;
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index)); double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0)
{
double4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
double4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data < src2_data)); uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
@ -486,15 +665,30 @@ __kernel void arithm_compare_le_D0 (__global uchar *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 src1_data = vload4(0, src1 + src1_index);
uchar4 src2_data = vload4(0, src2 + src2_index);
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data <= src2_data)); uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
@ -524,15 +718,30 @@ __kernel void arithm_compare_le_D2 (__global ushort *src1, int src1_step, int sr
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index));
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index));
if(src1_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data <= src2_data)); uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
@ -562,15 +771,30 @@ __kernel void arithm_compare_le_D3 (__global short *src1, int src1_step, int src
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index));
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index));
if(src1_index < 0)
{
short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data <= src2_data)); uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
@ -596,15 +820,30 @@ __kernel void arithm_compare_le_D4 (__global int *src1, int src1_step, int src1_
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2)& 3) #define dst_align ((dst_offset >> 2)& 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
if(src1_index < 0)
{
int4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
int4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data =convert_uchar4((src1_data <= src2_data)); uchar4 tmp_data =convert_uchar4((src1_data <= src2_data));
@ -629,15 +868,29 @@ __kernel void arithm_compare_le_D5 (__global float *src1, int src1_step, int src
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2)& 3) #define dst_align ((dst_offset >> 2)& 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0)
{
float4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
float4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index));
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index));
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data <= src2_data)); uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));
@ -663,15 +916,30 @@ __kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int sr
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 3)& 3) #define dst_align ((dst_offset >> 3)& 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0)
{
double4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
double4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index));
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index));
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data <= src2_data)); uchar4 tmp_data = convert_uchar4((src1_data <= src2_data));

@ -60,23 +60,36 @@ __kernel void magnitudeSqr_C1_D5 (__global float *src1,int src1_step,int src1_of
int y = get_global_id(1); int y = get_global_id(1);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2) & 3) #define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2)); int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index)); int src2_index_fix = src2_index < 0 ? 0 : src2_index;
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index)); float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0)
{
float4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
float4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index)); float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
float4 tmp_data ; float4 tmp_data ;
@ -112,21 +125,32 @@ __kernel void magnitudeSqr_C2_D5 (__global float *src1,int src1_step,int src1_of
int y = get_global_id(1); int y = get_global_id(1);
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2) & 3) #define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2)); int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
float8 src1_data = vload8(0, (__global float *)((__global char *)src1 + src1_index_fix));
if(src1_index==-6)
src1_data.s01234567 = src1_data.s67012345;
if(src1_index==-4)
src1_data.s01234567 = src1_data.s45670123;
if(src1_index== -2)
src1_data.s01234567 = src1_data.s23456701;
float8 src1_data = vload8(0, (__global float *)((__global char *)src1 + src1_index));
float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index)); float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
float4 tmp_data ; float4 tmp_data ;

File diff suppressed because it is too large Load Diff

@ -51,9 +51,9 @@
////////////vector fuction name format: split_vector_C(channels number)_D(data type depth)////// ////////////vector fuction name format: split_vector_C(channels number)_D(data type depth)//////
//////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void split_vector_C4_D0 (__global uchar *mat_src, int src_step, int src_offset, __kernel void split_vector_C4_D0 (__global uchar *mat_src, int src_step, int src_offset,
__global uchar *mat_dst0, int dst0_step, int dst0_offset, __global uchar *mat_dst0, int dst0_step, int dst0_offset,
__global uchar *mat_dst1, int dst1_step, int dst1_offset, __global uchar *mat_dst1, int dst1_step, int dst1_offset,
__global uchar *mat_dst2, int dst2_step, int dst2_offset, __global uchar *mat_dst2, int dst2_step, int dst2_offset,
__global uchar *mat_dst3, int dst3_step, int dst3_offset, __global uchar *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
@ -61,37 +61,37 @@ __kernel void split_vector_C4_D0 (__global uchar *mat_src, int src_step, int s
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
if((x < cols) && (y < rows)) if((x < cols) && (y < rows))
{ {
x = x << 2; x = x << 2;
int src_idx = mad24(y, src_step, src_offset + (x << 2)); int src_idx = mad24(y, src_step, src_offset + (x << 2));
int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + x) & (int)0xfffffffc; int dst0_idx = mad24(y, dst0_step, dst0_offset + x) & (int)0xfffffffc;
int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + x) & (int)0xfffffffc; int dst1_idx = mad24(y, dst1_step, dst1_offset + x) & (int)0xfffffffc;
int dst2_start = mad24(y, dst2_step, dst2_offset); int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1); int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + x) & (int)0xfffffffc; int dst2_idx = mad24(y, dst2_step, dst2_offset + x) & (int)0xfffffffc;
int dst3_start = mad24(y, dst3_step, dst3_offset); int dst3_start = mad24(y, dst3_step, dst3_offset);
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1); int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
int dst3_idx = mad24(y, dst3_step, dst3_offset + x) & (int)0xfffffffc; int dst3_idx = mad24(y, dst3_step, dst3_offset + x) & (int)0xfffffffc;
uchar4 data_0 = *((global uchar4 *)(mat_src + (src_idx - 12 >= 0 ? src_idx - 12 : src_idx)));
uchar4 data_1 = *((global uchar4 *)(mat_src + (src_idx - 8 >= 0 ? src_idx - 8 : src_idx)));
uchar4 data_2 = *((global uchar4 *)(mat_src + (src_idx - 4 >= 0 ? src_idx - 4 : src_idx)));
uchar4 data_3 = *((global uchar4 *)(mat_src + src_idx + 0 ));
uchar4 data_0 = *((global uchar4 *)(mat_src + (src_idx - 12 >= 0 ? src_idx - 12 : src_idx))); int total_bytes = src_offset + rows * src_step;
uchar4 data_1 = *((global uchar4 *)(mat_src + (src_idx - 8 >= 0 ? src_idx - 8 : src_idx))); uchar4 data_4 = *((global uchar4 *)(mat_src + (src_idx + 4 < total_bytes ? src_idx + 4 : src_idx)));
uchar4 data_2 = *((global uchar4 *)(mat_src + (src_idx - 4 >= 0 ? src_idx - 4 : src_idx))); uchar4 data_5 = *((global uchar4 *)(mat_src + (src_idx + 8 < total_bytes ? src_idx + 8 : src_idx)));
uchar4 data_3 = *((global uchar4 *)(mat_src + src_idx + 0 )); uchar4 data_6 = *((global uchar4 *)(mat_src + (src_idx + 12 < total_bytes ? src_idx + 12 : src_idx)));
int total_bytes = src_offset + rows * src_step;
uchar4 data_4 = *((global uchar4 *)(mat_src + (src_idx + 4 < total_bytes ? src_idx + 4 : src_idx)));
uchar4 data_5 = *((global uchar4 *)(mat_src + (src_idx + 8 < total_bytes ? src_idx + 8 : src_idx)));
uchar4 data_6 = *((global uchar4 *)(mat_src + (src_idx + 12 < total_bytes ? src_idx + 12 : src_idx)));
uchar4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3; uchar4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3;
@ -164,33 +164,33 @@ __kernel void split_vector_C4_D0 (__global uchar *mat_src, int src_step, int s
} }
__kernel void split_vector_C3_D0 (__global uchar *mat_src, int src_step, int src_offset, __kernel void split_vector_C3_D0 (__global uchar *mat_src, int src_step, int src_offset,
__global uchar *mat_dst0, int dst0_step, int dst0_offset, __global uchar *mat_dst0, int dst0_step, int dst0_offset,
__global uchar *mat_dst1, int dst1_step, int dst1_offset, __global uchar *mat_dst1, int dst1_step, int dst1_offset,
__global uchar *mat_dst2, int dst2_step, int dst2_offset, __global uchar *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
if((x < cols) && (y < rows)) if((x < cols) && (y < rows))
{ {
x = x << 2; x = x << 2;
int src_idx = mad24(y, src_step, src_offset); int src_idx = mad24(y, src_step, src_offset);
int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc); int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc); int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
int dst2_start = mad24(y, dst2_step, dst2_offset); int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1); int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc); int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
uchar4 dst0_data = *((__global uchar4 *)(mat_dst0 + dst0_idx)); uchar4 dst0_data = *((__global uchar4 *)(mat_dst0 + dst0_idx));
uchar4 dst1_data = *((__global uchar4 *)(mat_dst1 + dst1_idx)); uchar4 dst1_data = *((__global uchar4 *)(mat_dst1 + dst1_idx));
uchar4 dst2_data = *((__global uchar4 *)(mat_dst2 + dst2_idx)); uchar4 dst2_data = *((__global uchar4 *)(mat_dst2 + dst2_idx));
@ -227,10 +227,10 @@ __kernel void split_vector_C3_D0 (__global uchar *mat_src, int src_step, int s
uchar data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18}; uchar data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18};
int index = 3 - dst0_offset & 3; int index = 3 - dst0_offset & 3;
tmp_data0 = (uchar4)(data[index], data[index + 1], data[index + 2], data[index + 3]); tmp_data0 = (uchar4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
uchar4 data0, data1, data2; uchar4 data0, data1, data2;
data0 = (uchar4)(src_data_1, src_data_4, src_data_7, src_data_10); data0 = (uchar4)(src_data_1, src_data_4, src_data_7, src_data_10);
data1 = (dst1_offset & 3) == 2 ? (uchar4)(src_data_4, src_data_7, src_data_10, src_data_13) : data0; data1 = (dst1_offset & 3) == 2 ? (uchar4)(src_data_4, src_data_7, src_data_10, src_data_13) : data0;
data2 = (dst1_offset & 3) == 1 ? (uchar4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1; data2 = (dst1_offset & 3) == 1 ? (uchar4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1;
@ -263,33 +263,47 @@ __kernel void split_vector_C3_D0 (__global uchar *mat_src, int src_step, int s
} }
__kernel void split_vector_C2_D0 (__global uchar *mat_src, int src_step, int src_offset, __kernel void split_vector_C2_D0 (__global uchar *mat_src, int src_step, int src_offset,
__global uchar *mat_dst0, int dst0_step, int dst0_offset, __global uchar *mat_dst0, int dst0_step, int dst0_offset,
__global uchar *mat_dst1, int dst1_step, int dst1_offset, __global uchar *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
if((x < cols) && (y < rows)) if((x < cols) && (y < rows))
{ {
x = x << 2; x = x << 2;
#define dst0_align ((dst0_offset & 3) << 1) #define dst0_align ((dst0_offset & 3) << 1)
#define dst1_align ((dst1_offset & 3) << 1) #define dst1_align ((dst1_offset & 3) << 1)
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1)); int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1));
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1)); int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1));
int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc); int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc); int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
uchar8 src_data_0 = vload8(0, mat_src + src_idx_0); uchar8 src_data_0 = vload8(0, mat_src + src_idx_0);
uchar8 src_data_1 = vload8(0, mat_src + src_idx_1); uchar8 src_data_1 = vload8(0, mat_src + src_idx_1);
if(src_idx_0 == -6)
src_data_0.s01234567 = src_data_0.s67012345;
if(src_idx_0 == -4)
src_data_0.s01234567 = src_data_0.s45670123;
if(src_idx_0 == -2)
src_data_0.s01234567 = src_data_0.s23456701;
if(src_idx_1 == -6)
src_data_1.s01234567 = src_data_1.s67012345;
if(src_idx_1 == -4)
src_data_1.s01234567 = src_data_1.s45670123;
if(src_idx_1 == -2)
src_data_1.s01234567 = src_data_1.s23456701;
uchar4 dst0_data = *((__global uchar4 *)(mat_dst0 + dst0_idx)); uchar4 dst0_data = *((__global uchar4 *)(mat_dst0 + dst0_idx));
uchar4 dst1_data = *((__global uchar4 *)(mat_dst1 + dst1_idx)); uchar4 dst1_data = *((__global uchar4 *)(mat_dst1 + dst1_idx));
@ -312,9 +326,9 @@ __kernel void split_vector_C2_D0 (__global uchar *mat_src, int src_step, int s
} }
__kernel void split_vector_C4_D1 (__global char *mat_src, int src_step, int src_offset, __kernel void split_vector_C4_D1 (__global char *mat_src, int src_step, int src_offset,
__global char *mat_dst0, int dst0_step, int dst0_offset, __global char *mat_dst0, int dst0_step, int dst0_offset,
__global char *mat_dst1, int dst1_step, int dst1_offset, __global char *mat_dst1, int dst1_step, int dst1_offset,
__global char *mat_dst2, int dst2_step, int dst2_offset, __global char *mat_dst2, int dst2_step, int dst2_offset,
__global char *mat_dst3, int dst3_step, int dst3_offset, __global char *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
@ -322,35 +336,35 @@ __kernel void split_vector_C4_D1 (__global char *mat_src, int src_step, int sr
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
if((x < cols) && (y < rows)) if((x < cols) && (y < rows))
{ {
x = x << 2; x = x << 2;
int src_idx = mad24(y, src_step, src_offset + (x << 2)); int src_idx = mad24(y, src_step, src_offset + (x << 2));
int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc); int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc); int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
int dst2_start = mad24(y, dst2_step, dst2_offset); int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1); int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc); int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
int dst3_start = mad24(y, dst3_step, dst3_offset); int dst3_start = mad24(y, dst3_step, dst3_offset);
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1); int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
int dst3_idx = mad24(y, dst3_step, dst3_offset + x & (int)0xfffffffc); int dst3_idx = mad24(y, dst3_step, dst3_offset + x & (int)0xfffffffc);
char4 data_0 = *((global char4 *)(mat_src + src_idx - 12)); char4 data_0 = *((global char4 *)(mat_src + src_idx - 12));
char4 data_1 = *((global char4 *)(mat_src + src_idx - 8 )); char4 data_1 = *((global char4 *)(mat_src + src_idx - 8 ));
char4 data_2 = *((global char4 *)(mat_src + src_idx - 4 )); char4 data_2 = *((global char4 *)(mat_src + src_idx - 4 ));
char4 data_3 = *((global char4 *)(mat_src + src_idx + 0 )); char4 data_3 = *((global char4 *)(mat_src + src_idx + 0 ));
char4 data_4 = *((global char4 *)(mat_src + src_idx + 4 )); char4 data_4 = *((global char4 *)(mat_src + src_idx + 4 ));
char4 data_5 = *((global char4 *)(mat_src + src_idx + 8 )); char4 data_5 = *((global char4 *)(mat_src + src_idx + 8 ));
char4 data_6 = *((global char4 *)(mat_src + src_idx + 12)); char4 data_6 = *((global char4 *)(mat_src + src_idx + 12));
char4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3; char4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3;
@ -423,33 +437,33 @@ __kernel void split_vector_C4_D1 (__global char *mat_src, int src_step, int sr
} }
__kernel void split_vector_C3_D1 (__global char *mat_src, int src_step, int src_offset, __kernel void split_vector_C3_D1 (__global char *mat_src, int src_step, int src_offset,
__global char *mat_dst0, int dst0_step, int dst0_offset, __global char *mat_dst0, int dst0_step, int dst0_offset,
__global char *mat_dst1, int dst1_step, int dst1_offset, __global char *mat_dst1, int dst1_step, int dst1_offset,
__global char *mat_dst2, int dst2_step, int dst2_offset, __global char *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
if((x < cols) && (y < rows)) if((x < cols) && (y < rows))
{ {
x = x << 2; x = x << 2;
int src_idx = mad24(y, src_step, src_offset); int src_idx = mad24(y, src_step, src_offset);
int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc); int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc); int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
int dst2_start = mad24(y, dst2_step, dst2_offset); int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1); int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc); int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc);
char4 dst0_data = *((__global char4 *)(mat_dst0 + dst0_idx)); char4 dst0_data = *((__global char4 *)(mat_dst0 + dst0_idx));
char4 dst1_data = *((__global char4 *)(mat_dst1 + dst1_idx)); char4 dst1_data = *((__global char4 *)(mat_dst1 + dst1_idx));
char4 dst2_data = *((__global char4 *)(mat_dst2 + dst2_idx)); char4 dst2_data = *((__global char4 *)(mat_dst2 + dst2_idx));
@ -486,10 +500,10 @@ __kernel void split_vector_C3_D1 (__global char *mat_src, int src_step, int sr
char data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18}; char data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18};
int index = 3 - dst0_offset & 3; int index = 3 - dst0_offset & 3;
tmp_data0 = (char4)(data[index], data[index + 1], data[index + 2], data[index + 3]); tmp_data0 = (char4)(data[index], data[index + 1], data[index + 2], data[index + 3]);
char4 data0, data1, data2; char4 data0, data1, data2;
data0 = (char4)(src_data_1, src_data_4, src_data_7, src_data_10); data0 = (char4)(src_data_1, src_data_4, src_data_7, src_data_10);
data1 = (dst1_offset & 3) == 2 ? (char4)(src_data_4, src_data_7, src_data_10, src_data_13) : data0; data1 = (dst1_offset & 3) == 2 ? (char4)(src_data_4, src_data_7, src_data_10, src_data_13) : data0;
data2 = (dst1_offset & 3) == 1 ? (char4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1; data2 = (dst1_offset & 3) == 1 ? (char4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1;
@ -522,34 +536,46 @@ __kernel void split_vector_C3_D1 (__global char *mat_src, int src_step, int sr
} }
__kernel void split_vector_C2_D1 (__global char *mat_src, int src_step, int src_offset, __kernel void split_vector_C2_D1 (__global char *mat_src, int src_step, int src_offset,
__global char *mat_dst0, int dst0_step, int dst0_offset, __global char *mat_dst0, int dst0_step, int dst0_offset,
__global char *mat_dst1, int dst1_step, int dst1_offset, __global char *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
if((x < cols) && (y < rows)) if((x < cols) && (y < rows))
{ {
x = x << 2; x = x << 2;
#define dst0_align ((dst0_offset & 3) << 1) #define dst0_align ((dst0_offset & 3) << 1)
#define dst1_align ((dst1_offset & 3) << 1) #define dst1_align ((dst1_offset & 3) << 1)
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1)); int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1));
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1)); int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1));
int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc); int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc); int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc);
int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
char8 src_data_0 = vload8(0, mat_src + src_idx_0); char8 src_data_0 = vload8(0, mat_src + src_idx_0);
char8 src_data_1 = vload8(0, mat_src + src_idx_1); char8 src_data_1 = vload8(0, mat_src + src_idx_1);
if(src_idx_0 == -6)
src_data_0.s01234567 = src_data_0.s67012345;
if(src_idx_0 == -4)
src_data_0.s01234567 = src_data_0.s45670123;
if(src_idx_0 == -2)
src_data_0.s01234567 = src_data_0.s23456701;
if(src_idx_1 == -6)
src_data_1.s01234567 = src_data_1.s67012345;
if(src_idx_1 == -4)
src_data_1.s01234567 = src_data_1.s45670123;
if(src_idx_1 == -2)
src_data_1.s01234567 = src_data_1.s23456701;
char4 dst0_data = *((__global char4 *)(mat_dst0 + dst0_idx)); char4 dst0_data = *((__global char4 *)(mat_dst0 + dst0_idx));
char4 dst1_data = *((__global char4 *)(mat_dst1 + dst1_idx)); char4 dst1_data = *((__global char4 *)(mat_dst1 + dst1_idx));
@ -571,9 +597,9 @@ __kernel void split_vector_C2_D1 (__global char *mat_src, int src_step, int sr
} }
__kernel void split_vector_C4_D2 (__global ushort *mat_src, int src_step, int src_offset, __kernel void split_vector_C4_D2 (__global ushort *mat_src, int src_step, int src_offset,
__global ushort *mat_dst0, int dst0_step, int dst0_offset, __global ushort *mat_dst0, int dst0_step, int dst0_offset,
__global ushort *mat_dst1, int dst1_step, int dst1_offset, __global ushort *mat_dst1, int dst1_step, int dst1_offset,
__global ushort *mat_dst2, int dst2_step, int dst2_offset, __global ushort *mat_dst2, int dst2_step, int dst2_offset,
__global ushort *mat_dst3, int dst3_step, int dst3_offset, __global ushort *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
@ -581,30 +607,37 @@ __kernel void split_vector_C4_D2 (__global ushort *mat_src, int src_step, int
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
if((x < cols) && (y < rows)) if((x < cols) && (y < rows))
{ {
x = x << 1; x = x << 1;
int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8); int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8);
int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8); int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8);
int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc); int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc); int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
int dst2_start = mad24(y, dst2_step, dst2_offset); int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1); int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc); int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
int dst3_start = mad24(y, dst3_step, dst3_offset); int dst3_start = mad24(y, dst3_step, dst3_offset);
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1); int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
int dst3_idx = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc); int dst3_idx = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc);
ushort8 src_data0 = vload8(0, (__global ushort *)((__global char *)mat_src + src_idx_0)); int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
ushort8 src_data0 = vload8(0,(__global ushort *)((__global char *)mat_src + src_idx_0));
if(src_idx_0 == -6)
src_data0.s01234567 = src_data0.s67012345;
if(src_idx_0 == -4)
src_data0.s01234567 = src_data0.s45670123;
if(src_idx_0 == -2)
src_data0.s01234567 = src_data0.s23456701;
ushort4 src_data1 = *((__global ushort4 *)((__global char *)mat_src + src_idx_1)); ushort4 src_data1 = *((__global ushort4 *)((__global char *)mat_src + src_idx_1));
ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx)); ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
@ -639,33 +672,33 @@ __kernel void split_vector_C4_D2 (__global ushort *mat_src, int src_step, int
} }
__kernel void split_vector_C3_D2 (__global ushort *mat_src, int src_step, int src_offset, __kernel void split_vector_C3_D2 (__global ushort *mat_src, int src_step, int src_offset,
__global ushort *mat_dst0, int dst0_step, int dst0_offset, __global ushort *mat_dst0, int dst0_step, int dst0_offset,
__global ushort *mat_dst1, int dst1_step, int dst1_offset, __global ushort *mat_dst1, int dst1_step, int dst1_offset,
__global ushort *mat_dst2, int dst2_step, int dst2_offset, __global ushort *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
if((x < cols) && (y < rows)) if((x < cols) && (y < rows))
{ {
x = x << 1; x = x << 1;
int src_idx = mad24(y, src_step, src_offset); int src_idx = mad24(y, src_step, src_offset);
int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc); int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc); int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
int dst2_start = mad24(y, dst2_step, dst2_offset); int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1); int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc); int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx)); ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
ushort2 dst1_data = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx)); ushort2 dst1_data = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
ushort2 dst2_data = *((__global ushort2 *)((__global char *)mat_dst2 + dst2_idx)); ushort2 dst2_data = *((__global ushort2 *)((__global char *)mat_dst2 + dst2_idx));
@ -702,34 +735,48 @@ __kernel void split_vector_C3_D2 (__global ushort *mat_src, int src_step, int
} }
__kernel void split_vector_C2_D2 (__global ushort *mat_src, int src_step, int src_offset, __kernel void split_vector_C2_D2 (__global ushort *mat_src, int src_step, int src_offset,
__global ushort *mat_dst0, int dst0_step, int dst0_offset, __global ushort *mat_dst0, int dst0_step, int dst0_offset,
__global ushort *mat_dst1, int dst1_step, int dst1_offset, __global ushort *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
if((x < cols) && (y < rows)) if((x < cols) && (y < rows))
{ {
x = x << 1; x = x << 1;
#define dst0_align ((dst0_offset & 3) << 1) #define dst0_align ((dst0_offset & 3) << 1)
#define dst1_align ((dst1_offset & 3) << 1) #define dst1_align ((dst1_offset & 3) << 1)
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2)); int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2));
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2)); int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2));
int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc); int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc); int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
ushort4 src_data_0 = vload4(0, (__global ushort *)((__global char *)mat_src + src_idx_0)); int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
ushort4 src_data_1 = vload4(0, (__global ushort *)((__global char *)mat_src + src_idx_1)); int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
ushort4 src_data_0 = vload4(0, (__global ushort *)((__global char *)mat_src + src1_index_fix));
ushort4 src_data_1 = vload4(0, (__global ushort *)((__global char *)mat_src + src2_index_fix));
if(src_idx_0 < 0)
{
ushort4 tmp;
tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
src_data_0.xyzw = (src_idx_1 == -1) ? src_data_0.wxyz:tmp.xyzw;
}
if(src_idx_1 < 0)
{
ushort4 tmp;
tmp.xyzw = (src_idx_1 == -2) ? src_data_1.zwxy : src_data_1.yzwx;
src_data_1.xyzw = (src_idx_1 == -1) ? src_data_1.wxyz : tmp.xyzw;
}
ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx)); ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx));
ushort2 dst1_data = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx)); ushort2 dst1_data = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx));
@ -746,9 +793,9 @@ __kernel void split_vector_C2_D2 (__global ushort *mat_src, int src_step, int
} }
} }
__kernel void split_vector_C4_D3 (__global short *mat_src, int src_step, int src_offset, __kernel void split_vector_C4_D3 (__global short *mat_src, int src_step, int src_offset,
__global short *mat_dst0, int dst0_step, int dst0_offset, __global short *mat_dst0, int dst0_step, int dst0_offset,
__global short *mat_dst1, int dst1_step, int dst1_offset, __global short *mat_dst1, int dst1_step, int dst1_offset,
__global short *mat_dst2, int dst2_step, int dst2_offset, __global short *mat_dst2, int dst2_step, int dst2_offset,
__global short *mat_dst3, int dst3_step, int dst3_offset, __global short *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
@ -756,30 +803,38 @@ __kernel void split_vector_C4_D3 (__global short *mat_src, int src_step, int s
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
if((x < cols) && (y < rows)) if((x < cols) && (y < rows))
{ {
x = x << 1; x = x << 1;
int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8); int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8);
int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8); int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8);
int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc); int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc); int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
int dst2_start = mad24(y, dst2_step, dst2_offset); int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1); int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc); int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
int dst3_start = mad24(y, dst3_step, dst3_offset); int dst3_start = mad24(y, dst3_step, dst3_offset);
int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1); int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1);
int dst3_idx = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc); int dst3_idx = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc);
int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
short8 src_data0 = vload8(0, (__global short *)((__global char *)mat_src + src_idx_0)); short8 src_data0 = vload8(0,(__global short *)((__global char *)mat_src + src_idx_0));
if(src_idx_0 == -6)
src_data0.s01234567 = src_data0.s67012345;
if(src_idx_0 == -4)
src_data0.s01234567 = src_data0.s45670123;
if(src_idx_0 == -2)
src_data0.s01234567 = src_data0.s23456701;
short4 src_data1 = *((__global short4 *)((__global char *)mat_src + src_idx_1)); short4 src_data1 = *((__global short4 *)((__global char *)mat_src + src_idx_1));
short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx)); short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
@ -813,33 +868,33 @@ __kernel void split_vector_C4_D3 (__global short *mat_src, int src_step, int s
} }
} }
__kernel void split_vector_C3_D3 (__global short *mat_src, int src_step, int src_offset, __kernel void split_vector_C3_D3 (__global short *mat_src, int src_step, int src_offset,
__global short *mat_dst0, int dst0_step, int dst0_offset, __global short *mat_dst0, int dst0_step, int dst0_offset,
__global short *mat_dst1, int dst1_step, int dst1_offset, __global short *mat_dst1, int dst1_step, int dst1_offset,
__global short *mat_dst2, int dst2_step, int dst2_offset, __global short *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
if((x < cols) && (y < rows)) if((x < cols) && (y < rows))
{ {
x = x << 1; x = x << 1;
int src_idx = mad24(y, src_step, src_offset); int src_idx = mad24(y, src_step, src_offset);
int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc); int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc); int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
int dst2_start = mad24(y, dst2_step, dst2_offset); int dst2_start = mad24(y, dst2_step, dst2_offset);
int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1); int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1);
int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc); int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc);
short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx)); short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
short2 dst1_data = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx)); short2 dst1_data = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
short2 dst2_data = *((__global short2 *)((__global char *)mat_dst2 + dst2_idx)); short2 dst2_data = *((__global short2 *)((__global char *)mat_dst2 + dst2_idx));
@ -877,33 +932,47 @@ __kernel void split_vector_C3_D3 (__global short *mat_src, int src_step, int s
__kernel void split_vector_C2_D3 (__global short *mat_src, int src_step, int src_offset, __kernel void split_vector_C2_D3 (__global short *mat_src, int src_step, int src_offset,
__global short *mat_dst0, int dst0_step, int dst0_offset, __global short *mat_dst0, int dst0_step, int dst0_offset,
__global short *mat_dst1, int dst1_step, int dst1_offset, __global short *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
if((x < cols) && (y < rows)) if((x < cols) && (y < rows))
{ {
x = x << 1; x = x << 1;
#define dst0_align ((dst0_offset & 3) << 1) #define dst0_align ((dst0_offset & 3) << 1)
#define dst1_align ((dst1_offset & 3) << 1) #define dst1_align ((dst1_offset & 3) << 1)
int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2)); int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2));
int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2)); int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2));
int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_start = mad24(y, dst0_step, dst0_offset);
int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1);
int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc); int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc);
int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_start = mad24(y, dst1_step, dst1_offset);
int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1);
int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc); int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc);
int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0;
int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1;
short4 src_data_0 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_0)); short4 src_data_0 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_0));
short4 src_data_1 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_1)); short4 src_data_1 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_1));
if(src_idx_0 < 0)
{
short4 tmp;
tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx;
src_data_0.xyzw = (src_idx_0 == -1) ? src_data_0.wxyz:tmp.xyzw;
}
if(src_idx_1< 0)
{
short4 tmp;
tmp.xyzw = ( src_idx_1== -2) ? src_data_1.zwxy : src_data_1.yzwx;
src_data_1.xyzw = ( src_idx_1== -1) ? src_data_1.wxyz : tmp.xyzw;
}
short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx)); short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx));
short2 dst1_data = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx)); short2 dst1_data = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx));
@ -921,9 +990,9 @@ __kernel void split_vector_C2_D3 (__global short *mat_src, int src_step, int s
} }
} }
__kernel void split_vector_C4_D4 (__global int *mat_src, int src_step, int src_offset, __kernel void split_vector_C4_D4 (__global int *mat_src, int src_step, int src_offset,
__global int *mat_dst0, int dst0_step, int dst0_offset, __global int *mat_dst0, int dst0_step, int dst0_offset,
__global int *mat_dst1, int dst1_step, int dst1_offset, __global int *mat_dst1, int dst1_step, int dst1_offset,
__global int *mat_dst2, int dst2_step, int dst2_offset, __global int *mat_dst2, int dst2_step, int dst2_offset,
__global int *mat_dst3, int dst3_step, int dst3_offset, __global int *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
@ -931,14 +1000,14 @@ __kernel void split_vector_C4_D4 (__global int *mat_src, int src_step, int src
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
if((x < cols) && (y < rows)) if((x < cols) && (y < rows))
{ {
int src_idx = mad24(y, src_step, src_offset); int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset); int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset); int dst1_idx = mad24(y, dst1_step, dst1_offset);
int dst2_idx = mad24(y, dst2_step, dst2_offset); int dst2_idx = mad24(y, dst2_step, dst2_offset);
int dst3_idx = mad24(y, dst3_step, dst3_offset); int dst3_idx = mad24(y, dst3_step, dst3_offset);
int4 src_data = ((__global int4 *)((__global char *)mat_src + src_idx))[x]; int4 src_data = ((__global int4 *)((__global char *)mat_src + src_idx))[x];
((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x; ((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
@ -948,18 +1017,18 @@ __kernel void split_vector_C4_D4 (__global int *mat_src, int src_step, int src
} }
} }
__kernel void split_vector_C3_D4 (__global int *mat_src, int src_step, int src_offset, __kernel void split_vector_C3_D4 (__global int *mat_src, int src_step, int src_offset,
__global int *mat_dst0, int dst0_step, int dst0_offset, __global int *mat_dst0, int dst0_step, int dst0_offset,
__global int *mat_dst1, int dst1_step, int dst1_offset, __global int *mat_dst1, int dst1_step, int dst1_offset,
__global int *mat_dst2, int dst2_step, int dst2_offset, __global int *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
if((x < cols) && (y < rows)) if((x < cols) && (y < rows))
{ {
int src_idx = mad24(y, src_step, src_offset); int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset); int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset); int dst1_idx = mad24(y, dst1_step, dst1_offset);
int dst2_idx = mad24(y, dst2_step, dst2_offset); int dst2_idx = mad24(y, dst2_step, dst2_offset);
@ -975,20 +1044,20 @@ __kernel void split_vector_C3_D4 (__global int *mat_src, int src_step, int src
} }
__kernel void split_vector_C2_D4 (__global int *mat_src, int src_step, int src_offset, __kernel void split_vector_C2_D4 (__global int *mat_src, int src_step, int src_offset,
__global int *mat_dst0, int dst0_step, int dst0_offset, __global int *mat_dst0, int dst0_step, int dst0_offset,
__global int *mat_dst1, int dst1_step, int dst1_offset, __global int *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
if((x < cols) && (y < rows)) if((x < cols) && (y < rows))
{ {
int src_idx = mad24(y, src_step, src_offset); int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset); int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset); int dst1_idx = mad24(y, dst1_step, dst1_offset);
int2 src_data = ((__global int2 *)((__global char *)mat_src + src_idx))[x]; int2 src_data = ((__global int2 *)((__global char *)mat_src + src_idx))[x];
((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x; ((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
@ -997,9 +1066,9 @@ __kernel void split_vector_C2_D4 (__global int *mat_src, int src_step, int src
} }
__kernel void split_vector_C4_D5 (__global float *mat_src, int src_step, int src_offset, __kernel void split_vector_C4_D5 (__global float *mat_src, int src_step, int src_offset,
__global float *mat_dst0, int dst0_step, int dst0_offset, __global float *mat_dst0, int dst0_step, int dst0_offset,
__global float *mat_dst1, int dst1_step, int dst1_offset, __global float *mat_dst1, int dst1_step, int dst1_offset,
__global float *mat_dst2, int dst2_step, int dst2_offset, __global float *mat_dst2, int dst2_step, int dst2_offset,
__global float *mat_dst3, int dst3_step, int dst3_offset, __global float *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
@ -1007,14 +1076,14 @@ __kernel void split_vector_C4_D5 (__global float *mat_src, int src_step, int s
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
if((x < cols) && (y < rows)) if((x < cols) && (y < rows))
{ {
int src_idx = mad24(y, src_step, src_offset); int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset); int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset); int dst1_idx = mad24(y, dst1_step, dst1_offset);
int dst2_idx = mad24(y, dst2_step, dst2_offset); int dst2_idx = mad24(y, dst2_step, dst2_offset);
int dst3_idx = mad24(y, dst3_step, dst3_offset); int dst3_idx = mad24(y, dst3_step, dst3_offset);
float4 src_data = ((__global float4 *)((__global char *)mat_src + src_idx))[x]; float4 src_data = ((__global float4 *)((__global char *)mat_src + src_idx))[x];
((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x; ((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
@ -1025,18 +1094,18 @@ __kernel void split_vector_C4_D5 (__global float *mat_src, int src_step, int s
} }
__kernel void split_vector_C3_D5 (__global float *mat_src, int src_step, int src_offset, __kernel void split_vector_C3_D5 (__global float *mat_src, int src_step, int src_offset,
__global float *mat_dst0, int dst0_step, int dst0_offset, __global float *mat_dst0, int dst0_step, int dst0_offset,
__global float *mat_dst1, int dst1_step, int dst1_offset, __global float *mat_dst1, int dst1_step, int dst1_offset,
__global float *mat_dst2, int dst2_step, int dst2_offset, __global float *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
if((x < cols) && (y < rows)) if((x < cols) && (y < rows))
{ {
int src_idx = mad24(y, src_step, src_offset); int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset); int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset); int dst1_idx = mad24(y, dst1_step, dst1_offset);
int dst2_idx = mad24(y, dst2_step, dst2_offset); int dst2_idx = mad24(y, dst2_step, dst2_offset);
@ -1052,20 +1121,20 @@ __kernel void split_vector_C3_D5 (__global float *mat_src, int src_step, int s
} }
__kernel void split_vector_C2_D5 (__global float *mat_src, int src_step, int src_offset, __kernel void split_vector_C2_D5 (__global float *mat_src, int src_step, int src_offset,
__global float *mat_dst0, int dst0_step, int dst0_offset, __global float *mat_dst0, int dst0_step, int dst0_offset,
__global float *mat_dst1, int dst1_step, int dst1_offset, __global float *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
if((x < cols) && (y < rows)) if((x < cols) && (y < rows))
{ {
int src_idx = mad24(y, src_step, src_offset); int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset); int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset); int dst1_idx = mad24(y, dst1_step, dst1_offset);
float2 src_data = ((__global float2 *)((__global char *)mat_src + src_idx))[x]; float2 src_data = ((__global float2 *)((__global char *)mat_src + src_idx))[x];
((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x; ((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
@ -1075,9 +1144,9 @@ __kernel void split_vector_C2_D5 (__global float *mat_src, int src_step, int s
#if defined (DOUBLE_SUPPORT) #if defined (DOUBLE_SUPPORT)
__kernel void split_vector_C4_D6 (__global double *mat_src, int src_step, int src_offset, __kernel void split_vector_C4_D6 (__global double *mat_src, int src_step, int src_offset,
__global double *mat_dst0, int dst0_step, int dst0_offset, __global double *mat_dst0, int dst0_step, int dst0_offset,
__global double *mat_dst1, int dst1_step, int dst1_offset, __global double *mat_dst1, int dst1_step, int dst1_offset,
__global double *mat_dst2, int dst2_step, int dst2_offset, __global double *mat_dst2, int dst2_step, int dst2_offset,
__global double *mat_dst3, int dst3_step, int dst3_offset, __global double *mat_dst3, int dst3_step, int dst3_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
@ -1085,14 +1154,14 @@ __kernel void split_vector_C4_D6 (__global double *mat_src, int src_step, int
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
if((x < cols) && (y < rows)) if((x < cols) && (y < rows))
{ {
int src_idx = mad24(y, src_step, src_offset); int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset); int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset); int dst1_idx = mad24(y, dst1_step, dst1_offset);
int dst2_idx = mad24(y, dst2_step, dst2_offset); int dst2_idx = mad24(y, dst2_step, dst2_offset);
int dst3_idx = mad24(y, dst3_step, dst3_offset); int dst3_idx = mad24(y, dst3_step, dst3_offset);
double4 src_data = ((__global double4 *)((__global char *)mat_src + src_idx))[x]; double4 src_data = ((__global double4 *)((__global char *)mat_src + src_idx))[x];
((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x; ((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;
@ -1103,18 +1172,18 @@ __kernel void split_vector_C4_D6 (__global double *mat_src, int src_step, int
} }
__kernel void split_vector_C3_D6 (__global double *mat_src, int src_step, int src_offset, __kernel void split_vector_C3_D6 (__global double *mat_src, int src_step, int src_offset,
__global double *mat_dst0, int dst0_step, int dst0_offset, __global double *mat_dst0, int dst0_step, int dst0_offset,
__global double *mat_dst1, int dst1_step, int dst1_offset, __global double *mat_dst1, int dst1_step, int dst1_offset,
__global double *mat_dst2, int dst2_step, int dst2_offset, __global double *mat_dst2, int dst2_step, int dst2_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
if((x < cols) && (y < rows)) if((x < cols) && (y < rows))
{ {
int src_idx = mad24(y, src_step, src_offset); int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset); int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset); int dst1_idx = mad24(y, dst1_step, dst1_offset);
int dst2_idx = mad24(y, dst2_step, dst2_offset); int dst2_idx = mad24(y, dst2_step, dst2_offset);
@ -1130,20 +1199,20 @@ __kernel void split_vector_C3_D6 (__global double *mat_src, int src_step, int
} }
__kernel void split_vector_C2_D6 (__global double *mat_src, int src_step, int src_offset, __kernel void split_vector_C2_D6 (__global double *mat_src, int src_step, int src_offset,
__global double *mat_dst0, int dst0_step, int dst0_offset, __global double *mat_dst0, int dst0_step, int dst0_offset,
__global double *mat_dst1, int dst1_step, int dst1_offset, __global double *mat_dst1, int dst1_step, int dst1_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
if((x < cols) && (y < rows)) if((x < cols) && (y < rows))
{ {
int src_idx = mad24(y, src_step, src_offset); int src_idx = mad24(y, src_step, src_offset);
int dst0_idx = mad24(y, dst0_step, dst0_offset); int dst0_idx = mad24(y, dst0_step, dst0_offset);
int dst1_idx = mad24(y, dst1_step, dst1_offset); int dst1_idx = mad24(y, dst1_step, dst1_offset);
double2 src_data = ((__global double2 *)((__global char *)mat_src + src_idx))[x]; double2 src_data = ((__global double2 *)((__global char *)mat_src + src_idx))[x];
((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x; ((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x;

@ -124,7 +124,7 @@ namespace cv
result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F); result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth())) if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth()))
{ {
matchTemplateNaive_SQDIFF(image, templ, result, image.channels()); matchTemplateNaive_SQDIFF(image, templ, result, image.oclchannels());
return; return;
} }
else else
@ -172,7 +172,7 @@ namespace cv
CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U ) CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
|| ((image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F) || ((image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F)
); );
CV_Assert(image.channels() == templ.channels() && (image.channels() == 1 || image.oclchannels() == 4) && result.channels() == 1); CV_Assert(image.oclchannels() == templ.oclchannels() && (image.oclchannels() == 1 || image.oclchannels() == 4) && result.oclchannels() == 1);
CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1); CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
Context *clCxt = image.clCxt; Context *clCxt = image.clCxt;
@ -209,7 +209,7 @@ namespace cv
result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F); result.create(image.rows - templ.rows + 1, image.cols - templ.cols + 1, CV_32F);
if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth())) if (templ.size().area() < getTemplateThreshold(CV_TM_SQDIFF, image.depth()))
{ {
matchTemplateNaive_CCORR(image, templ, result, image.channels()); matchTemplateNaive_CCORR(image, templ, result, image.oclchannels());
return; return;
} }
else else
@ -220,8 +220,8 @@ namespace cv
image.convertTo(buf.imagef, CV_32F); image.convertTo(buf.imagef, CV_32F);
templ.convertTo(buf.templf, CV_32F); templ.convertTo(buf.templf, CV_32F);
} }
CV_Assert(image.channels() == 1); CV_Assert(image.oclchannels() == 1);
oclMat o_result(image.size(), CV_MAKETYPE(CV_32F, image.channels())); oclMat o_result(image.size(), CV_MAKETYPE(CV_32F, image.oclchannels()));
filter2D(buf.imagef, o_result, CV_32F, buf.templf, Point(0, 0)); filter2D(buf.imagef, o_result, CV_32F, buf.templf, Point(0, 0));
result = o_result(Rect(0, 0, image.rows - templ.rows + 1, image.cols - templ.cols + 1)); result = o_result(Rect(0, 0, image.rows - templ.rows + 1, image.cols - templ.cols + 1));
} }
@ -265,7 +265,7 @@ namespace cv
CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U ) CV_Assert((image.depth() == CV_8U && templ.depth() == CV_8U )
|| ((image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F) || ((image.depth() == CV_32F && templ.depth() == CV_32F) && result.depth() == CV_32F)
); );
CV_Assert(image.channels() == templ.channels() && (image.oclchannels() == 1 || image.oclchannels() == 4) && result.channels() == 1); CV_Assert(image.oclchannels() == templ.oclchannels() && (image.oclchannels() == 1 || image.oclchannels() == 4) && result.oclchannels() == 1);
CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1); CV_Assert(result.rows == image.rows - templ.rows + 1 && result.cols == image.cols - templ.cols + 1);
Context *clCxt = image.clCxt; Context *clCxt = image.clCxt;
@ -320,7 +320,7 @@ namespace cv
args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset)); args.push_back( make_pair( sizeof(cl_int), (void *)&result.offset));
args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
// to be continued in the following section // to be continued in the following section
if(image.channels() == 1) if(image.oclchannels() == 1)
{ {
buf.image_sums.resize(1); buf.image_sums.resize(1);
integral(image, buf.image_sums[0]); integral(image, buf.image_sums[0]);
@ -340,7 +340,7 @@ namespace cv
buf.image_sums.resize(buf.images.size()); buf.image_sums.resize(buf.images.size());
for(int i = 0; i < image.channels(); i ++) for(int i = 0; i < image.oclchannels(); i ++)
{ {
integral(buf.images[i], buf.image_sums[i]); integral(buf.images[i], buf.image_sums[i]);
} }
@ -394,7 +394,7 @@ namespace cv
args.push_back( make_pair( sizeof(cl_int), (void *)&result.step)); args.push_back( make_pair( sizeof(cl_int), (void *)&result.step));
args.push_back( make_pair( sizeof(cl_float), (void *)&scale) ); args.push_back( make_pair( sizeof(cl_float), (void *)&scale) );
// to be continued in the following section // to be continued in the following section
if(image.channels() == 1) if(image.oclchannels() == 1)
{ {
buf.image_sums.resize(1); buf.image_sums.resize(1);
buf.image_sqsums.resize(1); buf.image_sqsums.resize(1);

@ -126,7 +126,7 @@ namespace cv
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// convert_C3C4 // convert_C3C4
void convert_C3C4(const cl_mem &src, oclMat &dst, int srcStep) void convert_C3C4(const cl_mem &src, oclMat &dst)
{ {
int dstStep_in_pixel = dst.step1() / dst.oclchannels(); int dstStep_in_pixel = dst.step1() / dst.oclchannels();
int pixel_end = dst.wholecols * dst.wholerows - 1; int pixel_end = dst.wholecols * dst.wholerows - 1;
@ -174,7 +174,7 @@ void convert_C3C4(const cl_mem &src, oclMat &dst, int srcStep)
} }
//////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////
// convert_C4C3 // convert_C4C3
void convert_C4C3(const oclMat &src, cl_mem &dst, int dstStep) void convert_C4C3(const oclMat &src, cl_mem &dst)
{ {
int srcStep_in_pixel = src.step1() / src.oclchannels(); int srcStep_in_pixel = src.step1() / src.oclchannels();
int pixel_end = src.wholecols * src.wholerows - 1; int pixel_end = src.wholecols * src.wholerows - 1;
@ -245,7 +245,7 @@ void cv::ocl::oclMat::upload(const Mat &m)
openCLVerifyCall(err); openCLVerifyCall(err);
openCLMemcpy2D(clCxt, temp, pitch, m.datastart, m.step, wholeSize.width * m.elemSize(), wholeSize.height, clMemcpyHostToDevice, 3); openCLMemcpy2D(clCxt, temp, pitch, m.datastart, m.step, wholeSize.width * m.elemSize(), wholeSize.height, clMemcpyHostToDevice, 3);
convert_C3C4(temp, *this, pitch); convert_C3C4(temp, *this);
//int* cputemp=new int[wholeSize.height*wholeSize.width * 3]; //int* cputemp=new int[wholeSize.height*wholeSize.width * 3];
//int* cpudata=new int[this->step*this->wholerows/sizeof(int)]; //int* cpudata=new int[this->step*this->wholerows/sizeof(int)];
//openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, temp, CL_TRUE, //openCLSafeCall(clEnqueueReadBuffer(clCxt->impl->clCmdQueue, temp, CL_TRUE,
@ -296,7 +296,7 @@ void cv::ocl::oclMat::download(cv::Mat &m) const
(pitch * wholerows + tail_padding - 1) / tail_padding * tail_padding, 0, &err); (pitch * wholerows + tail_padding - 1) / tail_padding * tail_padding, 0, &err);
openCLVerifyCall(err); openCLVerifyCall(err);
convert_C4C3(*this, temp, pitch / m.elemSize1()); convert_C4C3(*this, temp);
openCLMemcpy2D(clCxt, m.data, m.step, temp, pitch, wholecols * m.elemSize(), wholerows, clMemcpyDeviceToHost, 3); openCLMemcpy2D(clCxt, m.data, m.step, temp, pitch, wholecols * m.elemSize(), wholerows, clMemcpyDeviceToHost, 3);
//int* cputemp=new int[wholecols*wholerows * 3]; //int* cputemp=new int[wholecols*wholerows * 3];
//int* cpudata=new int[this->step*this->wholerows/sizeof(int)]; //int* cpudata=new int[this->step*this->wholerows/sizeof(int)];
@ -382,7 +382,7 @@ void cv::ocl::oclMat::copyTo( oclMat &m ) const
CV_DbgAssert(!this->empty()); CV_DbgAssert(!this->empty());
m.create(size(), type()); m.create(size(), type());
openCLCopyBuffer2D(clCxt, m.data, m.step, m.offset, openCLCopyBuffer2D(clCxt, m.data, m.step, m.offset,
data, step, cols * elemSize(), rows, offset, clMemcpyDeviceToDevice); data, step, cols * elemSize(), rows, offset);
} }
void cv::ocl::oclMat::copyTo( oclMat &mat, const oclMat &mask) const void cv::ocl::oclMat::copyTo( oclMat &mat, const oclMat &mask) const

@ -85,10 +85,10 @@ namespace cv
globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1]; globalThreads[1] = divUp(globalThreads[1], localThreads[1]) * localThreads[1];
globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2]; globalThreads[2] = divUp(globalThreads[2], localThreads[2]) * localThreads[2];
size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2]; //size_t blockSize = localThreads[0] * localThreads[1] * localThreads[2];
cv::ocl::openCLVerifyKernel(clCxt, kernel, &blockSize, globalThreads, localThreads); cv::ocl::openCLVerifyKernel(clCxt, kernel, localThreads);
} }
for(int i = 0; i < args.size(); i ++) for(size_t i = 0; i < args.size(); i ++)
openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second)); openCLSafeCall(clSetKernelArg(kernel, i, args[i].first, args[i].second));
openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads, openCLSafeCall(clEnqueueNDRangeKernel(clCxt->impl->clCmdQueue, kernel, 3, NULL, globalThreads,

@ -47,7 +47,7 @@
#define __OPENCV_PRECOMP_H__ #define __OPENCV_PRECOMP_H__
#if _MSC_VER >= 1200 #if _MSC_VER >= 1200
#pragma warning( disable: 4251 4710 4711 4514 4996 ) #pragma warning( disable: 4244 4251 4710 4711 4514 4996 )
#endif #endif
#ifdef HAVE_CVCONFIG_H #ifdef HAVE_CVCONFIG_H
@ -81,7 +81,7 @@
#if defined __APPLE__ #if defined __APPLE__
#include <OpenCL/OpenCL.h> #include <OpenCL/OpenCL.h>
#else #else
#include <CL/cl.h> #include <CL/opencl.h>
#endif #endif
#include "safe_call.hpp" #include "safe_call.hpp"
@ -100,7 +100,7 @@ namespace cv
size_t width, size_t height, enum openCLMemcpyKind kind, int channels = -1); size_t width, size_t height, enum openCLMemcpyKind kind, int channels = -1);
void openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset, void openCLCopyBuffer2D(Context *clCxt, void *dst, size_t dpitch, int dst_offset,
const void *src, size_t spitch, const void *src, size_t spitch,
size_t width, size_t height, int src_offset, enum openCLMemcpyKind kind); size_t width, size_t height, int src_offset);
void openCLFree(void *devPtr); void openCLFree(void *devPtr);
cl_mem openCLCreateBuffer(Context *clCxt, size_t flag, size_t size); cl_mem openCLCreateBuffer(Context *clCxt, size_t flag, size_t size);
void openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size); void openCLReadBuffer(Context *clCxt, cl_mem dst_buffer, void *host_buffer, size_t size);
@ -108,8 +108,7 @@ namespace cv
const char **source, string kernelName); const char **source, string kernelName);
cl_kernel openCLGetKernelFromSource(const Context *clCxt, cl_kernel openCLGetKernelFromSource(const Context *clCxt,
const char **source, string kernelName, const char *build_options); const char **source, string kernelName, const char *build_options);
void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *blockSize, void openCLVerifyKernel(const Context *clCxt, cl_kernel kernel, size_t *localThreads);
size_t *globalThreads, size_t *localThreads);
void openCLExecuteKernel(Context *clCxt , const char **source, string kernelName, vector< std::pair<size_t, const void *> > &args, void openCLExecuteKernel(Context *clCxt , const char **source, string kernelName, vector< std::pair<size_t, const void *> > &args,
int globalcols , int globalrows, size_t blockSize = 16, int kernel_expand_depth = -1, int kernel_expand_channel = -1); int globalcols , int globalrows, size_t blockSize = 16, int kernel_expand_depth = -1, int kernel_expand_channel = -1);
void openCLExecuteKernel_(Context *clCxt , const char **source, string kernelName, void openCLExecuteKernel_(Context *clCxt , const char **source, string kernelName,

@ -61,6 +61,7 @@ namespace cv
extern const char *pyrlk; extern const char *pyrlk;
extern const char *operator_setTo; extern const char *operator_setTo;
extern const char *operator_convertTo; extern const char *operator_convertTo;
extern const char *operator_copyToM;
extern const char *arithm_mul; extern const char *arithm_mul;
extern const char *pyr_down; extern const char *pyr_down;
} }
@ -397,6 +398,71 @@ oclMat &setTo(oclMat &src, const Scalar &scalar)
return src; return src;
} }
///////////////////////////////////////////////////////////////////////////
////////////////////////////////// CopyTo /////////////////////////////////
///////////////////////////////////////////////////////////////////////////
void copy_to_with_mask_cus(const oclMat &src, oclMat &dst, const oclMat &mask, string kernelName)
{
CV_DbgAssert( dst.rows == mask.rows && dst.cols == mask.cols &&
src.rows == dst.rows && src.cols == dst.cols
&& mask.type() == CV_8UC1);
vector<pair<size_t , const void *> > args;
std::string string_types[4][7] = {{"uchar", "char", "ushort", "short", "int", "float", "double"},
{"uchar2", "char2", "ushort2", "short2", "int2", "float2", "double2"},
{"uchar3", "char3", "ushort3", "short3", "int3", "float3", "double3"},
{"uchar4", "char4", "ushort4", "short4", "int4", "float4", "double4"}
};
char compile_option[32];
sprintf(compile_option, "-D GENTYPE=%s", string_types[dst.oclchannels() - 1][dst.depth()].c_str());
size_t localThreads[3] = {16, 16, 1};
size_t globalThreads[3];
globalThreads[0] = divUp(dst.cols, localThreads[0]) * localThreads[0];
globalThreads[1] = divUp(dst.rows, localThreads[1]) * localThreads[1];
globalThreads[2] = 1;
int dststep_in_pixel = dst.step / dst.elemSize(), dstoffset_in_pixel = dst.offset / dst.elemSize();
int srcstep_in_pixel = src.step / src.elemSize(), srcoffset_in_pixel = src.offset / src.elemSize();
args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data ));
args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst.data ));
args.push_back( make_pair( sizeof(cl_mem) , (void *)&mask.data ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&src.cols ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&src.rows ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&srcstep_in_pixel ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&srcoffset_in_pixel ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dststep_in_pixel ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&dstoffset_in_pixel ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.step ));
args.push_back( make_pair( sizeof(cl_int) , (void *)&mask.offset ));
openCLExecuteKernel2(dst.clCxt , &operator_copyToM, kernelName, globalThreads,
localThreads, args, -1, -1, compile_option, CLFLUSH);
}
void copyTo(const oclMat &src, oclMat &m )
{
CV_DbgAssert(!src.empty());
m.create(src.size(), src.type());
openCLCopyBuffer2D(src.clCxt, m.data, m.step, m.offset,
src.data, src.step, src.cols * src.elemSize(), src.rows, src.offset);
}
void copyTo(const oclMat &src, oclMat &mat, const oclMat &mask)
{
if (mask.empty())
{
copyTo(src, mat);
}
else
{
mat.create(src.size(), src.type());
copy_to_with_mask_cus(src, mat, mask, "copy_to_with_mask");
}
}
void arithmetic_run(const oclMat &src1, oclMat &dst, string kernelName, const char **kernelString, void *_scalar) void arithmetic_run(const oclMat &src1, oclMat &dst, string kernelName, const char **kernelString, void *_scalar)
{ {
if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F) if(src1.clCxt -> impl -> double_support == 0 && src1.type() == CV_64F)
@ -879,20 +945,23 @@ void cv::ocl::PyrLKOpticalFlow::dense(const oclMat &prevImg, const oclMat &nextI
nextPyr_.resize(maxLevel + 1); nextPyr_.resize(maxLevel + 1);
prevPyr_[0] = prevImg; prevPyr_[0] = prevImg;
nextImg.convertTo(nextPyr_[0], CV_32F); //nextImg.convertTo(nextPyr_[0], CV_32F);
convertTo(nextImg, nextPyr_[0], CV_32F);
for (int level = 1; level <= maxLevel; ++level) for (int level = 1; level <= maxLevel; ++level)
{ {
pyrDown(prevPyr_[level - 1], prevPyr_[level]); pyrDown_cus(prevPyr_[level - 1], prevPyr_[level]);
pyrDown(nextPyr_[level - 1], nextPyr_[level]); pyrDown_cus(nextPyr_[level - 1], nextPyr_[level]);
} }
ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[0]); ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[0]);
ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[0]); ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[0]);
ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[1]); ensureSizeIsEnough(prevImg.size(), CV_32FC1, uPyr_[1]);
ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[1]); ensureSizeIsEnough(prevImg.size(), CV_32FC1, vPyr_[1]);
uPyr_[1].setTo(Scalar::all(0)); //uPyr_[1].setTo(Scalar::all(0));
vPyr_[1].setTo(Scalar::all(0)); //vPyr_[1].setTo(Scalar::all(0));
setTo(uPyr_[1], Scalar::all(0));
setTo(vPyr_[1], Scalar::all(0));
Size winSize2i(winSize.width, winSize.height); Size winSize2i(winSize.width, winSize.height);
@ -909,8 +978,12 @@ void cv::ocl::PyrLKOpticalFlow::dense(const oclMat &prevImg, const oclMat &nextI
idx = idx2; idx = idx2;
} }
uPyr_[idx].copyTo(u); //uPyr_[idx].copyTo(u);
vPyr_[idx].copyTo(v); //vPyr_[idx].copyTo(v);
copyTo(uPyr_[idx], u);
copyTo(vPyr_[idx], v);
clFinish(prevImg.clCxt->impl->clCmdQueue);
} }
#endif /* !defined (HAVE_CUDA) */ #endif /* !defined (HAVE_CUDA) */

@ -73,22 +73,53 @@ void print_info()
#endif #endif
} }
std::string workdir;
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
TS::ptr()->init("ocl"); TS::ptr()->init("ocl");
InitGoogleTest(&argc, argv); InitGoogleTest(&argc, argv);
const char *keys =
"{ h | help | false | print help message }"
"{ w | workdir | ../../../samples/c/| set working directory }"
"{ t | type | gpu | set device type:cpu or gpu}"
"{ p | platform | 0 | set platform id }"
"{ d | device | 0 | set device id }";
print_info(); CommandLineParser cmd(argc, argv, keys);
if (cmd.get<bool>("help"))
{
cout << "Avaible options besides goole test option:" << endl;
cmd.printParams();
return 0;
}
workdir = cmd.get<string>("workdir");
string type = cmd.get<string>("type");
unsigned int pid = cmd.get<unsigned int>("platform");
int device = cmd.get<int>("device");
print_info();
int flag = CVCL_DEVICE_TYPE_GPU;
if(type == "cpu")
{
flag = CVCL_DEVICE_TYPE_CPU;
}
std::vector<cv::ocl::Info> oclinfo; std::vector<cv::ocl::Info> oclinfo;
int devnums = getDevice(oclinfo); int devnums = getDevice(oclinfo, flag);
if(devnums < 1) if(devnums <= device || device < 0)
{
std::cout << "device invalid\n";
return -1;
}
if(pid >= oclinfo.size())
{ {
std::cout << "no device found\n"; std::cout << "platform invalid\n";
return -1; return -1;
} }
//setDevice(oclinfo[1]); if(pid != 0 || device != 0)
{
setDevice(oclinfo[pid], device);
}
cout << "Device type:" << type << endl << "Device name:" << oclinfo[pid].DeviceName[device] << endl;
return RUN_ALL_TESTS(); return RUN_ALL_TESTS();
} }

@ -67,7 +67,5 @@
#include "interpolation.hpp" #include "interpolation.hpp"
//#include "add_test_info.h" //#include "add_test_info.h"
#define OPENCV_DEFAULT_OPENCL_DEVICE CVCL_DEVICE_TYPE_GPU
#endif #endif

@ -133,10 +133,9 @@ PARAM_TEST_CASE(ArithmTestBase, MatType, bool)
void random_roi() void random_roi()
{ {
cv::RNG &rng = TS::ptr()->get_rng();
#ifdef RANDOMROI #ifdef RANDOMROI
//randomize ROI //randomize ROI
cv::RNG &rng = TS::ptr()->get_rng();
roicols = rng.uniform(1, mat1.cols); roicols = rng.uniform(1, mat1.cols);
roirows = rng.uniform(1, mat1.rows); roirows = rng.uniform(1, mat1.rows);
src1x = rng.uniform(0, mat1.cols - roicols); src1x = rng.uniform(0, mat1.cols - roicols);
@ -234,7 +233,7 @@ TEST_P(Exp, Mat)
char s[1024]; char s[1024];
sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y); sprintf(s, "roicols=%d,roirows=%d,src1x=%d,src1y=%d,dstx=%d,dsty=%d,maskx=%d,masky=%d,src2x=%d,src2y=%d", roicols, roirows, src1x, src1y, dstx, dsty, maskx, masky, src2x, src2y);
EXPECT_MAT_NEAR(dst, cpu_dst, 1, s); EXPECT_MAT_NEAR(dst, cpu_dst, 2, s);
} }
} }
@ -855,7 +854,7 @@ TEST_P(MinMaxLoc, MAT)
cv::Point minLoc_, maxLoc_; cv::Point minLoc_, maxLoc_;
cv::ocl::minMaxLoc(gmat1, &minVal_, &maxVal_, &minLoc_, &maxLoc_, cv::ocl::oclMat()); cv::ocl::minMaxLoc(gmat1, &minVal_, &maxVal_, &minLoc_, &maxLoc_, cv::ocl::oclMat());
double error0, error1, minlocVal, minlocVal_, maxlocVal, maxlocVal_; double error0 = 0., error1 = 0., minlocVal = 0., minlocVal_ = 0., maxlocVal = 0., maxlocVal_ = 0.;
if(depth == 0) if(depth == 0)
{ {
minlocVal = mat1_roi.at<unsigned char>(minLoc); minlocVal = mat1_roi.at<unsigned char>(minLoc);
@ -975,7 +974,7 @@ TEST_P(MinMaxLoc, MASK)
cv::Point minLoc_, maxLoc_; cv::Point minLoc_, maxLoc_;
cv::ocl::minMaxLoc(gmat1, &minVal_, &maxVal_, &minLoc_, &maxLoc_, gmask); cv::ocl::minMaxLoc(gmat1, &minVal_, &maxVal_, &minLoc_, &maxLoc_, gmask);
double error0, error1, minlocVal, minlocVal_, maxlocVal, maxlocVal_; double error0 = 0., error1 = 0., minlocVal = 0., minlocVal_ = 0., maxlocVal = 0., maxlocVal_ = 0.;
if(minLoc_.x == -1 || minLoc_.y == -1 || maxLoc_.x == -1 || maxLoc_.y == -1) continue; if(minLoc_.x == -1 || minLoc_.y == -1 || maxLoc_.x == -1 || maxLoc_.y == -1) continue;
if(depth == 0) if(depth == 0)
{ {

@ -45,16 +45,11 @@
#include "precomp.hpp" #include "precomp.hpp"
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
#ifdef WIN32
#define FILTER_IMAGE "C:/Users/Public/Pictures/Sample Pictures/Penguins.jpg"
#else
#define FILTER_IMAGE "/Users/Test/Valve_original.PNG" // user need to specify a valid image path
#endif
#define SHOW_RESULT 0 #define SHOW_RESULT 0
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// Canny // Canny
extern std::string workdir;
IMPLEMENT_PARAM_CLASS(AppertureSize, int); IMPLEMENT_PARAM_CLASS(AppertureSize, int);
IMPLEMENT_PARAM_CLASS(L2gradient, bool); IMPLEMENT_PARAM_CLASS(L2gradient, bool);
@ -76,7 +71,7 @@ PARAM_TEST_CASE(Canny, AppertureSize, L2gradient)
TEST_P(Canny, Accuracy) TEST_P(Canny, Accuracy)
{ {
cv::Mat img = readImage(FILTER_IMAGE, cv::IMREAD_GRAYSCALE); cv::Mat img = readImage(workdir + "fruits.jpg", cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(img.empty()); ASSERT_FALSE(img.empty());
double low_thresh = 50.0; double low_thresh = 50.0;
@ -110,4 +105,4 @@ TEST_P(Canny, Accuracy)
INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Canny, testing::Combine( INSTANTIATE_TEST_CASE_P(GPU_ImgProc, Canny, testing::Combine(
testing::Values(AppertureSize(3), AppertureSize(5)), testing::Values(AppertureSize(3), AppertureSize(5)),
testing::Values(L2gradient(false), L2gradient(true)))); testing::Values(L2gradient(false), L2gradient(true))));
#endif #endif

@ -52,7 +52,7 @@ using namespace cvtest;
using namespace testing; using namespace testing;
using namespace std; using namespace std;
using namespace cv; using namespace cv;
extern string workdir;
struct getRect struct getRect
{ {
Rect operator ()(const CvAvgComp &e) const Rect operator ()(const CvAvgComp &e) const
@ -75,14 +75,11 @@ PARAM_TEST_CASE(HaarTestBase, int, int)
{ {
scale = 1.0; scale = 1.0;
index = 0; index = 0;
string cascadeName = "../../../data/haarcascades/haarcascade_frontalface_alt.xml"; string cascadeName = workdir + "../../data/haarcascades/haarcascade_frontalface_alt.xml";
if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName))) if( (!cascade.load( cascadeName )) || (!cpucascade.load(cascadeName)))
{ {
cout << "ERROR: Could not load classifier cascade" << endl; cout << "ERROR: Could not load classifier cascade" << endl;
cout << "Usage: facedetect [--cascade=<cascade_path>]\n"
" [--scale[=<image scale>\n"
" [filename|camera_index]\n" << endl ;
return; return;
} }
//int devnums = getDevice(oclinfo); //int devnums = getDevice(oclinfo);
@ -99,17 +96,17 @@ struct Haar : HaarTestBase {};
TEST_F(Haar, FaceDetect) TEST_F(Haar, FaceDetect)
{ {
string imgName = "../../../samples/c/lena.jpg"; string imgName = workdir + "lena.jpg";
Mat img = imread( imgName, 1 ); Mat img = imread( imgName, 1 );
if(img.empty()) if(img.empty())
{ {
std::cout << "Couldn't read test" << index << ".jpg" << std::endl; std::cout << "Couldn't read " << imgName << std::endl;
return ; return ;
} }
int i = 0; //int i = 0;
double t = 0; //double t = 0;
vector<Rect> faces, oclfaces; vector<Rect> faces, oclfaces;
const static Scalar colors[] = { CV_RGB(0, 0, 255), const static Scalar colors[] = { CV_RGB(0, 0, 255),

@ -48,7 +48,7 @@
using namespace std; using namespace std;
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
extern string workdir;
PARAM_TEST_CASE(HOG, cv::Size, int) PARAM_TEST_CASE(HOG, cv::Size, int)
{ {
cv::Size winSize; cv::Size winSize;
@ -63,7 +63,7 @@ PARAM_TEST_CASE(HOG, cv::Size, int)
TEST_P(HOG, GetDescriptors) TEST_P(HOG, GetDescriptors)
{ {
// Load image // Load image
cv::Mat img_rgb = readImage("../../../samples/gpu/road.png"); cv::Mat img_rgb = readImage(workdir + "lena.jpg");
ASSERT_FALSE(img_rgb.empty()); ASSERT_FALSE(img_rgb.empty());
// Convert image // Convert image
@ -120,7 +120,7 @@ bool match_rect(cv::Rect r1, cv::Rect r2, int threshold)
TEST_P(HOG, Detect) TEST_P(HOG, Detect)
{ {
// Load image // Load image
cv::Mat img_rgb = readImage("../../../samples/gpu/road.png"); cv::Mat img_rgb = readImage(workdir + "lena.jpg");
ASSERT_FALSE(img_rgb.empty()); ASSERT_FALSE(img_rgb.empty());
// Convert image // Convert image

@ -1008,7 +1008,7 @@ TEST_P(Remap, Mat)
int bordertype[] = {cv::BORDER_CONSTANT, cv::BORDER_REPLICATE/*,BORDER_REFLECT,BORDER_WRAP,BORDER_REFLECT_101*/}; int bordertype[] = {cv::BORDER_CONSTANT, cv::BORDER_REPLICATE/*,BORDER_REFLECT,BORDER_WRAP,BORDER_REFLECT_101*/};
const char *borderstr[] = {"BORDER_CONSTANT", "BORDER_REPLICATE"/*, "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"*/}; const char *borderstr[] = {"BORDER_CONSTANT", "BORDER_REPLICATE"/*, "BORDER_REFLECT","BORDER_WRAP","BORDER_REFLECT_101"*/};
// for(int i = 0; i < sizeof(bordertype)/sizeof(int); i++) // for(int i = 0; i < sizeof(bordertype)/sizeof(int); i++)
for(int j = 0; j < 100; j++) for(int j = 0; j < LOOP_TIMES; j++)
{ {
random_roi(); random_roi();
cv::remap(src_roi, dst_roi, map1_roi, map2_roi, interpolation, bordertype[0], val); cv::remap(src_roi, dst_roi, map1_roi, map2_roi, interpolation, bordertype[0], val);
@ -1017,7 +1017,7 @@ TEST_P(Remap, Mat)
gdst.download(cpu_dst); gdst.download(cpu_dst);
char sss[1024]; char sss[1024];
sprintf(sss, "src_roicols=%d,src_roirows=%d,dst_roicols=%d,dst_roirows=%d,src1x =%d,src1y=%d,dstx=%d,dsty=%d", src_roicols, src_roirows, dst_roicols, dst_roirows, srcx, srcy, dstx, dsty); sprintf(sss, "src_roicols=%d,src_roirows=%d,dst_roicols=%d,dst_roirows=%d,src1x =%d,src1y=%d,dstx=%d,dsty=%d bordertype=%s", src_roicols, src_roirows, dst_roicols, dst_roirows, srcx, srcy, dstx, dsty, borderstr[0]);
if(interpolation == 0) if(interpolation == 0)
@ -1371,7 +1371,9 @@ TEST_P(meanShiftFiltering, Mat)
gdst.download(cpu_gdst); gdst.download(cpu_gdst);
char sss[1024]; char sss[1024];
char warning[300] = "Warning: If the selected device doesn't support double, a deviation will exist.\nIf the accuracy is acceptable, please ignore it.\n";
sprintf(sss, "roicols=%d,roirows=%d,srcx=%d,srcy=%d,dstx=%d,dsty=%d\n", roicols, roirows, srcx, srcy, dstx, dsty); sprintf(sss, "roicols=%d,roirows=%d,srcx=%d,srcy=%d,dstx=%d,dsty=%d\n", roicols, roirows, srcx, srcy, dstx, dsty);
strcat(sss, warning);
EXPECT_MAT_NEAR(dst, cpu_gdst, 0.0, sss); EXPECT_MAT_NEAR(dst, cpu_gdst, 0.0, sss);
} }
@ -1397,7 +1399,9 @@ TEST_P(meanShiftProc, Mat)
gdstCoor.download(cpu_gdstCoor); gdstCoor.download(cpu_gdstCoor);
char sss[1024]; char sss[1024];
char warning[300] = "Warning: If the selected device doesn't support double, a deviation will exist.\nIf the accuracy is acceptable, please ignore it.\n";
sprintf(sss, "roicols=%d,roirows=%d,srcx=%d,srcy=%d,dstx=%d,dsty=%d\n", roicols, roirows, srcx, srcy, dstx, dsty); sprintf(sss, "roicols=%d,roirows=%d,srcx=%d,srcy=%d,dstx=%d,dsty=%d\n", roicols, roirows, srcx, srcy, dstx, dsty);
strcat(sss, warning);
EXPECT_MAT_NEAR(dst, cpu_gdst, 0.0, sss); EXPECT_MAT_NEAR(dst, cpu_gdst, 0.0, sss);
EXPECT_MAT_NEAR(dstCoor, cpu_gdstCoor, 0.0, sss); EXPECT_MAT_NEAR(dstCoor, cpu_gdstCoor, 0.0, sss);
} }
@ -1740,7 +1744,7 @@ INSTANTIATE_TEST_CASE_P(Imgproc, meanShiftProc, Combine(
)); ));
INSTANTIATE_TEST_CASE_P(Imgproc, Remap, Combine( INSTANTIATE_TEST_CASE_P(Imgproc, Remap, Combine(
Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC4), Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_32FC1, CV_32FC3, CV_32FC4),
Values(CV_32FC1, CV_16SC2, CV_32FC2), Values(-1, CV_32FC1), Values(CV_32FC1, CV_16SC2, CV_32FC2), Values(-1, CV_32FC1),
Values((int)cv::INTER_NEAREST, (int)cv::INTER_LINEAR), Values((int)cv::INTER_NEAREST, (int)cv::INTER_LINEAR),
Values((int)cv::BORDER_CONSTANT))); Values((int)cv::BORDER_CONSTANT)));
@ -1751,7 +1755,7 @@ INSTANTIATE_TEST_CASE_P(histTestBase, calcHist, Combine(
ONE_TYPE(CV_32SC1) //no use ONE_TYPE(CV_32SC1) //no use
)); ));
INSTANTIATE_TEST_CASE_P(ConvolveTestBase, Convolve, Combine( //INSTANTIATE_TEST_CASE_P(ConvolveTestBase, Convolve, Combine(
Values(CV_32FC1, CV_32FC1), // Values(CV_32FC1, CV_32FC1),
Values(false))); // Values(false) is the reserved parameter // Values(false))); // Values(false) is the reserved parameter
#endif // HAVE_OPENCL #endif // HAVE_OPENCL

@ -44,7 +44,7 @@
#include "precomp.hpp" #include "precomp.hpp"
#define PERF_TEST 0 //#define PERF_TEST 0
#ifdef HAVE_OPENCL #ifdef HAVE_OPENCL
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// MatchTemplate // MatchTemplate
@ -157,18 +157,18 @@ TEST_P(MatchTemplate32F, Accuracy)
#endif // PERF_TEST #endif // PERF_TEST
} }
//INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate8U, INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate8U,
// testing::Combine( testing::Combine(
// MTEMP_SIZES, MTEMP_SIZES,
// testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/), testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/),
// testing::Values(Channels(1), Channels(3), Channels(4)), testing::Values(Channels(1), Channels(3), Channels(4)),
// ALL_TEMPLATE_METHODS ALL_TEMPLATE_METHODS
// ) )
// ); );
//
//INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate32F, testing::Combine( INSTANTIATE_TEST_CASE_P(GPU_ImgProc, MatchTemplate32F, testing::Combine(
// MTEMP_SIZES, MTEMP_SIZES,
// testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/), testing::Values(TemplateSize(cv::Size(5, 5)), TemplateSize(cv::Size(16, 16))/*, TemplateSize(cv::Size(30, 30))*/),
// testing::Values(Channels(1), Channels(3), Channels(4)), testing::Values(Channels(1), Channels(3), Channels(4)),
// testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR)))); testing::Values(TemplateMethod(cv::TM_SQDIFF), TemplateMethod(cv::TM_CCORR))));
#endif #endif

@ -54,7 +54,7 @@ using namespace std;
///////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////
// BroxOpticalFlow // BroxOpticalFlow
extern string workdir;
#define BROX_OPTICAL_FLOW_DUMP_FILE "opticalflow/brox_optical_flow.bin" #define BROX_OPTICAL_FLOW_DUMP_FILE "opticalflow/brox_optical_flow.bin"
#define BROX_OPTICAL_FLOW_DUMP_FILE_CC20 "opticalflow/brox_optical_flow_cc20.bin" #define BROX_OPTICAL_FLOW_DUMP_FILE_CC20 "opticalflow/brox_optical_flow_cc20.bin"
@ -78,10 +78,10 @@ PARAM_TEST_CASE(Sparse, bool, bool)
TEST_P(Sparse, Mat) TEST_P(Sparse, Mat)
{ {
cv::Mat frame0 = readImage("../../../samples/gpu/rubberwhale1.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR); cv::Mat frame0 = readImage(workdir + "../gpu/rubberwhale1.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
ASSERT_FALSE(frame0.empty()); ASSERT_FALSE(frame0.empty());
cv::Mat frame1 = readImage("../../../samples/gpu/rubberwhale2.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR); cv::Mat frame1 = readImage(workdir + "../gpu/rubberwhale2.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
ASSERT_FALSE(frame1.empty()); ASSERT_FALSE(frame1.empty());
cv::Mat gray_frame; cv::Mat gray_frame;

@ -279,11 +279,14 @@ static int pyopencv_to(const PyObject* o, Mat& m, const ArgInfo info, bool allow
needcopy = true; needcopy = true;
} }
if( ismultichannel && _strides[1] != (npy_intp)elemsize*_sizes[2] )
needcopy = true;
if (needcopy) if (needcopy)
{ {
if (info.outputarg) if (info.outputarg)
{ {
failmsg("output array %s is not row-contiguous (step[ndims-1] != elemsize)", info.name); failmsg("Layout of the output array %s is incompatible with cv::Mat (step[ndims-1] != elemsize or step[1] != elemsize*nchannels)", info.name);
return false; return false;
} }
if( needcast ) if( needcast )

@ -326,7 +326,10 @@ static int parseCmdArgs(int argc, char** argv)
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
#if ENABLE_LOG
int64 app_start_time = getTickCount(); int64 app_start_time = getTickCount();
#endif
cv::setBreakOnError(true); cv::setBreakOnError(true);
int retval = parseCmdArgs(argc, argv); int retval = parseCmdArgs(argc, argv);
@ -345,7 +348,9 @@ int main(int argc, char* argv[])
bool is_work_scale_set = false, is_seam_scale_set = false, is_compose_scale_set = false; bool is_work_scale_set = false, is_seam_scale_set = false, is_compose_scale_set = false;
LOGLN("Finding features..."); LOGLN("Finding features...");
#if ENABLE_LOG
int64 t = getTickCount(); int64 t = getTickCount();
#endif
Ptr<FeaturesFinder> finder; Ptr<FeaturesFinder> finder;
if (features_type == "surf") if (features_type == "surf")
@ -420,7 +425,9 @@ int main(int argc, char* argv[])
LOGLN("Finding features, time: " << ((getTickCount() - t) / getTickFrequency()) << " sec"); LOGLN("Finding features, time: " << ((getTickCount() - t) / getTickFrequency()) << " sec");
LOG("Pairwise matching"); LOG("Pairwise matching");
#if ENABLE_LOG
t = getTickCount(); t = getTickCount();
#endif
vector<MatchesInfo> pairwise_matches; vector<MatchesInfo> pairwise_matches;
BestOf2NearestMatcher matcher(try_gpu, match_conf); BestOf2NearestMatcher matcher(try_gpu, match_conf);
matcher(features, pairwise_matches); matcher(features, pairwise_matches);
@ -516,7 +523,9 @@ int main(int argc, char* argv[])
} }
LOGLN("Warping images (auxiliary)... "); LOGLN("Warping images (auxiliary)... ");
#if ENABLE_LOG
t = getTickCount(); t = getTickCount();
#endif
vector<Point> corners(num_images); vector<Point> corners(num_images);
vector<Mat> masks_warped(num_images); vector<Mat> masks_warped(num_images);
@ -634,7 +643,9 @@ int main(int argc, char* argv[])
masks.clear(); masks.clear();
LOGLN("Compositing..."); LOGLN("Compositing...");
#if ENABLE_LOG
t = getTickCount(); t = getTickCount();
#endif
Mat img_warped, img_warped_s; Mat img_warped, img_warped_s;
Mat dilated_mask, seam_mask, mask, mask_warped; Mat dilated_mask, seam_mask, mask, mask_warped;

Loading…
Cancel
Save