|
|
|
@ -103,7 +103,11 @@ static void arithmetic_run_generic(const oclMat &src1, const oclMat &src2, const |
|
|
|
|
int dststep1 = dst.step / dst.elemSize(), dstoffset1 = dst.offset / dst.elemSize(); |
|
|
|
|
std::vector<uchar> m; |
|
|
|
|
|
|
|
|
|
#ifdef ANDROID |
|
|
|
|
size_t localThreads[3] = { 16, 10, 1 }; |
|
|
|
|
#else |
|
|
|
|
size_t localThreads[3] = { 16, 16, 1 }; |
|
|
|
|
#endif |
|
|
|
|
size_t globalThreads[3] = { dst.cols, dst.rows, 1 }; |
|
|
|
|
|
|
|
|
|
std::string kernelName = "arithm_binary_op"; |
|
|
|
@ -337,10 +341,15 @@ static void arithmetic_sum_buffer_run(const oclMat &src, cl_mem &dst, int groupn |
|
|
|
|
args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data)); |
|
|
|
|
args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst )); |
|
|
|
|
size_t globalThreads[3] = { groupnum * 256, 1, 1 }; |
|
|
|
|
size_t localThreads[3] = { 256, 1, 1 }; |
|
|
|
|
|
|
|
|
|
#ifdef ANDROID |
|
|
|
|
openCLExecuteKernel(src.clCxt, &arithm_sum, "arithm_op_sum", globalThreads, NULL, |
|
|
|
|
args, -1, -1, buildOptions.c_str()); |
|
|
|
|
#else |
|
|
|
|
size_t localThreads[3] = { 256, 1, 1 }; |
|
|
|
|
openCLExecuteKernel(src.clCxt, &arithm_sum, "arithm_op_sum", globalThreads, localThreads, |
|
|
|
|
args, -1, -1, buildOptions.c_str()); |
|
|
|
|
#endif |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
template <typename T> |
|
|
|
@ -515,6 +524,7 @@ static void arithmetic_minMax_run(const oclMat &src, const oclMat & mask, cl_mem |
|
|
|
|
size_t globalThreads[3] = {groupnum * 256, 1, 1}; |
|
|
|
|
size_t localThreads[3] = {256, 1, 1}; |
|
|
|
|
|
|
|
|
|
// kernel use fixed grid size, replace lt on NULL is imposible without kernel changes
|
|
|
|
|
openCLExecuteKernel(src.clCxt, &arithm_minMax, kernelName, globalThreads, localThreads, |
|
|
|
|
args, -1, -1, buildOptions.c_str()); |
|
|
|
|
} |
|
|
|
@ -622,7 +632,11 @@ static void arithm_absdiff_nonsaturate_run(const oclMat & src1, const oclMat & s |
|
|
|
|
int diffstep1 = diff.step / diff.elemSize(), diffoffset1 = diff.offset / diff.elemSize(); |
|
|
|
|
|
|
|
|
|
string kernelName = "arithm_absdiff_nonsaturate"; |
|
|
|
|
#ifdef ANDROID |
|
|
|
|
size_t localThreads[3] = { 16, 10, 1 }; |
|
|
|
|
#else |
|
|
|
|
size_t localThreads[3] = { 16, 16, 1 }; |
|
|
|
|
#endif |
|
|
|
|
size_t globalThreads[3] = { diff.cols, diff.rows, 1 }; |
|
|
|
|
|
|
|
|
|
const char * const typeMap[] = { "uchar", "char", "ushort", "short", "int", "float", "double" }; |
|
|
|
@ -842,7 +856,11 @@ static void arithmetic_exp_log_run(const oclMat &src, oclMat &dst, string kernel |
|
|
|
|
int srcoffset1 = src.offset / src.elemSize1(), dstoffset1 = dst.offset / dst.elemSize1(); |
|
|
|
|
int srcstep1 = src.step1(), dststep1 = dst.step1(); |
|
|
|
|
|
|
|
|
|
#ifdef ANDROID |
|
|
|
|
size_t localThreads[3] = { 64, 2, 1 }; |
|
|
|
|
#else |
|
|
|
|
size_t localThreads[3] = { 64, 4, 1 }; |
|
|
|
|
#endif |
|
|
|
|
size_t globalThreads[3] = { dst.cols, dst.rows, 1 }; |
|
|
|
|
|
|
|
|
|
std::string buildOptions = format("-D srcT=%s", |
|
|
|
@ -880,7 +898,11 @@ static void arithmetic_magnitude_phase_run(const oclMat &src1, const oclMat &src |
|
|
|
|
{ |
|
|
|
|
int depth = dst.depth(); |
|
|
|
|
|
|
|
|
|
#ifdef ANDROID |
|
|
|
|
size_t localThreads[3] = { 64, 2, 1 }; |
|
|
|
|
#else |
|
|
|
|
size_t localThreads[3] = { 64, 4, 1 }; |
|
|
|
|
#endif |
|
|
|
|
size_t globalThreads[3] = { dst.cols, dst.rows, 1 }; |
|
|
|
|
|
|
|
|
|
int src1_step = src1.step / src1.elemSize(), src1_offset = src1.offset / src1.elemSize(); |
|
|
|
@ -928,7 +950,11 @@ static void arithmetic_phase_run(const oclMat &src1, const oclMat &src2, oclMat |
|
|
|
|
int src2step1 = src2.step / src2.elemSize1(), src2offset1 = src2.offset / src2.elemSize1(); |
|
|
|
|
int dststep1 = dst.step / dst.elemSize1(), dstoffset1 = dst.offset / dst.elemSize1(); |
|
|
|
|
|
|
|
|
|
#ifdef ANDROID |
|
|
|
|
size_t localThreads[3] = { 64, 2, 1 }; |
|
|
|
|
#else |
|
|
|
|
size_t localThreads[3] = { 64, 4, 1 }; |
|
|
|
|
#endif |
|
|
|
|
size_t globalThreads[3] = { cols1, dst.rows, 1 }; |
|
|
|
|
|
|
|
|
|
vector<pair<size_t , const void *> > args; |
|
|
|
@ -974,7 +1000,11 @@ static void arithmetic_cartToPolar_run(const oclMat &src1, const oclMat &src2, o |
|
|
|
|
|
|
|
|
|
int cols = src1.cols * channels; |
|
|
|
|
|
|
|
|
|
#ifdef ANDROID |
|
|
|
|
size_t localThreads[3] = { 64, 2, 1 }; |
|
|
|
|
#else |
|
|
|
|
size_t localThreads[3] = { 64, 4, 1 }; |
|
|
|
|
#endif |
|
|
|
|
size_t globalThreads[3] = { cols, src1.rows, 1 }; |
|
|
|
|
|
|
|
|
|
int src1_step = src1.step / src1.elemSize1(), src1_offset = src1.offset / src1.elemSize1(); |
|
|
|
@ -1028,7 +1058,11 @@ static void arithmetic_ptc_run(const oclMat &src1, const oclMat &src2, oclMat &d |
|
|
|
|
int channels = src2.oclchannels(), depth = src2.depth(); |
|
|
|
|
int cols = src2.cols * channels, rows = src2.rows; |
|
|
|
|
|
|
|
|
|
#ifdef ANDROID |
|
|
|
|
size_t localThreads[3] = { 64, 2, 1 }; |
|
|
|
|
#else |
|
|
|
|
size_t localThreads[3] = { 64, 4, 1 }; |
|
|
|
|
#endif |
|
|
|
|
size_t globalThreads[3] = { cols, rows, 1 }; |
|
|
|
|
|
|
|
|
|
int src1_step = src1.step / src1.elemSize1(), src1_offset = src1.offset / src1.elemSize1(); |
|
|
|
@ -1104,6 +1138,8 @@ static void arithmetic_minMaxLoc_run(const oclMat &src, cl_mem &dst, int vlen , |
|
|
|
|
char build_options[50]; |
|
|
|
|
sprintf(build_options, "-D DEPTH_%d -D REPEAT_S%d -D REPEAT_E%d", src.depth(), repeat_s, repeat_e); |
|
|
|
|
size_t gt[3] = {groupnum * 256, 1, 1}, lt[3] = {256, 1, 1}; |
|
|
|
|
|
|
|
|
|
// kernel use fixed grid size, replace lt on NULL is imposible without kernel changes
|
|
|
|
|
openCLExecuteKernel(src.clCxt, &arithm_minMaxLoc, "arithm_op_minMaxLoc", gt, lt, args, -1, -1, build_options); |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -1133,6 +1169,7 @@ static void arithmetic_minMaxLoc_mask_run(const oclMat &src, const oclMat &mask, |
|
|
|
|
args.push_back( make_pair( sizeof(cl_mem) , (void *)&mask.data )); |
|
|
|
|
args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst )); |
|
|
|
|
|
|
|
|
|
// kernel use fixed grid size, replace lt on NULL is imposible without kernel changes
|
|
|
|
|
openCLExecuteKernel(src.clCxt, &arithm_minMaxLoc_mask, "arithm_op_minMaxLoc_mask", gt, lt, args, -1, -1, build_options); |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
@ -1250,10 +1287,15 @@ static void arithmetic_countNonZero_run(const oclMat &src, cl_mem &dst, int grou |
|
|
|
|
args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst )); |
|
|
|
|
|
|
|
|
|
size_t globalThreads[3] = { groupnum * 256, 1, 1 }; |
|
|
|
|
size_t localThreads[3] = { 256, 1, 1 }; |
|
|
|
|
|
|
|
|
|
#ifdef ANDROID |
|
|
|
|
openCLExecuteKernel(src.clCxt, &arithm_nonzero, kernelName, globalThreads, NULL, |
|
|
|
|
args, -1, -1, buildOptions.c_str()); |
|
|
|
|
#else |
|
|
|
|
size_t localThreads[3] = { 256, 1, 1 }; |
|
|
|
|
openCLExecuteKernel(src.clCxt, &arithm_nonzero, kernelName, globalThreads, localThreads, |
|
|
|
|
args, -1, -1, buildOptions.c_str()); |
|
|
|
|
#endif |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
int cv::ocl::countNonZero(const oclMat &src) |
|
|
|
@ -1311,7 +1353,11 @@ static void bitwise_unary_run(const oclMat &src1, oclMat &dst, string kernelName |
|
|
|
|
int offset_cols = (dst.offset / dst.elemSize1()) & (vector_length - 1); |
|
|
|
|
int cols = divUp(dst.cols * channels + offset_cols, vector_length); |
|
|
|
|
|
|
|
|
|
#ifdef ANDROID |
|
|
|
|
size_t localThreads[3] = { 64, 2, 1 }; |
|
|
|
|
#else |
|
|
|
|
size_t localThreads[3] = { 64, 4, 1 }; |
|
|
|
|
#endif |
|
|
|
|
size_t globalThreads[3] = { cols, dst.rows, 1 }; |
|
|
|
|
|
|
|
|
|
int dst_step1 = dst.cols * dst.elemSize(); |
|
|
|
@ -1351,7 +1397,11 @@ static void bitwise_binary_run(const oclMat &src1, const oclMat &src2, const Sca |
|
|
|
|
operationMap[operationType], vlenstr.c_str(), vlenstr.c_str(), |
|
|
|
|
(int)src1.elemSize(), vlen, vlenstr.c_str()); |
|
|
|
|
|
|
|
|
|
#ifdef ANDROID |
|
|
|
|
size_t localThreads[3] = { 16, 10, 1 }; |
|
|
|
|
#else |
|
|
|
|
size_t localThreads[3] = { 16, 16, 1 }; |
|
|
|
|
#endif |
|
|
|
|
size_t globalThreads[3] = { dst.cols, dst.rows, 1 }; |
|
|
|
|
|
|
|
|
|
vector<pair<size_t , const void *> > args; |
|
|
|
@ -1599,7 +1649,6 @@ void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2, |
|
|
|
|
typeMap[depth], hasDouble ? "double" : "float", typeMap[depth], |
|
|
|
|
depth >= CV_32F ? "" : "_sat_rte"); |
|
|
|
|
|
|
|
|
|
size_t localThreads[3] = { 256, 1, 1 }; |
|
|
|
|
size_t globalThreads[3] = { cols1, dst.rows, 1}; |
|
|
|
|
|
|
|
|
|
float alpha_f = static_cast<float>(alpha), |
|
|
|
@ -1633,8 +1682,14 @@ void cv::ocl::addWeighted(const oclMat &src1, double alpha, const oclMat &src2, |
|
|
|
|
args.push_back( make_pair( sizeof(cl_int), (void *)&cols1 )); |
|
|
|
|
args.push_back( make_pair( sizeof(cl_int), (void *)&src1.rows )); |
|
|
|
|
|
|
|
|
|
#ifdef ANDROID |
|
|
|
|
openCLExecuteKernel(clCxt, &arithm_addWeighted, "addWeighted", globalThreads, NULL, |
|
|
|
|
args, -1, -1, buildOptions.c_str()); |
|
|
|
|
#else |
|
|
|
|
size_t localThreads[3] = { 256, 1, 1}; |
|
|
|
|
openCLExecuteKernel(clCxt, &arithm_addWeighted, "addWeighted", globalThreads, localThreads, |
|
|
|
|
args, -1, -1, buildOptions.c_str()); |
|
|
|
|
#endif |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
|