From 91ac9688a85ae5671de781b303941f3774fa67d7 Mon Sep 17 00:00:00 2001 From: Andrey Kamaev Date: Fri, 15 Mar 2013 23:56:31 +0400 Subject: [PATCH] Allow OpenCL acceleration in every OpenCV module --- CMakeLists.txt | 2 +- cmake/OpenCVModule.cmake | 15 + {modules/ocl => cmake}/cl2cpp.cmake | 0 modules/ocl/CMakeLists.txt | 42 +- modules/ocl/src/kernels/brute_force_match.cl | 865 ------------------ .../src/{kernels => opencl}/arithm_2_mat.cl | 0 .../ocl/src/{kernels => opencl}/arithm_LUT.cl | 0 .../src/{kernels => opencl}/arithm_absdiff.cl | 0 .../ocl/src/{kernels => opencl}/arithm_add.cl | 0 .../{kernels => opencl}/arithm_addWeighted.cl | 68 +- .../{kernels => opencl}/arithm_add_scalar.cl | 0 .../arithm_add_scalar_mask.cl | 0 .../{kernels => opencl}/arithm_bitwise_and.cl | 57 +- .../arithm_bitwise_and_mask.cl | 1 - .../arithm_bitwise_and_scalar.cl | 0 .../arithm_bitwise_and_scalar_mask.cl | 1 - .../{kernels => opencl}/arithm_bitwise_not.cl | 15 +- .../{kernels => opencl}/arithm_bitwise_or.cl | 17 +- .../arithm_bitwise_or_mask.cl | 1 - .../arithm_bitwise_or_scalar.cl | 1 - .../arithm_bitwise_or_scalar_mask.cl | 1 - .../{kernels => opencl}/arithm_bitwise_xor.cl | 57 +- .../arithm_bitwise_xor_mask.cl | 1 - .../arithm_bitwise_xor_scalar.cl | 0 .../arithm_bitwise_xor_scalar_mask.cl | 1 - .../{kernels => opencl}/arithm_cartToPolar.cl | 0 .../{kernels => opencl}/arithm_compare_eq.cl | 631 +++++++------ .../{kernels => opencl}/arithm_compare_ne.cl | 636 +++++++------ .../ocl/src/{kernels => opencl}/arithm_div.cl | 2 - .../ocl/src/{kernels => opencl}/arithm_exp.cl | 0 .../src/{kernels => opencl}/arithm_flip.cl | 0 .../src/{kernels => opencl}/arithm_flip_rc.cl | 0 .../ocl/src/{kernels => opencl}/arithm_log.cl | 0 .../{kernels => opencl}/arithm_magnitude.cl | 0 .../arithm_magnitudeSqr.cl | 22 +- .../src/{kernels => opencl}/arithm_minMax.cl | 0 .../{kernels => opencl}/arithm_minMaxLoc.cl | 0 .../arithm_minMaxLoc_mask.cl | 1 - .../{kernels => opencl}/arithm_minMax_mask.cl | 1 - .../ocl/src/{kernels => opencl}/arithm_mul.cl | 0 .../src/{kernels => opencl}/arithm_nonzero.cl | 0 .../src/{kernels => opencl}/arithm_phase.cl | 0 .../{kernels => opencl}/arithm_polarToCart.cl | 0 .../ocl/src/{kernels => opencl}/arithm_pow.cl | 0 .../ocl/src/{kernels => opencl}/arithm_sub.cl | 0 .../{kernels => opencl}/arithm_sub_scalar.cl | 0 .../arithm_sub_scalar_mask.cl | 0 .../ocl/src/{kernels => opencl}/arithm_sum.cl | 1 - .../src/{kernels => opencl}/arithm_sum_3.cl | 1 - .../{kernels => opencl}/arithm_transpose.cl | 0 .../src/{kernels => opencl}/blend_linear.cl | 7 +- modules/ocl/src/opencl/brute_force_match.cl | 865 ++++++++++++++++++ .../src/{kernels => opencl}/build_warps.cl | 1 - .../src/{kernels => opencl}/convertC3C4.cl | 0 .../ocl/src/{kernels => opencl}/cvt_color.cl | 0 .../src/{kernels => opencl}/filter_sep_col.cl | 0 .../src/{kernels => opencl}/filter_sep_row.cl | 2 - .../filtering_boxFilter.cl | 0 .../filtering_laplacian.cl | 0 .../{kernels => opencl}/filtering_morph.cl | 0 .../{kernels => opencl}/haarobjectdetect.cl | 4 - .../haarobjectdetect_scaled2.cl | 1 - .../{kernels => opencl}/imgproc_bilateral.cl | 0 .../{kernels => opencl}/imgproc_calcHarris.cl | 0 .../imgproc_calcMinEigenVal.cl | 0 .../src/{kernels => opencl}/imgproc_canny.cl | 0 .../{kernels => opencl}/imgproc_columnsum.cl | 0 .../{kernels => opencl}/imgproc_convolve.cl | 2 - .../imgproc_copymakeboder.cl | 0 .../{kernels => opencl}/imgproc_histogram.cl | 1 - .../{kernels => opencl}/imgproc_integral.cl | 0 .../imgproc_integral_sum.cl | 0 .../src/{kernels => opencl}/imgproc_median.cl | 1 - .../src/{kernels => opencl}/imgproc_remap.cl | 101 +- .../src/{kernels => opencl}/imgproc_resize.cl | 1 - .../{kernels => opencl}/imgproc_threshold.cl | 1 - .../{kernels => opencl}/imgproc_warpAffine.cl | 0 .../imgproc_warpPerspective.cl | 1 - .../{kernels => opencl}/interpolate_frames.cl | 0 .../src/{kernels => opencl}/match_template.cl | 1 - .../ocl/src/{kernels => opencl}/meanShift.cl | 1 - .../ocl/src/{kernels => opencl}/merge_mat.cl | 0 .../ocl/src/{kernels => opencl}/moments.cl | 4 +- .../src/{kernels => opencl}/nonfree_surf.cl | 182 ++-- .../src/{kernels => opencl}/objdetect_hog.cl | 0 .../{kernels => opencl}/operator_convertTo.cl | 0 .../{kernels => opencl}/operator_copyToM.cl | 0 .../src/{kernels => opencl}/operator_setTo.cl | 0 .../{kernels => opencl}/operator_setToM.cl | 1 - .../ocl/src/{kernels => opencl}/pyr_down.cl | 0 modules/ocl/src/{kernels => opencl}/pyr_up.cl | 0 modules/ocl/src/{kernels => opencl}/pyrlk.cl | 0 .../src/{kernels => opencl}/pyrlk_no_image.cl | 0 .../ocl/src/{kernels => opencl}/split_mat.cl | 424 ++++----- .../ocl/src/{kernels => opencl}/stereobm.cl | 42 +- 95 files changed, 2008 insertions(+), 2075 deletions(-) rename {modules/ocl => cmake}/cl2cpp.cmake (100%) delete mode 100644 modules/ocl/src/kernels/brute_force_match.cl rename modules/ocl/src/{kernels => opencl}/arithm_2_mat.cl (100%) rename modules/ocl/src/{kernels => opencl}/arithm_LUT.cl (100%) rename modules/ocl/src/{kernels => opencl}/arithm_absdiff.cl (100%) rename modules/ocl/src/{kernels => opencl}/arithm_add.cl (100%) rename modules/ocl/src/{kernels => opencl}/arithm_addWeighted.cl (95%) rename modules/ocl/src/{kernels => opencl}/arithm_add_scalar.cl (100%) rename modules/ocl/src/{kernels => opencl}/arithm_add_scalar_mask.cl (100%) rename modules/ocl/src/{kernels => opencl}/arithm_bitwise_and.cl (95%) rename modules/ocl/src/{kernels => opencl}/arithm_bitwise_and_mask.cl (99%) rename modules/ocl/src/{kernels => opencl}/arithm_bitwise_and_scalar.cl (100%) rename modules/ocl/src/{kernels => opencl}/arithm_bitwise_and_scalar_mask.cl (99%) rename modules/ocl/src/{kernels => opencl}/arithm_bitwise_not.cl (99%) rename modules/ocl/src/{kernels => opencl}/arithm_bitwise_or.cl (98%) rename modules/ocl/src/{kernels => opencl}/arithm_bitwise_or_mask.cl (99%) rename modules/ocl/src/{kernels => opencl}/arithm_bitwise_or_scalar.cl (99%) rename modules/ocl/src/{kernels => opencl}/arithm_bitwise_or_scalar_mask.cl (99%) rename modules/ocl/src/{kernels => opencl}/arithm_bitwise_xor.cl (95%) rename modules/ocl/src/{kernels => opencl}/arithm_bitwise_xor_mask.cl (99%) rename modules/ocl/src/{kernels => opencl}/arithm_bitwise_xor_scalar.cl (100%) rename modules/ocl/src/{kernels => opencl}/arithm_bitwise_xor_scalar_mask.cl (99%) rename modules/ocl/src/{kernels => opencl}/arithm_cartToPolar.cl (100%) rename modules/ocl/src/{kernels => opencl}/arithm_compare_eq.cl (74%) rename modules/ocl/src/{kernels => opencl}/arithm_compare_ne.cl (73%) rename modules/ocl/src/{kernels => opencl}/arithm_div.cl (99%) rename modules/ocl/src/{kernels => opencl}/arithm_exp.cl (100%) rename modules/ocl/src/{kernels => opencl}/arithm_flip.cl (100%) rename modules/ocl/src/{kernels => opencl}/arithm_flip_rc.cl (100%) rename modules/ocl/src/{kernels => opencl}/arithm_log.cl (100%) rename modules/ocl/src/{kernels => opencl}/arithm_magnitude.cl (100%) rename modules/ocl/src/{kernels => opencl}/arithm_magnitudeSqr.cl (98%) rename modules/ocl/src/{kernels => opencl}/arithm_minMax.cl (100%) rename modules/ocl/src/{kernels => opencl}/arithm_minMaxLoc.cl (100%) rename modules/ocl/src/{kernels => opencl}/arithm_minMaxLoc_mask.cl (99%) rename modules/ocl/src/{kernels => opencl}/arithm_minMax_mask.cl (99%) rename modules/ocl/src/{kernels => opencl}/arithm_mul.cl (100%) rename modules/ocl/src/{kernels => opencl}/arithm_nonzero.cl (100%) rename modules/ocl/src/{kernels => opencl}/arithm_phase.cl (100%) rename modules/ocl/src/{kernels => opencl}/arithm_polarToCart.cl (100%) rename modules/ocl/src/{kernels => opencl}/arithm_pow.cl (100%) rename modules/ocl/src/{kernels => opencl}/arithm_sub.cl (100%) rename modules/ocl/src/{kernels => opencl}/arithm_sub_scalar.cl (100%) rename modules/ocl/src/{kernels => opencl}/arithm_sub_scalar_mask.cl (100%) rename modules/ocl/src/{kernels => opencl}/arithm_sum.cl (99%) rename modules/ocl/src/{kernels => opencl}/arithm_sum_3.cl (99%) rename modules/ocl/src/{kernels => opencl}/arithm_transpose.cl (100%) rename modules/ocl/src/{kernels => opencl}/blend_linear.cl (98%) create mode 100644 modules/ocl/src/opencl/brute_force_match.cl rename modules/ocl/src/{kernels => opencl}/build_warps.cl (99%) rename modules/ocl/src/{kernels => opencl}/convertC3C4.cl (100%) rename modules/ocl/src/{kernels => opencl}/cvt_color.cl (100%) rename modules/ocl/src/{kernels => opencl}/filter_sep_col.cl (100%) rename modules/ocl/src/{kernels => opencl}/filter_sep_row.cl (99%) rename modules/ocl/src/{kernels => opencl}/filtering_boxFilter.cl (100%) rename modules/ocl/src/{kernels => opencl}/filtering_laplacian.cl (100%) rename modules/ocl/src/{kernels => opencl}/filtering_morph.cl (100%) rename modules/ocl/src/{kernels => opencl}/haarobjectdetect.cl (99%) rename modules/ocl/src/{kernels => opencl}/haarobjectdetect_scaled2.cl (99%) rename modules/ocl/src/{kernels => opencl}/imgproc_bilateral.cl (100%) rename modules/ocl/src/{kernels => opencl}/imgproc_calcHarris.cl (100%) rename modules/ocl/src/{kernels => opencl}/imgproc_calcMinEigenVal.cl (100%) rename modules/ocl/src/{kernels => opencl}/imgproc_canny.cl (100%) rename modules/ocl/src/{kernels => opencl}/imgproc_columnsum.cl (100%) rename modules/ocl/src/{kernels => opencl}/imgproc_convolve.cl (99%) rename modules/ocl/src/{kernels => opencl}/imgproc_copymakeboder.cl (100%) rename modules/ocl/src/{kernels => opencl}/imgproc_histogram.cl (99%) rename modules/ocl/src/{kernels => opencl}/imgproc_integral.cl (100%) rename modules/ocl/src/{kernels => opencl}/imgproc_integral_sum.cl (100%) rename modules/ocl/src/{kernels => opencl}/imgproc_median.cl (99%) rename modules/ocl/src/{kernels => opencl}/imgproc_remap.cl (98%) rename modules/ocl/src/{kernels => opencl}/imgproc_resize.cl (99%) rename modules/ocl/src/{kernels => opencl}/imgproc_threshold.cl (99%) rename modules/ocl/src/{kernels => opencl}/imgproc_warpAffine.cl (100%) rename modules/ocl/src/{kernels => opencl}/imgproc_warpPerspective.cl (99%) rename modules/ocl/src/{kernels => opencl}/interpolate_frames.cl (100%) rename modules/ocl/src/{kernels => opencl}/match_template.cl (99%) rename modules/ocl/src/{kernels => opencl}/meanShift.cl (99%) rename modules/ocl/src/{kernels => opencl}/merge_mat.cl (100%) rename modules/ocl/src/{kernels => opencl}/moments.cl (99%) rename modules/ocl/src/{kernels => opencl}/nonfree_surf.cl (94%) rename modules/ocl/src/{kernels => opencl}/objdetect_hog.cl (100%) rename modules/ocl/src/{kernels => opencl}/operator_convertTo.cl (100%) rename modules/ocl/src/{kernels => opencl}/operator_copyToM.cl (100%) rename modules/ocl/src/{kernels => opencl}/operator_setTo.cl (100%) rename modules/ocl/src/{kernels => opencl}/operator_setToM.cl (99%) rename modules/ocl/src/{kernels => opencl}/pyr_down.cl (100%) rename modules/ocl/src/{kernels => opencl}/pyr_up.cl (100%) rename modules/ocl/src/{kernels => opencl}/pyrlk.cl (100%) rename modules/ocl/src/{kernels => opencl}/pyrlk_no_image.cl (100%) rename modules/ocl/src/{kernels => opencl}/split_mat.cl (87%) rename modules/ocl/src/{kernels => opencl}/stereobm.cl (96%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6657de2c05..351273e888 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -782,7 +782,7 @@ if(HAVE_CUDA) status(" Use fast math:" CUDA_FAST_MATH THEN YES ELSE NO) endif() -if(HAVE_OPENCL AND BUILD_opencv_ocl) +if(HAVE_OPENCL) status("") status(" OpenCL") if(OPENCL_INCLUDE_DIR) diff --git a/cmake/OpenCVModule.cmake b/cmake/OpenCVModule.cmake index b6d129a267..abb0393956 100644 --- a/cmake/OpenCVModule.cmake +++ b/cmake/OpenCVModule.cmake @@ -432,10 +432,22 @@ macro(ocv_glob_module_sources) file(GLOB lib_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h") file(GLOB lib_hdrs_detail "include/opencv2/${name}/detail/*.hpp" "include/opencv2/${name}/detail/*.h") + file(GLOB cl_kernels "src/opencl/*.cl") + source_group("Src" FILES ${lib_srcs} ${lib_int_hdrs}) source_group("Include" FILES ${lib_hdrs}) source_group("Include\\detail" FILES ${lib_hdrs_detail}) + if(HAVE_OPENCL AND cl_kernels) + ocv_include_directories(${OPENCL_INCLUDE_DIRS}) + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp" + COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/opencl" -DOUTPUT="${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp" -P "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake" + DEPENDS ${cl_kernels} "${OpenCV_SOURCE_DIR}/cmake/cl2cpp.cmake") + source_group("Src\\OpenCL" FILES ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp") + list(APPEND lib_srcs ${cl_kernels} "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp") + endif() + ocv_set_module_sources(${ARGN} HEADERS ${lib_hdrs} ${lib_hdrs_detail} SOURCES ${lib_srcs} ${lib_int_hdrs}) endmacro() @@ -449,6 +461,9 @@ macro(ocv_create_module) if(NOT "${ARGN}" STREQUAL "SKIP_LINK") target_link_libraries(${the_module} ${OPENCV_MODULE_${the_module}_DEPS} ${OPENCV_MODULE_${the_module}_DEPS_EXT} ${OPENCV_LINKER_LIBS} ${IPP_LIBS} ${ARGN}) + if(HAVE_OPENCL AND OPENCL_LIBRARIES) + target_link_libraries(${the_module} ${OPENCL_LIBRARIES}) + endif() endif() add_dependencies(opencv_modules ${the_module}) diff --git a/modules/ocl/cl2cpp.cmake b/cmake/cl2cpp.cmake similarity index 100% rename from modules/ocl/cl2cpp.cmake rename to cmake/cl2cpp.cmake diff --git a/modules/ocl/CMakeLists.txt b/modules/ocl/CMakeLists.txt index 7e621f42ba..8dbe90c316 100644 --- a/modules/ocl/CMakeLists.txt +++ b/modules/ocl/CMakeLists.txt @@ -3,45 +3,5 @@ if(NOT HAVE_OPENCL) endif() set(the_description "OpenCL-accelerated Computer Vision") -ocv_add_module(ocl opencv_core opencv_imgproc opencv_features2d opencv_objdetect opencv_video opencv_nonfree) -ocv_module_include_directories(${OPENCL_INCLUDE_DIRS}) - -file(GLOB CL_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/kernels/*.cl") -set(kernels_cpp "${CMAKE_CURRENT_BINARY_DIR}/kernels.cpp") -set(cl2cpp_script "${CMAKE_CURRENT_SOURCE_DIR}/cl2cpp.cmake") - -add_custom_command( - OUTPUT ${kernels_cpp} - COMMAND ${CMAKE_COMMAND} -DCL_DIR="${CMAKE_CURRENT_SOURCE_DIR}/src/kernels" -DOUTPUT="${kernels_cpp}" -P ${cl2cpp_script} - DEPENDS ${CL_FILES} ${cl2cpp_script}) - -file(GLOB lib_hdrs "include/opencv2/${name}/*.hpp" "include/opencv2/${name}/*.h") -file(GLOB lib_srcs "src/*.cpp") -file(GLOB lib_int_hdrs "src/*.h*") - -source_group("Include" FILES ${lib_hdrs}) -source_group("Src\\Host" FILES ${lib_srcs} ${lib_int_hdrs} ${kernels_cpp}) - +ocv_define_module(ocl opencv_core opencv_imgproc opencv_features2d opencv_objdetect opencv_video opencv_nonfree) ocv_warnings_disable(CMAKE_CXX_FLAGS -Wshadow) - -ocv_set_module_sources(HEADERS ${lib_hdrs} SOURCES ${lib_int_hdrs} ${lib_srcs} ${kernels_cpp}) -ocv_create_module(${OPENCL_LIBRARIES}) -ocv_add_precompiled_headers(${the_module}) - -################################################################################################################ -################################ OpenCL Module Tests ################################################## -################################################################################################################ -file(GLOB test_srcs "test/*.cpp") -file(GLOB test_hdrs "test/*.hpp" "test/*.h") - -ocv_add_accuracy_tests(FILES "Include" ${test_hdrs} - FILES "Src" ${test_srcs}) - -################################################################################################################ -################################ OpenCL Module Performance ################################################## -################################################################################################################ -file(GLOB perf_srcs "perf/*.cpp") -file(GLOB perf_hdrs "perf/*.hpp" "perf/*.h") - -ocv_add_perf_tests(FILES "Include" ${perf_hdrs} - FILES "Src" ${perf_srcs}) diff --git a/modules/ocl/src/kernels/brute_force_match.cl b/modules/ocl/src/kernels/brute_force_match.cl deleted file mode 100644 index e5dd29ee0a..0000000000 --- a/modules/ocl/src/kernels/brute_force_match.cl +++ /dev/null @@ -1,865 +0,0 @@ -#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable -#define MAX_FLOAT 1e7f - -int bit1Count(float x) -{ - int c = 0; - int ix = (int)x; - - for (int i = 0 ; i < 32 ; i++) - { - c += ix & 0x1; - ix >>= 1; - } - - return (float)c; -} -/* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size -local size: dim0 is block_size, dim1 is block_size. -*/ -__kernel void BruteForceMatch_UnrollMatch( - __global float *query, - __global float *train, - //__global float *mask, - __global int *bestTrainIdx, - __global float *bestDistance, - __local float *sharebuffer, - int block_size, - int max_desc_len, - int query_rows, - int query_cols, - int train_rows, - int train_cols, - int step, - int distType -) -{ - const int lidx = get_local_id(0); - const int lidy = get_local_id(1); - const int groupidx = get_group_id(0); - - __local float *s_query = sharebuffer; - __local float *s_train = sharebuffer + block_size * max_desc_len; - - int queryIdx = groupidx * block_size + lidy; - - // load the query into local memory. - for (int i = 0 ; i < max_desc_len / block_size; i ++) - { - int loadx = lidx + i * block_size; - s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; - } - - float myBestDistance = MAX_FLOAT; - int myBestTrainIdx = -1; - - // loopUnrolledCached to find the best trainIdx and best distance. - volatile int imgIdx = 0; - - for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) - { - float result = 0; - - for (int i = 0 ; i < max_desc_len / block_size ; i++) - { - //load a block_size * block_size block into local train. - const int loadx = lidx + i * block_size; - s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; - - //synchronize to make sure each elem for reduceIteration in share memory is written already. - barrier(CLK_LOCAL_MEM_FENCE); - - /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to - sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ - - switch (distType) - { - case 0: - - for (int j = 0 ; j < block_size ; j++) - { - result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]); - } - - break; - case 1: - - for (int j = 0 ; j < block_size ; j++) - { - float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]; - result += qr * qr; - } - - break; - case 2: - - for (int j = 0 ; j < block_size ; j++) - { - //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]); - result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]); - } - - break; - } - - barrier(CLK_LOCAL_MEM_FENCE); - } - - int trainIdx = t * block_size + lidx; - - if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/) - { - //bestImgIdx = imgIdx; - myBestDistance = result; - myBestTrainIdx = trainIdx; - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - __local float *s_distance = (__local float *)(sharebuffer); - __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); - - //find BestMatch - s_distance += lidy * block_size; - s_trainIdx += lidy * block_size; - s_distance[lidx] = myBestDistance; - s_trainIdx[lidx] = myBestTrainIdx; - - barrier(CLK_LOCAL_MEM_FENCE); - - //reduce -- now all reduce implement in each threads. - for (int k = 0 ; k < block_size; k++) - { - if (myBestDistance > s_distance[k]) - { - myBestDistance = s_distance[k]; - myBestTrainIdx = s_trainIdx[k]; - } - } - - if (queryIdx < query_rows && lidx == 0) - { - bestTrainIdx[queryIdx] = myBestTrainIdx; - bestDistance[queryIdx] = myBestDistance; - } -} - -__kernel void BruteForceMatch_Match( - __global float *query, - __global float *train, - //__global float *mask, - __global int *bestTrainIdx, - __global float *bestDistance, - __local float *sharebuffer, - int block_size, - int query_rows, - int query_cols, - int train_rows, - int train_cols, - int step, - int distType -) -{ - const int lidx = get_local_id(0); - const int lidy = get_local_id(1); - const int groupidx = get_group_id(0); - - const int queryIdx = groupidx * block_size + lidy; - - float myBestDistance = MAX_FLOAT; - int myBestTrainIdx = -1; - - __local float *s_query = sharebuffer; - __local float *s_train = sharebuffer + block_size * block_size; - - // loop - for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) - { - //Dist dist; - float result = 0; - - for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++) - { - const int loadx = lidx + i * block_size; - //load query and train into local memory - s_query[lidy * block_size + lidx] = 0; - s_train[lidx * block_size + lidy] = 0; - - if (loadx < query_cols) - { - s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx]; - s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx]; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to - sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ - - switch (distType) - { - case 0: - - for (int j = 0 ; j < block_size ; j++) - { - result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); - } - - break; - case 1: - - for (int j = 0 ; j < block_size ; j++) - { - float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; - result += qr * qr; - } - - break; - case 2: - - for (int j = 0 ; j < block_size ; j++) - { - //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]); - result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]); - } - - break; - } - - barrier(CLK_LOCAL_MEM_FENCE); - } - - const int trainIdx = t * block_size + lidx; - - if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/) - { - //myBestImgidx = imgIdx; - myBestDistance = result; - myBestTrainIdx = trainIdx; - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - __local float *s_distance = (__local float *)sharebuffer; - __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); - - //findBestMatch - s_distance += lidy * block_size; - s_trainIdx += lidy * block_size; - s_distance[lidx] = myBestDistance; - s_trainIdx[lidx] = myBestTrainIdx; - - barrier(CLK_LOCAL_MEM_FENCE); - - //reduce -- now all reduce implement in each threads. - for (int k = 0 ; k < block_size; k++) - { - if (myBestDistance > s_distance[k]) - { - myBestDistance = s_distance[k]; - myBestTrainIdx = s_trainIdx[k]; - } - } - - if (queryIdx < query_rows && lidx == 0) - { - bestTrainIdx[queryIdx] = myBestTrainIdx; - bestDistance[queryIdx] = myBestDistance; - } -} - -//radius_unrollmatch -__kernel void BruteForceMatch_RadiusUnrollMatch( - __global float *query, - __global float *train, - float maxDistance, - //__global float *mask, - __global int *bestTrainIdx, - __global float *bestDistance, - __global int *nMatches, - __local float *sharebuffer, - int block_size, - int max_desc_len, - int query_rows, - int query_cols, - int train_rows, - int train_cols, - int bestTrainIdx_cols, - int step, - int ostep, - int distType -) -{ - const int lidx = get_local_id(0); - const int lidy = get_local_id(1); - const int groupidx = get_group_id(0); - const int groupidy = get_group_id(1); - - const int queryIdx = groupidy * block_size + lidy; - const int trainIdx = groupidx * block_size + lidx; - - __local float *s_query = sharebuffer; - __local float *s_train = sharebuffer + block_size * block_size; - - float result = 0; - - for (int i = 0 ; i < max_desc_len / block_size ; ++i) - { - //load a block_size * block_size block into local train. - const int loadx = lidx + i * block_size; - - s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; - s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; - - //synchronize to make sure each elem for reduceIteration in share memory is written already. - barrier(CLK_LOCAL_MEM_FENCE); - - /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to - sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ - - switch (distType) - { - case 0: - - for (int j = 0 ; j < block_size ; ++j) - { - result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); - } - - break; - case 1: - - for (int j = 0 ; j < block_size ; ++j) - { - float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; - result += qr * qr; - } - - break; - case 2: - - for (int j = 0 ; j < block_size ; ++j) - { - result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]); - } - - break; - } - - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/) - { - unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/); - - if (ind < bestTrainIdx_cols) - { - //bestImgIdx = imgIdx; - bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx; - bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result; - } - } -} - -//radius_match -__kernel void BruteForceMatch_RadiusMatch( - __global float *query, - __global float *train, - float maxDistance, - //__global float *mask, - __global int *bestTrainIdx, - __global float *bestDistance, - __global int *nMatches, - __local float *sharebuffer, - int block_size, - int query_rows, - int query_cols, - int train_rows, - int train_cols, - int bestTrainIdx_cols, - int step, - int ostep, - int distType -) -{ - const int lidx = get_local_id(0); - const int lidy = get_local_id(1); - const int groupidx = get_group_id(0); - const int groupidy = get_group_id(1); - - const int queryIdx = groupidy * block_size + lidy; - const int trainIdx = groupidx * block_size + lidx; - - __local float *s_query = sharebuffer; - __local float *s_train = sharebuffer + block_size * block_size; - - float result = 0; - - for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i) - { - //load a block_size * block_size block into local train. - const int loadx = lidx + i * block_size; - - s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; - s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; - - //synchronize to make sure each elem for reduceIteration in share memory is written already. - barrier(CLK_LOCAL_MEM_FENCE); - - /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to - sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ - - switch (distType) - { - case 0: - - for (int j = 0 ; j < block_size ; ++j) - { - result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); - } - - break; - case 1: - - for (int j = 0 ; j < block_size ; ++j) - { - float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; - result += qr * qr; - } - - break; - case 2: - - for (int j = 0 ; j < block_size ; ++j) - { - result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]); - } - - break; - } - - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/) - { - unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/); - - if (ind < bestTrainIdx_cols) - { - //bestImgIdx = imgIdx; - bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx; - bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result; - } - } -} - - -__kernel void BruteForceMatch_knnUnrollMatch( - __global float *query, - __global float *train, - //__global float *mask, - __global int2 *bestTrainIdx, - __global float2 *bestDistance, - __local float *sharebuffer, - int block_size, - int max_desc_len, - int query_rows, - int query_cols, - int train_rows, - int train_cols, - int step, - int distType -) -{ - const int lidx = get_local_id(0); - const int lidy = get_local_id(1); - const int groupidx = get_group_id(0); - - const int queryIdx = groupidx * block_size + lidy; - local float *s_query = sharebuffer; - local float *s_train = sharebuffer + block_size * max_desc_len; - - // load the query into local memory. - for (int i = 0 ; i < max_desc_len / block_size; i ++) - { - int loadx = lidx + i * block_size; - s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; - } - - float myBestDistance1 = MAX_FLOAT; - float myBestDistance2 = MAX_FLOAT; - int myBestTrainIdx1 = -1; - int myBestTrainIdx2 = -1; - - //loopUnrolledCached - volatile int imgIdx = 0; - - for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) - { - float result = 0; - - for (int i = 0 ; i < max_desc_len / block_size ; i++) - { - const int loadX = lidx + i * block_size; - //load a block_size * block_size block into local train. - const int loadx = lidx + i * block_size; - s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; - - //synchronize to make sure each elem for reduceIteration in share memory is written already. - barrier(CLK_LOCAL_MEM_FENCE); - - /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to - sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ - - switch (distType) - { - case 0: - - for (int j = 0 ; j < block_size ; j++) - { - result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]); - } - - break; - case 1: - - for (int j = 0 ; j < block_size ; j++) - { - float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]; - result += qr * qr; - } - - break; - case 2: - - for (int j = 0 ; j < block_size ; j++) - { - //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]); - result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]); - } - - break; - } - - barrier(CLK_LOCAL_MEM_FENCE); - } - - const int trainIdx = t * block_size + lidx; - - if (queryIdx < query_rows && trainIdx < train_rows) - { - if (result < myBestDistance1) - { - myBestDistance2 = myBestDistance1; - myBestTrainIdx2 = myBestTrainIdx1; - myBestDistance1 = result; - myBestTrainIdx1 = trainIdx; - } - else if (result < myBestDistance2) - { - myBestDistance2 = result; - myBestTrainIdx2 = trainIdx; - } - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - local float *s_distance = (local float *)sharebuffer; - local int *s_trainIdx = (local int *)(sharebuffer + block_size * block_size); - - // find BestMatch - s_distance += lidy * block_size; - s_trainIdx += lidy * block_size; - - s_distance[lidx] = myBestDistance1; - s_trainIdx[lidx] = myBestTrainIdx1; - - float bestDistance1 = MAX_FLOAT; - float bestDistance2 = MAX_FLOAT; - int bestTrainIdx1 = -1; - int bestTrainIdx2 = -1; - barrier(CLK_LOCAL_MEM_FENCE); - - if (lidx == 0) - { - for (int i = 0 ; i < block_size ; i++) - { - float val = s_distance[i]; - - if (val < bestDistance1) - { - bestDistance2 = bestDistance1; - bestTrainIdx2 = bestTrainIdx1; - - bestDistance1 = val; - bestTrainIdx1 = s_trainIdx[i]; - } - else if (val < bestDistance2) - { - bestDistance2 = val; - bestTrainIdx2 = s_trainIdx[i]; - } - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - s_distance[lidx] = myBestDistance2; - s_trainIdx[lidx] = myBestTrainIdx2; - - barrier(CLK_LOCAL_MEM_FENCE); - - if (lidx == 0) - { - for (int i = 0 ; i < block_size ; i++) - { - float val = s_distance[i]; - - if (val < bestDistance2) - { - bestDistance2 = val; - bestTrainIdx2 = s_trainIdx[i]; - } - } - } - - myBestDistance1 = bestDistance1; - myBestDistance2 = bestDistance2; - - myBestTrainIdx1 = bestTrainIdx1; - myBestTrainIdx2 = bestTrainIdx2; - - if (queryIdx < query_rows && lidx == 0) - { - bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2); - bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2); - } -} - -__kernel void BruteForceMatch_knnMatch( - __global float *query, - __global float *train, - //__global float *mask, - __global int2 *bestTrainIdx, - __global float2 *bestDistance, - __local float *sharebuffer, - int block_size, - int query_rows, - int query_cols, - int train_rows, - int train_cols, - int step, - int distType -) -{ - const int lidx = get_local_id(0); - const int lidy = get_local_id(1); - const int groupidx = get_group_id(0); - - const int queryIdx = groupidx * block_size + lidy; - local float *s_query = sharebuffer; - local float *s_train = sharebuffer + block_size * block_size; - - float myBestDistance1 = MAX_FLOAT; - float myBestDistance2 = MAX_FLOAT; - int myBestTrainIdx1 = -1; - int myBestTrainIdx2 = -1; - - //loop - for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) - { - float result = 0.0f; - - for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++) - { - const int loadx = lidx + i * block_size; - //load query and train into local memory - s_query[lidy * block_size + lidx] = 0; - s_train[lidx * block_size + lidy] = 0; - - if (loadx < query_cols) - { - s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx]; - s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx]; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to - sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ - - switch (distType) - { - case 0: - - for (int j = 0 ; j < block_size ; j++) - { - result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); - } - - break; - case 1: - - for (int j = 0 ; j < block_size ; j++) - { - float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; - result += qr * qr; - } - - break; - case 2: - - for (int j = 0 ; j < block_size ; j++) - { - //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]); - result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]); - } - - break; - } - - barrier(CLK_LOCAL_MEM_FENCE); - } - - const int trainIdx = t * block_size + lidx; - - if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/) - { - if (result < myBestDistance1) - { - myBestDistance2 = myBestDistance1; - myBestTrainIdx2 = myBestTrainIdx1; - myBestDistance1 = result; - myBestTrainIdx1 = trainIdx; - } - else if (result < myBestDistance2) - { - myBestDistance2 = result; - myBestTrainIdx2 = trainIdx; - } - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - __local float *s_distance = (__local float *)sharebuffer; - __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); - - //findBestMatch - s_distance += lidy * block_size; - s_trainIdx += lidy * block_size; - - s_distance[lidx] = myBestDistance1; - s_trainIdx[lidx] = myBestTrainIdx1; - - float bestDistance1 = MAX_FLOAT; - float bestDistance2 = MAX_FLOAT; - int bestTrainIdx1 = -1; - int bestTrainIdx2 = -1; - barrier(CLK_LOCAL_MEM_FENCE); - - if (lidx == 0) - { - for (int i = 0 ; i < block_size ; i++) - { - float val = s_distance[i]; - - if (val < bestDistance1) - { - bestDistance2 = bestDistance1; - bestTrainIdx2 = bestTrainIdx1; - - bestDistance1 = val; - bestTrainIdx1 = s_trainIdx[i]; - } - else if (val < bestDistance2) - { - bestDistance2 = val; - bestTrainIdx2 = s_trainIdx[i]; - } - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - s_distance[lidx] = myBestDistance2; - s_trainIdx[lidx] = myBestTrainIdx2; - - barrier(CLK_LOCAL_MEM_FENCE); - - if (lidx == 0) - { - for (int i = 0 ; i < block_size ; i++) - { - float val = s_distance[i]; - - if (val < bestDistance2) - { - bestDistance2 = val; - bestTrainIdx2 = s_trainIdx[i]; - } - } - } - - myBestDistance1 = bestDistance1; - myBestDistance2 = bestDistance2; - - myBestTrainIdx1 = bestTrainIdx1; - myBestTrainIdx2 = bestTrainIdx2; - - if (queryIdx < query_rows && lidx == 0) - { - bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2); - bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2); - } -} - -kernel void BruteForceMatch_calcDistanceUnrolled( - __global float *query, - __global float *train, - //__global float *mask, - __global float *allDist, - __local float *sharebuffer, - int block_size, - int max_desc_len, - int query_rows, - int query_cols, - int train_rows, - int train_cols, - int step, - int distType) -{ - /* Todo */ -} - -kernel void BruteForceMatch_calcDistance( - __global float *query, - __global float *train, - //__global float *mask, - __global float *allDist, - __local float *sharebuffer, - int block_size, - int query_rows, - int query_cols, - int train_rows, - int train_cols, - int step, - int distType) -{ - /* Todo */ -} - -kernel void BruteForceMatch_findBestMatch( - __global float *allDist, - __global int *bestTrainIdx, - __global float *bestDistance, - int k, - int block_size -) -{ - /* Todo */ -} \ No newline at end of file diff --git a/modules/ocl/src/kernels/arithm_2_mat.cl b/modules/ocl/src/opencl/arithm_2_mat.cl similarity index 100% rename from modules/ocl/src/kernels/arithm_2_mat.cl rename to modules/ocl/src/opencl/arithm_2_mat.cl diff --git a/modules/ocl/src/kernels/arithm_LUT.cl b/modules/ocl/src/opencl/arithm_LUT.cl similarity index 100% rename from modules/ocl/src/kernels/arithm_LUT.cl rename to modules/ocl/src/opencl/arithm_LUT.cl diff --git a/modules/ocl/src/kernels/arithm_absdiff.cl b/modules/ocl/src/opencl/arithm_absdiff.cl similarity index 100% rename from modules/ocl/src/kernels/arithm_absdiff.cl rename to modules/ocl/src/opencl/arithm_absdiff.cl diff --git a/modules/ocl/src/kernels/arithm_add.cl b/modules/ocl/src/opencl/arithm_add.cl similarity index 100% rename from modules/ocl/src/kernels/arithm_add.cl rename to modules/ocl/src/opencl/arithm_add.cl diff --git a/modules/ocl/src/kernels/arithm_addWeighted.cl b/modules/ocl/src/opencl/arithm_addWeighted.cl similarity index 95% rename from modules/ocl/src/kernels/arithm_addWeighted.cl rename to modules/ocl/src/opencl/arithm_addWeighted.cl index 7e9df6f253..d76f994aa0 100644 --- a/modules/ocl/src/kernels/arithm_addWeighted.cl +++ b/modules/ocl/src/opencl/arithm_addWeighted.cl @@ -61,29 +61,29 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset int y = get_global_id(1); if (x < cols && y < rows) - + { x = x << 2; #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); + int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); + int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - uchar4 src1_data ,src2_data; + uchar4 src1_data ,src2_data; - src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0; - src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0; - src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0; - src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0; + src1_data.x= src1_index+0 >= 0 ? src1[src1_index+0] : 0; + src1_data.y= src1_index+1 >= 0 ? src1[src1_index+1] : 0; + src1_data.z= src1_index+2 >= 0 ? src1[src1_index+2] : 0; + src1_data.w= src1_index+3 >= 0 ? src1[src1_index+3] : 0; - src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0; - src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0; - src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0; - src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0; + src2_data.x= src2_index+0 >= 0 ? src2[src2_index+0] : 0; + src2_data.y= src2_index+1 >= 0 ? src2[src2_index+1] : 0; + src2_data.z= src2_index+2 >= 0 ? src2[src2_index+2] : 0; + src2_data.w= src2_index+3 >= 0 ? src2[src2_index+3] : 0; uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); // short4 tmp = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama; @@ -117,14 +117,14 @@ __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offs int y = get_global_id(1); if (x < cols && y < rows) - + { x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); @@ -177,14 +177,14 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse int y = get_global_id(1); if (x < cols && y < rows) - + { x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); @@ -236,18 +236,18 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset, int y = get_global_id(1); if (x < cols && y < rows) - + { - + x = x << 2; #define bitOfInt (sizeof(int)== 4 ? 2: 3) #define dst_align ((dst_offset >> bitOfInt) & 3) - int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt)); - int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt)); - + int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt)); + int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt)); + int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt)); @@ -256,7 +256,7 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset, int src2_index_fix = src2_index < 0 ? 0 : src2_index; int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index_fix)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index_fix)); - + if(src1_index < 0) { int4 tmp; @@ -299,16 +299,16 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset int y = get_global_id(1); if (x < cols && y < rows) - + { - + x = x << 2; #define dst_align ((dst_offset >> 2) & 3) - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); - + int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); + int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); + int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2)); @@ -361,16 +361,16 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs int y = get_global_id(1); if (x < cols && y < rows) - + { - + x = x << 2; #define dst_align ((dst_offset >> 3) & 3) - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); - + int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); + int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); + int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3)); diff --git a/modules/ocl/src/kernels/arithm_add_scalar.cl b/modules/ocl/src/opencl/arithm_add_scalar.cl similarity index 100% rename from modules/ocl/src/kernels/arithm_add_scalar.cl rename to modules/ocl/src/opencl/arithm_add_scalar.cl diff --git a/modules/ocl/src/kernels/arithm_add_scalar_mask.cl b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl similarity index 100% rename from modules/ocl/src/kernels/arithm_add_scalar_mask.cl rename to modules/ocl/src/opencl/arithm_add_scalar_mask.cl diff --git a/modules/ocl/src/kernels/arithm_bitwise_and.cl b/modules/ocl/src/opencl/arithm_bitwise_and.cl similarity index 95% rename from modules/ocl/src/kernels/arithm_bitwise_and.cl rename to modules/ocl/src/opencl/arithm_bitwise_and.cl index f954452b1f..8adc56de5f 100644 --- a/modules/ocl/src/kernels/arithm_bitwise_and.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_and.cl @@ -63,8 +63,8 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr x = x << 2; #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); + int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); + int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); @@ -75,14 +75,14 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr uchar4 src2_data = vload4(0, src2 + src2_index_fix); if(src1_index < 0) - { + { uchar4 tmp; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - uchar4 tmp; + } + if(src2_index < 0) + { + uchar4 tmp; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; } @@ -113,8 +113,8 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src x = x << 2; #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); + int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); + int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); @@ -126,14 +126,14 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src char4 src2_data = vload4(0, src2 + src2_index_fix); if(src1_index < 0) - { + { char4 tmp; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - char4 tmp; + } + if(src2_index < 0) + { + char4 tmp; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; } @@ -164,8 +164,8 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); @@ -177,14 +177,14 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix)); if(src1_index < 0) - { + { ushort4 tmp; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - ushort4 tmp; + } + if(src2_index < 0) + { + ushort4 tmp; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; } @@ -216,8 +216,8 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); @@ -229,14 +229,14 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix)); if(src1_index < 0) - { + { short4 tmp; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - short4 tmp; + } + if(src2_index < 0) + { + short4 tmp; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; } @@ -320,4 +320,3 @@ __kernel void arithm_bitwise_and_D6 (__global char *src1, int src1_step, int src } } #endif - diff --git a/modules/ocl/src/kernels/arithm_bitwise_and_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_and_mask.cl similarity index 99% rename from modules/ocl/src/kernels/arithm_bitwise_and_mask.cl rename to modules/ocl/src/opencl/arithm_bitwise_and_mask.cl index d1f745ff29..595fb2ceb7 100644 --- a/modules/ocl/src/kernels/arithm_bitwise_and_mask.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_and_mask.cl @@ -1135,4 +1135,3 @@ __kernel void arithm_bitwise_and_with_mask_C4_D6 (__global char *src1, int src1_ } } #endif - diff --git a/modules/ocl/src/kernels/arithm_bitwise_and_scalar.cl b/modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl similarity index 100% rename from modules/ocl/src/kernels/arithm_bitwise_and_scalar.cl rename to modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl diff --git a/modules/ocl/src/kernels/arithm_bitwise_and_scalar_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl similarity index 99% rename from modules/ocl/src/kernels/arithm_bitwise_and_scalar_mask.cl rename to modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl index 50304aa34a..beafd7e0a7 100644 --- a/modules/ocl/src/kernels/arithm_bitwise_and_scalar_mask.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl @@ -1055,4 +1055,3 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D6 (__global short *src1, int sr } } #endif - diff --git a/modules/ocl/src/kernels/arithm_bitwise_not.cl b/modules/ocl/src/opencl/arithm_bitwise_not.cl similarity index 99% rename from modules/ocl/src/kernels/arithm_bitwise_not.cl rename to modules/ocl/src/opencl/arithm_bitwise_not.cl index 64bcc1799a..fd9d2ccf99 100644 --- a/modules/ocl/src/kernels/arithm_bitwise_not.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_not.cl @@ -62,7 +62,7 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr x = x << 2; #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); + int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); @@ -72,7 +72,7 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = ~ src1_data; - + /* if(src1_index < 0) { uchar4 tmp; @@ -102,7 +102,7 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src x = x << 2; #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); + int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); @@ -136,7 +136,7 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); @@ -171,7 +171,7 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); @@ -245,14 +245,13 @@ __kernel void arithm_bitwise_not_D6 (__global char *src, int src_step, int src_o { int src_index = mad24(y, src_step, (x << 3) + src_offset); int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); - + char8 data; data = *((__global char8 *)((__global char *)src + src_index)); data = ~ data; - + *((__global char8 *)((__global char *)dst + dst_index)) = data; } } #endif - diff --git a/modules/ocl/src/kernels/arithm_bitwise_or.cl b/modules/ocl/src/opencl/arithm_bitwise_or.cl similarity index 98% rename from modules/ocl/src/kernels/arithm_bitwise_or.cl rename to modules/ocl/src/opencl/arithm_bitwise_or.cl index 01e3a2f998..a95e59e0ca 100644 --- a/modules/ocl/src/kernels/arithm_bitwise_or.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_or.cl @@ -63,8 +63,8 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src x = x << 2; #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); + int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); + int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); @@ -111,8 +111,8 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1 x = x << 2; #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); + int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); + int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); @@ -148,8 +148,8 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); @@ -186,8 +186,8 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); @@ -276,4 +276,3 @@ __kernel void arithm_bitwise_or_D6 (__global char *src1, int src1_step, int src1 } } #endif - diff --git a/modules/ocl/src/kernels/arithm_bitwise_or_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl similarity index 99% rename from modules/ocl/src/kernels/arithm_bitwise_or_mask.cl rename to modules/ocl/src/opencl/arithm_bitwise_or_mask.cl index 92d98ec01c..aedb68c474 100644 --- a/modules/ocl/src/kernels/arithm_bitwise_or_mask.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl @@ -1135,4 +1135,3 @@ __kernel void arithm_bitwise_or_with_mask_C4_D6 (__global char *src1, int src1_s } } #endif - diff --git a/modules/ocl/src/kernels/arithm_bitwise_or_scalar.cl b/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl similarity index 99% rename from modules/ocl/src/kernels/arithm_bitwise_or_scalar.cl rename to modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl index bbd5f3fb2e..5b94591a30 100644 --- a/modules/ocl/src/kernels/arithm_bitwise_or_scalar.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl @@ -911,4 +911,3 @@ __kernel void arithm_s_bitwise_or_C4_D6 (__global short *src1, int src1_step, in } } #endif - diff --git a/modules/ocl/src/kernels/arithm_bitwise_or_scalar_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl similarity index 99% rename from modules/ocl/src/kernels/arithm_bitwise_or_scalar_mask.cl rename to modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl index 153398706f..54066c21a0 100644 --- a/modules/ocl/src/kernels/arithm_bitwise_or_scalar_mask.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl @@ -1078,4 +1078,3 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D6 (__global short *src1, int src } } #endif - diff --git a/modules/ocl/src/kernels/arithm_bitwise_xor.cl b/modules/ocl/src/opencl/arithm_bitwise_xor.cl similarity index 95% rename from modules/ocl/src/kernels/arithm_bitwise_xor.cl rename to modules/ocl/src/opencl/arithm_bitwise_xor.cl index 6e83ef50ec..4f743776a4 100644 --- a/modules/ocl/src/kernels/arithm_bitwise_xor.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_xor.cl @@ -63,8 +63,8 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr x = x << 2; #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); + int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); + int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); @@ -76,14 +76,14 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr uchar4 src2_data = vload4(0, src2 + src2_index_fix); if(src1_index < 0) - { + { uchar4 tmp; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - uchar4 tmp; + } + if(src2_index < 0) + { + uchar4 tmp; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; } @@ -113,8 +113,8 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src x = x << 2; #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); + int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); + int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); @@ -126,14 +126,14 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src char4 src2_data = vload4(0, src2 + src2_index_fix); if(src1_index < 0) - { + { char4 tmp; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - char4 tmp; + } + if(src2_index < 0) + { + char4 tmp; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; } @@ -164,8 +164,8 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); @@ -177,14 +177,14 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix)); if(src1_index < 0) - { + { ushort4 tmp; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - ushort4 tmp; + } + if(src2_index < 0) + { + ushort4 tmp; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; } @@ -216,8 +216,8 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); @@ -231,14 +231,14 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); if(src1_index < 0) - { + { short4 tmp; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - short4 tmp; + } + if(src2_index < 0) + { + short4 tmp; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; } @@ -324,4 +324,3 @@ __kernel void arithm_bitwise_xor_D6 (__global char *src1, int src1_step, int src } } #endif - diff --git a/modules/ocl/src/kernels/arithm_bitwise_xor_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl similarity index 99% rename from modules/ocl/src/kernels/arithm_bitwise_xor_mask.cl rename to modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl index 248654ef74..4359d860a5 100644 --- a/modules/ocl/src/kernels/arithm_bitwise_xor_mask.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl @@ -1135,4 +1135,3 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D6 (__global char *src1, int src1_ } } #endif - diff --git a/modules/ocl/src/kernels/arithm_bitwise_xor_scalar.cl b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl similarity index 100% rename from modules/ocl/src/kernels/arithm_bitwise_xor_scalar.cl rename to modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl diff --git a/modules/ocl/src/kernels/arithm_bitwise_xor_scalar_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl similarity index 99% rename from modules/ocl/src/kernels/arithm_bitwise_xor_scalar_mask.cl rename to modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl index 4efa2dac6c..57ad9ee713 100644 --- a/modules/ocl/src/kernels/arithm_bitwise_xor_scalar_mask.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl @@ -1055,4 +1055,3 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D6 (__global short *src1, int sr } } #endif - diff --git a/modules/ocl/src/kernels/arithm_cartToPolar.cl b/modules/ocl/src/opencl/arithm_cartToPolar.cl similarity index 100% rename from modules/ocl/src/kernels/arithm_cartToPolar.cl rename to modules/ocl/src/opencl/arithm_cartToPolar.cl diff --git a/modules/ocl/src/kernels/arithm_compare_eq.cl b/modules/ocl/src/opencl/arithm_compare_eq.cl similarity index 74% rename from modules/ocl/src/kernels/arithm_compare_eq.cl rename to modules/ocl/src/opencl/arithm_compare_eq.cl index 1db0b7dd14..f818532ba2 100644 --- a/modules/ocl/src/kernels/arithm_compare_eq.cl +++ b/modules/ocl/src/opencl/arithm_compare_eq.cl @@ -63,31 +63,31 @@ __kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src x = x << 2; #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); + int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); + int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; uchar4 src1_data = vload4(0, src1 + src1_index_fix); uchar4 src2_data = vload4(0, src2 + src2_index_fix); - if(src1_index < 0) - { - uchar4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - uchar4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - - - + if(src1_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + + + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data == src2_data)); @@ -115,29 +115,29 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr x = x << 2; #define dst_align ((dst_offset >> 1)& 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); - ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); - if(src1_index < 0) - { - ushort4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - ushort4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } + ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); + if(src1_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data == src2_data)); @@ -166,32 +166,32 @@ __kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); - short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); - if(src1_index < 0) - { - short4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - short4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - - - - + short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); + if(src1_index < 0) + { + short4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + short4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + + + + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data == src2_data)); @@ -215,32 +215,32 @@ __kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_ int y = get_global_id(1); if (x < cols && y < rows) - { + { x = x << 2; #define dst_align ((dst_offset >> 2) & 3) - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); + int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); + int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index)); - if(src1_index < 0) - { - int4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - int4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } + if(src1_index < 0) + { + int4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + int4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data == src2_data)); @@ -266,22 +266,22 @@ __kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src { x = x << 2; #define dst_align ((dst_offset >> 2) & 3) - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); + int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); + int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix)); - float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src2_index < 0) - { - float4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - + float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src2_index < 0) + { + float4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data == src2_data)); @@ -308,29 +308,29 @@ __kernel void arithm_compare_eq_D6 (__global double *src1, int src1_step, int sr { x = x << 2; #define dst_align ((dst_offset >> 3) & 3) - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); + int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); + int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix)); double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix)); - if(src1_index < 0) - { - double4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - double4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - + if(src1_index < 0) + { + double4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + double4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data == src2_data)); @@ -359,31 +359,31 @@ __kernel void arithm_compare_gt_D0 (__global uchar *src1, int src1_step, int src x = x << 2; #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); + int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); + int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; uchar4 src1_data = vload4(0, src1 + src1_index_fix); uchar4 src2_data = vload4(0, src2 + src2_index_fix); - if(src1_index < 0) - { - uchar4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - uchar4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - - - + if(src1_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + + + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data > src2_data)); @@ -410,31 +410,31 @@ __kernel void arithm_compare_gt_D2 (__global ushort *src1, int src1_step, int sr x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); - ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); - if(src1_index < 0) - { - ushort4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - ushort4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - - - + ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); + if(src1_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + + + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data > src2_data)); @@ -463,29 +463,29 @@ __kernel void arithm_compare_gt_D3 (__global short *src1, int src1_step, int src x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); - short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); - if(src1_index < 0) - { - short4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - short4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - + short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); + if(src1_index < 0) + { + short4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + short4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); @@ -512,31 +512,31 @@ __kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_ { x = x << 2; #define dst_align ((dst_offset >> 2) & 3) - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); + int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); + int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index)); - if(src1_index < 0) - { - int4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - int4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - - + if(src1_index < 0) + { + int4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + int4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data > src2_data)); @@ -561,29 +561,29 @@ __kernel void arithm_compare_gt_D5 (__global float *src1, int src1_step, int src { x = x << 2; #define dst_align ((dst_offset >> 2) & 3) - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); + int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); + int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix)); float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); - if(src1_index < 0) - { - float4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - float4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - + if(src1_index < 0) + { + float4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + float4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data > src2_data)); @@ -610,29 +610,29 @@ __kernel void arithm_compare_gt_D6 (__global double *src1, int src1_step, int sr { x = x << 2; #define dst_align ((dst_offset >> 3) & 3) - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); + int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); + int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix)); - double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix)); - if(src1_index < 0) - { - double4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - double4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - + double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix)); + if(src1_index < 0) + { + double4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + double4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data > src2_data)); @@ -661,30 +661,30 @@ __kernel void arithm_compare_ge_D0 (__global uchar *src1, int src1_step, int src x = x << 2; #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); + int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); + int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; uchar4 src1_data = vload4(0, src1 + src1_index_fix); uchar4 src2_data = vload4(0, src2 + src2_index_fix); - if(src1_index < 0) - { - uchar4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - uchar4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - + if(src1_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); @@ -715,30 +715,30 @@ __kernel void arithm_compare_ge_D2 (__global ushort *src1, int src1_step, int sr x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); - ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); - if(src1_index < 0) - { - ushort4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - ushort4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - + ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); + if(src1_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + @@ -770,30 +770,30 @@ __kernel void arithm_compare_ge_D3 (__global short *src1, int src1_step, int src x = x << 2; #define dst_align ((dst_offset >> 1)& 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); - short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); - if(src1_index < 0) - { - short4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - short4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - + short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); + if(src1_index < 0) + { + short4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + short4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); @@ -821,30 +821,30 @@ __kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_ x = x << 2; #define dst_align ((dst_offset >> 2)& 3) - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); + int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); + int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index)); - if(src1_index < 0) - { - int4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - int4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } + if(src1_index < 0) + { + int4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + int4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data >= src2_data)); @@ -870,30 +870,30 @@ __kernel void arithm_compare_ge_D5 (__global float *src1, int src1_step, int src x = x << 2; #define dst_align ((dst_offset >> 2)& 3) - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); + int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); + int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix)); float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); - if(src1_index < 0) - { - - float4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - float4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } + if(src1_index < 0) + { + + float4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + float4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data >= src2_data)); @@ -921,28 +921,28 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr x = x << 2; #define dst_align ((dst_offset >> 3)& 3) - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); + int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); + int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix)); - double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix)); - if(src1_index < 0) - { - double4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - double4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); + double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix)); + if(src1_index < 0) + { + double4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + double4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data >= src2_data)); dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; @@ -954,4 +954,3 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr } } #endif - diff --git a/modules/ocl/src/kernels/arithm_compare_ne.cl b/modules/ocl/src/opencl/arithm_compare_ne.cl similarity index 73% rename from modules/ocl/src/kernels/arithm_compare_ne.cl rename to modules/ocl/src/opencl/arithm_compare_ne.cl index 1c5063a460..713dc13169 100644 --- a/modules/ocl/src/kernels/arithm_compare_ne.cl +++ b/modules/ocl/src/opencl/arithm_compare_ne.cl @@ -59,29 +59,29 @@ __kernel void arithm_compare_ne_D0 (__global uchar *src1, int src1_step, int src x = x << 2; #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); + int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); + int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; uchar4 src1_data = vload4(0, src1 + src1_index_fix); uchar4 src2_data = vload4(0, src2 + src2_index_fix); - if(src1_index < 0) - { - uchar4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - uchar4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - + if(src1_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data != src2_data)); @@ -111,29 +111,29 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr x = x << 2; #define dst_align ((dst_offset >> 1)& 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); - ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); - if(src1_index < 0) - { - ushort4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - ushort4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } + ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); + if(src1_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data != src2_data)); @@ -163,29 +163,29 @@ __kernel void arithm_compare_ne_D3 (__global short *src1, int src1_step, int src x = x << 2; #define dst_align ((dst_offset >> 1)& 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); - short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); - if(src1_index < 0) - { - short4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - short4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - + short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); + if(src1_index < 0) + { + short4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + short4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data != src2_data)); @@ -211,30 +211,30 @@ __kernel void arithm_compare_ne_D4 (__global int *src1, int src1_step, int src1_ { x = x << 2; #define dst_align ((dst_offset >> 2)& 3) - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); + int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); + int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index)); - if(src1_index < 0) - { - int4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - int4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } + if(src1_index < 0) + { + int4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + int4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data != src2_data)); @@ -260,28 +260,28 @@ __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src { x = x << 2; #define dst_align ((dst_offset >> 2) & 3) - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); + int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); + int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix)); - float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src1_index < 0) - { - float4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - float4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - + float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src1_index < 0) + { + float4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + float4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data != src2_data)); @@ -307,29 +307,29 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr { x = x << 2; #define dst_align ((dst_offset >> 3) & 3) - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); + int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); + int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix)); - double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix)); - if(src1_index < 0) - { - double4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - double4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - + double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix)); + if(src1_index < 0) + { + double4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + double4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data != src2_data)); @@ -344,7 +344,7 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr } #endif - + /***********************************Compare LT*******************************/ __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src1_offset, __global uchar *src2, int src2_step, int src2_offset, @@ -359,29 +359,29 @@ __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src x = x << 2; #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); + int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); + int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; uchar4 src1_data = vload4(0, src1 + src1_index_fix); uchar4 src2_data = vload4(0, src2 + src2_index_fix); - if(src1_index < 0) - { - uchar4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - uchar4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - + if(src1_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data < src2_data)); @@ -411,30 +411,30 @@ __kernel void arithm_compare_lt_D2 (__global ushort *src1, int src1_step, int sr x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); - ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); - if(src1_index < 0) - { - ushort4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - ushort4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - + ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); + if(src1_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data < src2_data)); @@ -464,29 +464,29 @@ __kernel void arithm_compare_lt_D3 (__global short *src1, int src1_step, int src x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); - short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); - if(src1_index < 0) - { - short4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - short4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - + short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); + if(src1_index < 0) + { + short4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + short4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); @@ -513,34 +513,34 @@ __kernel void arithm_compare_lt_D4 (__global int *src1, int src1_step, int src1_ { x = x << 2; #define dst_align ((dst_offset >> 2) & 3) - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); + int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); + int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index)); - if(src1_index < 0) - { - int4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - int4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - - - - + if(src1_index < 0) + { + int4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + int4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + + + + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data < src2_data)); @@ -565,29 +565,29 @@ __kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src { x = x << 2; #define dst_align ((dst_offset >> 2) & 3) - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); + int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); + int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix)); float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); - if(src1_index < 0) - { - float4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - float4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - + if(src1_index < 0) + { + float4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + float4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data < src2_data)); @@ -614,29 +614,29 @@ __kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int sr { x = x << 2; #define dst_align ((dst_offset >> 3) & 3) - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); + int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); + int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix)); - double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix)); - if(src1_index < 0) - { - double4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - double4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - + double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix)); + if(src1_index < 0) + { + double4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + double4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data < src2_data)); @@ -665,29 +665,29 @@ __kernel void arithm_compare_le_D0 (__global uchar *src1, int src1_step, int src x = x << 2; #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); + int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); + int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; uchar4 src1_data = vload4(0, src1 + src1_index_fix); uchar4 src2_data = vload4(0, src2 + src2_index_fix); - if(src1_index < 0) - { - uchar4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - uchar4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - + if(src1_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); @@ -718,29 +718,29 @@ __kernel void arithm_compare_le_D2 (__global ushort *src1, int src1_step, int sr x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); - ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); - if(src1_index < 0) - { - ushort4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - ushort4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - + ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); + if(src1_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); @@ -771,29 +771,29 @@ __kernel void arithm_compare_le_D3 (__global short *src1, int src1_step, int src x = x << 2; #define dst_align ((dst_offset >> 1) & 3) - int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); - int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); + int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); + int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); - short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); - if(src1_index < 0) - { - short4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - short4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - + short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); + if(src1_index < 0) + { + short4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + short4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); @@ -820,29 +820,29 @@ __kernel void arithm_compare_le_D4 (__global int *src1, int src1_step, int src1_ { x = x << 2; #define dst_align ((dst_offset >> 2)& 3) - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); + int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); + int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index)); - if(src1_index < 0) - { - int4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - int4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } + if(src1_index < 0) + { + int4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + int4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data =convert_uchar4((src1_data <= src2_data)); @@ -868,28 +868,28 @@ __kernel void arithm_compare_le_D5 (__global float *src1, int src1_step, int src { x = x << 2; #define dst_align ((dst_offset >> 2)& 3) - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); + int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); + int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix)); - float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); - if(src1_index < 0) - { - float4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - float4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } + float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); + if(src1_index < 0) + { + float4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + float4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data <= src2_data)); @@ -916,29 +916,29 @@ __kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int sr { x = x << 2; #define dst_align ((dst_offset >> 3)& 3) - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); - int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); + int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); + int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix)); - double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix)); - if(src1_index < 0) - { - double4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - double4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - + double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix)); + if(src1_index < 0) + { + double4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + double4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data <= src2_data)); @@ -952,5 +952,3 @@ __kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int sr } } #endif - - diff --git a/modules/ocl/src/kernels/arithm_div.cl b/modules/ocl/src/opencl/arithm_div.cl similarity index 99% rename from modules/ocl/src/kernels/arithm_div.cl rename to modules/ocl/src/opencl/arithm_div.cl index 54fe3cdc15..dcbe303106 100644 --- a/modules/ocl/src/kernels/arithm_div.cl +++ b/modules/ocl/src/opencl/arithm_div.cl @@ -455,5 +455,3 @@ __kernel void arithm_s_div_D6 (__global double *src, int src_step, int src_offse } } #endif - - diff --git a/modules/ocl/src/kernels/arithm_exp.cl b/modules/ocl/src/opencl/arithm_exp.cl similarity index 100% rename from modules/ocl/src/kernels/arithm_exp.cl rename to modules/ocl/src/opencl/arithm_exp.cl diff --git a/modules/ocl/src/kernels/arithm_flip.cl b/modules/ocl/src/opencl/arithm_flip.cl similarity index 100% rename from modules/ocl/src/kernels/arithm_flip.cl rename to modules/ocl/src/opencl/arithm_flip.cl diff --git a/modules/ocl/src/kernels/arithm_flip_rc.cl b/modules/ocl/src/opencl/arithm_flip_rc.cl similarity index 100% rename from modules/ocl/src/kernels/arithm_flip_rc.cl rename to modules/ocl/src/opencl/arithm_flip_rc.cl diff --git a/modules/ocl/src/kernels/arithm_log.cl b/modules/ocl/src/opencl/arithm_log.cl similarity index 100% rename from modules/ocl/src/kernels/arithm_log.cl rename to modules/ocl/src/opencl/arithm_log.cl diff --git a/modules/ocl/src/kernels/arithm_magnitude.cl b/modules/ocl/src/opencl/arithm_magnitude.cl similarity index 100% rename from modules/ocl/src/kernels/arithm_magnitude.cl rename to modules/ocl/src/opencl/arithm_magnitude.cl diff --git a/modules/ocl/src/kernels/arithm_magnitudeSqr.cl b/modules/ocl/src/opencl/arithm_magnitudeSqr.cl similarity index 98% rename from modules/ocl/src/kernels/arithm_magnitudeSqr.cl rename to modules/ocl/src/opencl/arithm_magnitudeSqr.cl index f1d0aa5733..3fd697ff1f 100644 --- a/modules/ocl/src/kernels/arithm_magnitudeSqr.cl +++ b/modules/ocl/src/opencl/arithm_magnitudeSqr.cl @@ -60,17 +60,17 @@ __kernel void magnitudeSqr_C1_D5 (__global float *src1,int src1_step,int src1_of int y = get_global_id(1); if (x < cols && y < rows) - + { - + x = x << 2; #define dst_align ((dst_offset >> 2) & 3) - int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); - int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); - + int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); + int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); + int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2)); @@ -125,16 +125,16 @@ __kernel void magnitudeSqr_C2_D5 (__global float *src1,int src1_step,int src1_of int y = get_global_id(1); if (x < cols && y < rows) - + { - + x = x << 2; #define dst_align ((dst_offset >> 2) & 3) - int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); - + int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); + int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2)); @@ -148,8 +148,8 @@ __kernel void magnitudeSqr_C2_D5 (__global float *src1,int src1_step,int src1_of src1_data.s01234567 = src1_data.s45670123; if(src1_index== -2) src1_data.s01234567 = src1_data.s23456701; - - + + float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index)); diff --git a/modules/ocl/src/kernels/arithm_minMax.cl b/modules/ocl/src/opencl/arithm_minMax.cl similarity index 100% rename from modules/ocl/src/kernels/arithm_minMax.cl rename to modules/ocl/src/opencl/arithm_minMax.cl diff --git a/modules/ocl/src/kernels/arithm_minMaxLoc.cl b/modules/ocl/src/opencl/arithm_minMaxLoc.cl similarity index 100% rename from modules/ocl/src/kernels/arithm_minMaxLoc.cl rename to modules/ocl/src/opencl/arithm_minMaxLoc.cl diff --git a/modules/ocl/src/kernels/arithm_minMaxLoc_mask.cl b/modules/ocl/src/opencl/arithm_minMaxLoc_mask.cl similarity index 99% rename from modules/ocl/src/kernels/arithm_minMaxLoc_mask.cl rename to modules/ocl/src/opencl/arithm_minMaxLoc_mask.cl index f87b928cec..0af4f7ba03 100644 --- a/modules/ocl/src/kernels/arithm_minMaxLoc_mask.cl +++ b/modules/ocl/src/opencl/arithm_minMaxLoc_mask.cl @@ -240,4 +240,3 @@ __kernel void arithm_op_minMaxLoc_mask (int cols,int invalid_cols,int offset,int dst[gid + 3 * groupnum] = CONVERT_RES_TYPE(lm_maxloc[0]); } } - diff --git a/modules/ocl/src/kernels/arithm_minMax_mask.cl b/modules/ocl/src/opencl/arithm_minMax_mask.cl similarity index 99% rename from modules/ocl/src/kernels/arithm_minMax_mask.cl rename to modules/ocl/src/opencl/arithm_minMax_mask.cl index 4097762331..734ccab750 100644 --- a/modules/ocl/src/kernels/arithm_minMax_mask.cl +++ b/modules/ocl/src/opencl/arithm_minMax_mask.cl @@ -194,4 +194,3 @@ __kernel void arithm_op_minMax_mask (int cols,int invalid_cols,int offset,int el dst[gid + groupnum] = localmem_max[0]; } } - diff --git a/modules/ocl/src/kernels/arithm_mul.cl b/modules/ocl/src/opencl/arithm_mul.cl similarity index 100% rename from modules/ocl/src/kernels/arithm_mul.cl rename to modules/ocl/src/opencl/arithm_mul.cl diff --git a/modules/ocl/src/kernels/arithm_nonzero.cl b/modules/ocl/src/opencl/arithm_nonzero.cl similarity index 100% rename from modules/ocl/src/kernels/arithm_nonzero.cl rename to modules/ocl/src/opencl/arithm_nonzero.cl diff --git a/modules/ocl/src/kernels/arithm_phase.cl b/modules/ocl/src/opencl/arithm_phase.cl similarity index 100% rename from modules/ocl/src/kernels/arithm_phase.cl rename to modules/ocl/src/opencl/arithm_phase.cl diff --git a/modules/ocl/src/kernels/arithm_polarToCart.cl b/modules/ocl/src/opencl/arithm_polarToCart.cl similarity index 100% rename from modules/ocl/src/kernels/arithm_polarToCart.cl rename to modules/ocl/src/opencl/arithm_polarToCart.cl diff --git a/modules/ocl/src/kernels/arithm_pow.cl b/modules/ocl/src/opencl/arithm_pow.cl similarity index 100% rename from modules/ocl/src/kernels/arithm_pow.cl rename to modules/ocl/src/opencl/arithm_pow.cl diff --git a/modules/ocl/src/kernels/arithm_sub.cl b/modules/ocl/src/opencl/arithm_sub.cl similarity index 100% rename from modules/ocl/src/kernels/arithm_sub.cl rename to modules/ocl/src/opencl/arithm_sub.cl diff --git a/modules/ocl/src/kernels/arithm_sub_scalar.cl b/modules/ocl/src/opencl/arithm_sub_scalar.cl similarity index 100% rename from modules/ocl/src/kernels/arithm_sub_scalar.cl rename to modules/ocl/src/opencl/arithm_sub_scalar.cl diff --git a/modules/ocl/src/kernels/arithm_sub_scalar_mask.cl b/modules/ocl/src/opencl/arithm_sub_scalar_mask.cl similarity index 100% rename from modules/ocl/src/kernels/arithm_sub_scalar_mask.cl rename to modules/ocl/src/opencl/arithm_sub_scalar_mask.cl diff --git a/modules/ocl/src/kernels/arithm_sum.cl b/modules/ocl/src/opencl/arithm_sum.cl similarity index 99% rename from modules/ocl/src/kernels/arithm_sum.cl rename to modules/ocl/src/opencl/arithm_sum.cl index d29a71c699..280b0a5111 100644 --- a/modules/ocl/src/kernels/arithm_sum.cl +++ b/modules/ocl/src/opencl/arithm_sum.cl @@ -203,4 +203,3 @@ __kernel void arithm_op_sum (int cols,int invalid_cols,int offset,int elemnum,in dst[gid] = localmem_sum[0]; } } - diff --git a/modules/ocl/src/kernels/arithm_sum_3.cl b/modules/ocl/src/opencl/arithm_sum_3.cl similarity index 99% rename from modules/ocl/src/kernels/arithm_sum_3.cl rename to modules/ocl/src/opencl/arithm_sum_3.cl index 1401889a73..3f6ed08803 100644 --- a/modules/ocl/src/kernels/arithm_sum_3.cl +++ b/modules/ocl/src/opencl/arithm_sum_3.cl @@ -245,4 +245,3 @@ __kernel void arithm_op_sum_3 (int cols,int invalid_cols,int offset,int elemnum, dst[gid*3+2] = localmem_sum3[0]; } } - diff --git a/modules/ocl/src/kernels/arithm_transpose.cl b/modules/ocl/src/opencl/arithm_transpose.cl similarity index 100% rename from modules/ocl/src/kernels/arithm_transpose.cl rename to modules/ocl/src/opencl/arithm_transpose.cl diff --git a/modules/ocl/src/kernels/blend_linear.cl b/modules/ocl/src/opencl/blend_linear.cl similarity index 98% rename from modules/ocl/src/kernels/blend_linear.cl rename to modules/ocl/src/opencl/blend_linear.cl index 06bde2f5c1..50c5c39c5f 100644 --- a/modules/ocl/src/kernels/blend_linear.cl +++ b/modules/ocl/src/opencl/blend_linear.cl @@ -15,7 +15,7 @@ // Third party copyrights are property of their respective owners. // // @Authors -// Liu Liujun, liujun@multicorewareinc.com +// Liu Liujun, liujun@multicorewareinc.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: @@ -61,7 +61,7 @@ __kernel void BlendLinear_C1_D0( int pos = mad24(idy,istep >> 2,idx); int wpos = mad24(idy,wstep >> 2,idx); float4 w1 = weight1[wpos], w2 = weight2[wpos]; - dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 + + dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 + convert_float4(img2[pos]) * w2) / (w1 + w2 + 1e-5f)); } } @@ -86,7 +86,7 @@ __kernel void BlendLinear_C4_D0( int wpos = mad24(idy,wstep, idx); float w1 = weight1[wpos]; float w2 = weight2[wpos]; - dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 + + dst[pos] = convert_uchar4((convert_float4(img1[pos]) * w1 + convert_float4(img2[pos]) * w2) / (w1 + w2 + 1e-5f)); } } @@ -138,4 +138,3 @@ __kernel void BlendLinear_C4_D5( dst[pos] = (img1[pos] * w1 + img2[pos] * w2) / (w1 + w2 + 1e-5f); } } - diff --git a/modules/ocl/src/opencl/brute_force_match.cl b/modules/ocl/src/opencl/brute_force_match.cl new file mode 100644 index 0000000000..0730ac5ac7 --- /dev/null +++ b/modules/ocl/src/opencl/brute_force_match.cl @@ -0,0 +1,865 @@ +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable +#define MAX_FLOAT 1e7f + +int bit1Count(float x) +{ + int c = 0; + int ix = (int)x; + + for (int i = 0 ; i < 32 ; i++) + { + c += ix & 0x1; + ix >>= 1; + } + + return (float)c; +} +/* 2dim launch, global size: dim0 is (query rows + block_size - 1) / block_size * block_size, dim1 is block_size +local size: dim0 is block_size, dim1 is block_size. +*/ +__kernel void BruteForceMatch_UnrollMatch( + __global float *query, + __global float *train, + //__global float *mask, + __global int *bestTrainIdx, + __global float *bestDistance, + __local float *sharebuffer, + int block_size, + int max_desc_len, + int query_rows, + int query_cols, + int train_rows, + int train_cols, + int step, + int distType +) +{ + const int lidx = get_local_id(0); + const int lidy = get_local_id(1); + const int groupidx = get_group_id(0); + + __local float *s_query = sharebuffer; + __local float *s_train = sharebuffer + block_size * max_desc_len; + + int queryIdx = groupidx * block_size + lidy; + + // load the query into local memory. + for (int i = 0 ; i < max_desc_len / block_size; i ++) + { + int loadx = lidx + i * block_size; + s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; + } + + float myBestDistance = MAX_FLOAT; + int myBestTrainIdx = -1; + + // loopUnrolledCached to find the best trainIdx and best distance. + volatile int imgIdx = 0; + + for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) + { + float result = 0; + + for (int i = 0 ; i < max_desc_len / block_size ; i++) + { + //load a block_size * block_size block into local train. + const int loadx = lidx + i * block_size; + s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; + + //synchronize to make sure each elem for reduceIteration in share memory is written already. + barrier(CLK_LOCAL_MEM_FENCE); + + /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to + sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ + + switch (distType) + { + case 0: + + for (int j = 0 ; j < block_size ; j++) + { + result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]); + } + + break; + case 1: + + for (int j = 0 ; j < block_size ; j++) + { + float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]; + result += qr * qr; + } + + break; + case 2: + + for (int j = 0 ; j < block_size ; j++) + { + //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]); + result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]); + } + + break; + } + + barrier(CLK_LOCAL_MEM_FENCE); + } + + int trainIdx = t * block_size + lidx; + + if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance/* && mask(queryIdx, trainIdx)*/) + { + //bestImgIdx = imgIdx; + myBestDistance = result; + myBestTrainIdx = trainIdx; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + __local float *s_distance = (__local float *)(sharebuffer); + __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); + + //find BestMatch + s_distance += lidy * block_size; + s_trainIdx += lidy * block_size; + s_distance[lidx] = myBestDistance; + s_trainIdx[lidx] = myBestTrainIdx; + + barrier(CLK_LOCAL_MEM_FENCE); + + //reduce -- now all reduce implement in each threads. + for (int k = 0 ; k < block_size; k++) + { + if (myBestDistance > s_distance[k]) + { + myBestDistance = s_distance[k]; + myBestTrainIdx = s_trainIdx[k]; + } + } + + if (queryIdx < query_rows && lidx == 0) + { + bestTrainIdx[queryIdx] = myBestTrainIdx; + bestDistance[queryIdx] = myBestDistance; + } +} + +__kernel void BruteForceMatch_Match( + __global float *query, + __global float *train, + //__global float *mask, + __global int *bestTrainIdx, + __global float *bestDistance, + __local float *sharebuffer, + int block_size, + int query_rows, + int query_cols, + int train_rows, + int train_cols, + int step, + int distType +) +{ + const int lidx = get_local_id(0); + const int lidy = get_local_id(1); + const int groupidx = get_group_id(0); + + const int queryIdx = groupidx * block_size + lidy; + + float myBestDistance = MAX_FLOAT; + int myBestTrainIdx = -1; + + __local float *s_query = sharebuffer; + __local float *s_train = sharebuffer + block_size * block_size; + + // loop + for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) + { + //Dist dist; + float result = 0; + + for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++) + { + const int loadx = lidx + i * block_size; + //load query and train into local memory + s_query[lidy * block_size + lidx] = 0; + s_train[lidx * block_size + lidy] = 0; + + if (loadx < query_cols) + { + s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx]; + s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to + sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ + + switch (distType) + { + case 0: + + for (int j = 0 ; j < block_size ; j++) + { + result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); + } + + break; + case 1: + + for (int j = 0 ; j < block_size ; j++) + { + float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; + result += qr * qr; + } + + break; + case 2: + + for (int j = 0 ; j < block_size ; j++) + { + //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]); + result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]); + } + + break; + } + + barrier(CLK_LOCAL_MEM_FENCE); + } + + const int trainIdx = t * block_size + lidx; + + if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance /*&& mask(queryIdx, trainIdx)*/) + { + //myBestImgidx = imgIdx; + myBestDistance = result; + myBestTrainIdx = trainIdx; + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + __local float *s_distance = (__local float *)sharebuffer; + __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); + + //findBestMatch + s_distance += lidy * block_size; + s_trainIdx += lidy * block_size; + s_distance[lidx] = myBestDistance; + s_trainIdx[lidx] = myBestTrainIdx; + + barrier(CLK_LOCAL_MEM_FENCE); + + //reduce -- now all reduce implement in each threads. + for (int k = 0 ; k < block_size; k++) + { + if (myBestDistance > s_distance[k]) + { + myBestDistance = s_distance[k]; + myBestTrainIdx = s_trainIdx[k]; + } + } + + if (queryIdx < query_rows && lidx == 0) + { + bestTrainIdx[queryIdx] = myBestTrainIdx; + bestDistance[queryIdx] = myBestDistance; + } +} + +//radius_unrollmatch +__kernel void BruteForceMatch_RadiusUnrollMatch( + __global float *query, + __global float *train, + float maxDistance, + //__global float *mask, + __global int *bestTrainIdx, + __global float *bestDistance, + __global int *nMatches, + __local float *sharebuffer, + int block_size, + int max_desc_len, + int query_rows, + int query_cols, + int train_rows, + int train_cols, + int bestTrainIdx_cols, + int step, + int ostep, + int distType +) +{ + const int lidx = get_local_id(0); + const int lidy = get_local_id(1); + const int groupidx = get_group_id(0); + const int groupidy = get_group_id(1); + + const int queryIdx = groupidy * block_size + lidy; + const int trainIdx = groupidx * block_size + lidx; + + __local float *s_query = sharebuffer; + __local float *s_train = sharebuffer + block_size * block_size; + + float result = 0; + + for (int i = 0 ; i < max_desc_len / block_size ; ++i) + { + //load a block_size * block_size block into local train. + const int loadx = lidx + i * block_size; + + s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; + s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; + + //synchronize to make sure each elem for reduceIteration in share memory is written already. + barrier(CLK_LOCAL_MEM_FENCE); + + /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to + sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ + + switch (distType) + { + case 0: + + for (int j = 0 ; j < block_size ; ++j) + { + result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); + } + + break; + case 1: + + for (int j = 0 ; j < block_size ; ++j) + { + float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; + result += qr * qr; + } + + break; + case 2: + + for (int j = 0 ; j < block_size ; ++j) + { + result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]); + } + + break; + } + + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/) + { + unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/); + + if (ind < bestTrainIdx_cols) + { + //bestImgIdx = imgIdx; + bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx; + bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result; + } + } +} + +//radius_match +__kernel void BruteForceMatch_RadiusMatch( + __global float *query, + __global float *train, + float maxDistance, + //__global float *mask, + __global int *bestTrainIdx, + __global float *bestDistance, + __global int *nMatches, + __local float *sharebuffer, + int block_size, + int query_rows, + int query_cols, + int train_rows, + int train_cols, + int bestTrainIdx_cols, + int step, + int ostep, + int distType +) +{ + const int lidx = get_local_id(0); + const int lidy = get_local_id(1); + const int groupidx = get_group_id(0); + const int groupidy = get_group_id(1); + + const int queryIdx = groupidy * block_size + lidy; + const int trainIdx = groupidx * block_size + lidx; + + __local float *s_query = sharebuffer; + __local float *s_train = sharebuffer + block_size * block_size; + + float result = 0; + + for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; ++i) + { + //load a block_size * block_size block into local train. + const int loadx = lidx + i * block_size; + + s_query[lidy * block_size + lidx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; + s_train[lidx * block_size + lidy] = loadx < query_cols ? train[min(groupidx * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; + + //synchronize to make sure each elem for reduceIteration in share memory is written already. + barrier(CLK_LOCAL_MEM_FENCE); + + /* there are three types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to + sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ + + switch (distType) + { + case 0: + + for (int j = 0 ; j < block_size ; ++j) + { + result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); + } + + break; + case 1: + + for (int j = 0 ; j < block_size ; ++j) + { + float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; + result += qr * qr; + } + + break; + case 2: + + for (int j = 0 ; j < block_size ; ++j) + { + result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[j * block_size + lidx]); + } + + break; + } + + barrier(CLK_LOCAL_MEM_FENCE); + } + + if (queryIdx < query_rows && trainIdx < train_rows && result < maxDistance/* && mask(queryIdx, trainIdx)*/) + { + unsigned int ind = atom_inc(nMatches + queryIdx/*, (unsigned int) -1*/); + + if (ind < bestTrainIdx_cols) + { + //bestImgIdx = imgIdx; + bestTrainIdx[queryIdx * (ostep / sizeof(int)) + ind] = trainIdx; + bestDistance[queryIdx * (ostep / sizeof(float)) + ind] = result; + } + } +} + + +__kernel void BruteForceMatch_knnUnrollMatch( + __global float *query, + __global float *train, + //__global float *mask, + __global int2 *bestTrainIdx, + __global float2 *bestDistance, + __local float *sharebuffer, + int block_size, + int max_desc_len, + int query_rows, + int query_cols, + int train_rows, + int train_cols, + int step, + int distType +) +{ + const int lidx = get_local_id(0); + const int lidy = get_local_id(1); + const int groupidx = get_group_id(0); + + const int queryIdx = groupidx * block_size + lidy; + local float *s_query = sharebuffer; + local float *s_train = sharebuffer + block_size * max_desc_len; + + // load the query into local memory. + for (int i = 0 ; i < max_desc_len / block_size; i ++) + { + int loadx = lidx + i * block_size; + s_query[lidy * max_desc_len + loadx] = loadx < query_cols ? query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx] : 0; + } + + float myBestDistance1 = MAX_FLOAT; + float myBestDistance2 = MAX_FLOAT; + int myBestTrainIdx1 = -1; + int myBestTrainIdx2 = -1; + + //loopUnrolledCached + volatile int imgIdx = 0; + + for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) + { + float result = 0; + + for (int i = 0 ; i < max_desc_len / block_size ; i++) + { + const int loadX = lidx + i * block_size; + //load a block_size * block_size block into local train. + const int loadx = lidx + i * block_size; + s_train[lidx * block_size + lidy] = loadx < train_cols ? train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx] : 0; + + //synchronize to make sure each elem for reduceIteration in share memory is written already. + barrier(CLK_LOCAL_MEM_FENCE); + + /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to + sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ + + switch (distType) + { + case 0: + + for (int j = 0 ; j < block_size ; j++) + { + result += fabs(s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]); + } + + break; + case 1: + + for (int j = 0 ; j < block_size ; j++) + { + float qr = s_query[lidy * max_desc_len + i * block_size + j] - s_train[j * block_size + lidx]; + result += qr * qr; + } + + break; + case 2: + + for (int j = 0 ; j < block_size ; j++) + { + //result += popcount((uint)s_query[lidy * max_desc_len + i * block_size + j] ^ (uint)s_train[j * block_size + lidx]); + result += bit1Count((uint)s_query[lidy * max_desc_len + i * block_size + j] ^(uint)s_train[j * block_size + lidx]); + } + + break; + } + + barrier(CLK_LOCAL_MEM_FENCE); + } + + const int trainIdx = t * block_size + lidx; + + if (queryIdx < query_rows && trainIdx < train_rows) + { + if (result < myBestDistance1) + { + myBestDistance2 = myBestDistance1; + myBestTrainIdx2 = myBestTrainIdx1; + myBestDistance1 = result; + myBestTrainIdx1 = trainIdx; + } + else if (result < myBestDistance2) + { + myBestDistance2 = result; + myBestTrainIdx2 = trainIdx; + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + local float *s_distance = (local float *)sharebuffer; + local int *s_trainIdx = (local int *)(sharebuffer + block_size * block_size); + + // find BestMatch + s_distance += lidy * block_size; + s_trainIdx += lidy * block_size; + + s_distance[lidx] = myBestDistance1; + s_trainIdx[lidx] = myBestTrainIdx1; + + float bestDistance1 = MAX_FLOAT; + float bestDistance2 = MAX_FLOAT; + int bestTrainIdx1 = -1; + int bestTrainIdx2 = -1; + barrier(CLK_LOCAL_MEM_FENCE); + + if (lidx == 0) + { + for (int i = 0 ; i < block_size ; i++) + { + float val = s_distance[i]; + + if (val < bestDistance1) + { + bestDistance2 = bestDistance1; + bestTrainIdx2 = bestTrainIdx1; + + bestDistance1 = val; + bestTrainIdx1 = s_trainIdx[i]; + } + else if (val < bestDistance2) + { + bestDistance2 = val; + bestTrainIdx2 = s_trainIdx[i]; + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + s_distance[lidx] = myBestDistance2; + s_trainIdx[lidx] = myBestTrainIdx2; + + barrier(CLK_LOCAL_MEM_FENCE); + + if (lidx == 0) + { + for (int i = 0 ; i < block_size ; i++) + { + float val = s_distance[i]; + + if (val < bestDistance2) + { + bestDistance2 = val; + bestTrainIdx2 = s_trainIdx[i]; + } + } + } + + myBestDistance1 = bestDistance1; + myBestDistance2 = bestDistance2; + + myBestTrainIdx1 = bestTrainIdx1; + myBestTrainIdx2 = bestTrainIdx2; + + if (queryIdx < query_rows && lidx == 0) + { + bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2); + bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2); + } +} + +__kernel void BruteForceMatch_knnMatch( + __global float *query, + __global float *train, + //__global float *mask, + __global int2 *bestTrainIdx, + __global float2 *bestDistance, + __local float *sharebuffer, + int block_size, + int query_rows, + int query_cols, + int train_rows, + int train_cols, + int step, + int distType +) +{ + const int lidx = get_local_id(0); + const int lidy = get_local_id(1); + const int groupidx = get_group_id(0); + + const int queryIdx = groupidx * block_size + lidy; + local float *s_query = sharebuffer; + local float *s_train = sharebuffer + block_size * block_size; + + float myBestDistance1 = MAX_FLOAT; + float myBestDistance2 = MAX_FLOAT; + int myBestTrainIdx1 = -1; + int myBestTrainIdx2 = -1; + + //loop + for (int t = 0 ; t < (train_rows + block_size - 1) / block_size ; t++) + { + float result = 0.0f; + + for (int i = 0 ; i < (query_cols + block_size - 1) / block_size ; i++) + { + const int loadx = lidx + i * block_size; + //load query and train into local memory + s_query[lidy * block_size + lidx] = 0; + s_train[lidx * block_size + lidy] = 0; + + if (loadx < query_cols) + { + s_query[lidy * block_size + lidx] = query[min(queryIdx, query_rows - 1) * (step / sizeof(float)) + loadx]; + s_train[lidx * block_size + lidy] = train[min(t * block_size + lidy, train_rows - 1) * (step / sizeof(float)) + loadx]; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + /* there are threee types in the reducer. the first is L1Dist, which to sum the abs(v1, v2), the second is L2Dist, which to + sum the (v1 - v2) * (v1 - v2), the third is humming, which to popc(v1 ^ v2), popc is to count the bits are set to 1*/ + + switch (distType) + { + case 0: + + for (int j = 0 ; j < block_size ; j++) + { + result += fabs(s_query[lidy * block_size + j] - s_train[j * block_size + lidx]); + } + + break; + case 1: + + for (int j = 0 ; j < block_size ; j++) + { + float qr = s_query[lidy * block_size + j] - s_train[j * block_size + lidx]; + result += qr * qr; + } + + break; + case 2: + + for (int j = 0 ; j < block_size ; j++) + { + //result += popcount((uint)s_query[lidy * block_size + j] ^ (uint)s_train[j * block_size + lidx]); + result += bit1Count((uint)s_query[lidy * block_size + j] ^(uint)s_train[(uint)j * block_size + lidx]); + } + + break; + } + + barrier(CLK_LOCAL_MEM_FENCE); + } + + const int trainIdx = t * block_size + lidx; + + if (queryIdx < query_rows && trainIdx < train_rows /*&& mask(queryIdx, trainIdx)*/) + { + if (result < myBestDistance1) + { + myBestDistance2 = myBestDistance1; + myBestTrainIdx2 = myBestTrainIdx1; + myBestDistance1 = result; + myBestTrainIdx1 = trainIdx; + } + else if (result < myBestDistance2) + { + myBestDistance2 = result; + myBestTrainIdx2 = trainIdx; + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + __local float *s_distance = (__local float *)sharebuffer; + __local int *s_trainIdx = (__local int *)(sharebuffer + block_size * block_size); + + //findBestMatch + s_distance += lidy * block_size; + s_trainIdx += lidy * block_size; + + s_distance[lidx] = myBestDistance1; + s_trainIdx[lidx] = myBestTrainIdx1; + + float bestDistance1 = MAX_FLOAT; + float bestDistance2 = MAX_FLOAT; + int bestTrainIdx1 = -1; + int bestTrainIdx2 = -1; + barrier(CLK_LOCAL_MEM_FENCE); + + if (lidx == 0) + { + for (int i = 0 ; i < block_size ; i++) + { + float val = s_distance[i]; + + if (val < bestDistance1) + { + bestDistance2 = bestDistance1; + bestTrainIdx2 = bestTrainIdx1; + + bestDistance1 = val; + bestTrainIdx1 = s_trainIdx[i]; + } + else if (val < bestDistance2) + { + bestDistance2 = val; + bestTrainIdx2 = s_trainIdx[i]; + } + } + } + + barrier(CLK_LOCAL_MEM_FENCE); + + s_distance[lidx] = myBestDistance2; + s_trainIdx[lidx] = myBestTrainIdx2; + + barrier(CLK_LOCAL_MEM_FENCE); + + if (lidx == 0) + { + for (int i = 0 ; i < block_size ; i++) + { + float val = s_distance[i]; + + if (val < bestDistance2) + { + bestDistance2 = val; + bestTrainIdx2 = s_trainIdx[i]; + } + } + } + + myBestDistance1 = bestDistance1; + myBestDistance2 = bestDistance2; + + myBestTrainIdx1 = bestTrainIdx1; + myBestTrainIdx2 = bestTrainIdx2; + + if (queryIdx < query_rows && lidx == 0) + { + bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2); + bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2); + } +} + +kernel void BruteForceMatch_calcDistanceUnrolled( + __global float *query, + __global float *train, + //__global float *mask, + __global float *allDist, + __local float *sharebuffer, + int block_size, + int max_desc_len, + int query_rows, + int query_cols, + int train_rows, + int train_cols, + int step, + int distType) +{ + /* Todo */ +} + +kernel void BruteForceMatch_calcDistance( + __global float *query, + __global float *train, + //__global float *mask, + __global float *allDist, + __local float *sharebuffer, + int block_size, + int query_rows, + int query_cols, + int train_rows, + int train_cols, + int step, + int distType) +{ + /* Todo */ +} + +kernel void BruteForceMatch_findBestMatch( + __global float *allDist, + __global int *bestTrainIdx, + __global float *bestDistance, + int k, + int block_size +) +{ + /* Todo */ +} \ No newline at end of file diff --git a/modules/ocl/src/kernels/build_warps.cl b/modules/ocl/src/opencl/build_warps.cl similarity index 99% rename from modules/ocl/src/kernels/build_warps.cl rename to modules/ocl/src/opencl/build_warps.cl index 13d7bb95ca..07cccee1a3 100644 --- a/modules/ocl/src/kernels/build_warps.cl +++ b/modules/ocl/src/opencl/build_warps.cl @@ -234,4 +234,3 @@ __kernel map_y[y * step_y + x] = ycoo; } } - diff --git a/modules/ocl/src/kernels/convertC3C4.cl b/modules/ocl/src/opencl/convertC3C4.cl similarity index 100% rename from modules/ocl/src/kernels/convertC3C4.cl rename to modules/ocl/src/opencl/convertC3C4.cl diff --git a/modules/ocl/src/kernels/cvt_color.cl b/modules/ocl/src/opencl/cvt_color.cl similarity index 100% rename from modules/ocl/src/kernels/cvt_color.cl rename to modules/ocl/src/opencl/cvt_color.cl diff --git a/modules/ocl/src/kernels/filter_sep_col.cl b/modules/ocl/src/opencl/filter_sep_col.cl similarity index 100% rename from modules/ocl/src/kernels/filter_sep_col.cl rename to modules/ocl/src/opencl/filter_sep_col.cl diff --git a/modules/ocl/src/kernels/filter_sep_row.cl b/modules/ocl/src/opencl/filter_sep_row.cl similarity index 99% rename from modules/ocl/src/kernels/filter_sep_row.cl rename to modules/ocl/src/opencl/filter_sep_row.cl index dbca8bd3a6..bfe6cd4dd6 100644 --- a/modules/ocl/src/kernels/filter_sep_row.cl +++ b/modules/ocl/src/opencl/filter_sep_row.cl @@ -466,5 +466,3 @@ __kernel __attribute__((reqd_work_group_size(LSIZE0,LSIZE1,1))) void row_filter_ dst[start_addr] = sum; } } - - diff --git a/modules/ocl/src/kernels/filtering_boxFilter.cl b/modules/ocl/src/opencl/filtering_boxFilter.cl similarity index 100% rename from modules/ocl/src/kernels/filtering_boxFilter.cl rename to modules/ocl/src/opencl/filtering_boxFilter.cl diff --git a/modules/ocl/src/kernels/filtering_laplacian.cl b/modules/ocl/src/opencl/filtering_laplacian.cl similarity index 100% rename from modules/ocl/src/kernels/filtering_laplacian.cl rename to modules/ocl/src/opencl/filtering_laplacian.cl diff --git a/modules/ocl/src/kernels/filtering_morph.cl b/modules/ocl/src/opencl/filtering_morph.cl similarity index 100% rename from modules/ocl/src/kernels/filtering_morph.cl rename to modules/ocl/src/opencl/filtering_morph.cl diff --git a/modules/ocl/src/kernels/haarobjectdetect.cl b/modules/ocl/src/opencl/haarobjectdetect.cl similarity index 99% rename from modules/ocl/src/kernels/haarobjectdetect.cl rename to modules/ocl/src/opencl/haarobjectdetect.cl index 7835b4bcc5..2fa0906b41 100644 --- a/modules/ocl/src/kernels/haarobjectdetect.cl +++ b/modules/ocl/src/opencl/haarobjectdetect.cl @@ -559,7 +559,3 @@ if(result) } } */ - - - - diff --git a/modules/ocl/src/kernels/haarobjectdetect_scaled2.cl b/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl similarity index 99% rename from modules/ocl/src/kernels/haarobjectdetect_scaled2.cl rename to modules/ocl/src/opencl/haarobjectdetect_scaled2.cl index 22d3004e29..9912b9c7a1 100644 --- a/modules/ocl/src/kernels/haarobjectdetect_scaled2.cl +++ b/modules/ocl/src/opencl/haarobjectdetect_scaled2.cl @@ -283,4 +283,3 @@ __kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuH newnode[counter].alpha[0] = t1.alpha[0]; newnode[counter].alpha[1] = t1.alpha[1]; } - diff --git a/modules/ocl/src/kernels/imgproc_bilateral.cl b/modules/ocl/src/opencl/imgproc_bilateral.cl similarity index 100% rename from modules/ocl/src/kernels/imgproc_bilateral.cl rename to modules/ocl/src/opencl/imgproc_bilateral.cl diff --git a/modules/ocl/src/kernels/imgproc_calcHarris.cl b/modules/ocl/src/opencl/imgproc_calcHarris.cl similarity index 100% rename from modules/ocl/src/kernels/imgproc_calcHarris.cl rename to modules/ocl/src/opencl/imgproc_calcHarris.cl diff --git a/modules/ocl/src/kernels/imgproc_calcMinEigenVal.cl b/modules/ocl/src/opencl/imgproc_calcMinEigenVal.cl similarity index 100% rename from modules/ocl/src/kernels/imgproc_calcMinEigenVal.cl rename to modules/ocl/src/opencl/imgproc_calcMinEigenVal.cl diff --git a/modules/ocl/src/kernels/imgproc_canny.cl b/modules/ocl/src/opencl/imgproc_canny.cl similarity index 100% rename from modules/ocl/src/kernels/imgproc_canny.cl rename to modules/ocl/src/opencl/imgproc_canny.cl diff --git a/modules/ocl/src/kernels/imgproc_columnsum.cl b/modules/ocl/src/opencl/imgproc_columnsum.cl similarity index 100% rename from modules/ocl/src/kernels/imgproc_columnsum.cl rename to modules/ocl/src/opencl/imgproc_columnsum.cl diff --git a/modules/ocl/src/kernels/imgproc_convolve.cl b/modules/ocl/src/opencl/imgproc_convolve.cl similarity index 99% rename from modules/ocl/src/kernels/imgproc_convolve.cl rename to modules/ocl/src/opencl/imgproc_convolve.cl index d113eb8169..76e7cfc55b 100644 --- a/modules/ocl/src/kernels/imgproc_convolve.cl +++ b/modules/ocl/src/opencl/imgproc_convolve.cl @@ -107,5 +107,3 @@ __kernel void convolve_D5 (__global float *src, __global float *temp1, __global dst[gy*(dst_step >> 2)+gx] = res; } } - - diff --git a/modules/ocl/src/kernels/imgproc_copymakeboder.cl b/modules/ocl/src/opencl/imgproc_copymakeboder.cl similarity index 100% rename from modules/ocl/src/kernels/imgproc_copymakeboder.cl rename to modules/ocl/src/opencl/imgproc_copymakeboder.cl diff --git a/modules/ocl/src/kernels/imgproc_histogram.cl b/modules/ocl/src/opencl/imgproc_histogram.cl similarity index 99% rename from modules/ocl/src/kernels/imgproc_histogram.cl rename to modules/ocl/src/opencl/imgproc_histogram.cl index 01e333fbc1..6bfa095f30 100644 --- a/modules/ocl/src/kernels/imgproc_histogram.cl +++ b/modules/ocl/src/opencl/imgproc_histogram.cl @@ -267,4 +267,3 @@ __kernel __attribute__((reqd_work_group_size(256,1,1)))void equalizeHist( } } */ - diff --git a/modules/ocl/src/kernels/imgproc_integral.cl b/modules/ocl/src/opencl/imgproc_integral.cl similarity index 100% rename from modules/ocl/src/kernels/imgproc_integral.cl rename to modules/ocl/src/opencl/imgproc_integral.cl diff --git a/modules/ocl/src/kernels/imgproc_integral_sum.cl b/modules/ocl/src/opencl/imgproc_integral_sum.cl similarity index 100% rename from modules/ocl/src/kernels/imgproc_integral_sum.cl rename to modules/ocl/src/opencl/imgproc_integral_sum.cl diff --git a/modules/ocl/src/kernels/imgproc_median.cl b/modules/ocl/src/opencl/imgproc_median.cl similarity index 99% rename from modules/ocl/src/kernels/imgproc_median.cl rename to modules/ocl/src/opencl/imgproc_median.cl index 2d9cd45f67..b87af96891 100644 --- a/modules/ocl/src/kernels/imgproc_median.cl +++ b/modules/ocl/src/opencl/imgproc_median.cl @@ -484,4 +484,3 @@ __kernel void medianFilter5_C1_D5(__global float * src, __global float * dst, i dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12; } #undef op(a,b) - diff --git a/modules/ocl/src/kernels/imgproc_remap.cl b/modules/ocl/src/opencl/imgproc_remap.cl similarity index 98% rename from modules/ocl/src/kernels/imgproc_remap.cl rename to modules/ocl/src/opencl/imgproc_remap.cl index 4917749561..ee40e935cc 100644 --- a/modules/ocl/src/kernels/imgproc_remap.cl +++ b/modules/ocl/src/opencl/imgproc_remap.cl @@ -48,7 +48,7 @@ #if defined DOUBLE_SUPPORT #pragma OPENCL EXTENSION cl_khr_fp64:enable typedef double4 F4 ; -#else +#else typedef float4 F4; #endif @@ -62,7 +62,7 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __global unsig { int x = get_global_id(0); int y = get_global_id(1); - + if(x < threadCols && y < dst_rows) { x = x << 2; @@ -79,7 +79,7 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __global unsig map1_data = *((__global short8 *)((__global char*)map1 + map1Start)); int4 srcIdx = convert_int4(map1_data.odd) * src_step + convert_int4(map1_data.even) + src_offset; - + uchar4 con = convert_uchar4(convert_int4(map1_data.even) >= (int4)(src_cols) || convert_int4(map1_data.odd) >= (int4)(src_rows) || convert_int4(map1_data.even) < (int4)(0) || convert_int4(map1_data.odd) < (int4)(0)); uchar4 src_data = val; @@ -91,12 +91,12 @@ __kernel void remapNNSConstant_C1_D0(__global unsigned char* dst, __global unsig src_data.s2 = *(src + srcIdx.s2); if (con.s3 == 0) src_data.s3 = *(src + srcIdx.s3); - + uchar4 dst_data; - + __global uchar4* d = (__global uchar4 *)(dst + dstStart); - uchar4 dVal = *d; + uchar4 dVal = *d; int4 dcon = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows); dst_data = (convert_uchar4(dcon) != convert_uchar4((int4)(0))) ? src_data : dVal; @@ -113,7 +113,7 @@ __kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsig { int x = get_global_id(0); int y = get_global_id(1); - + if(x < threadCols && y < dst_rows) { x = x << 2; @@ -131,9 +131,9 @@ __kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsig map1_data = *((__global float8 *)((__global char*)map1 + map1Start)); int8 map1_dataZ = convert_int8_sat_rte(map1_data); int4 srcIdx = map1_dataZ.odd * src_step + map1_dataZ.even + src_offset; - + uchar4 src_data = val; - uchar4 con = convert_uchar4(map1_dataZ.even >= (int4)(src_cols) || map1_dataZ.odd >= (int4)(src_rows) || map1_dataZ.even < (int4)(0) || map1_dataZ.odd < (int4)(0)); + uchar4 con = convert_uchar4(map1_dataZ.even >= (int4)(src_cols) || map1_dataZ.odd >= (int4)(src_rows) || map1_dataZ.even < (int4)(0) || map1_dataZ.odd < (int4)(0)); if (con.s0 == 0) src_data.s0 = *(src + srcIdx.s0); @@ -147,10 +147,10 @@ __kernel void remapNNFConstant_C1_D0(__global unsigned char* dst, __global unsig // dst_data = convert_uchar4(map1_dataZ.even >= (int4)(src_cols) || map1_dataZ.odd >= (int4)(src_rows)) ? (uchar4)(val) : src_data; __global uchar4* d = (__global uchar4 *)(dst + dstStart); - uchar4 dVal = *d; + uchar4 dVal = *d; int4 dcon = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows); - + dst_data = (convert_uchar4(dcon) != convert_uchar4((int4)(0))) ? src_data : dVal; *d = dst_data; } @@ -162,7 +162,7 @@ __kernel void remapNNF1Constant_C1_D0(__global unsigned char* dst, __global unsi { int x = get_global_id(0); int y = get_global_id(1); - + if(x < threadCols && y < dst_rows) { x = x << 2; @@ -183,9 +183,9 @@ __kernel void remapNNF1Constant_C1_D0(__global unsigned char* dst, __global unsi float8 map_data = (float8)(map1_data.s0, map2_data.s0, map1_data.s1, map2_data.s1, map1_data.s2, map2_data.s2, map1_data.s3, map2_data.s3); int8 map_dataZ = convert_int8_sat_rte(map_data); int4 srcIdx = map_dataZ.odd * src_step + map_dataZ.even + src_offset; - + uchar4 src_data = val; - uchar4 con = convert_uchar4(map_dataZ.even >= (int4)(src_cols) || map_dataZ.odd >= (int4)(src_rows)|| map_dataZ.even < (int4)(0) || map_dataZ.odd < (int4)(0)); + uchar4 con = convert_uchar4(map_dataZ.even >= (int4)(src_cols) || map_dataZ.odd >= (int4)(src_rows)|| map_dataZ.even < (int4)(0) || map_dataZ.odd < (int4)(0)); if (con.s0 == 0) src_data.s0 = *(src + srcIdx.s0); @@ -196,14 +196,14 @@ __kernel void remapNNF1Constant_C1_D0(__global unsigned char* dst, __global unsi if (con.s3 == 0) src_data.s3 = *(src + srcIdx.s3); uchar4 dst_data; - + // dst_data = convert_uchar4(map_dataZ.even >= (int4)(src_cols) || map_dataZ.odd >= (int4)(src_rows)) ? (uchar4)(val) : src_data; __global uchar4* d = (__global uchar4 *)(dst + dstStart); - uchar4 dVal = *d; + uchar4 dVal = *d; int4 dcon = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows); - + dst_data = (convert_uchar4(dcon) != convert_uchar4((int4)(0))) ? src_data : dVal; *d = dst_data; } @@ -272,7 +272,7 @@ __kernel void remapNNF1Constant_C4_D0(__global unsigned char* dst, __global unsi int y = get_global_id(1); if(x < threadCols && y < dst_rows) - { + { int dstIdx = y * dst_step + (x << 2) + dst_offset; int mapIdx = y * map1_step + (x << 2) + map1_offset; float map1_data = *((__global float *)((__global char*)map1 + mapIdx)); @@ -294,7 +294,7 @@ __kernel void remapNNSConstant_C1_D5(__global float* dst, __global float const * { int x = get_global_id(0); int y = get_global_id(1); - + if(x < threadCols && y < dst_rows) { int dstIdx = y * dst_step + (x << 2) + dst_offset; @@ -309,7 +309,7 @@ __kernel void remapNNSConstant_C1_D5(__global float* dst, __global float const * src_data = *((__global float *)((__global uchar *)src + srcIdx)); *((__global float *)((__global uchar*)dst + dstIdx)) = src_data; - + } @@ -321,7 +321,7 @@ __kernel void remapNNFConstant_C1_D5(__global float* dst, __global float const * { int x = get_global_id(0); int y = get_global_id(1); - + if(x < threadCols && y < dst_rows) { int dstIdx = y * dst_step + (x << 2) + dst_offset; @@ -337,7 +337,7 @@ __kernel void remapNNFConstant_C1_D5(__global float* dst, __global float const * src_data = *((__global float *)((__global uchar *)src + srcIdx)); *((__global float *)((__global uchar*)dst + dstIdx)) = src_data; - + } } @@ -348,7 +348,7 @@ __kernel void remapNNF1Constant_C1_D5(__global float* dst, __global float const { int x = get_global_id(0); int y = get_global_id(1); - + if(x < threadCols && y < dst_rows) { int dstIdx = y * dst_step + (x << 2) + dst_offset; @@ -367,7 +367,7 @@ __kernel void remapNNF1Constant_C1_D5(__global float* dst, __global float const src_data = *((__global float *)((__global uchar *)src + srcIdx)); *((__global float *)((__global uchar*)dst + dstIdx)) = src_data; - + } } @@ -391,9 +391,9 @@ __kernel void remapNNSConstant_C4_D5(__global float * dst, __global float const src_data = nval; else src_data = *((__global float4 *)((__global uchar *)src + srcIdx)); - *((__global float4 *)((__global uchar*)dst + dstIdx)) = src_data; + *((__global float4 *)((__global uchar*)dst + dstIdx)) = src_data; + - } } @@ -454,13 +454,13 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig int y = get_global_id(1); if(x < threadCols && y < dst_rows) { - x = x << 2; + x = x << 2; int gx = x - (dst_offset&3); int4 Gx = (int4)(gx, gx+1, gx+2, gx+3); uchar4 nval =convert_uchar4(nVal); uchar4 val = (uchar4)(nval.s0); - + int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3); @@ -518,12 +518,12 @@ __kernel void remapLNFConstant_C1_D0(__global unsigned char* dst, __global unsig d.s2 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s2 * src_step + map1_dataDx1.s2 + src_offset)); if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0) d.s3 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s3 * src_step + map1_dataDx1.s3 + src_offset)); - + uchar4 dst_data = convert_uchar4_sat_rte((convert_float4(a))* ud * vd +(convert_float4(b))* u * vd + (convert_float4(c))* ud * v + (convert_float4(d)) * u * v ); - + __global uchar4* D = (__global uchar4 *)(dst + dstStart); - uchar4 dVal = *D; + uchar4 dVal = *D; int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows); dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal; @@ -540,13 +540,13 @@ __kernel void remapLNF1Constant_C1_D0(__global unsigned char* dst, __global unsi int y = get_global_id(1); if(x < threadCols && y < dst_rows) { - x = x << 2; + x = x << 2; int gx = x - (dst_offset&3); int4 Gx = (int4)(gx, gx+1, gx+2, gx+3); uchar4 nval =convert_uchar4(nVal); uchar4 val = (uchar4)(nval.s0); - + int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&3); @@ -607,13 +607,13 @@ __kernel void remapLNF1Constant_C1_D0(__global unsigned char* dst, __global unsi d.s2 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s2 * src_step + map1_dataDx1.s2 + src_offset)); if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0) d.s3 = *((__global uchar*)((__global uchar *)src + map1_dataDy1.s3 * src_step + map1_dataDx1.s3 + src_offset)); - + uchar4 dst_data = convert_uchar4_sat_rte((convert_float4(a))* ud * vd +(convert_float4(b))* u * vd + (convert_float4(c))* ud * v + (convert_float4(d)) * u * v ); - + __global uchar4* D = (__global uchar4 *)(dst + dstStart); - uchar4 dVal = *D; + uchar4 dVal = *D; int4 con = (Gx >= 0 && Gx < dst_cols && y >= 0 && y < dst_rows); dst_data = (convert_uchar4(con) != (uchar4)(0)) ? dst_data : dVal; @@ -725,13 +725,13 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const * int y = get_global_id(1); if(x < threadCols && y < dst_rows) { - x = x << 4; + x = x << 4; int gx = x - (dst_offset&15); int4 Gx = (int4)(gx, gx+4, gx+8, gx+12); float4 nval =convert_float4(nVal); float4 val = (float4)(nval.s0); - + int dstStart = (y * dst_step + x + dst_offset) - (dst_offset&15); int map1Start = y * map1_step + (x << 1) + map1_offset - ((dst_offset & 15) << 1); float8 map1_data; @@ -787,12 +787,12 @@ __kernel void remapLNFConstant_C1_D5(__global float* dst, __global float const * d.s2 = *((__global float*)((__global uchar *)src + map1_dataDy1.s2 * src_step + (map1_dataDx1.s2 << 2) + src_offset)); if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0) d.s3 = *((__global float*)((__global uchar *)src + map1_dataDy1.s3 * src_step + (map1_dataDx1.s3 << 2) + src_offset)); - + float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v ; - + __global float4* D = (__global float4 *)((__global char*)dst + dstStart); - float4 dVal = *D; + float4 dVal = *D; int4 con = (Gx >= 0 && Gx < (dst_cols << 2) && y >= 0 && y < dst_rows); dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal; @@ -809,13 +809,13 @@ __kernel void remapLNF1Constant_C1_D5(__global float* dst, __global float const int y = get_global_id(1); if(x < threadCols && y < dst_rows) { - x = x << 4; + x = x << 4; int gx = x - (dst_offset&15); int4 Gx = (int4)(gx, gx+4, gx+8, gx+12); float4 nval =convert_float4(nVal); float4 val = (float4)(nval.s0); - + int dstStart = y * dst_step + x + dst_offset - (dst_offset & 15); int map1Start = y * map1_step + x + map1_offset - (dst_offset & 15); float4 map1_data; @@ -874,13 +874,13 @@ __kernel void remapLNF1Constant_C1_D5(__global float* dst, __global float const d.s2 = *((__global float*)((__global uchar *)src + map1_dataDy1.s2 * src_step + (map1_dataDx1.s2 << 2) + src_offset)); if (map1_dataDx1.s3 < src_cols && map1_dataDx1.s3 >= 0 && map1_dataDy1.s3 < src_rows && map1_dataDy1.s3 >= 0) d.s3 = *((__global float*)((__global uchar *)src + map1_dataDy1.s3 * src_step + (map1_dataDx1.s3 << 2) + src_offset)); - - + + float4 dst_data = a * ud * vd + b * u * vd + c * ud * v + d * u * v ; - + __global float4* D = (__global float4 *)((__global char*)dst + dstStart); - float4 dVal = *D; + float4 dVal = *D; int4 con = (Gx >= 0 && Gx < (dst_cols << 2) && y >= 0 && y < dst_rows); dst_data = (convert_float4(con) != (float4)(0)) ? dst_data : dVal; @@ -928,7 +928,7 @@ __kernel void remapLNFConstant_C4_D5(__global float * dst, __global float const else d = *((__global float4 *)((__global uchar *)src + map_dataD.y * src_step + (map_dataD.x<<4) + src_offset )); - float4 dst_data = a * ((float4)(1.0-u.x)) * ((float4)(1.0-u.y)) + b *((float4)(u.x)) * ((float4)(1.0-u.y)) + c * ((float4)(1.0-u.x)) *((float4)(u.y)) + d *((float4)(u.x)) *((float4)(u.y)); + float4 dst_data = a * ((float4)(1.0-u.x)) * ((float4)(1.0-u.y)) + b *((float4)(u.x)) * ((float4)(1.0-u.y)) + c * ((float4)(1.0-u.x)) *((float4)(u.y)) + d *((float4)(u.x)) *((float4)(u.y)); *((__global float4 *)((__global uchar*)dst + dstIdx)) = dst_data ; } @@ -974,12 +974,9 @@ __kernel void remapLNF1Constant_C4_D5(__global float * dst, __global float const else d = *((__global float4 *)((__global uchar *)src + map_dataD.y * src_step + (map_dataD.x<<4) + src_offset )); - float4 dst_data = a * ((float4)(1.0-u.x)) * ((float4)(1.0-u.y)) + b *((float4)(u.x)) * ((float4)(1.0-u.y)) + c * ((float4)(1.0-u.x)) *((float4)(u.y)) + d *((float4)(u.x)) *((float4)(u.y)); + float4 dst_data = a * ((float4)(1.0-u.x)) * ((float4)(1.0-u.y)) + b *((float4)(u.x)) * ((float4)(1.0-u.y)) + c * ((float4)(1.0-u.x)) *((float4)(u.y)) + d *((float4)(u.x)) *((float4)(u.y)); *((__global float4 *)((__global uchar*)dst + dstIdx)) = dst_data ; } } - - - diff --git a/modules/ocl/src/kernels/imgproc_resize.cl b/modules/ocl/src/opencl/imgproc_resize.cl similarity index 99% rename from modules/ocl/src/kernels/imgproc_resize.cl rename to modules/ocl/src/opencl/imgproc_resize.cl index b6a25d3827..fd486de40a 100644 --- a/modules/ocl/src/kernels/imgproc_resize.cl +++ b/modules/ocl/src/opencl/imgproc_resize.cl @@ -411,4 +411,3 @@ __kernel void resizeNN_C4_D5(__global float4 * dst, __global float4 * src, dst[dpos] = src[spos]; } - diff --git a/modules/ocl/src/kernels/imgproc_threshold.cl b/modules/ocl/src/opencl/imgproc_threshold.cl similarity index 99% rename from modules/ocl/src/kernels/imgproc_threshold.cl rename to modules/ocl/src/opencl/imgproc_threshold.cl index e046b49a75..8ad501f7c1 100644 --- a/modules/ocl/src/kernels/imgproc_threshold.cl +++ b/modules/ocl/src/opencl/imgproc_threshold.cl @@ -150,4 +150,3 @@ __kernel void threshold_C1_D5(__global const float * restrict src, __global floa } } } - diff --git a/modules/ocl/src/kernels/imgproc_warpAffine.cl b/modules/ocl/src/opencl/imgproc_warpAffine.cl similarity index 100% rename from modules/ocl/src/kernels/imgproc_warpAffine.cl rename to modules/ocl/src/opencl/imgproc_warpAffine.cl diff --git a/modules/ocl/src/kernels/imgproc_warpPerspective.cl b/modules/ocl/src/opencl/imgproc_warpPerspective.cl similarity index 99% rename from modules/ocl/src/kernels/imgproc_warpPerspective.cl rename to modules/ocl/src/opencl/imgproc_warpPerspective.cl index 9a5ec83edd..a37ffa1bee 100644 --- a/modules/ocl/src/kernels/imgproc_warpPerspective.cl +++ b/modules/ocl/src/opencl/imgproc_warpPerspective.cl @@ -682,4 +682,3 @@ __kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 } } } - diff --git a/modules/ocl/src/kernels/interpolate_frames.cl b/modules/ocl/src/opencl/interpolate_frames.cl similarity index 100% rename from modules/ocl/src/kernels/interpolate_frames.cl rename to modules/ocl/src/opencl/interpolate_frames.cl diff --git a/modules/ocl/src/kernels/match_template.cl b/modules/ocl/src/opencl/match_template.cl similarity index 99% rename from modules/ocl/src/kernels/match_template.cl rename to modules/ocl/src/opencl/match_template.cl index ddbd86ba49..3133e62371 100644 --- a/modules/ocl/src/kernels/match_template.cl +++ b/modules/ocl/src/opencl/match_template.cl @@ -821,4 +821,3 @@ void matchTemplate_Prepared_CCOFF_NORMED_C4_D0 res[res_idx] = normAcc(num, denum); } } - diff --git a/modules/ocl/src/kernels/meanShift.cl b/modules/ocl/src/opencl/meanShift.cl similarity index 99% rename from modules/ocl/src/kernels/meanShift.cl rename to modules/ocl/src/opencl/meanShift.cl index 4b5a08b352..a5b110812d 100644 --- a/modules/ocl/src/kernels/meanShift.cl +++ b/modules/ocl/src/opencl/meanShift.cl @@ -240,4 +240,3 @@ __kernel void meanshiftproc_kernel( __global uchar4* in, __global uchar4* outr, // outsp[basesp] =(short2)((short)x0,(short)y0); } } - diff --git a/modules/ocl/src/kernels/merge_mat.cl b/modules/ocl/src/opencl/merge_mat.cl similarity index 100% rename from modules/ocl/src/kernels/merge_mat.cl rename to modules/ocl/src/opencl/merge_mat.cl diff --git a/modules/ocl/src/kernels/moments.cl b/modules/ocl/src/opencl/moments.cl similarity index 99% rename from modules/ocl/src/kernels/moments.cl rename to modules/ocl/src/opencl/moments.cl index 60488372e7..399ff32076 100644 --- a/modules/ocl/src/kernels/moments.cl +++ b/modules/ocl/src/opencl/moments.cl @@ -27,7 +27,7 @@ typedef long T; #define DST_ROW_A03 9 __kernel void icvContourMoments(int contour_total, - __global float* reader_oclmat_data, + __global float* reader_oclmat_data, __global T* dst_a, int dst_step) { @@ -58,7 +58,7 @@ __kernel void icvContourMoments(int contour_total, dxy = xi_1 * yi - xi * yi_1; xii_1 = xi_1 + xi; yii_1 = yi_1 + yi; - + dst_step /= sizeof(T); *( dst_a + DST_ROW_A00 * dst_step + idx) = dxy; *( dst_a + DST_ROW_A10 * dst_step + idx) = dxy * xii_1; diff --git a/modules/ocl/src/kernels/nonfree_surf.cl b/modules/ocl/src/opencl/nonfree_surf.cl similarity index 94% rename from modules/ocl/src/kernels/nonfree_surf.cl rename to modules/ocl/src/opencl/nonfree_surf.cl index 8cffe3d93a..8c373bc4cd 100644 --- a/modules/ocl/src/kernels/nonfree_surf.cl +++ b/modules/ocl/src/opencl/nonfree_surf.cl @@ -104,11 +104,11 @@ __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAM // N = 2 // for simple haar paatern float icvCalcHaarPatternSum_2( - IMAGE_INT32 sumTex, - __constant float src[2][5], - int oldSize, - int newSize, - int y, int x, + IMAGE_INT32 sumTex, + __constant float src[2][5], + int oldSize, + int newSize, + int y, int x, int rows, int cols, int elemPerRow) { @@ -137,11 +137,11 @@ float icvCalcHaarPatternSum_2( // N = 3 float icvCalcHaarPatternSum_3( - IMAGE_INT32 sumTex, - __constant float src[2][5], - int oldSize, - int newSize, - int y, int x, + IMAGE_INT32 sumTex, + __constant float src[2][5], + int oldSize, + int newSize, + int y, int x, int rows, int cols, int elemPerRow) { @@ -170,11 +170,11 @@ float icvCalcHaarPatternSum_3( // N = 4 float icvCalcHaarPatternSum_4( - IMAGE_INT32 sumTex, - __constant float src[2][5], - int oldSize, - int newSize, - int y, int x, + IMAGE_INT32 sumTex, + __constant float src[2][5], + int oldSize, + int newSize, + int y, int x, int rows, int cols, int elemPerRow) { @@ -265,7 +265,7 @@ __kernel void icvCalcLayerDetAndTrace( const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step); det [j + margin + det_step * (layer * c_layer_rows + i + margin)] = dx * dy - 0.81f * dxy * dxy; - trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy; + trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy; } } @@ -301,9 +301,9 @@ bool within_check(IMAGE_INT32 maskSumTex, int sum_i, int sum_j, int size, int ro // Non-maximal suppression to further filtering the candidates from previous step __kernel void icvFindMaximaInLayer_withmask( - __global const float * det, - __global const float * trace, - __global int4 * maxPosBuffer, + __global const float * det, + __global const float * trace, + __global int4 * maxPosBuffer, volatile __global int* maxCounter, int counter_offset, int det_step, // the step of det in bytes @@ -345,26 +345,26 @@ __kernel // Is this thread within the hessian buffer? const int zoff = get_local_size(0) * get_local_size(1); const int localLin = get_local_id(0) + get_local_id(1) * get_local_size(0) + zoff; - N9[localLin - zoff] = - det[det_step * + N9[localLin - zoff] = + det[det_step * (c_layer_rows * (layer - 1) + min(max(i, 0), c_img_rows - 1)) // y + min(max(j, 0), c_img_cols - 1)]; // x - N9[localLin ] = - det[det_step * + N9[localLin ] = + det[det_step * (c_layer_rows * (layer ) + min(max(i, 0), c_img_rows - 1)) // y + min(max(j, 0), c_img_cols - 1)]; // x - N9[localLin + zoff] = - det[det_step * + N9[localLin + zoff] = + det[det_step * (c_layer_rows * (layer + 1) + min(max(i, 0), c_img_rows - 1)) // y + min(max(j, 0), c_img_cols - 1)]; // x barrier(CLK_LOCAL_MEM_FENCE); - if (i < c_layer_rows - margin + if (i < c_layer_rows - margin && j < c_layer_cols - margin - && get_local_id(0) > 0 + && get_local_id(0) > 0 && get_local_id(0) < get_local_size(0) - 1 - && get_local_id(1) > 0 + && get_local_id(1) > 0 && get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA ) { @@ -429,9 +429,9 @@ __kernel __kernel void icvFindMaximaInLayer( - __global float * det, - __global float * trace, - __global int4 * maxPosBuffer, + __global float * det, + __global float * trace, + __global int4 * maxPosBuffer, volatile __global int* maxCounter, int counter_offset, int det_step, // the step of det in bytes @@ -474,19 +474,19 @@ __kernel int l_x = min(max(j, 0), c_img_cols - 1); int l_y = c_layer_rows * layer + min(max(i, 0), c_img_rows - 1); - N9[localLin - zoff] = + N9[localLin - zoff] = det[det_step * (l_y - c_layer_rows) + l_x]; - N9[localLin ] = + N9[localLin ] = det[det_step * (l_y ) + l_x]; - N9[localLin + zoff] = + N9[localLin + zoff] = det[det_step * (l_y + c_layer_rows) + l_x]; barrier(CLK_LOCAL_MEM_FENCE); - if (i < c_layer_rows - margin + if (i < c_layer_rows - margin && j < c_layer_cols - margin - && get_local_id(0) > 0 + && get_local_id(0) > 0 && get_local_id(0) < get_local_size(0) - 1 - && get_local_id(1) > 0 + && get_local_id(1) > 0 && get_local_id(1) < get_local_size(1) - 1 // these are unnecessary conditions ported from CUDA ) { @@ -554,17 +554,17 @@ inline bool solve3x3_float(volatile __local const float A[3][3], volatile __loc { F invdet = 1.0 / det; - x[0] = invdet * + x[0] = invdet * (b[0] * (A[1][1] * A[2][2] - A[1][2] * A[2][1]) - A[0][1] * (b[1] * A[2][2] - A[1][2] * b[2] ) + A[0][2] * (b[1] * A[2][1] - A[1][1] * b[2] )); - x[1] = invdet * + x[1] = invdet * (A[0][0] * (b[1] * A[2][2] - A[1][2] * b[2] ) - b[0] * (A[1][0] * A[2][2] - A[1][2] * A[2][0]) + A[0][2] * (A[1][0] * b[2] - b[1] * A[2][0])); - x[2] = invdet * + x[2] = invdet * (A[0][0] * (A[1][1] * b[2] - b[1] * A[2][1]) - A[0][1] * (A[1][0] * b[2] - b[1] * A[2][0]) + b[0] * (A[1][0] * A[2][1] - A[1][1] * A[2][0])); @@ -585,9 +585,9 @@ inline bool solve3x3_float(volatile __local const float A[3][3], volatile __loc //////////////////////////////////////////////////////////////////////// // INTERPOLATION -__kernel +__kernel void icvInterpolateKeypoint( - __global const float * det, + __global const float * det, __global const int4 * maxPosBuffer, __global float * keypoints, volatile __global int * featureCounter, @@ -617,7 +617,7 @@ __kernel volatile __local float N9[3][3][3]; - N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] = + N9[get_local_id(2)][get_local_id(1)][get_local_id(0)] = det[det_step * (c_layer_rows * layer + i) + j]; barrier(CLK_LOCAL_MEM_FENCE); @@ -715,27 +715,27 @@ __kernel __constant float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6}; __constant float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0}; -__constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, - 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, - 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, - 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, - 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, - 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, - 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, - 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, - 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, - 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, - 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, - 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, - 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, - 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, +__constant float c_aptW[ORI_SAMPLES] = {0.001455130288377404f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, + 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, + 0.00665318313986063f, 0.00720730796456337f, 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, + 0.002003900473937392f, 0.001707611023448408f, 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, + 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, + 0.0035081731621176f, 0.001707611023448408f, 0.002547456417232752f, 0.005233579315245152f, 0.009162282571196556f, + 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, 0.01366852037608624f, + 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.003238451667129993f, 0.00665318313986063f, + 0.01164754293859005f, 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, + 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.001455130288377404f, + 0.0035081731621176f, 0.00720730796456337f, 0.01261763460934162f, 0.0188232995569706f, 0.02392910048365593f, + 0.02592208795249462f, 0.02392910048365593f, 0.0188232995569706f, 0.01261763460934162f, 0.00720730796456337f, + 0.0035081731621176f, 0.001455130288377404f, 0.003238451667129993f, 0.00665318313986063f, 0.01164754293859005f, + 0.01737609319388866f, 0.02208934165537357f, 0.02392910048365593f, 0.02208934165537357f, 0.01737609319388866f, 0.01164754293859005f, 0.00665318313986063f, 0.003238451667129993f, 0.002547456417232752f, 0.005233579315245152f, - 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, - 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, - 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, + 0.009162282571196556f, 0.01366852037608624f, 0.01737609319388866f, 0.0188232995569706f, 0.01737609319388866f, + 0.01366852037608624f, 0.009162282571196556f, 0.005233579315245152f, 0.002547456417232752f, 0.001707611023448408f, + 0.0035081731621176f, 0.006141661666333675f, 0.009162282571196556f, 0.01164754293859005f, 0.01261763460934162f, 0.01164754293859005f, 0.009162282571196556f, 0.006141661666333675f, 0.0035081731621176f, 0.001707611023448408f, - 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, - 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, + 0.002003900473937392f, 0.0035081731621176f, 0.005233579315245152f, 0.00665318313986063f, 0.00720730796456337f, + 0.00665318313986063f, 0.005233579315245152f, 0.0035081731621176f, 0.002003900473937392f, 0.001707611023448408f, 0.002547456417232752f, 0.003238451667129993f, 0.0035081731621176f, 0.003238451667129993f, 0.002547456417232752f, 0.001707611023448408f, 0.001455130288377404f}; @@ -748,13 +748,13 @@ void reduce_32_sum(volatile __local float * data, volatile float* partial_reduc data[tid] = *partial_reduction; barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 16) + if (tid < 16) { data[tid] = *partial_reduction = op(partial_reduction, data[tid + 16]); data[tid] = *partial_reduction = op(partial_reduction, data[tid + 8 ]); data[tid] = *partial_reduction = op(partial_reduction, data[tid + 4 ]); data[tid] = *partial_reduction = op(partial_reduction, data[tid + 2 ]); - data[tid] = *partial_reduction = op(partial_reduction, data[tid + 1 ]); + data[tid] = *partial_reduction = op(partial_reduction, data[tid + 1 ]); } #undef op } @@ -958,8 +958,8 @@ __constant float c_DW[PATCH_SZ * PATCH_SZ] = // utility for linear filter inline uchar readerGet( - IMAGE_INT8 src, - const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir, + IMAGE_INT8 src, + const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir, int i, int j, int rows, int cols, int elemPerRow ) { @@ -969,8 +969,8 @@ inline uchar readerGet( } inline float linearFilter( - IMAGE_INT8 src, - const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir, + IMAGE_INT8 src, + const float centerX, const float centerY, const float win_offset, const float cos_dir, const float sin_dir, float y, float x, int rows, int cols, int elemPerRow ) { @@ -1004,9 +1004,9 @@ void calc_dx_dy( volatile __local float s_dx_bin[25], volatile __local float s_dy_bin[25], volatile __local float s_PATCH[6][6], - __global const float* featureX, - __global const float* featureY, - __global const float* featureSize, + __global const float* featureX, + __global const float* featureY, + __global const float* featureSize, __global const float* featureDir, int rows, int cols, @@ -1058,26 +1058,26 @@ void calc_dx_dy( const float dw = c_DW[yIndex * PATCH_SZ + xIndex]; const float vx = ( - s_PATCH[get_local_id(1) ][get_local_id(0) + 1] - - s_PATCH[get_local_id(1) ][get_local_id(0) ] + - s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] - - s_PATCH[get_local_id(1) + 1][get_local_id(0) ]) + s_PATCH[get_local_id(1) ][get_local_id(0) + 1] - + s_PATCH[get_local_id(1) ][get_local_id(0) ] + + s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] - + s_PATCH[get_local_id(1) + 1][get_local_id(0) ]) * dw; const float vy = ( - s_PATCH[get_local_id(1) + 1][get_local_id(0) ] - - s_PATCH[get_local_id(1) ][get_local_id(0) ] + - s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] - - s_PATCH[get_local_id(1) ][get_local_id(0) + 1]) + s_PATCH[get_local_id(1) + 1][get_local_id(0) ] - + s_PATCH[get_local_id(1) ][get_local_id(0) ] + + s_PATCH[get_local_id(1) + 1][get_local_id(0) + 1] - + s_PATCH[get_local_id(1) ][get_local_id(0) + 1]) * dw; s_dx_bin[tid] = vx; s_dy_bin[tid] = vy; } } void reduce_sum25( - volatile __local float* sdata1, - volatile __local float* sdata2, - volatile __local float* sdata3, - volatile __local float* sdata4, + volatile __local float* sdata1, + volatile __local float* sdata2, + volatile __local float* sdata3, + volatile __local float* sdata4, int tid ) { @@ -1115,13 +1115,13 @@ void reduce_sum25( } } -__kernel +__kernel void compute_descriptors64( IMAGE_INT8 imgTex, - volatile __global float * descriptors, + volatile __global float * descriptors, __global const float * keypoints, int descriptors_step, - int keypoints_step, + int keypoints_step, int rows, int cols, int img_step @@ -1155,7 +1155,7 @@ __kernel if (tid < 25) { reduce_sum25(sdx, sdy, sdxabs, sdyabs, tid); - } + } barrier(CLK_LOCAL_MEM_FENCE); if (tid < 25) { @@ -1171,10 +1171,10 @@ __kernel } } } -__kernel +__kernel void compute_descriptors128( IMAGE_INT8 imgTex, - __global volatile float * descriptors, + __global volatile float * descriptors, __global float * keypoints, int descriptors_step, int keypoints_step, @@ -1269,7 +1269,7 @@ __kernel } } -__kernel +__kernel void normalize_descriptors128(__global float * descriptors, int descriptors_step) { descriptors_step /= sizeof(*descriptors); @@ -1310,7 +1310,7 @@ __kernel // normalize and store in output descriptor_base[get_local_id(0)] = lookup / len; } -__kernel +__kernel void normalize_descriptors64(__global float * descriptors, int descriptors_step) { descriptors_step /= sizeof(*descriptors); diff --git a/modules/ocl/src/kernels/objdetect_hog.cl b/modules/ocl/src/opencl/objdetect_hog.cl similarity index 100% rename from modules/ocl/src/kernels/objdetect_hog.cl rename to modules/ocl/src/opencl/objdetect_hog.cl diff --git a/modules/ocl/src/kernels/operator_convertTo.cl b/modules/ocl/src/opencl/operator_convertTo.cl similarity index 100% rename from modules/ocl/src/kernels/operator_convertTo.cl rename to modules/ocl/src/opencl/operator_convertTo.cl diff --git a/modules/ocl/src/kernels/operator_copyToM.cl b/modules/ocl/src/opencl/operator_copyToM.cl similarity index 100% rename from modules/ocl/src/kernels/operator_copyToM.cl rename to modules/ocl/src/opencl/operator_copyToM.cl diff --git a/modules/ocl/src/kernels/operator_setTo.cl b/modules/ocl/src/opencl/operator_setTo.cl similarity index 100% rename from modules/ocl/src/kernels/operator_setTo.cl rename to modules/ocl/src/opencl/operator_setTo.cl diff --git a/modules/ocl/src/kernels/operator_setToM.cl b/modules/ocl/src/opencl/operator_setToM.cl similarity index 99% rename from modules/ocl/src/kernels/operator_setToM.cl rename to modules/ocl/src/opencl/operator_setToM.cl index 59357fad6d..dde12d86f6 100644 --- a/modules/ocl/src/kernels/operator_setToM.cl +++ b/modules/ocl/src/opencl/operator_setToM.cl @@ -57,4 +57,3 @@ __kernel void set_to_with_mask( } } - diff --git a/modules/ocl/src/kernels/pyr_down.cl b/modules/ocl/src/opencl/pyr_down.cl similarity index 100% rename from modules/ocl/src/kernels/pyr_down.cl rename to modules/ocl/src/opencl/pyr_down.cl diff --git a/modules/ocl/src/kernels/pyr_up.cl b/modules/ocl/src/opencl/pyr_up.cl similarity index 100% rename from modules/ocl/src/kernels/pyr_up.cl rename to modules/ocl/src/opencl/pyr_up.cl diff --git a/modules/ocl/src/kernels/pyrlk.cl b/modules/ocl/src/opencl/pyrlk.cl similarity index 100% rename from modules/ocl/src/kernels/pyrlk.cl rename to modules/ocl/src/opencl/pyrlk.cl diff --git a/modules/ocl/src/kernels/pyrlk_no_image.cl b/modules/ocl/src/opencl/pyrlk_no_image.cl similarity index 100% rename from modules/ocl/src/kernels/pyrlk_no_image.cl rename to modules/ocl/src/opencl/pyrlk_no_image.cl diff --git a/modules/ocl/src/kernels/split_mat.cl b/modules/ocl/src/opencl/split_mat.cl similarity index 87% rename from modules/ocl/src/kernels/split_mat.cl rename to modules/ocl/src/opencl/split_mat.cl index 3c70859264..caee4366de 100644 --- a/modules/ocl/src/kernels/split_mat.cl +++ b/modules/ocl/src/opencl/split_mat.cl @@ -51,9 +51,9 @@ ////////////vector fuction name format: split_vector_C(channels number)_D(data type depth)////// //////////////////////////////////////////////////////////////////////////////////////////////// __kernel void split_vector_C4_D0 (__global uchar *mat_src, int src_step, int src_offset, - __global uchar *mat_dst0, int dst0_step, int dst0_offset, - __global uchar *mat_dst1, int dst1_step, int dst1_offset, - __global uchar *mat_dst2, int dst2_step, int dst2_offset, + __global uchar *mat_dst0, int dst0_step, int dst0_offset, + __global uchar *mat_dst1, int dst1_step, int dst1_offset, + __global uchar *mat_dst2, int dst2_step, int dst2_offset, __global uchar *mat_dst3, int dst3_step, int dst3_offset, int rows, int cols, int dst_step1) @@ -61,37 +61,37 @@ __kernel void split_vector_C4_D0 (__global uchar *mat_src, int src_step, int s int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { x = x << 2; - int src_idx = mad24(y, src_step, src_offset + (x << 2)); + int src_idx = mad24(y, src_step, src_offset + (x << 2)); - int dst0_start = mad24(y, dst0_step, dst0_offset); + int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_idx = mad24(y, dst0_step, dst0_offset + x) & (int)0xfffffffc; - int dst1_start = mad24(y, dst1_step, dst1_offset); + int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_idx = mad24(y, dst1_step, dst1_offset + x) & (int)0xfffffffc; - int dst2_start = mad24(y, dst2_step, dst2_offset); + int dst2_start = mad24(y, dst2_step, dst2_offset); int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1); int dst2_idx = mad24(y, dst2_step, dst2_offset + x) & (int)0xfffffffc; - int dst3_start = mad24(y, dst3_step, dst3_offset); + int dst3_start = mad24(y, dst3_step, dst3_offset); int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1); int dst3_idx = mad24(y, dst3_step, dst3_offset + x) & (int)0xfffffffc; - - uchar4 data_0 = *((global uchar4 *)(mat_src + (src_idx - 12 >= 0 ? src_idx - 12 : src_idx))); - uchar4 data_1 = *((global uchar4 *)(mat_src + (src_idx - 8 >= 0 ? src_idx - 8 : src_idx))); - uchar4 data_2 = *((global uchar4 *)(mat_src + (src_idx - 4 >= 0 ? src_idx - 4 : src_idx))); - uchar4 data_3 = *((global uchar4 *)(mat_src + src_idx + 0 )); - int total_bytes = src_offset + rows * src_step; - uchar4 data_4 = *((global uchar4 *)(mat_src + (src_idx + 4 < total_bytes ? src_idx + 4 : src_idx))); - uchar4 data_5 = *((global uchar4 *)(mat_src + (src_idx + 8 < total_bytes ? src_idx + 8 : src_idx))); - uchar4 data_6 = *((global uchar4 *)(mat_src + (src_idx + 12 < total_bytes ? src_idx + 12 : src_idx))); + uchar4 data_0 = *((global uchar4 *)(mat_src + (src_idx - 12 >= 0 ? src_idx - 12 : src_idx))); + uchar4 data_1 = *((global uchar4 *)(mat_src + (src_idx - 8 >= 0 ? src_idx - 8 : src_idx))); + uchar4 data_2 = *((global uchar4 *)(mat_src + (src_idx - 4 >= 0 ? src_idx - 4 : src_idx))); + uchar4 data_3 = *((global uchar4 *)(mat_src + src_idx + 0 )); + + int total_bytes = src_offset + rows * src_step; + uchar4 data_4 = *((global uchar4 *)(mat_src + (src_idx + 4 < total_bytes ? src_idx + 4 : src_idx))); + uchar4 data_5 = *((global uchar4 *)(mat_src + (src_idx + 8 < total_bytes ? src_idx + 8 : src_idx))); + uchar4 data_6 = *((global uchar4 *)(mat_src + (src_idx + 12 < total_bytes ? src_idx + 12 : src_idx))); uchar4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3; @@ -164,33 +164,33 @@ __kernel void split_vector_C4_D0 (__global uchar *mat_src, int src_step, int s } __kernel void split_vector_C3_D0 (__global uchar *mat_src, int src_step, int src_offset, - __global uchar *mat_dst0, int dst0_step, int dst0_offset, - __global uchar *mat_dst1, int dst1_step, int dst1_offset, - __global uchar *mat_dst2, int dst2_step, int dst2_offset, + __global uchar *mat_dst0, int dst0_step, int dst0_offset, + __global uchar *mat_dst1, int dst1_step, int dst1_offset, + __global uchar *mat_dst2, int dst2_step, int dst2_offset, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { x = x << 2; - int src_idx = mad24(y, src_step, src_offset); + int src_idx = mad24(y, src_step, src_offset); - int dst0_start = mad24(y, dst0_step, dst0_offset); + int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc); - int dst1_start = mad24(y, dst1_step, dst1_offset); + int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc); - int dst2_start = mad24(y, dst2_step, dst2_offset); + int dst2_start = mad24(y, dst2_step, dst2_offset); int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1); int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc); - + uchar4 dst0_data = *((__global uchar4 *)(mat_dst0 + dst0_idx)); uchar4 dst1_data = *((__global uchar4 *)(mat_dst1 + dst1_idx)); uchar4 dst2_data = *((__global uchar4 *)(mat_dst2 + dst2_idx)); @@ -227,10 +227,10 @@ __kernel void split_vector_C3_D0 (__global uchar *mat_src, int src_step, int s uchar data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18}; int index = 3 - dst0_offset & 3; - tmp_data0 = (uchar4)(data[index], data[index + 1], data[index + 2], data[index + 3]); + tmp_data0 = (uchar4)(data[index], data[index + 1], data[index + 2], data[index + 3]); uchar4 data0, data1, data2; - + data0 = (uchar4)(src_data_1, src_data_4, src_data_7, src_data_10); data1 = (dst1_offset & 3) == 2 ? (uchar4)(src_data_4, src_data_7, src_data_10, src_data_13) : data0; data2 = (dst1_offset & 3) == 1 ? (uchar4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1; @@ -263,33 +263,33 @@ __kernel void split_vector_C3_D0 (__global uchar *mat_src, int src_step, int s } __kernel void split_vector_C2_D0 (__global uchar *mat_src, int src_step, int src_offset, - __global uchar *mat_dst0, int dst0_step, int dst0_offset, - __global uchar *mat_dst1, int dst1_step, int dst1_offset, + __global uchar *mat_dst0, int dst0_step, int dst0_offset, + __global uchar *mat_dst1, int dst1_step, int dst1_offset, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { x = x << 2; #define dst0_align ((dst0_offset & 3) << 1) #define dst1_align ((dst1_offset & 3) << 1) - int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1)); - int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1)); + int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1)); + int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1)); - int dst0_start = mad24(y, dst0_step, dst0_offset); + int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc); - int dst1_start = mad24(y, dst1_step, dst1_offset); + int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc); - - int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0; - int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1; + + int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0; + int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1; uchar8 src_data_0 = vload8(0, mat_src + src_idx_0); uchar8 src_data_1 = vload8(0, mat_src + src_idx_1); if(src_idx_0 == -6) @@ -326,9 +326,9 @@ __kernel void split_vector_C2_D0 (__global uchar *mat_src, int src_step, int s } __kernel void split_vector_C4_D1 (__global char *mat_src, int src_step, int src_offset, - __global char *mat_dst0, int dst0_step, int dst0_offset, - __global char *mat_dst1, int dst1_step, int dst1_offset, - __global char *mat_dst2, int dst2_step, int dst2_offset, + __global char *mat_dst0, int dst0_step, int dst0_offset, + __global char *mat_dst1, int dst1_step, int dst1_offset, + __global char *mat_dst2, int dst2_step, int dst2_offset, __global char *mat_dst3, int dst3_step, int dst3_offset, int rows, int cols, int dst_step1) @@ -336,35 +336,35 @@ __kernel void split_vector_C4_D1 (__global char *mat_src, int src_step, int sr int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { x = x << 2; - int src_idx = mad24(y, src_step, src_offset + (x << 2)); + int src_idx = mad24(y, src_step, src_offset + (x << 2)); - int dst0_start = mad24(y, dst0_step, dst0_offset); + int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc); - int dst1_start = mad24(y, dst1_step, dst1_offset); + int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc); - int dst2_start = mad24(y, dst2_step, dst2_offset); + int dst2_start = mad24(y, dst2_step, dst2_offset); int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1); int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc); - int dst3_start = mad24(y, dst3_step, dst3_offset); + int dst3_start = mad24(y, dst3_step, dst3_offset); int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1); int dst3_idx = mad24(y, dst3_step, dst3_offset + x & (int)0xfffffffc); - - char4 data_0 = *((global char4 *)(mat_src + src_idx - 12)); - char4 data_1 = *((global char4 *)(mat_src + src_idx - 8 )); - char4 data_2 = *((global char4 *)(mat_src + src_idx - 4 )); - char4 data_3 = *((global char4 *)(mat_src + src_idx + 0 )); - char4 data_4 = *((global char4 *)(mat_src + src_idx + 4 )); - char4 data_5 = *((global char4 *)(mat_src + src_idx + 8 )); - char4 data_6 = *((global char4 *)(mat_src + src_idx + 12)); + + char4 data_0 = *((global char4 *)(mat_src + src_idx - 12)); + char4 data_1 = *((global char4 *)(mat_src + src_idx - 8 )); + char4 data_2 = *((global char4 *)(mat_src + src_idx - 4 )); + char4 data_3 = *((global char4 *)(mat_src + src_idx + 0 )); + char4 data_4 = *((global char4 *)(mat_src + src_idx + 4 )); + char4 data_5 = *((global char4 *)(mat_src + src_idx + 8 )); + char4 data_6 = *((global char4 *)(mat_src + src_idx + 12)); char4 tmp_data0=1, tmp_data1=2, tmp_data2, tmp_data3; @@ -437,33 +437,33 @@ __kernel void split_vector_C4_D1 (__global char *mat_src, int src_step, int sr } __kernel void split_vector_C3_D1 (__global char *mat_src, int src_step, int src_offset, - __global char *mat_dst0, int dst0_step, int dst0_offset, - __global char *mat_dst1, int dst1_step, int dst1_offset, - __global char *mat_dst2, int dst2_step, int dst2_offset, + __global char *mat_dst0, int dst0_step, int dst0_offset, + __global char *mat_dst1, int dst1_step, int dst1_offset, + __global char *mat_dst2, int dst2_step, int dst2_offset, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { x = x << 2; - int src_idx = mad24(y, src_step, src_offset); + int src_idx = mad24(y, src_step, src_offset); - int dst0_start = mad24(y, dst0_step, dst0_offset); + int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc); - int dst1_start = mad24(y, dst1_step, dst1_offset); + int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc); - int dst2_start = mad24(y, dst2_step, dst2_offset); + int dst2_start = mad24(y, dst2_step, dst2_offset); int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1); int dst2_idx = mad24(y, dst2_step, dst2_offset + x & (int)0xfffffffc); - + char4 dst0_data = *((__global char4 *)(mat_dst0 + dst0_idx)); char4 dst1_data = *((__global char4 *)(mat_dst1 + dst1_idx)); char4 dst2_data = *((__global char4 *)(mat_dst2 + dst2_idx)); @@ -500,10 +500,10 @@ __kernel void split_vector_C3_D1 (__global char *mat_src, int src_step, int sr char data[7] = {src_data_0, src_data_3, src_data_6, src_data_9, src_data_12, src_data_15, src_data_18}; int index = 3 - dst0_offset & 3; - tmp_data0 = (char4)(data[index], data[index + 1], data[index + 2], data[index + 3]); + tmp_data0 = (char4)(data[index], data[index + 1], data[index + 2], data[index + 3]); char4 data0, data1, data2; - + data0 = (char4)(src_data_1, src_data_4, src_data_7, src_data_10); data1 = (dst1_offset & 3) == 2 ? (char4)(src_data_4, src_data_7, src_data_10, src_data_13) : data0; data2 = (dst1_offset & 3) == 1 ? (char4)(src_data_7, src_data_10, src_data_13, src_data_16) : data1; @@ -536,32 +536,32 @@ __kernel void split_vector_C3_D1 (__global char *mat_src, int src_step, int sr } __kernel void split_vector_C2_D1 (__global char *mat_src, int src_step, int src_offset, - __global char *mat_dst0, int dst0_step, int dst0_offset, - __global char *mat_dst1, int dst1_step, int dst1_offset, + __global char *mat_dst0, int dst0_step, int dst0_offset, + __global char *mat_dst1, int dst1_step, int dst1_offset, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { x = x << 2; #define dst0_align ((dst0_offset & 3) << 1) #define dst1_align ((dst1_offset & 3) << 1) - int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1)); - int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1)); + int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 1)); + int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 1)); - int dst0_start = mad24(y, dst0_step, dst0_offset); + int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_idx = mad24(y, dst0_step, dst0_offset + x & (int)0xfffffffc); - int dst1_start = mad24(y, dst1_step, dst1_offset); + int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_idx = mad24(y, dst1_step, dst1_offset + x & (int)0xfffffffc); - int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0; - int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1; + int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0; + int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1; char8 src_data_0 = vload8(0, mat_src + src_idx_0); char8 src_data_1 = vload8(0, mat_src + src_idx_1); if(src_idx_0 == -6) @@ -597,9 +597,9 @@ __kernel void split_vector_C2_D1 (__global char *mat_src, int src_step, int sr } __kernel void split_vector_C4_D2 (__global ushort *mat_src, int src_step, int src_offset, - __global ushort *mat_dst0, int dst0_step, int dst0_offset, - __global ushort *mat_dst1, int dst1_step, int dst1_offset, - __global ushort *mat_dst2, int dst2_step, int dst2_offset, + __global ushort *mat_dst0, int dst0_step, int dst0_offset, + __global ushort *mat_dst1, int dst1_step, int dst1_offset, + __global ushort *mat_dst2, int dst2_step, int dst2_offset, __global ushort *mat_dst3, int dst3_step, int dst3_offset, int rows, int cols, int dst_step1) @@ -607,30 +607,30 @@ __kernel void split_vector_C4_D2 (__global ushort *mat_src, int src_step, int int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { x = x << 1; - int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8); - int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8); + int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8); + int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8); - int dst0_start = mad24(y, dst0_step, dst0_offset); + int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc); - int dst1_start = mad24(y, dst1_step, dst1_offset); + int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc); - int dst2_start = mad24(y, dst2_step, dst2_offset); + int dst2_start = mad24(y, dst2_step, dst2_offset); int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1); int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc); - int dst3_start = mad24(y, dst3_step, dst3_offset); + int dst3_start = mad24(y, dst3_step, dst3_offset); int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1); int dst3_idx = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc); - - int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0; + + int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0; ushort8 src_data0 = vload8(0,(__global ushort *)((__global char *)mat_src + src_idx_0)); if(src_idx_0 == -6) src_data0.s01234567 = src_data0.s67012345; @@ -672,33 +672,33 @@ __kernel void split_vector_C4_D2 (__global ushort *mat_src, int src_step, int } __kernel void split_vector_C3_D2 (__global ushort *mat_src, int src_step, int src_offset, - __global ushort *mat_dst0, int dst0_step, int dst0_offset, - __global ushort *mat_dst1, int dst1_step, int dst1_offset, - __global ushort *mat_dst2, int dst2_step, int dst2_offset, + __global ushort *mat_dst0, int dst0_step, int dst0_offset, + __global ushort *mat_dst1, int dst1_step, int dst1_offset, + __global ushort *mat_dst2, int dst2_step, int dst2_offset, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { x = x << 1; - int src_idx = mad24(y, src_step, src_offset); + int src_idx = mad24(y, src_step, src_offset); - int dst0_start = mad24(y, dst0_step, dst0_offset); + int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc); - int dst1_start = mad24(y, dst1_step, dst1_offset); + int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc); - int dst2_start = mad24(y, dst2_step, dst2_offset); + int dst2_start = mad24(y, dst2_step, dst2_offset); int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1); int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc); - + ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx)); ushort2 dst1_data = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx)); ushort2 dst2_data = *((__global ushort2 *)((__global char *)mat_dst2 + dst2_idx)); @@ -735,48 +735,48 @@ __kernel void split_vector_C3_D2 (__global ushort *mat_src, int src_step, int } __kernel void split_vector_C2_D2 (__global ushort *mat_src, int src_step, int src_offset, - __global ushort *mat_dst0, int dst0_step, int dst0_offset, - __global ushort *mat_dst1, int dst1_step, int dst1_offset, + __global ushort *mat_dst0, int dst0_step, int dst0_offset, + __global ushort *mat_dst1, int dst1_step, int dst1_offset, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { x = x << 1; #define dst0_align ((dst0_offset & 3) << 1) #define dst1_align ((dst1_offset & 3) << 1) - int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2)); - int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2)); + int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2)); + int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2)); - int dst0_start = mad24(y, dst0_step, dst0_offset); + int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc); - int dst1_start = mad24(y, dst1_step, dst1_offset); + int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc); - - int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0; - int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1; + + int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0; + int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1; ushort4 src_data_0 = vload4(0, (__global ushort *)((__global char *)mat_src + src1_index_fix)); ushort4 src_data_1 = vload4(0, (__global ushort *)((__global char *)mat_src + src2_index_fix)); - if(src_idx_0 < 0) - { - ushort4 tmp; - tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx; - src_data_0.xyzw = (src_idx_1 == -1) ? src_data_0.wxyz:tmp.xyzw; - } - if(src_idx_1 < 0) - { - ushort4 tmp; - tmp.xyzw = (src_idx_1 == -2) ? src_data_1.zwxy : src_data_1.yzwx; - src_data_1.xyzw = (src_idx_1 == -1) ? src_data_1.wxyz : tmp.xyzw; - } - + if(src_idx_0 < 0) + { + ushort4 tmp; + tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx; + src_data_0.xyzw = (src_idx_1 == -1) ? src_data_0.wxyz:tmp.xyzw; + } + if(src_idx_1 < 0) + { + ushort4 tmp; + tmp.xyzw = (src_idx_1 == -2) ? src_data_1.zwxy : src_data_1.yzwx; + src_data_1.xyzw = (src_idx_1 == -1) ? src_data_1.wxyz : tmp.xyzw; + } + ushort2 dst0_data = *((__global ushort2 *)((__global char *)mat_dst0 + dst0_idx)); ushort2 dst1_data = *((__global ushort2 *)((__global char *)mat_dst1 + dst1_idx)); @@ -793,9 +793,9 @@ __kernel void split_vector_C2_D2 (__global ushort *mat_src, int src_step, int } } __kernel void split_vector_C4_D3 (__global short *mat_src, int src_step, int src_offset, - __global short *mat_dst0, int dst0_step, int dst0_offset, - __global short *mat_dst1, int dst1_step, int dst1_offset, - __global short *mat_dst2, int dst2_step, int dst2_offset, + __global short *mat_dst0, int dst0_step, int dst0_offset, + __global short *mat_dst1, int dst1_step, int dst1_offset, + __global short *mat_dst2, int dst2_step, int dst2_offset, __global short *mat_dst3, int dst3_step, int dst3_offset, int rows, int cols, int dst_step1) @@ -803,38 +803,38 @@ __kernel void split_vector_C4_D3 (__global short *mat_src, int src_step, int s int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { x = x << 1; - int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8); - int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8); + int src_idx_0 = mad24(y, src_step, src_offset + (x << 3) - 8); + int src_idx_1 = mad24(y, src_step, src_offset + (x << 3) + 8); - int dst0_start = mad24(y, dst0_step, dst0_offset); + int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc); - int dst1_start = mad24(y, dst1_step, dst1_offset); + int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc); - int dst2_start = mad24(y, dst2_step, dst2_offset); + int dst2_start = mad24(y, dst2_step, dst2_offset); int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1); int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc); - int dst3_start = mad24(y, dst3_step, dst3_offset); + int dst3_start = mad24(y, dst3_step, dst3_offset); int dst3_end = mad24(y, dst3_step, dst3_offset + dst_step1); int dst3_idx = mad24(y, dst3_step, dst3_offset + (x << 1) & (int)0xfffffffc); - int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0; + int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0; short8 src_data0 = vload8(0,(__global short *)((__global char *)mat_src + src_idx_0)); - + if(src_idx_0 == -6) src_data0.s01234567 = src_data0.s67012345; if(src_idx_0 == -4) src_data0.s01234567 = src_data0.s45670123; if(src_idx_0 == -2) src_data0.s01234567 = src_data0.s23456701; - + short4 src_data1 = *((__global short4 *)((__global char *)mat_src + src_idx_1)); short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx)); @@ -868,33 +868,33 @@ __kernel void split_vector_C4_D3 (__global short *mat_src, int src_step, int s } } __kernel void split_vector_C3_D3 (__global short *mat_src, int src_step, int src_offset, - __global short *mat_dst0, int dst0_step, int dst0_offset, - __global short *mat_dst1, int dst1_step, int dst1_offset, - __global short *mat_dst2, int dst2_step, int dst2_offset, + __global short *mat_dst0, int dst0_step, int dst0_offset, + __global short *mat_dst1, int dst1_step, int dst1_offset, + __global short *mat_dst2, int dst2_step, int dst2_offset, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { x = x << 1; - int src_idx = mad24(y, src_step, src_offset); + int src_idx = mad24(y, src_step, src_offset); - int dst0_start = mad24(y, dst0_step, dst0_offset); + int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc); - int dst1_start = mad24(y, dst1_step, dst1_offset); + int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc); - int dst2_start = mad24(y, dst2_step, dst2_offset); + int dst2_start = mad24(y, dst2_step, dst2_offset); int dst2_end = mad24(y, dst2_step, dst2_offset + dst_step1); int dst2_idx = mad24(y, dst2_step, dst2_offset + (x << 1) & (int)0xfffffffc); - + short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx)); short2 dst1_data = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx)); short2 dst2_data = *((__global short2 *)((__global char *)mat_dst2 + dst2_idx)); @@ -932,47 +932,47 @@ __kernel void split_vector_C3_D3 (__global short *mat_src, int src_step, int s __kernel void split_vector_C2_D3 (__global short *mat_src, int src_step, int src_offset, - __global short *mat_dst0, int dst0_step, int dst0_offset, - __global short *mat_dst1, int dst1_step, int dst1_offset, + __global short *mat_dst0, int dst0_step, int dst0_offset, + __global short *mat_dst1, int dst1_step, int dst1_offset, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { x = x << 1; #define dst0_align ((dst0_offset & 3) << 1) #define dst1_align ((dst1_offset & 3) << 1) - int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2)); - int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2)); + int src_idx_0 = mad24(y, src_step, src_offset - dst0_align + (x << 2)); + int src_idx_1 = mad24(y, src_step, src_offset - dst1_align + (x << 2)); - int dst0_start = mad24(y, dst0_step, dst0_offset); + int dst0_start = mad24(y, dst0_step, dst0_offset); int dst0_end = mad24(y, dst0_step, dst0_offset + dst_step1); int dst0_idx = mad24(y, dst0_step, dst0_offset + (x << 1) & (int)0xfffffffc); - int dst1_start = mad24(y, dst1_step, dst1_offset); + int dst1_start = mad24(y, dst1_step, dst1_offset); int dst1_end = mad24(y, dst1_step, dst1_offset + dst_step1); int dst1_idx = mad24(y, dst1_step, dst1_offset + (x << 1) & (int)0xfffffffc); - int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0; - int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1; + int src1_index_fix = src_idx_0 < 0 ? 0 : src_idx_0; + int src2_index_fix = src_idx_1 < 0 ? 0 : src_idx_1; short4 src_data_0 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_0)); short4 src_data_1 = vload4(0, (__global short *)((__global char *)mat_src + src_idx_1)); - if(src_idx_0 < 0) - { - short4 tmp; - tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx; - src_data_0.xyzw = (src_idx_0 == -1) ? src_data_0.wxyz:tmp.xyzw; - } - if(src_idx_1< 0) - { - short4 tmp; - tmp.xyzw = ( src_idx_1== -2) ? src_data_1.zwxy : src_data_1.yzwx; - src_data_1.xyzw = ( src_idx_1== -1) ? src_data_1.wxyz : tmp.xyzw; - } - + if(src_idx_0 < 0) + { + short4 tmp; + tmp.xyzw = (src_idx_0 == -2) ? src_data_0.zwxy : src_data_0.yzwx; + src_data_0.xyzw = (src_idx_0 == -1) ? src_data_0.wxyz:tmp.xyzw; + } + if(src_idx_1< 0) + { + short4 tmp; + tmp.xyzw = ( src_idx_1== -2) ? src_data_1.zwxy : src_data_1.yzwx; + src_data_1.xyzw = ( src_idx_1== -1) ? src_data_1.wxyz : tmp.xyzw; + } + short2 dst0_data = *((__global short2 *)((__global char *)mat_dst0 + dst0_idx)); short2 dst1_data = *((__global short2 *)((__global char *)mat_dst1 + dst1_idx)); @@ -990,9 +990,9 @@ __kernel void split_vector_C2_D3 (__global short *mat_src, int src_step, int s } } __kernel void split_vector_C4_D4 (__global int *mat_src, int src_step, int src_offset, - __global int *mat_dst0, int dst0_step, int dst0_offset, - __global int *mat_dst1, int dst1_step, int dst1_offset, - __global int *mat_dst2, int dst2_step, int dst2_offset, + __global int *mat_dst0, int dst0_step, int dst0_offset, + __global int *mat_dst1, int dst1_step, int dst1_offset, + __global int *mat_dst2, int dst2_step, int dst2_offset, __global int *mat_dst3, int dst3_step, int dst3_offset, int rows, int cols, int dst_step1) @@ -1000,14 +1000,14 @@ __kernel void split_vector_C4_D4 (__global int *mat_src, int src_step, int src int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { - int src_idx = mad24(y, src_step, src_offset); + int src_idx = mad24(y, src_step, src_offset); int dst0_idx = mad24(y, dst0_step, dst0_offset); int dst1_idx = mad24(y, dst1_step, dst1_offset); int dst2_idx = mad24(y, dst2_step, dst2_offset); int dst3_idx = mad24(y, dst3_step, dst3_offset); - + int4 src_data = ((__global int4 *)((__global char *)mat_src + src_idx))[x]; ((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x; @@ -1017,18 +1017,18 @@ __kernel void split_vector_C4_D4 (__global int *mat_src, int src_step, int src } } __kernel void split_vector_C3_D4 (__global int *mat_src, int src_step, int src_offset, - __global int *mat_dst0, int dst0_step, int dst0_offset, - __global int *mat_dst1, int dst1_step, int dst1_offset, - __global int *mat_dst2, int dst2_step, int dst2_offset, + __global int *mat_dst0, int dst0_step, int dst0_offset, + __global int *mat_dst1, int dst1_step, int dst1_offset, + __global int *mat_dst2, int dst2_step, int dst2_offset, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { - int src_idx = mad24(y, src_step, src_offset); + int src_idx = mad24(y, src_step, src_offset); int dst0_idx = mad24(y, dst0_step, dst0_offset); int dst1_idx = mad24(y, dst1_step, dst1_offset); int dst2_idx = mad24(y, dst2_step, dst2_offset); @@ -1044,20 +1044,20 @@ __kernel void split_vector_C3_D4 (__global int *mat_src, int src_step, int src } __kernel void split_vector_C2_D4 (__global int *mat_src, int src_step, int src_offset, - __global int *mat_dst0, int dst0_step, int dst0_offset, - __global int *mat_dst1, int dst1_step, int dst1_offset, + __global int *mat_dst0, int dst0_step, int dst0_offset, + __global int *mat_dst1, int dst1_step, int dst1_offset, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { - int src_idx = mad24(y, src_step, src_offset); + int src_idx = mad24(y, src_step, src_offset); int dst0_idx = mad24(y, dst0_step, dst0_offset); int dst1_idx = mad24(y, dst1_step, dst1_offset); - + int2 src_data = ((__global int2 *)((__global char *)mat_src + src_idx))[x]; ((__global int *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x; @@ -1066,9 +1066,9 @@ __kernel void split_vector_C2_D4 (__global int *mat_src, int src_step, int src } __kernel void split_vector_C4_D5 (__global float *mat_src, int src_step, int src_offset, - __global float *mat_dst0, int dst0_step, int dst0_offset, - __global float *mat_dst1, int dst1_step, int dst1_offset, - __global float *mat_dst2, int dst2_step, int dst2_offset, + __global float *mat_dst0, int dst0_step, int dst0_offset, + __global float *mat_dst1, int dst1_step, int dst1_offset, + __global float *mat_dst2, int dst2_step, int dst2_offset, __global float *mat_dst3, int dst3_step, int dst3_offset, int rows, int cols, int dst_step1) @@ -1076,14 +1076,14 @@ __kernel void split_vector_C4_D5 (__global float *mat_src, int src_step, int s int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { - int src_idx = mad24(y, src_step, src_offset); + int src_idx = mad24(y, src_step, src_offset); int dst0_idx = mad24(y, dst0_step, dst0_offset); int dst1_idx = mad24(y, dst1_step, dst1_offset); int dst2_idx = mad24(y, dst2_step, dst2_offset); int dst3_idx = mad24(y, dst3_step, dst3_offset); - + float4 src_data = ((__global float4 *)((__global char *)mat_src + src_idx))[x]; ((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x; @@ -1094,18 +1094,18 @@ __kernel void split_vector_C4_D5 (__global float *mat_src, int src_step, int s } __kernel void split_vector_C3_D5 (__global float *mat_src, int src_step, int src_offset, - __global float *mat_dst0, int dst0_step, int dst0_offset, - __global float *mat_dst1, int dst1_step, int dst1_offset, - __global float *mat_dst2, int dst2_step, int dst2_offset, + __global float *mat_dst0, int dst0_step, int dst0_offset, + __global float *mat_dst1, int dst1_step, int dst1_offset, + __global float *mat_dst2, int dst2_step, int dst2_offset, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { - int src_idx = mad24(y, src_step, src_offset); + int src_idx = mad24(y, src_step, src_offset); int dst0_idx = mad24(y, dst0_step, dst0_offset); int dst1_idx = mad24(y, dst1_step, dst1_offset); int dst2_idx = mad24(y, dst2_step, dst2_offset); @@ -1121,20 +1121,20 @@ __kernel void split_vector_C3_D5 (__global float *mat_src, int src_step, int s } __kernel void split_vector_C2_D5 (__global float *mat_src, int src_step, int src_offset, - __global float *mat_dst0, int dst0_step, int dst0_offset, - __global float *mat_dst1, int dst1_step, int dst1_offset, + __global float *mat_dst0, int dst0_step, int dst0_offset, + __global float *mat_dst1, int dst1_step, int dst1_offset, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { - int src_idx = mad24(y, src_step, src_offset); + int src_idx = mad24(y, src_step, src_offset); int dst0_idx = mad24(y, dst0_step, dst0_offset); int dst1_idx = mad24(y, dst1_step, dst1_offset); - + float2 src_data = ((__global float2 *)((__global char *)mat_src + src_idx))[x]; ((__global float *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x; @@ -1144,9 +1144,9 @@ __kernel void split_vector_C2_D5 (__global float *mat_src, int src_step, int s #if defined (DOUBLE_SUPPORT) __kernel void split_vector_C4_D6 (__global double *mat_src, int src_step, int src_offset, - __global double *mat_dst0, int dst0_step, int dst0_offset, - __global double *mat_dst1, int dst1_step, int dst1_offset, - __global double *mat_dst2, int dst2_step, int dst2_offset, + __global double *mat_dst0, int dst0_step, int dst0_offset, + __global double *mat_dst1, int dst1_step, int dst1_offset, + __global double *mat_dst2, int dst2_step, int dst2_offset, __global double *mat_dst3, int dst3_step, int dst3_offset, int rows, int cols, int dst_step1) @@ -1154,14 +1154,14 @@ __kernel void split_vector_C4_D6 (__global double *mat_src, int src_step, int int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { - int src_idx = mad24(y, src_step, src_offset); + int src_idx = mad24(y, src_step, src_offset); int dst0_idx = mad24(y, dst0_step, dst0_offset); int dst1_idx = mad24(y, dst1_step, dst1_offset); int dst2_idx = mad24(y, dst2_step, dst2_offset); int dst3_idx = mad24(y, dst3_step, dst3_offset); - + double4 src_data = ((__global double4 *)((__global char *)mat_src + src_idx))[x]; ((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x; @@ -1172,18 +1172,18 @@ __kernel void split_vector_C4_D6 (__global double *mat_src, int src_step, int } __kernel void split_vector_C3_D6 (__global double *mat_src, int src_step, int src_offset, - __global double *mat_dst0, int dst0_step, int dst0_offset, - __global double *mat_dst1, int dst1_step, int dst1_offset, - __global double *mat_dst2, int dst2_step, int dst2_offset, + __global double *mat_dst0, int dst0_step, int dst0_offset, + __global double *mat_dst1, int dst1_step, int dst1_offset, + __global double *mat_dst2, int dst2_step, int dst2_offset, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { - int src_idx = mad24(y, src_step, src_offset); + int src_idx = mad24(y, src_step, src_offset); int dst0_idx = mad24(y, dst0_step, dst0_offset); int dst1_idx = mad24(y, dst1_step, dst1_offset); int dst2_idx = mad24(y, dst2_step, dst2_offset); @@ -1199,20 +1199,20 @@ __kernel void split_vector_C3_D6 (__global double *mat_src, int src_step, int } __kernel void split_vector_C2_D6 (__global double *mat_src, int src_step, int src_offset, - __global double *mat_dst0, int dst0_step, int dst0_offset, - __global double *mat_dst1, int dst1_step, int dst1_offset, + __global double *mat_dst0, int dst0_step, int dst0_offset, + __global double *mat_dst1, int dst1_step, int dst1_offset, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); - if((x < cols) && (y < rows)) + if((x < cols) && (y < rows)) { - int src_idx = mad24(y, src_step, src_offset); + int src_idx = mad24(y, src_step, src_offset); int dst0_idx = mad24(y, dst0_step, dst0_offset); int dst1_idx = mad24(y, dst1_step, dst1_offset); - + double2 src_data = ((__global double2 *)((__global char *)mat_src + src_idx))[x]; ((__global double *)((__global char *)mat_dst0 + dst0_idx))[x] = src_data.x; diff --git a/modules/ocl/src/kernels/stereobm.cl b/modules/ocl/src/opencl/stereobm.cl similarity index 96% rename from modules/ocl/src/kernels/stereobm.cl rename to modules/ocl/src/opencl/stereobm.cl index 4edab86b45..954283987b 100644 --- a/modules/ocl/src/kernels/stereobm.cl +++ b/modules/ocl/src/opencl/stereobm.cl @@ -55,9 +55,9 @@ int SQ(int a) return a * a; } -unsigned int CalcSSD(volatile __local unsigned int *col_ssd_cache, +unsigned int CalcSSD(volatile __local unsigned int *col_ssd_cache, volatile __local unsigned int *col_ssd, int radius) -{ +{ unsigned int cache = 0; unsigned int cache2 = 0; @@ -77,7 +77,7 @@ unsigned int CalcSSD(volatile __local unsigned int *col_ssd_cache, return col_ssd[0] + cache + cache2; } -uint2 MinSSD(volatile __local unsigned int *col_ssd_cache, +uint2 MinSSD(volatile __local unsigned int *col_ssd_cache, volatile __local unsigned int *col_ssd, int radius) { unsigned int ssd[N_DISPARITIES]; @@ -112,7 +112,7 @@ uint2 MinSSD(volatile __local unsigned int *col_ssd_cache, return (uint2)(mssd, bestIdx); } -void StepDown(int idx1, int idx2, __global unsigned char* imageL, +void StepDown(int idx1, int idx2, __global unsigned char* imageL, __global unsigned char* imageR, int d, volatile __local unsigned int *col_ssd, int radius) { unsigned char leftPixel1; @@ -179,8 +179,8 @@ void StepDown(int idx1, int idx2, __global unsigned char* imageL, col_ssd[7 * (BLOCK_W + 2 * radius)] += SQ(diff2) - SQ(diff1); } -void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imageL, - __global unsigned char* imageR, int d, +void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imageL, + __global unsigned char* imageR, int d, volatile __local unsigned int *col_ssd, int radius) { unsigned char leftPixel1; @@ -215,15 +215,15 @@ void InitColSSD(int x_tex, int y_tex, int im_pitch, __global unsigned char* imag col_ssd[7 * (BLOCK_W + 2 * radius)] = diffa[7]; } -__kernel void stereoKernel(__global unsigned char *left, __global unsigned char *right, +__kernel void stereoKernel(__global unsigned char *left, __global unsigned char *right, __global unsigned int *cminSSDImage, int cminSSD_step, __global unsigned char *disp, int disp_step,int cwidth, int cheight, - int img_step, int maxdisp, int radius, + int img_step, int maxdisp, int radius, __local unsigned int *col_ssd_cache) { volatile __local unsigned int *col_ssd = col_ssd_cache + BLOCK_W + get_local_id(0); - volatile __local unsigned int *col_ssd_extra = get_local_id(0) < (2 * radius) ? col_ssd + BLOCK_W : 0; + volatile __local unsigned int *col_ssd_extra = get_local_id(0) < (2 * radius) ? col_ssd + BLOCK_W : 0; int X = get_group_id(0) * BLOCK_W + get_local_id(0) + maxdisp + radius; // int Y = get_group_id(1) * ROWSperTHREAD + radius; @@ -266,8 +266,8 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char int idx1 = y_tex * img_step + x_tex; int idx2 = (y_tex + (2 * radius + 1)) * img_step + x_tex; - barrier(CLK_GLOBAL_MEM_FENCE); - barrier(CLK_LOCAL_MEM_FENCE); + barrier(CLK_GLOBAL_MEM_FENCE); + barrier(CLK_LOCAL_MEM_FENCE); StepDown(idx1, idx2, left, right, d, col_ssd, radius); if (col_ssd_extra > 0) @@ -276,7 +276,7 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char y_tex += 1; - barrier(CLK_LOCAL_MEM_FENCE); + barrier(CLK_LOCAL_MEM_FENCE); if (X < cwidth - radius && row < cheight - radius - Y) { @@ -296,7 +296,7 @@ __kernel void stereoKernel(__global unsigned char *left, __global unsigned char //////////////////////////// Sobel Prefiler (signal channel)////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////// -__kernel void prefilter_xsobel(__global unsigned char *input, __global unsigned char *output, +__kernel void prefilter_xsobel(__global unsigned char *input, __global unsigned char *output, int rows, int cols, int prefilterCap) { int x = get_global_id(0); @@ -304,7 +304,7 @@ __kernel void prefilter_xsobel(__global unsigned char *input, __global unsigned if(x < cols && y < rows) { - int cov = input[(y-1) * cols + (x-1)] * (-1) + input[(y-1) * cols + (x+1)] * (1) + + int cov = input[(y-1) * cols + (x-1)] * (-1) + input[(y-1) * cols + (x+1)] * (1) + input[(y) * cols + (x-1)] * (-2) + input[(y) * cols + (x+1)] * (2) + input[(y+1) * cols + (x-1)] * (-1) + input[(y+1) * cols + (x+1)] * (1); @@ -325,10 +325,10 @@ float sobel(__global unsigned char *input, int x, int y, int rows, int cols) int x1 = x==0? 0 : x-1; if(x < cols && y < rows) { - conv = (float)input[(y1) * cols + (x1)] * (-1) + (float)input[(y1) * cols + (x+1)] * (1) + + conv = (float)input[(y1) * cols + (x1)] * (-1) + (float)input[(y1) * cols + (x+1)] * (1) + (float)input[(y) * cols + (x1)] * (-2) + (float)input[(y) * cols + (x+1)] * (2) + (float)input[(y+1) * cols + (x1)] * (-1) + (float)input[(y+1) * cols + (x+1)] * (1); - + } return fabs(conv); } @@ -359,9 +359,9 @@ float CalcSums(__local float *cols, __local float *cols_cache, int winsz) } #define RpT (2 * ROWSperTHREAD) // got experimentally -__kernel void textureness_kernel(__global unsigned char *disp, int disp_rows, int disp_cols, - int disp_step, __global unsigned char *input, int input_rows, - int input_cols,int winsz, float threshold, +__kernel void textureness_kernel(__global unsigned char *disp, int disp_rows, int disp_cols, + int disp_step, __global unsigned char *input, int input_rows, + int input_cols,int winsz, float threshold, __local float *cols_cache) { int winsz2 = winsz/2; @@ -405,13 +405,13 @@ __kernel void textureness_kernel(__global unsigned char *disp, int disp_rows, in for(int y = beg_row + 1; y < end_row; ++y) { - sum = sum - sobel(input, x - winsz2, y - winsz2 - 1, input_rows, input_cols) + + sum = sum - sobel(input, x - winsz2, y - winsz2 - 1, input_rows, input_cols) + sobel(input, x - winsz2, y + winsz2, input_rows, input_cols); *cols = sum; if (cols_extra) { - sum_extra = sum_extra - sobel(input, x + group_size_x - winsz2, y - winsz2 - 1,input_rows, input_cols) + sum_extra = sum_extra - sobel(input, x + group_size_x - winsz2, y - winsz2 - 1,input_rows, input_cols) + sobel(input, x + group_size_x - winsz2, y + winsz2, input_rows, input_cols); *cols_extra = sum_extra; }