mirror of https://github.com/opencv/opencv.git
Merge pull request #1748 from SpecLad:merge-2.4
commit
8b19df3fec
112 changed files with 6928 additions and 7110 deletions
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,93 @@ |
||||
# James Bigler, NVIDIA Corp (nvidia.com - jbigler) |
||||
# Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html |
||||
# |
||||
# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. |
||||
# |
||||
# Copyright (c) 2007-2009 |
||||
# Scientific Computing and Imaging Institute, University of Utah |
||||
# |
||||
# This code is licensed under the MIT License. See the FindCUDA.cmake script |
||||
# for the text of the license. |
||||
|
||||
# The MIT License |
||||
# |
||||
# License for the specific language governing rights and limitations under |
||||
# Permission is hereby granted, free of charge, to any person obtaining a |
||||
# copy of this software and associated documentation files (the "Software"), |
||||
# to deal in the Software without restriction, including without limitation |
||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense, |
||||
# and/or sell copies of the Software, and to permit persons to whom the |
||||
# Software is furnished to do so, subject to the following conditions: |
||||
# |
||||
# The above copyright notice and this permission notice shall be included |
||||
# in all copies or substantial portions of the Software. |
||||
# |
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
||||
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
||||
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
||||
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
||||
# DEALINGS IN THE SOFTWARE. |
||||
# |
||||
|
||||
####################################################################### |
||||
# This converts a file written in makefile syntax into one that can be included |
||||
# by CMake. |
||||
|
||||
file(READ ${input_file} depend_text) |
||||
|
||||
if (${depend_text} MATCHES ".+") |
||||
|
||||
# message("FOUND DEPENDS") |
||||
|
||||
# Remember, four backslashes is escaped to one backslash in the string. |
||||
string(REGEX REPLACE "\\\\ " " " depend_text ${depend_text}) |
||||
|
||||
# This works for the nvcc -M generated dependency files. |
||||
string(REGEX REPLACE "^.* : " "" depend_text ${depend_text}) |
||||
string(REGEX REPLACE "[ \\\\]*\n" ";" depend_text ${depend_text}) |
||||
|
||||
set(dependency_list "") |
||||
|
||||
foreach(file ${depend_text}) |
||||
|
||||
string(REGEX REPLACE "^ +" "" file ${file}) |
||||
|
||||
# OK, now if we had a UNC path, nvcc has a tendency to only output the first '/' |
||||
# instead of '//'. Here we will test to see if the file exists, if it doesn't then |
||||
# try to prepend another '/' to the path and test again. If it still fails remove the |
||||
# path. |
||||
|
||||
if(NOT EXISTS "${file}") |
||||
if (EXISTS "/${file}") |
||||
set(file "/${file}") |
||||
else() |
||||
message(WARNING " Removing non-existent dependency file: ${file}") |
||||
set(file "") |
||||
endif() |
||||
endif() |
||||
|
||||
if(NOT IS_DIRECTORY "${file}") |
||||
# If softlinks start to matter, we should change this to REALPATH. For now we need |
||||
# to flatten paths, because nvcc can generate stuff like /bin/../include instead of |
||||
# just /include. |
||||
get_filename_component(file_absolute "${file}" ABSOLUTE) |
||||
list(APPEND dependency_list "${file_absolute}") |
||||
endif() |
||||
|
||||
endforeach() |
||||
|
||||
else() |
||||
# message("FOUND NO DEPENDS") |
||||
endif() |
||||
|
||||
# Remove the duplicate entries and sort them. |
||||
list(REMOVE_DUPLICATES dependency_list) |
||||
list(SORT dependency_list) |
||||
|
||||
foreach(file ${dependency_list}) |
||||
set(cuda_nvcc_depend "${cuda_nvcc_depend} \"${file}\"\n") |
||||
endforeach() |
||||
|
||||
file(WRITE ${output_file} "# Generated by: make2cmake.cmake\nSET(CUDA_NVCC_DEPEND\n ${cuda_nvcc_depend})\n\n") |
@ -0,0 +1,110 @@ |
||||
# James Bigler, NVIDIA Corp (nvidia.com - jbigler) |
||||
# Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html |
||||
# |
||||
# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. |
||||
# |
||||
# Copyright (c) 2007-2009 |
||||
# Scientific Computing and Imaging Institute, University of Utah |
||||
# |
||||
# This code is licensed under the MIT License. See the FindCUDA.cmake script |
||||
# for the text of the license. |
||||
|
||||
# The MIT License |
||||
# |
||||
# License for the specific language governing rights and limitations under |
||||
# Permission is hereby granted, free of charge, to any person obtaining a |
||||
# copy of this software and associated documentation files (the "Software"), |
||||
# to deal in the Software without restriction, including without limitation |
||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense, |
||||
# and/or sell copies of the Software, and to permit persons to whom the |
||||
# Software is furnished to do so, subject to the following conditions: |
||||
# |
||||
# The above copyright notice and this permission notice shall be included |
||||
# in all copies or substantial portions of the Software. |
||||
# |
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
||||
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
||||
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
||||
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
||||
# DEALINGS IN THE SOFTWARE. |
||||
# |
||||
|
||||
####################################################################### |
||||
# Parses a .cubin file produced by nvcc and reports statistics about the file. |
||||
|
||||
|
||||
file(READ ${input_file} file_text) |
||||
|
||||
if (${file_text} MATCHES ".+") |
||||
|
||||
# Remember, four backslashes is escaped to one backslash in the string. |
||||
string(REGEX REPLACE ";" "\\\\;" file_text ${file_text}) |
||||
string(REGEX REPLACE "\ncode" ";code" file_text ${file_text}) |
||||
|
||||
list(LENGTH file_text len) |
||||
|
||||
foreach(line ${file_text}) |
||||
|
||||
# Only look at "code { }" blocks. |
||||
if(line MATCHES "^code") |
||||
|
||||
# Break into individual lines. |
||||
string(REGEX REPLACE "\n" ";" line ${line}) |
||||
|
||||
foreach(entry ${line}) |
||||
|
||||
# Extract kernel names. |
||||
if (${entry} MATCHES "[^g]name = ([^ ]+)") |
||||
string(REGEX REPLACE ".* = ([^ ]+)" "\\1" entry ${entry}) |
||||
|
||||
# Check to see if the kernel name starts with "_" |
||||
set(skip FALSE) |
||||
# if (${entry} MATCHES "^_") |
||||
# Skip the rest of this block. |
||||
# message("Skipping ${entry}") |
||||
# set(skip TRUE) |
||||
# else () |
||||
message("Kernel: ${entry}") |
||||
# endif () |
||||
|
||||
endif() |
||||
|
||||
# Skip the rest of the block if necessary |
||||
if(NOT skip) |
||||
|
||||
# Registers |
||||
if (${entry} MATCHES "reg([ ]+)=([ ]+)([^ ]+)") |
||||
string(REGEX REPLACE ".*([ ]+)=([ ]+)([^ ]+)" "\\3" entry ${entry}) |
||||
message("Registers: ${entry}") |
||||
endif() |
||||
|
||||
# Local memory |
||||
if (${entry} MATCHES "lmem([ ]+)=([ ]+)([^ ]+)") |
||||
string(REGEX REPLACE ".*([ ]+)=([ ]+)([^ ]+)" "\\3" entry ${entry}) |
||||
message("Local: ${entry}") |
||||
endif() |
||||
|
||||
# Shared memory |
||||
if (${entry} MATCHES "smem([ ]+)=([ ]+)([^ ]+)") |
||||
string(REGEX REPLACE ".*([ ]+)=([ ]+)([^ ]+)" "\\3" entry ${entry}) |
||||
message("Shared: ${entry}") |
||||
endif() |
||||
|
||||
if (${entry} MATCHES "^}") |
||||
message("") |
||||
endif() |
||||
|
||||
endif() |
||||
|
||||
|
||||
endforeach() |
||||
|
||||
endif() |
||||
|
||||
endforeach() |
||||
|
||||
else() |
||||
# message("FOUND NO DEPENDS") |
||||
endif() |
@ -0,0 +1,288 @@ |
||||
# James Bigler, NVIDIA Corp (nvidia.com - jbigler) |
||||
# |
||||
# Copyright (c) 2008 - 2009 NVIDIA Corporation. All rights reserved. |
||||
# |
||||
# This code is licensed under the MIT License. See the FindCUDA.cmake script |
||||
# for the text of the license. |
||||
|
||||
# The MIT License |
||||
# |
||||
# License for the specific language governing rights and limitations under |
||||
# Permission is hereby granted, free of charge, to any person obtaining a |
||||
# copy of this software and associated documentation files (the "Software"), |
||||
# to deal in the Software without restriction, including without limitation |
||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense, |
||||
# and/or sell copies of the Software, and to permit persons to whom the |
||||
# Software is furnished to do so, subject to the following conditions: |
||||
# |
||||
# The above copyright notice and this permission notice shall be included |
||||
# in all copies or substantial portions of the Software. |
||||
# |
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
||||
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
||||
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
||||
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER |
||||
# DEALINGS IN THE SOFTWARE. |
||||
|
||||
|
||||
########################################################################## |
||||
# This file runs the nvcc commands to produce the desired output file along with |
||||
# the dependency file needed by CMake to compute dependencies. In addition the |
||||
# file checks the output of each command and if the command fails it deletes the |
||||
# output files. |
||||
|
||||
# Input variables |
||||
# |
||||
# verbose:BOOL=<> OFF: Be as quiet as possible (default) |
||||
# ON : Describe each step |
||||
# |
||||
# build_configuration:STRING=<> Typically one of Debug, MinSizeRel, Release, or |
||||
# RelWithDebInfo, but it should match one of the |
||||
# entries in CUDA_HOST_FLAGS. This is the build |
||||
# configuration used when compiling the code. If |
||||
# blank or unspecified Debug is assumed as this is |
||||
# what CMake does. |
||||
# |
||||
# generated_file:STRING=<> File to generate. This argument must be passed in. |
||||
# |
||||
# generated_cubin_file:STRING=<> File to generate. This argument must be passed |
||||
# in if build_cubin is true. |
||||
|
||||
if(NOT generated_file) |
||||
message(FATAL_ERROR "You must specify generated_file on the command line") |
||||
endif() |
||||
|
||||
# Set these up as variables to make reading the generated file easier |
||||
set(CMAKE_COMMAND "@CMAKE_COMMAND@") # path |
||||
set(source_file "@source_file@") # path |
||||
set(NVCC_generated_dependency_file "@NVCC_generated_dependency_file@") # path |
||||
set(cmake_dependency_file "@cmake_dependency_file@") # path |
||||
set(CUDA_make2cmake "@CUDA_make2cmake@") # path |
||||
set(CUDA_parse_cubin "@CUDA_parse_cubin@") # path |
||||
set(build_cubin @build_cubin@) # bool |
||||
set(CUDA_HOST_COMPILER "@CUDA_HOST_COMPILER@") # bool |
||||
# We won't actually use these variables for now, but we need to set this, in |
||||
# order to force this file to be run again if it changes. |
||||
set(generated_file_path "@generated_file_path@") # path |
||||
set(generated_file_internal "@generated_file@") # path |
||||
set(generated_cubin_file_internal "@generated_cubin_file@") # path |
||||
|
||||
set(CUDA_NVCC_EXECUTABLE "@CUDA_NVCC_EXECUTABLE@") # path |
||||
set(CUDA_NVCC_FLAGS @CUDA_NVCC_FLAGS@ ;; @CUDA_WRAP_OPTION_NVCC_FLAGS@) # list |
||||
@CUDA_NVCC_FLAGS_CONFIG@ |
||||
set(nvcc_flags @nvcc_flags@) # list |
||||
set(CUDA_NVCC_INCLUDE_ARGS "@CUDA_NVCC_INCLUDE_ARGS@") # list (needs to be in quotes to handle spaces properly). |
||||
set(format_flag "@format_flag@") # string |
||||
|
||||
if(build_cubin AND NOT generated_cubin_file) |
||||
message(FATAL_ERROR "You must specify generated_cubin_file on the command line") |
||||
endif() |
||||
|
||||
# This is the list of host compilation flags. It C or CXX should already have |
||||
# been chosen by FindCUDA.cmake. |
||||
@CUDA_HOST_FLAGS@ |
||||
|
||||
# Take the compiler flags and package them up to be sent to the compiler via -Xcompiler |
||||
set(nvcc_host_compiler_flags "") |
||||
# If we weren't given a build_configuration, use Debug. |
||||
if(NOT build_configuration) |
||||
set(build_configuration Debug) |
||||
endif() |
||||
string(TOUPPER "${build_configuration}" build_configuration) |
||||
#message("CUDA_NVCC_HOST_COMPILER_FLAGS = ${CUDA_NVCC_HOST_COMPILER_FLAGS}") |
||||
foreach(flag ${CMAKE_HOST_FLAGS} ${CMAKE_HOST_FLAGS_${build_configuration}}) |
||||
# Extra quotes are added around each flag to help nvcc parse out flags with spaces. |
||||
set(nvcc_host_compiler_flags "${nvcc_host_compiler_flags},\"${flag}\"") |
||||
endforeach() |
||||
if (nvcc_host_compiler_flags) |
||||
set(nvcc_host_compiler_flags "-Xcompiler" ${nvcc_host_compiler_flags}) |
||||
endif() |
||||
#message("nvcc_host_compiler_flags = \"${nvcc_host_compiler_flags}\"") |
||||
# Add the build specific configuration flags |
||||
list(APPEND CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS_${build_configuration}}) |
||||
|
||||
# Any -ccbin existing in CUDA_NVCC_FLAGS gets highest priority |
||||
list( FIND CUDA_NVCC_FLAGS "-ccbin" ccbin_found0 ) |
||||
list( FIND CUDA_NVCC_FLAGS "--compiler-bindir" ccbin_found1 ) |
||||
if( ccbin_found0 LESS 0 AND ccbin_found1 LESS 0 ) |
||||
if (CUDA_HOST_COMPILER STREQUAL "$(VCInstallDir)bin" AND DEFINED CCBIN) |
||||
set(CCBIN -ccbin "${CCBIN}") |
||||
else() |
||||
set(CCBIN -ccbin "${CUDA_HOST_COMPILER}") |
||||
endif() |
||||
endif() |
||||
|
||||
# cuda_execute_process - Executes a command with optional command echo and status message. |
||||
# |
||||
# status - Status message to print if verbose is true |
||||
# command - COMMAND argument from the usual execute_process argument structure |
||||
# ARGN - Remaining arguments are the command with arguments |
||||
# |
||||
# CUDA_result - return value from running the command |
||||
# |
||||
# Make this a macro instead of a function, so that things like RESULT_VARIABLE |
||||
# and other return variables are present after executing the process. |
||||
macro(cuda_execute_process status command) |
||||
set(_command ${command}) |
||||
if(NOT _command STREQUAL "COMMAND") |
||||
message(FATAL_ERROR "Malformed call to cuda_execute_process. Missing COMMAND as second argument. (command = ${command})") |
||||
endif() |
||||
if(verbose) |
||||
execute_process(COMMAND "${CMAKE_COMMAND}" -E echo -- ${status}) |
||||
# Now we need to build up our command string. We are accounting for quotes |
||||
# and spaces, anything else is left up to the user to fix if they want to |
||||
# copy and paste a runnable command line. |
||||
set(cuda_execute_process_string) |
||||
foreach(arg ${ARGN}) |
||||
# If there are quotes, excape them, so they come through. |
||||
string(REPLACE "\"" "\\\"" arg ${arg}) |
||||
# Args with spaces need quotes around them to get them to be parsed as a single argument. |
||||
if(arg MATCHES " ") |
||||
list(APPEND cuda_execute_process_string "\"${arg}\"") |
||||
else() |
||||
list(APPEND cuda_execute_process_string ${arg}) |
||||
endif() |
||||
endforeach() |
||||
# Echo the command |
||||
execute_process(COMMAND ${CMAKE_COMMAND} -E echo ${cuda_execute_process_string}) |
||||
endif() |
||||
# Run the command |
||||
execute_process(COMMAND ${ARGN} RESULT_VARIABLE CUDA_result ) |
||||
endmacro() |
||||
|
||||
# Delete the target file |
||||
cuda_execute_process( |
||||
"Removing ${generated_file}" |
||||
COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" |
||||
) |
||||
|
||||
# For CUDA 2.3 and below, -G -M doesn't work, so remove the -G flag |
||||
# for dependency generation and hope for the best. |
||||
set(depends_CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS}") |
||||
set(CUDA_VERSION @CUDA_VERSION@) |
||||
if(CUDA_VERSION VERSION_LESS "3.0") |
||||
cmake_policy(PUSH) |
||||
# CMake policy 0007 NEW states that empty list elements are not |
||||
# ignored. I'm just setting it to avoid the warning that's printed. |
||||
cmake_policy(SET CMP0007 NEW) |
||||
# Note that this will remove all occurances of -G. |
||||
list(REMOVE_ITEM depends_CUDA_NVCC_FLAGS "-G") |
||||
cmake_policy(POP) |
||||
endif() |
||||
|
||||
# nvcc doesn't define __CUDACC__ for some reason when generating dependency files. This |
||||
# can cause incorrect dependencies when #including files based on this macro which is |
||||
# defined in the generating passes of nvcc invokation. We will go ahead and manually |
||||
# define this for now until a future version fixes this bug. |
||||
set(CUDACC_DEFINE -D__CUDACC__) |
||||
|
||||
# Generate the dependency file |
||||
cuda_execute_process( |
||||
"Generating dependency file: ${NVCC_generated_dependency_file}" |
||||
COMMAND "${CUDA_NVCC_EXECUTABLE}" |
||||
-M |
||||
${CUDACC_DEFINE} |
||||
"${source_file}" |
||||
-o "${NVCC_generated_dependency_file}" |
||||
${CCBIN} |
||||
${nvcc_flags} |
||||
${nvcc_host_compiler_flags} |
||||
${depends_CUDA_NVCC_FLAGS} |
||||
-DNVCC |
||||
${CUDA_NVCC_INCLUDE_ARGS} |
||||
) |
||||
|
||||
if(CUDA_result) |
||||
message(FATAL_ERROR "Error generating ${generated_file}") |
||||
endif() |
||||
|
||||
# Generate the cmake readable dependency file to a temp file. Don't put the |
||||
# quotes just around the filenames for the input_file and output_file variables. |
||||
# CMake will pass the quotes through and not be able to find the file. |
||||
cuda_execute_process( |
||||
"Generating temporary cmake readable file: ${cmake_dependency_file}.tmp" |
||||
COMMAND "${CMAKE_COMMAND}" |
||||
-D "input_file:FILEPATH=${NVCC_generated_dependency_file}" |
||||
-D "output_file:FILEPATH=${cmake_dependency_file}.tmp" |
||||
-P "${CUDA_make2cmake}" |
||||
) |
||||
|
||||
if(CUDA_result) |
||||
message(FATAL_ERROR "Error generating ${generated_file}") |
||||
endif() |
||||
|
||||
# Copy the file if it is different |
||||
cuda_execute_process( |
||||
"Copy if different ${cmake_dependency_file}.tmp to ${cmake_dependency_file}" |
||||
COMMAND "${CMAKE_COMMAND}" -E copy_if_different "${cmake_dependency_file}.tmp" "${cmake_dependency_file}" |
||||
) |
||||
|
||||
if(CUDA_result) |
||||
message(FATAL_ERROR "Error generating ${generated_file}") |
||||
endif() |
||||
|
||||
# Delete the temporary file |
||||
cuda_execute_process( |
||||
"Removing ${cmake_dependency_file}.tmp and ${NVCC_generated_dependency_file}" |
||||
COMMAND "${CMAKE_COMMAND}" -E remove "${cmake_dependency_file}.tmp" "${NVCC_generated_dependency_file}" |
||||
) |
||||
|
||||
if(CUDA_result) |
||||
message(FATAL_ERROR "Error generating ${generated_file}") |
||||
endif() |
||||
|
||||
# Generate the code |
||||
cuda_execute_process( |
||||
"Generating ${generated_file}" |
||||
COMMAND "${CUDA_NVCC_EXECUTABLE}" |
||||
"${source_file}" |
||||
${format_flag} -o "${generated_file}" |
||||
${CCBIN} |
||||
${nvcc_flags} |
||||
${nvcc_host_compiler_flags} |
||||
${CUDA_NVCC_FLAGS} |
||||
-DNVCC |
||||
${CUDA_NVCC_INCLUDE_ARGS} |
||||
) |
||||
|
||||
if(CUDA_result) |
||||
# Since nvcc can sometimes leave half done files make sure that we delete the output file. |
||||
cuda_execute_process( |
||||
"Removing ${generated_file}" |
||||
COMMAND "${CMAKE_COMMAND}" -E remove "${generated_file}" |
||||
) |
||||
message(FATAL_ERROR "Error generating file ${generated_file}") |
||||
else() |
||||
if(verbose) |
||||
message("Generated ${generated_file} successfully.") |
||||
endif() |
||||
endif() |
||||
|
||||
# Cubin resource report commands. |
||||
if( build_cubin ) |
||||
# Run with -cubin to produce resource usage report. |
||||
cuda_execute_process( |
||||
"Generating ${generated_cubin_file}" |
||||
COMMAND "${CUDA_NVCC_EXECUTABLE}" |
||||
"${source_file}" |
||||
${CUDA_NVCC_FLAGS} |
||||
${nvcc_flags} |
||||
${CCBIN} |
||||
${nvcc_host_compiler_flags} |
||||
-DNVCC |
||||
-cubin |
||||
-o "${generated_cubin_file}" |
||||
${CUDA_NVCC_INCLUDE_ARGS} |
||||
) |
||||
|
||||
# Execute the parser script. |
||||
cuda_execute_process( |
||||
"Executing the parser script" |
||||
COMMAND "${CMAKE_COMMAND}" |
||||
-D "input_file:STRING=${generated_cubin_file}" |
||||
-P "${CUDA_parse_cubin}" |
||||
) |
||||
|
||||
endif() |
File diff suppressed because it is too large
Load Diff
@ -1,753 +0,0 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Jia Haipeng, jiahaipeng95@gmail.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
#if defined (DOUBLE_SUPPORT) |
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable |
||||
#endif |
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
/////////////////////////////////////////////flip rows and cols/////////////////////////////////////// |
||||
////////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
__kernel void arithm_flip_rc_C1_D0 (__global uchar *src, int src_step, int src_offset, |
||||
__global uchar *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int thread_rows, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < thread_rows) |
||||
{ |
||||
int src_index_0 = mad24(y, src_step, (x) + src_offset); |
||||
int src_index_1 = mad24(rows - y - 1, src_step, (cols - x -1) + src_offset); |
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x) + dst_offset); |
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, (cols - x -1) + dst_offset); |
||||
|
||||
uchar data0 = *(src + src_index_0); |
||||
uchar data1 = *(src + src_index_1); |
||||
|
||||
*(dst + dst_index_0) = data1; |
||||
*(dst + dst_index_1) = data0; |
||||
} |
||||
} |
||||
__kernel void arithm_flip_rc_C1_D1 (__global char *src, int src_step, int src_offset, |
||||
__global char *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int thread_rows, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < thread_rows) |
||||
{ |
||||
int src_index_0 = mad24(y, src_step, (x) + src_offset); |
||||
int src_index_1 = mad24(rows - y - 1, src_step, (cols - x -1) + src_offset); |
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x) + dst_offset); |
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, (cols - x -1) + dst_offset); |
||||
|
||||
char data0 = *(src + src_index_0); |
||||
char data1 = *(src + src_index_1); |
||||
|
||||
*(dst + dst_index_0) = data1; |
||||
*(dst + dst_index_1) = data0; |
||||
} |
||||
} |
||||
__kernel void arithm_flip_rc_C1_D2 (__global ushort *src, int src_step, int src_offset, |
||||
__global ushort *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int thread_rows, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < thread_rows) |
||||
{ |
||||
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset); |
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 1) + src_offset); |
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 1) + dst_offset); |
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 1) + dst_offset); |
||||
|
||||
ushort data0 = *((__global ushort *)((__global char *)src + src_index_0)); |
||||
ushort data1 = *((__global ushort *)((__global char *)src + src_index_1)); |
||||
|
||||
*((__global ushort *)((__global char *)dst + dst_index_0)) = data1; |
||||
*((__global ushort *)((__global char *)dst + dst_index_1)) = data0; |
||||
} |
||||
} |
||||
__kernel void arithm_flip_rc_C1_D3 (__global short *src, int src_step, int src_offset, |
||||
__global short *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int thread_rows, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < thread_rows) |
||||
{ |
||||
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset); |
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 1) + src_offset); |
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 1) + dst_offset); |
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 1) + dst_offset); |
||||
|
||||
short data0 = *((__global short *)((__global char *)src + src_index_0)); |
||||
short data1 = *((__global short *)((__global char *)src + src_index_1)); |
||||
|
||||
*((__global short *)((__global char *)dst + dst_index_0)) = data1; |
||||
*((__global short *)((__global char *)dst + dst_index_1)) = data0; |
||||
} |
||||
} |
||||
__kernel void arithm_flip_rc_C1_D4 (__global int *src, int src_step, int src_offset, |
||||
__global int *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int thread_rows, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < thread_rows) |
||||
{ |
||||
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset); |
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset); |
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset); |
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset); |
||||
|
||||
int data0 = *((__global int *)((__global char *)src + src_index_0)); |
||||
int data1 = *((__global int *)((__global char *)src + src_index_1)); |
||||
|
||||
*((__global int *)((__global char *)dst + dst_index_0)) = data1; |
||||
*((__global int *)((__global char *)dst + dst_index_1)) = data0; |
||||
} |
||||
} |
||||
__kernel void arithm_flip_rc_C1_D5 (__global float *src, int src_step, int src_offset, |
||||
__global float *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int thread_rows, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < thread_rows) |
||||
{ |
||||
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset); |
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset); |
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset); |
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset); |
||||
|
||||
float data0 = *((__global float *)((__global char *)src + src_index_0)); |
||||
float data1 = *((__global float *)((__global char *)src + src_index_1)); |
||||
|
||||
*((__global float *)((__global char *)dst + dst_index_0)) = data1; |
||||
*((__global float *)((__global char *)dst + dst_index_1)) = data0; |
||||
} |
||||
} |
||||
|
||||
#if defined (DOUBLE_SUPPORT) |
||||
__kernel void arithm_flip_rc_C1_D6 (__global double *src, int src_step, int src_offset, |
||||
__global double *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int thread_rows, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < thread_rows) |
||||
{ |
||||
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset); |
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset); |
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset); |
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset); |
||||
|
||||
double data0 = *((__global double *)((__global char *)src + src_index_0)); |
||||
double data1 = *((__global double *)((__global char *)src + src_index_1)); |
||||
|
||||
*((__global double *)((__global char *)dst + dst_index_0)) = data1; |
||||
*((__global double *)((__global char *)dst + dst_index_1)) = data0; |
||||
} |
||||
} |
||||
#endif |
||||
__kernel void arithm_flip_rc_C2_D0 (__global uchar *src, int src_step, int src_offset, |
||||
__global uchar *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int thread_rows, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < thread_rows) |
||||
{ |
||||
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset); |
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 1) + src_offset); |
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 1) + dst_offset); |
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 1) + dst_offset); |
||||
|
||||
uchar2 data0 = *((__global uchar2 *)(src + src_index_0)); |
||||
uchar2 data1 = *((__global uchar2 *)(src + src_index_1)); |
||||
|
||||
*((__global uchar2 *)(dst + dst_index_0)) = data1; |
||||
*((__global uchar2 *)(dst + dst_index_1)) = data0; |
||||
} |
||||
} |
||||
__kernel void arithm_flip_rc_C2_D1 (__global char *src, int src_step, int src_offset, |
||||
__global char *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int thread_rows, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < thread_rows) |
||||
{ |
||||
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset); |
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 1) + src_offset); |
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 1) + dst_offset); |
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 1) + dst_offset); |
||||
|
||||
char2 data0 = *((__global char2 *)(src + src_index_0)); |
||||
char2 data1 = *((__global char2 *)(src + src_index_1)); |
||||
|
||||
*((__global char2 *)(dst + dst_index_0)) = data1; |
||||
*((__global char2 *)(dst + dst_index_1)) = data0; |
||||
} |
||||
} |
||||
__kernel void arithm_flip_rc_C2_D2 (__global ushort *src, int src_step, int src_offset, |
||||
__global ushort *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int thread_rows, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < thread_rows) |
||||
{ |
||||
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset); |
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset); |
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset); |
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset); |
||||
|
||||
ushort2 data0 = *((__global ushort2 *)((__global char *)src + src_index_0)); |
||||
ushort2 data1 = *((__global ushort2 *)((__global char *)src + src_index_1)); |
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index_0)) = data1; |
||||
*((__global ushort2 *)((__global char *)dst + dst_index_1)) = data0; |
||||
} |
||||
} |
||||
__kernel void arithm_flip_rc_C2_D3 (__global short *src, int src_step, int src_offset, |
||||
__global short *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int thread_rows, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < thread_rows) |
||||
{ |
||||
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset); |
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset); |
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset); |
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset); |
||||
|
||||
short2 data0 = *((__global short2 *)((__global char *)src + src_index_0)); |
||||
short2 data1 = *((__global short2 *)((__global char *)src + src_index_1)); |
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index_0)) = data1; |
||||
*((__global short2 *)((__global char *)dst + dst_index_1)) = data0; |
||||
} |
||||
} |
||||
__kernel void arithm_flip_rc_C2_D4 (__global int *src, int src_step, int src_offset, |
||||
__global int *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int thread_rows, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < thread_rows) |
||||
{ |
||||
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset); |
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset); |
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset); |
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset); |
||||
|
||||
int2 data0 = *((__global int2 *)((__global char *)src + src_index_0)); |
||||
int2 data1 = *((__global int2 *)((__global char *)src + src_index_1)); |
||||
|
||||
*((__global int2 *)((__global char *)dst + dst_index_0)) = data1; |
||||
*((__global int2 *)((__global char *)dst + dst_index_1)) = data0; |
||||
} |
||||
} |
||||
__kernel void arithm_flip_rc_C2_D5 (__global float *src, int src_step, int src_offset, |
||||
__global float *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int thread_rows, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < thread_rows) |
||||
{ |
||||
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset); |
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset); |
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset); |
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset); |
||||
|
||||
float2 data0 = *((__global float2 *)((__global char *)src + src_index_0)); |
||||
float2 data1 = *((__global float2 *)((__global char *)src + src_index_1)); |
||||
|
||||
*((__global float2 *)((__global char *)dst + dst_index_0)) = data1; |
||||
*((__global float2 *)((__global char *)dst + dst_index_1)) = data0; |
||||
} |
||||
} |
||||
|
||||
#if defined (DOUBLE_SUPPORT) |
||||
__kernel void arithm_flip_rc_C2_D6 (__global double *src, int src_step, int src_offset, |
||||
__global double *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int thread_rows, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < thread_rows) |
||||
{ |
||||
int src_index_0 = mad24(y, src_step, (x << 4) + src_offset); |
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 4) + src_offset); |
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 4) + dst_offset); |
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 4) + dst_offset); |
||||
|
||||
double2 data0 = *((__global double2 *)((__global char *)src + src_index_0)); |
||||
double2 data1 = *((__global double2 *)((__global char *)src + src_index_1)); |
||||
|
||||
*((__global double2 *)((__global char *)dst + dst_index_0)) = data1; |
||||
*((__global double2 *)((__global char *)dst + dst_index_1)) = data0; |
||||
} |
||||
} |
||||
#endif |
||||
|
||||
__kernel void arithm_flip_rc_C3_D0 (__global uchar *src, int src_step, int src_offset, |
||||
__global uchar *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int thread_rows, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < thread_rows) |
||||
{ |
||||
int src_index_0 = mad24(y, src_step, (x * 3) + src_offset); |
||||
int src_index_1 = mad24(rows - y - 1, src_step, (cols - x -1) * 3 + src_offset); |
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x * 3) + dst_offset); |
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, (cols - x -1) * 3 + dst_offset); |
||||
|
||||
|
||||
uchar data0_0 = *(src + src_index_0 + 0); |
||||
uchar data0_1 = *(src + src_index_0 + 1); |
||||
uchar data0_2 = *(src + src_index_0 + 2); |
||||
|
||||
uchar data1_0 = *(src + src_index_1 + 0); |
||||
uchar data1_1 = *(src + src_index_1 + 1); |
||||
uchar data1_2 = *(src + src_index_1 + 2); |
||||
|
||||
*(dst + dst_index_0 + 0 ) = data1_0; |
||||
*(dst + dst_index_0 + 1 ) = data1_1; |
||||
*(dst + dst_index_0 + 2 ) = data1_2; |
||||
|
||||
*(dst + dst_index_1 + 0) = data0_0; |
||||
*(dst + dst_index_1 + 1) = data0_1; |
||||
*(dst + dst_index_1 + 2) = data0_2; |
||||
} |
||||
} |
||||
__kernel void arithm_flip_rc_C3_D1 (__global char *src, int src_step, int src_offset, |
||||
__global char *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int thread_rows, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < thread_rows) |
||||
{ |
||||
int src_index_0 = mad24(y, src_step, (x * 3) + src_offset); |
||||
int src_index_1 = mad24(rows - y - 1, src_step, (cols - x -1) * 3 + src_offset); |
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x * 3) + dst_offset); |
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, (cols - x -1) * 3 + dst_offset); |
||||
|
||||
|
||||
char data0_0 = *(src + src_index_0 + 0); |
||||
char data0_1 = *(src + src_index_0 + 1); |
||||
char data0_2 = *(src + src_index_0 + 2); |
||||
|
||||
char data1_0 = *(src + src_index_1 + 0); |
||||
char data1_1 = *(src + src_index_1 + 1); |
||||
char data1_2 = *(src + src_index_1 + 2); |
||||
|
||||
*(dst + dst_index_0 + 0 ) = data1_0; |
||||
*(dst + dst_index_0 + 1 ) = data1_1; |
||||
*(dst + dst_index_0 + 2 ) = data1_2; |
||||
|
||||
*(dst + dst_index_1 + 0) = data0_0; |
||||
*(dst + dst_index_1 + 1) = data0_1; |
||||
*(dst + dst_index_1 + 2) = data0_2; |
||||
} |
||||
} |
||||
__kernel void arithm_flip_rc_C3_D2 (__global ushort *src, int src_step, int src_offset, |
||||
__global ushort *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int thread_rows, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < thread_rows) |
||||
{ |
||||
int src_index_0 = mad24(y, src_step, (x * 3 << 1) + src_offset); |
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 1) + src_offset); |
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x * 3 << 1) + dst_offset); |
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 1) + dst_offset); |
||||
|
||||
ushort data0_0 = *((__global ushort *)((__global char *)src + src_index_0 + 0)); |
||||
ushort data0_1 = *((__global ushort *)((__global char *)src + src_index_0 + 2)); |
||||
ushort data0_2 = *((__global ushort *)((__global char *)src + src_index_0 + 4)); |
||||
|
||||
ushort data1_0 = *((__global ushort *)((__global char *)src + src_index_1 + 0)); |
||||
ushort data1_1 = *((__global ushort *)((__global char *)src + src_index_1 + 2)); |
||||
ushort data1_2 = *((__global ushort *)((__global char *)src + src_index_1 + 4)); |
||||
|
||||
*((__global ushort *)((__global char *)dst + dst_index_0 + 0)) = data1_0; |
||||
*((__global ushort *)((__global char *)dst + dst_index_0 + 2)) = data1_1; |
||||
*((__global ushort *)((__global char *)dst + dst_index_0 + 4)) = data1_2; |
||||
|
||||
*((__global ushort *)((__global char *)dst + dst_index_1 + 0)) = data0_0; |
||||
*((__global ushort *)((__global char *)dst + dst_index_1 + 2)) = data0_1; |
||||
*((__global ushort *)((__global char *)dst + dst_index_1 + 4)) = data0_2; |
||||
} |
||||
} |
||||
__kernel void arithm_flip_rc_C3_D3 (__global short *src, int src_step, int src_offset, |
||||
__global short *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int thread_rows, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < thread_rows) |
||||
{ |
||||
int src_index_0 = mad24(y, src_step, (x * 3 << 1) + src_offset); |
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 1) + src_offset); |
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x * 3 << 1) + dst_offset); |
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 1) + dst_offset); |
||||
|
||||
short data0_0 = *((__global short *)((__global char *)src + src_index_0 + 0)); |
||||
short data0_1 = *((__global short *)((__global char *)src + src_index_0 + 2)); |
||||
short data0_2 = *((__global short *)((__global char *)src + src_index_0 + 4)); |
||||
|
||||
short data1_0 = *((__global short *)((__global char *)src + src_index_1 + 0)); |
||||
short data1_1 = *((__global short *)((__global char *)src + src_index_1 + 2)); |
||||
short data1_2 = *((__global short *)((__global char *)src + src_index_1 + 4)); |
||||
|
||||
*((__global short *)((__global char *)dst + dst_index_0 + 0)) = data1_0; |
||||
*((__global short *)((__global char *)dst + dst_index_0 + 2)) = data1_1; |
||||
*((__global short *)((__global char *)dst + dst_index_0 + 4)) = data1_2; |
||||
|
||||
*((__global short *)((__global char *)dst + dst_index_1 + 0)) = data0_0; |
||||
*((__global short *)((__global char *)dst + dst_index_1 + 2)) = data0_1; |
||||
*((__global short *)((__global char *)dst + dst_index_1 + 4)) = data0_2; |
||||
} |
||||
} |
||||
|
||||
__kernel void arithm_flip_rc_C3_D4 (__global int *src, int src_step, int src_offset, |
||||
__global int *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int thread_rows, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < thread_rows) |
||||
{ |
||||
int src_index_0 = mad24(y, src_step, (x * 3 << 2) + src_offset); |
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 2) + src_offset); |
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x * 3 << 2) + dst_offset); |
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 2) + dst_offset); |
||||
|
||||
int data0_0 = *((__global int *)((__global char *)src + src_index_0 + 0)); |
||||
int data0_1 = *((__global int *)((__global char *)src + src_index_0 + 4)); |
||||
int data0_2 = *((__global int *)((__global char *)src + src_index_0 + 8)); |
||||
|
||||
int data1_0 = *((__global int *)((__global char *)src + src_index_1 + 0)); |
||||
int data1_1 = *((__global int *)((__global char *)src + src_index_1 + 4)); |
||||
int data1_2 = *((__global int *)((__global char *)src + src_index_1 + 8)); |
||||
|
||||
*((__global int *)((__global char *)dst + dst_index_0 + 0)) = data1_0; |
||||
*((__global int *)((__global char *)dst + dst_index_0 + 4)) = data1_1; |
||||
*((__global int *)((__global char *)dst + dst_index_0 + 8)) = data1_2; |
||||
|
||||
*((__global int *)((__global char *)dst + dst_index_1 + 0)) = data0_0; |
||||
*((__global int *)((__global char *)dst + dst_index_1 + 4)) = data0_1; |
||||
*((__global int *)((__global char *)dst + dst_index_1 + 8)) = data0_2; |
||||
} |
||||
} |
||||
__kernel void arithm_flip_rc_C3_D5 (__global float *src, int src_step, int src_offset, |
||||
__global float *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int thread_rows, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < thread_rows) |
||||
{ |
||||
int src_index_0 = mad24(y, src_step, (x * 3 << 2) + src_offset); |
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 2) + src_offset); |
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x * 3 << 2) + dst_offset); |
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 2) + dst_offset); |
||||
|
||||
float data0_0 = *((__global float *)((__global char *)src + src_index_0 + 0)); |
||||
float data0_1 = *((__global float *)((__global char *)src + src_index_0 + 4)); |
||||
float data0_2 = *((__global float *)((__global char *)src + src_index_0 + 8)); |
||||
|
||||
float data1_0 = *((__global float *)((__global char *)src + src_index_1 + 0)); |
||||
float data1_1 = *((__global float *)((__global char *)src + src_index_1 + 4)); |
||||
float data1_2 = *((__global float *)((__global char *)src + src_index_1 + 8)); |
||||
|
||||
*((__global float *)((__global char *)dst + dst_index_0 + 0)) = data1_0; |
||||
*((__global float *)((__global char *)dst + dst_index_0 + 4)) = data1_1; |
||||
*((__global float *)((__global char *)dst + dst_index_0 + 8)) = data1_2; |
||||
|
||||
*((__global float *)((__global char *)dst + dst_index_1 + 0)) = data0_0; |
||||
*((__global float *)((__global char *)dst + dst_index_1 + 4)) = data0_1; |
||||
*((__global float *)((__global char *)dst + dst_index_1 + 8)) = data0_2; |
||||
} |
||||
} |
||||
|
||||
#if defined (DOUBLE_SUPPORT) |
||||
__kernel void arithm_flip_rc_C3_D6 (__global double *src, int src_step, int src_offset, |
||||
__global double *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int thread_rows, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < thread_rows) |
||||
{ |
||||
int src_index_0 = mad24(y, src_step, (x * 3 << 3) + src_offset); |
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) * 3 << 3) + src_offset); |
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x * 3 << 3) + dst_offset); |
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) * 3 << 3) + dst_offset); |
||||
|
||||
double data0_0 = *((__global double *)((__global char *)src + src_index_0 + 0 )); |
||||
double data0_1 = *((__global double *)((__global char *)src + src_index_0 + 8 )); |
||||
double data0_2 = *((__global double *)((__global char *)src + src_index_0 + 16)); |
||||
|
||||
double data1_0 = *((__global double *)((__global char *)src + src_index_1 + 0 )); |
||||
double data1_1 = *((__global double *)((__global char *)src + src_index_1 + 8 )); |
||||
double data1_2 = *((__global double *)((__global char *)src + src_index_1 + 16)); |
||||
|
||||
*((__global double *)((__global char *)dst + dst_index_0 + 0 )) = data1_0; |
||||
*((__global double *)((__global char *)dst + dst_index_0 + 8 )) = data1_1; |
||||
*((__global double *)((__global char *)dst + dst_index_0 + 16)) = data1_2; |
||||
|
||||
*((__global double *)((__global char *)dst + dst_index_1 + 0 )) = data0_0; |
||||
*((__global double *)((__global char *)dst + dst_index_1 + 8 )) = data0_1; |
||||
*((__global double *)((__global char *)dst + dst_index_1 + 16)) = data0_2; |
||||
} |
||||
} |
||||
#endif |
||||
__kernel void arithm_flip_rc_C4_D0 (__global uchar *src, int src_step, int src_offset, |
||||
__global uchar *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int thread_rows, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < thread_rows) |
||||
{ |
||||
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset); |
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset); |
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset); |
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset); |
||||
|
||||
uchar4 data0 = *((__global uchar4 *)(src + src_index_0)); |
||||
uchar4 data1 = *((__global uchar4 *)(src + src_index_1)); |
||||
|
||||
*((__global uchar4 *)(dst + dst_index_0)) = data1; |
||||
*((__global uchar4 *)(dst + dst_index_1)) = data0; |
||||
} |
||||
} |
||||
__kernel void arithm_flip_rc_C4_D1 (__global char *src, int src_step, int src_offset, |
||||
__global char *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int thread_rows, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < thread_rows) |
||||
{ |
||||
int src_index_0 = mad24(y, src_step, (x << 2) + src_offset); |
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 2) + src_offset); |
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 2) + dst_offset); |
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 2) + dst_offset); |
||||
|
||||
char4 data0 = *((__global char4 *)(src + src_index_0)); |
||||
char4 data1 = *((__global char4 *)(src + src_index_1)); |
||||
|
||||
*((__global char4 *)(dst + dst_index_0)) = data1; |
||||
*((__global char4 *)(dst + dst_index_1)) = data0; |
||||
} |
||||
} |
||||
__kernel void arithm_flip_rc_C4_D2 (__global ushort *src, int src_step, int src_offset, |
||||
__global ushort *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int thread_rows, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < thread_rows) |
||||
{ |
||||
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset); |
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset); |
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset); |
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset); |
||||
|
||||
ushort4 data0 = *((__global ushort4 *)((__global char *)src + src_index_0)); |
||||
ushort4 data1 = *((__global ushort4 *)((__global char *)src + src_index_1)); |
||||
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index_0)) = data1; |
||||
*((__global ushort4 *)((__global char *)dst + dst_index_1)) = data0; |
||||
} |
||||
} |
||||
__kernel void arithm_flip_rc_C4_D3 (__global short *src, int src_step, int src_offset, |
||||
__global short *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int thread_rows, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < thread_rows) |
||||
{ |
||||
int src_index_0 = mad24(y, src_step, (x << 3) + src_offset); |
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 3) + src_offset); |
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 3) + dst_offset); |
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 3) + dst_offset); |
||||
|
||||
short4 data0 = *((__global short4 *)((__global char *)src + src_index_0)); |
||||
short4 data1 = *((__global short4 *)((__global char *)src + src_index_1)); |
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index_0)) = data1; |
||||
*((__global short4 *)((__global char *)dst + dst_index_1)) = data0; |
||||
} |
||||
} |
||||
__kernel void arithm_flip_rc_C4_D4 (__global int *src, int src_step, int src_offset, |
||||
__global int *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int thread_rows, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < thread_rows) |
||||
{ |
||||
int src_index_0 = mad24(y, src_step, (x << 4) + src_offset); |
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 4) + src_offset); |
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 4) + dst_offset); |
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 4) + dst_offset); |
||||
|
||||
int4 data0 = *((__global int4 *)((__global char *)src + src_index_0)); |
||||
int4 data1 = *((__global int4 *)((__global char *)src + src_index_1)); |
||||
|
||||
*((__global int4 *)((__global char *)dst + dst_index_0)) = data1; |
||||
*((__global int4 *)((__global char *)dst + dst_index_1)) = data0; |
||||
} |
||||
} |
||||
__kernel void arithm_flip_rc_C4_D5 (__global float *src, int src_step, int src_offset, |
||||
__global float *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int thread_rows, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < thread_rows) |
||||
{ |
||||
int src_index_0 = mad24(y, src_step, (x << 4) + src_offset); |
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 4) + src_offset); |
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 4) + dst_offset); |
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 4) + dst_offset); |
||||
|
||||
float4 data0 = *((__global float4 *)((__global char *)src + src_index_0)); |
||||
float4 data1 = *((__global float4 *)((__global char *)src + src_index_1)); |
||||
|
||||
*((__global float4 *)((__global char *)dst + dst_index_0)) = data1; |
||||
*((__global float4 *)((__global char *)dst + dst_index_1)) = data0; |
||||
} |
||||
} |
||||
|
||||
#if defined (DOUBLE_SUPPORT) |
||||
__kernel void arithm_flip_rc_C4_D6 (__global double *src, int src_step, int src_offset, |
||||
__global double *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int thread_rows, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < thread_rows) |
||||
{ |
||||
int src_index_0 = mad24(y, src_step, (x << 5) + src_offset); |
||||
int src_index_1 = mad24(rows - y - 1, src_step, ((cols - x -1) << 5) + src_offset); |
||||
|
||||
int dst_index_0 = mad24(y, dst_step, (x << 5) + dst_offset); |
||||
int dst_index_1 = mad24(rows - y - 1, dst_step, ((cols - x -1) << 5) + dst_offset); |
||||
|
||||
double4 data0 = *((__global double4 *)((__global char *)src + src_index_0)); |
||||
double4 data1 = *((__global double4 *)((__global char *)src + src_index_1)); |
||||
|
||||
*((__global double4 *)((__global char *)dst + dst_index_0)) = data1; |
||||
*((__global double4 *)((__global char *)dst + dst_index_1)) = data0; |
||||
} |
||||
} |
||||
#endif |
@ -1,196 +0,0 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Shengen Yan,yanshengen@gmail.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other GpuMaterials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
/**************************************PUBLICFUNC*************************************/ |
||||
#if defined (DOUBLE_SUPPORT) |
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable |
||||
#endif |
||||
|
||||
#if defined (DEPTH_0) |
||||
#define VEC_TYPE uchar8 |
||||
#define TYPE uchar |
||||
#define CONVERT_TYPE convert_uchar8 |
||||
#define MIN_VAL 0 |
||||
#define MAX_VAL 255 |
||||
#endif |
||||
#if defined (DEPTH_1) |
||||
#define VEC_TYPE char8 |
||||
#define TYPE char |
||||
#define CONVERT_TYPE convert_char8 |
||||
#define MIN_VAL -128 |
||||
#define MAX_VAL 127 |
||||
#endif |
||||
#if defined (DEPTH_2) |
||||
#define VEC_TYPE ushort8 |
||||
#define TYPE ushort |
||||
#define CONVERT_TYPE convert_ushort8 |
||||
#define MIN_VAL 0 |
||||
#define MAX_VAL 65535 |
||||
#endif |
||||
#if defined (DEPTH_3) |
||||
#define VEC_TYPE short8 |
||||
#define TYPE short |
||||
#define CONVERT_TYPE convert_short8 |
||||
#define MIN_VAL -32768 |
||||
#define MAX_VAL 32767 |
||||
#endif |
||||
#if defined (DEPTH_4) |
||||
#define VEC_TYPE int8 |
||||
#define TYPE int |
||||
#define CONVERT_TYPE convert_int8 |
||||
#define MIN_VAL INT_MIN |
||||
#define MAX_VAL INT_MAX |
||||
#endif |
||||
#if defined (DEPTH_5) |
||||
#define VEC_TYPE float8 |
||||
#define TYPE float |
||||
#define CONVERT_TYPE convert_float8 |
||||
#define MIN_VAL (-FLT_MAX) |
||||
#define MAX_VAL FLT_MAX |
||||
#endif |
||||
#if defined (DEPTH_6) |
||||
#define VEC_TYPE double8 |
||||
#define TYPE double |
||||
#define CONVERT_TYPE convert_double8 |
||||
#define MIN_VAL (-DBL_MAX) |
||||
#define MAX_VAL DBL_MAX |
||||
#endif |
||||
|
||||
#if defined (REPEAT_E0) |
||||
#define repeat_me(a) a = a; |
||||
#endif |
||||
#if defined (REPEAT_E1) |
||||
#define repeat_me(a) a.s7 = 0; |
||||
#endif |
||||
#if defined (REPEAT_E2) |
||||
#define repeat_me(a) a.s7 = 0;a.s6 = 0; |
||||
#endif |
||||
#if defined (REPEAT_E3) |
||||
#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0; |
||||
#endif |
||||
#if defined (REPEAT_E4) |
||||
#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0; |
||||
#endif |
||||
#if defined (REPEAT_E5) |
||||
#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0; |
||||
#endif |
||||
#if defined (REPEAT_E6) |
||||
#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;a.s2 = 0; |
||||
#endif |
||||
#if defined (REPEAT_E7) |
||||
#define repeat_me(a) a.s7 = 0;a.s6 = 0;a.s5 = 0;a.s4 = 0;a.s3 = 0;a.s2 = 0;a.s1 = 0; |
||||
#endif |
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable |
||||
#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics:enable |
||||
|
||||
/**************************************Array minMax mask**************************************/ |
||||
__kernel void arithm_op_minMax_mask (int cols,int invalid_cols,int offset,int elemnum,int groupnum, __global TYPE *src, |
||||
int minvalid_cols,int moffset, __global uchar *mask,__global VEC_TYPE *dst) |
||||
{ |
||||
unsigned int lid = get_local_id(0); |
||||
unsigned int gid = get_group_id(0); |
||||
unsigned int id = get_global_id(0); |
||||
unsigned int idx = id + (id / cols) * invalid_cols; |
||||
unsigned int midx = id + (id / cols) * minvalid_cols; |
||||
__local VEC_TYPE localmem_max[128],localmem_min[128]; |
||||
VEC_TYPE minval,maxval,temp,m_temp; |
||||
if(id < elemnum) |
||||
{ |
||||
temp = vload8(idx, &src[offset]); |
||||
m_temp = CONVERT_TYPE(vload8(midx,&mask[moffset])); |
||||
if(id % cols == cols - 1) |
||||
{ |
||||
repeat_me(m_temp); |
||||
} |
||||
minval = m_temp != (VEC_TYPE)0 ? temp : (VEC_TYPE)MAX_VAL; |
||||
maxval = m_temp != (VEC_TYPE)0 ? temp : (VEC_TYPE)MIN_VAL; |
||||
} |
||||
else |
||||
{ |
||||
minval = MAX_VAL; |
||||
maxval = MIN_VAL; |
||||
} |
||||
for(id=id + (groupnum << 8); id < elemnum;id = id + (groupnum << 8)) |
||||
{ |
||||
idx = id + (id / cols) * invalid_cols; |
||||
midx = id + (id / cols) * minvalid_cols; |
||||
temp = vload8(idx, &src[offset]); |
||||
m_temp = CONVERT_TYPE(vload8(midx,&mask[moffset])); |
||||
if(id % cols == cols - 1) |
||||
{ |
||||
repeat_me(m_temp); |
||||
} |
||||
minval = min(minval,m_temp != (VEC_TYPE)0 ? temp : minval); |
||||
maxval = max(maxval,m_temp != (VEC_TYPE)0 ? temp : maxval); |
||||
} |
||||
if(lid > 127) |
||||
{ |
||||
localmem_min[lid - 128] = minval; |
||||
localmem_max[lid - 128] = maxval; |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if(lid < 128) |
||||
{ |
||||
localmem_min[lid] = min(minval,localmem_min[lid]); |
||||
localmem_max[lid] = max(maxval,localmem_max[lid]); |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
for(int lsize = 64; lsize > 0; lsize >>= 1) |
||||
{ |
||||
if(lid < lsize) |
||||
{ |
||||
int lid2 = lsize + lid; |
||||
localmem_min[lid] = min(localmem_min[lid] , localmem_min[lid2]); |
||||
localmem_max[lid] = max(localmem_max[lid] , localmem_max[lid2]); |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
if( lid == 0) |
||||
{ |
||||
dst[gid] = localmem_min[0]; |
||||
dst[gid + groupnum] = localmem_max[0]; |
||||
} |
||||
} |
@ -0,0 +1,370 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#ifdef BORDER_REPLICATE |
||||
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh |
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i)) |
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr)) |
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i)) |
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr)) |
||||
#endif |
||||
|
||||
#ifdef BORDER_REFLECT |
||||
//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb |
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i)) |
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr)) |
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i)) |
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr)) |
||||
#endif |
||||
|
||||
#ifdef BORDER_REFLECT_101 |
||||
//BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba |
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i)) |
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)) |
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i)) |
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr)) |
||||
#endif |
||||
|
||||
//blur function does not support BORDER_WRAP |
||||
#ifdef BORDER_WRAP |
||||
//BORDER_WRAP: cdefgh|abcdefgh|abcdefg |
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i)) |
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr)) |
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i)) |
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr)) |
||||
#endif |
||||
|
||||
#ifdef EXTRA_EXTRAPOLATION // border > src image size |
||||
#ifdef BORDER_CONSTANT |
||||
// None |
||||
#elif defined BORDER_REPLICATE |
||||
#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \ |
||||
{ \ |
||||
x = max(min(x, maxX - 1), minX); \ |
||||
y = max(min(y, maxY - 1), minY); \ |
||||
} |
||||
#elif defined BORDER_WRAP |
||||
#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \ |
||||
{ \ |
||||
if (x < minX) \ |
||||
x -= ((x - maxX + 1) / maxX) * maxX; \ |
||||
if (x >= maxX) \ |
||||
x %= maxX; \ |
||||
if (y < minY) \ |
||||
y -= ((y - maxY + 1) / maxY) * maxY; \ |
||||
if (y >= maxY) \ |
||||
y %= maxY; \ |
||||
} |
||||
#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101) |
||||
#define EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, delta) \ |
||||
{ \ |
||||
if (maxX - minX == 1) \ |
||||
x = minX; \ |
||||
else \ |
||||
do \ |
||||
{ \ |
||||
if (x < minX) \ |
||||
x = -(x - minX) - 1 + delta; \ |
||||
else \ |
||||
x = maxX - 1 - (x - maxX) - delta; \ |
||||
} \ |
||||
while (x >= maxX || x < minX); \ |
||||
\ |
||||
if (maxY - minY == 1) \ |
||||
y = minY; \ |
||||
else \ |
||||
do \ |
||||
{ \ |
||||
if (y < minY) \ |
||||
y = -(y - minY) - 1 + delta; \ |
||||
else \ |
||||
y = maxY - 1 - (y - maxY) - delta; \ |
||||
} \ |
||||
while (y >= maxY || y < minY); \ |
||||
} |
||||
#ifdef BORDER_REFLECT |
||||
#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 0) |
||||
#elif defined(BORDER_REFLECT_101) |
||||
#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 1) |
||||
#endif |
||||
#else |
||||
#error No extrapolation method |
||||
#endif |
||||
#else |
||||
#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \ |
||||
{ \ |
||||
int _row = y - minY, _col = x - minX; \ |
||||
_row = ADDR_H(_row, 0, maxY - minY); \ |
||||
_row = ADDR_B(_row, maxY - minY, _row); \ |
||||
y = _row + minY; \ |
||||
\ |
||||
_col = ADDR_L(_col, 0, maxX - minX); \ |
||||
_col = ADDR_R(_col, maxX - minX, _col); \ |
||||
x = _col + minX; \ |
||||
} |
||||
#endif |
||||
|
||||
#if USE_DOUBLE |
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable |
||||
#define FPTYPE double |
||||
#define CONVERT_TO_FPTYPE CAT(convert_double, VEC_SIZE) |
||||
#else |
||||
#define FPTYPE float |
||||
#define CONVERT_TO_FPTYPE CAT(convert_float, VEC_SIZE) |
||||
#endif |
||||
|
||||
#if DATA_DEPTH == 0 |
||||
#define BASE_TYPE uchar |
||||
#elif DATA_DEPTH == 1 |
||||
#define BASE_TYPE char |
||||
#elif DATA_DEPTH == 2 |
||||
#define BASE_TYPE ushort |
||||
#elif DATA_DEPTH == 3 |
||||
#define BASE_TYPE short |
||||
#elif DATA_DEPTH == 4 |
||||
#define BASE_TYPE int |
||||
#elif DATA_DEPTH == 5 |
||||
#define BASE_TYPE float |
||||
#elif DATA_DEPTH == 6 |
||||
#define BASE_TYPE double |
||||
#else |
||||
#error data_depth |
||||
#endif |
||||
|
||||
#define __CAT(x, y) x##y |
||||
#define CAT(x, y) __CAT(x, y) |
||||
|
||||
#define uchar1 uchar |
||||
#define char1 char |
||||
#define ushort1 ushort |
||||
#define short1 short |
||||
#define int1 int |
||||
#define float1 float |
||||
#define double1 double |
||||
|
||||
#define convert_uchar1_sat_rte convert_uchar_sat_rte |
||||
#define convert_char1_sat_rte convert_char_sat_rte |
||||
#define convert_ushort1_sat_rte convert_ushort_sat_rte |
||||
#define convert_short1_sat_rte convert_short_sat_rte |
||||
#define convert_int1_sat_rte convert_int_sat_rte |
||||
#define convert_float1 |
||||
#define convert_double1 |
||||
|
||||
#if DATA_DEPTH == 5 || DATA_DEPTH == 6 |
||||
#define CONVERT_TO_TYPE CAT(CAT(convert_, BASE_TYPE), VEC_SIZE) |
||||
#else |
||||
#define CONVERT_TO_TYPE CAT(CAT(CAT(convert_, BASE_TYPE), VEC_SIZE), _sat_rte) |
||||
#endif |
||||
|
||||
#define VEC_SIZE DATA_CHAN |
||||
|
||||
#define VEC_TYPE CAT(BASE_TYPE, VEC_SIZE) |
||||
#define TYPE VEC_TYPE |
||||
|
||||
#define SCALAR_TYPE CAT(FPTYPE, VEC_SIZE) |
||||
|
||||
#define INTERMEDIATE_TYPE CAT(FPTYPE, VEC_SIZE) |
||||
|
||||
struct RectCoords |
||||
{ |
||||
int x1, y1, x2, y2; |
||||
}; |
||||
|
||||
//#define DEBUG |
||||
#ifdef DEBUG |
||||
#define DEBUG_ONLY(x) x |
||||
#define ASSERT(condition) do { if (!(condition)) { printf("BUG in boxFilter kernel (global=%d,%d): " #condition "\n", get_global_id(0), get_global_id(1)); } } while (0) |
||||
#else |
||||
#define DEBUG_ONLY(x) (void)0 |
||||
#define ASSERT(condition) (void)0 |
||||
#endif |
||||
|
||||
|
||||
inline INTERMEDIATE_TYPE readSrcPixel(int2 pos, __global TYPE *src, const unsigned int srcStepBytes, const struct RectCoords srcCoords |
||||
#ifdef BORDER_CONSTANT |
||||
, SCALAR_TYPE borderValue |
||||
#endif |
||||
) |
||||
{ |
||||
#ifdef BORDER_ISOLATED |
||||
if(pos.x >= srcCoords.x1 && pos.y >= srcCoords.y1 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2) |
||||
#else |
||||
if(pos.x >= 0 && pos.y >= 0 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2) |
||||
#endif |
||||
{ |
||||
__global TYPE* ptr = (__global TYPE*)((__global char*)src + pos.x * sizeof(TYPE) + pos.y * srcStepBytes); |
||||
return CONVERT_TO_FPTYPE(*ptr); |
||||
} |
||||
else |
||||
{ |
||||
#ifdef BORDER_CONSTANT |
||||
return borderValue; |
||||
#else |
||||
int selected_col = pos.x; |
||||
int selected_row = pos.y; |
||||
|
||||
EXTRAPOLATE(selected_col, selected_row, |
||||
#ifdef BORDER_ISOLATED |
||||
srcCoords.x1, srcCoords.y1, |
||||
#else |
||||
0, 0, |
||||
#endif |
||||
srcCoords.x2, srcCoords.y2 |
||||
); |
||||
|
||||
// debug border mapping |
||||
//printf("pos=%d,%d --> %d, %d\n", pos.x, pos.y, selected_col, selected_row); |
||||
|
||||
pos = (int2)(selected_col, selected_row); |
||||
if(pos.x >= 0 && pos.y >= 0 && pos.x < srcCoords.x2 && pos.y < srcCoords.y2) |
||||
{ |
||||
__global TYPE* ptr = (__global TYPE*)((__global char*)src + pos.x * sizeof(TYPE) + pos.y * srcStepBytes); |
||||
return CONVERT_TO_FPTYPE(*ptr); |
||||
} |
||||
else |
||||
{ |
||||
// for debug only |
||||
DEBUG_ONLY(printf("BUG in boxFilter kernel\n")); |
||||
return (FPTYPE)(0.0f); |
||||
} |
||||
#endif |
||||
} |
||||
} |
||||
|
||||
// INPUT PARAMETER: BLOCK_SIZE_Y (via defines) |
||||
|
||||
__kernel |
||||
__attribute__((reqd_work_group_size(LOCAL_SIZE, 1, 1))) |
||||
void filter2D(__global TYPE *src, const unsigned int srcStepBytes, const int4 srcRC, |
||||
__global TYPE *dst, const unsigned int dstStepBytes, const int4 dstRC, |
||||
#ifdef BORDER_CONSTANT |
||||
SCALAR_TYPE borderValue, |
||||
#endif |
||||
__constant FPTYPE* kernelData // transposed: [KERNEL_SIZE_X][KERNEL_SIZE_Y2_ALIGNED] |
||||
) |
||||
{ |
||||
const struct RectCoords srcCoords = {srcRC.s0, srcRC.s1, srcRC.s2, srcRC.s3}; // for non-isolated border: offsetX, offsetY, wholeX, wholeY |
||||
struct RectCoords dstCoords = {dstRC.s0, dstRC.s1, dstRC.s2, dstRC.s3}; |
||||
|
||||
const int local_id = get_local_id(0); |
||||
const int x = local_id + (LOCAL_SIZE - (KERNEL_SIZE_X - 1)) * get_group_id(0) - ANCHOR_X; |
||||
const int y = get_global_id(1) * BLOCK_SIZE_Y; |
||||
|
||||
INTERMEDIATE_TYPE data[KERNEL_SIZE_Y]; |
||||
__local INTERMEDIATE_TYPE sumOfCols[LOCAL_SIZE]; |
||||
|
||||
int2 srcPos = (int2)(srcCoords.x1 + x, srcCoords.y1 + y - ANCHOR_Y); |
||||
|
||||
int2 pos = (int2)(dstCoords.x1 + x, dstCoords.y1 + y); |
||||
__global TYPE* dstPtr = (__global TYPE*)((__global char*)dst + pos.x * sizeof(TYPE) + pos.y * dstStepBytes); // Pointer can be out of bounds! |
||||
bool writeResult = (local_id >= ANCHOR_X && local_id < LOCAL_SIZE - (KERNEL_SIZE_X - 1 - ANCHOR_X) && |
||||
pos.x >= dstCoords.x1 && pos.x < dstCoords.x2); |
||||
|
||||
#if BLOCK_SIZE_Y > 1 |
||||
bool readAllpixels = true; |
||||
int sy_index = 0; // current index in data[] array |
||||
|
||||
dstCoords.y2 = min(dstCoords.y2, pos.y + BLOCK_SIZE_Y); |
||||
for (; |
||||
pos.y < dstCoords.y2; |
||||
pos.y++, |
||||
dstPtr = (__global TYPE*)((__global char*)dstPtr + dstStepBytes)) |
||||
#endif |
||||
{ |
||||
ASSERT(pos.y < dstCoords.y2); |
||||
|
||||
for ( |
||||
#if BLOCK_SIZE_Y > 1 |
||||
int sy = readAllpixels ? 0 : -1; sy < (readAllpixels ? KERNEL_SIZE_Y : 0); |
||||
#else |
||||
int sy = 0, sy_index = 0; sy < KERNEL_SIZE_Y; |
||||
#endif |
||||
sy++, srcPos.y++) |
||||
{ |
||||
data[sy + sy_index] = readSrcPixel(srcPos, src, srcStepBytes, srcCoords |
||||
#ifdef BORDER_CONSTANT |
||||
, borderValue |
||||
#endif |
||||
); |
||||
} |
||||
|
||||
INTERMEDIATE_TYPE total_sum = 0; |
||||
for (int sx = 0; sx < KERNEL_SIZE_X; sx++) |
||||
{ |
||||
{ |
||||
__constant FPTYPE* k = &kernelData[KERNEL_SIZE_Y2_ALIGNED * sx |
||||
#if BLOCK_SIZE_Y > 1 |
||||
+ KERNEL_SIZE_Y - sy_index |
||||
#endif |
||||
]; |
||||
INTERMEDIATE_TYPE tmp_sum = 0; |
||||
for (int sy = 0; sy < KERNEL_SIZE_Y; sy++) |
||||
{ |
||||
tmp_sum += data[sy] * k[sy]; |
||||
} |
||||
|
||||
sumOfCols[local_id] = tmp_sum; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
|
||||
int id = local_id + sx - ANCHOR_X; |
||||
if (id >= 0 && id < LOCAL_SIZE) |
||||
total_sum += sumOfCols[id]; |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
|
||||
if (writeResult) |
||||
{ |
||||
ASSERT(pos.y >= dstCoords.y1 && pos.y < dstCoords.y2); |
||||
*dstPtr = CONVERT_TO_TYPE(total_sum); |
||||
} |
||||
|
||||
#if BLOCK_SIZE_Y > 1 |
||||
readAllpixels = false; |
||||
#if BLOCK_SIZE_Y > KERNEL_SIZE_Y |
||||
sy_index = (sy_index + 1 <= KERNEL_SIZE_Y) ? sy_index + 1 : 1; |
||||
#else |
||||
sy_index++; |
||||
#endif |
||||
#endif // BLOCK_SIZE_Y == 1 |
||||
} |
||||
} |
@ -1,381 +0,0 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Pang Erping, erping@multicorewareinc.com |
||||
// Jia Haipeng, jiahaipeng95@gmail.com |
||||
// Peng Xiao, pengxiao@outlook.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
/////////////////////////////////Macro for border type//////////////////////////////////////////// |
||||
///////////////////////////////////////////////////////////////////////////////////////////////// |
||||
#ifdef BORDER_REPLICATE |
||||
|
||||
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh |
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i)) |
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr)) |
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) : (i)) |
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr)) |
||||
#endif |
||||
|
||||
#ifdef BORDER_REFLECT |
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? ((l_edge)<<1)-(i)-1 : (i)) |
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr)) |
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? ((t_edge)<<1)-(i)-1 : (i)) |
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr)) |
||||
#endif |
||||
|
||||
#ifdef BORDER_REFLECT_101 |
||||
//BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba |
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? ((l_edge)<<1)-(i) : (i)) |
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)) |
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? ((t_edge)<<1)-(i) : (i)) |
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr)) |
||||
#endif |
||||
|
||||
#ifdef IMG_C_1_0 |
||||
#define T_IMG uchar |
||||
#define T_IMGx4 uchar4 |
||||
#define T_IMG_C1 uchar |
||||
#define CONVERT_TYPE convert_uchar_sat |
||||
#define CONVERT_TYPEx4 convert_uchar4_sat |
||||
#endif |
||||
#ifdef IMG_C_4_0 |
||||
#define T_IMG uchar4 |
||||
#define T_IMGx4 uchar16 |
||||
#define T_IMG_C1 uchar |
||||
#define CONVERT_TYPE convert_uchar4_sat |
||||
#define CONVERT_TYPEx4 convert_uchar16_sat |
||||
#endif |
||||
#ifdef IMG_C_1_5 |
||||
#define T_IMG float |
||||
#define T_IMGx4 float4 |
||||
#define T_IMG_C1 float |
||||
#define CONVERT_TYPE convert_float |
||||
#define CONVERT_TYPEx4 convert_float4 |
||||
#endif |
||||
#ifdef IMG_C_4_5 |
||||
#define T_IMG float4 |
||||
#define T_IMGx4 float16 |
||||
#define T_IMG_C1 float |
||||
#define CONVERT_TYPE convert_float4 |
||||
#define CONVERT_TYPEx4 convert_float16 |
||||
#endif |
||||
|
||||
#ifndef CN |
||||
#define CN 1 |
||||
#endif |
||||
|
||||
#if CN == 1 |
||||
#define T_SUM float |
||||
#define T_SUMx4 float4 |
||||
#define CONVERT_TYPE_SUM convert_float |
||||
#define CONVERT_TYPE_SUMx4 convert_float4 |
||||
#define SUM_ZERO (0.0f) |
||||
#define SUM_ZEROx4 (0.0f, 0.0f, 0.0f, 0.0f) |
||||
#define VLOAD4 vload4 |
||||
#define SX x |
||||
#define SY y |
||||
#define SZ z |
||||
#define SW w |
||||
#elif CN == 4 |
||||
#define T_SUM float4 |
||||
#define T_SUMx4 float16 |
||||
#define CONVERT_TYPE_SUM convert_float4 |
||||
#define CONVERT_TYPE_SUMx4 convert_float16 |
||||
#define SUM_ZERO (0.0f, 0.0f, 0.0f, 0.0f) |
||||
#define SUM_ZEROx4 (0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f) |
||||
#define VLOAD4 vload16 |
||||
#define SX s0123 |
||||
#define SY s4567 |
||||
#define SZ s89ab |
||||
#define SW scdef |
||||
#endif |
||||
|
||||
#ifndef FILTER_SIZE |
||||
#define FILTER_SIZE 3 |
||||
#endif |
||||
|
||||
#define LOCAL_GROUP_SIZE 16 |
||||
|
||||
#define LOCAL_WIDTH ((FILTER_SIZE/2)*2 + LOCAL_GROUP_SIZE) |
||||
#define LOCAL_HEIGHT ((FILTER_SIZE/2)*2 + LOCAL_GROUP_SIZE) |
||||
|
||||
#define FILTER_RADIUS (FILTER_SIZE >> 1) |
||||
|
||||
__kernel void filter2D( |
||||
__global T_IMG *src, |
||||
__global T_IMG *dst, |
||||
int src_step, |
||||
int dst_step, |
||||
__constant float *mat_kernel, |
||||
__local T_IMG *local_data, |
||||
int wholerows, |
||||
int wholecols, |
||||
int src_offset_x, |
||||
int src_offset_y, |
||||
int dst_offset_x, |
||||
int dst_offset_y, |
||||
int cols, |
||||
int rows, |
||||
int operate_cols |
||||
) |
||||
{ |
||||
int groupStartCol = get_group_id(0) * get_local_size(0); |
||||
int groupStartRow = get_group_id(1) * get_local_size(1); |
||||
|
||||
int localCol = get_local_id(0); |
||||
int localRow = get_local_id(1); |
||||
int globalCol = groupStartCol + localCol; |
||||
int globalRow = groupStartRow + localRow; |
||||
const int src_offset = mad24(src_offset_y, src_step, src_offset_x); |
||||
const int dst_offset = mad24(dst_offset_y, dst_step, dst_offset_x); |
||||
|
||||
#ifdef BORDER_CONSTANT |
||||
for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1)) |
||||
{ |
||||
int curRow = groupStartRow + i; |
||||
for(int j = localCol; j < LOCAL_WIDTH; j += get_local_size(0)) |
||||
{ |
||||
int curCol = groupStartCol + j; |
||||
if(curRow < FILTER_RADIUS - src_offset_y || (curRow - FILTER_RADIUS) >= wholerows - src_offset_y|| |
||||
curCol < FILTER_RADIUS - src_offset_x || (curCol - FILTER_RADIUS) >= wholecols - src_offset_x) |
||||
{ |
||||
local_data[(i) * LOCAL_WIDTH + j] = 0; |
||||
} |
||||
else |
||||
{ |
||||
local_data[(i) * LOCAL_WIDTH + j] = src[(curRow - FILTER_RADIUS) * src_step + curCol - FILTER_RADIUS + src_offset]; |
||||
} |
||||
} |
||||
} |
||||
#else |
||||
for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1)) |
||||
{ |
||||
int curRow = groupStartRow + i; |
||||
|
||||
curRow = ADDR_H(curRow, FILTER_RADIUS - src_offset_y, wholerows - src_offset_y); |
||||
|
||||
curRow = ADDR_B(curRow - FILTER_RADIUS, wholerows - src_offset_y, curRow - FILTER_RADIUS); |
||||
|
||||
for(int j = localCol; j < LOCAL_WIDTH; j += get_local_size(0)) |
||||
{ |
||||
int curCol = groupStartCol + j; |
||||
curCol = ADDR_L(curCol, FILTER_RADIUS - src_offset_x, wholecols - src_offset_x); |
||||
curCol = ADDR_R(curCol - FILTER_RADIUS, wholecols - src_offset_x, curCol - FILTER_RADIUS); |
||||
if(curRow < wholerows && curCol < wholecols) |
||||
{ |
||||
local_data[(i) * LOCAL_WIDTH + j] = src[(curRow) * src_step + curCol + src_offset]; |
||||
} |
||||
} |
||||
} |
||||
#endif |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if(globalRow < rows && globalCol < cols) |
||||
{ |
||||
T_SUM sum = (T_SUM)(SUM_ZERO); |
||||
int filterIdx = 0; |
||||
for(int i = 0; i < FILTER_SIZE; i++) |
||||
{ |
||||
int offset = (i + localRow) * LOCAL_WIDTH; |
||||
|
||||
for(int j = 0; j < FILTER_SIZE; j++) |
||||
{ |
||||
sum += CONVERT_TYPE_SUM(local_data[offset + j + localCol]) * mat_kernel[filterIdx++]; |
||||
} |
||||
} |
||||
dst[(globalRow)*dst_step + (globalCol) + dst_offset] = CONVERT_TYPE(sum); |
||||
} |
||||
} |
||||
|
||||
/// following is specific for 3x3 kernels |
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
/////////////////////////////Macro for define elements number per thread///////////////////////////// |
||||
//////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
|
||||
#define ANX 1 |
||||
#define ANY 1 |
||||
|
||||
#define ROWS_PER_GROUP 4 |
||||
#define ROWS_PER_GROUP_BITS 2 |
||||
#define ROWS_FETCH (ROWS_PER_GROUP + ANY + ANY) //(ROWS_PER_GROUP + anY * 2) |
||||
|
||||
#define THREADS_PER_ROW 64 |
||||
#define THREADS_PER_ROW_BIT 6 |
||||
|
||||
#define ELEMENTS_PER_THREAD 4 |
||||
#define ELEMENTS_PER_THREAD_BIT 2 |
||||
|
||||
#define LOCAL_MEM_STEP 260 //divup((get_local_size(0) + anX * 2), 4) * 4 |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
/////////////////////////////////////////8uC1//////////////////////////////////////////////////////// |
||||
//////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
|
||||
__kernel void filter2D_3x3( |
||||
__global T_IMG *src, |
||||
__global T_IMG *dst, |
||||
int src_step, |
||||
int dst_step, |
||||
__constant float *mat_kernel, |
||||
__local T_IMG *local_data, |
||||
int wholerows, |
||||
int wholecols, |
||||
int src_offset_x, |
||||
int src_offset_y, |
||||
int dst_offset_x, |
||||
int dst_offset_y, |
||||
int cols, |
||||
int rows, |
||||
int operate_cols |
||||
) |
||||
{ |
||||
int gX = get_global_id(0); |
||||
int gY = get_global_id(1); |
||||
|
||||
int lX = get_local_id(0); |
||||
|
||||
int groupX_size = get_local_size(0); |
||||
int groupX_id = get_group_id(0); |
||||
|
||||
#define dst_align (dst_offset_x & 3) |
||||
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX; |
||||
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY; |
||||
|
||||
if((gY << 2) < rows) |
||||
{ |
||||
for(int i = 0; i < ROWS_FETCH; ++i) |
||||
{ |
||||
if((rows_start_index - src_offset_y) + i < rows + ANY) |
||||
{ |
||||
#ifdef BORDER_CONSTANT |
||||
int selected_row = rows_start_index + i; |
||||
int selected_cols = cols_start_index_group + lX; |
||||
|
||||
T_IMG data = src[mad24(selected_row, src_step, selected_cols)]; |
||||
int con = selected_row >= 0 && selected_row < wholerows && selected_cols >= 0 && selected_cols < wholecols; |
||||
data = con ? data : (T_IMG)(0); |
||||
local_data[mad24(i, LOCAL_MEM_STEP, lX)] = data; |
||||
|
||||
if(lX < (ANX << 1)) |
||||
{ |
||||
selected_cols = cols_start_index_group + lX + groupX_size; |
||||
|
||||
data = src[mad24(selected_row, src_step, selected_cols)]; |
||||
con = selected_row >= 0 && selected_row < wholerows && selected_cols >= 0 && selected_cols < wholecols; |
||||
data = con ? data : (T_IMG)(0); |
||||
local_data[mad24(i, LOCAL_MEM_STEP, lX) + groupX_size] = data; |
||||
} |
||||
#else |
||||
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows); |
||||
selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row); |
||||
|
||||
int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols); |
||||
selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols); |
||||
|
||||
T_IMG data = src[mad24(selected_row, src_step, selected_cols)]; |
||||
|
||||
local_data[mad24(i, LOCAL_MEM_STEP, lX)] = data; |
||||
|
||||
if(lX < (ANX << 1)) |
||||
{ |
||||
selected_cols = cols_start_index_group + lX + groupX_size; |
||||
selected_cols = ADDR_R(selected_cols, wholecols, selected_cols); |
||||
|
||||
data = src[mad24(selected_row, src_step, selected_cols)]; |
||||
local_data[mad24(i, LOCAL_MEM_STEP, lX) + groupX_size] = data; |
||||
} |
||||
#endif |
||||
} |
||||
} |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2); |
||||
if(((gY << 2) < rows) && (process_col < operate_cols)) |
||||
{ |
||||
int dst_cols_start = dst_offset_x; |
||||
int dst_cols_end = dst_offset_x + cols; |
||||
int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc; |
||||
|
||||
int dst_rows_end = dst_offset_y + rows; |
||||
int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT); |
||||
dst = dst + mad24(dst_rows_index, dst_step, dst_cols_index); |
||||
|
||||
T_IMGx4 dst_data = *(__global T_IMGx4 *)dst; |
||||
|
||||
T_SUMx4 sum = (T_SUMx4)SUM_ZEROx4; |
||||
T_IMGx4 data; |
||||
|
||||
for(int i = 0; i < FILTER_SIZE; i++) |
||||
{ |
||||
#pragma unroll |
||||
for(int j = 0; j < FILTER_SIZE; j++) |
||||
{ |
||||
if(dst_rows_index < dst_rows_end) |
||||
{ |
||||
int local_row = (lX >> THREADS_PER_ROW_BIT) + i; |
||||
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; |
||||
|
||||
data = VLOAD4(0, (__local T_IMG_C1 *)(local_data + local_row * LOCAL_MEM_STEP + local_cols)); |
||||
sum = sum + (mat_kernel[i * FILTER_SIZE + j] * CONVERT_TYPE_SUMx4(data)); |
||||
} |
||||
} |
||||
} |
||||
|
||||
if(dst_rows_index < dst_rows_end) |
||||
{ |
||||
T_IMGx4 tmp_dst = CONVERT_TYPEx4(sum); |
||||
tmp_dst.SX = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end)) ? |
||||
tmp_dst.SX : dst_data.SX; |
||||
tmp_dst.SY = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end)) ? |
||||
tmp_dst.SY : dst_data.SY; |
||||
tmp_dst.SZ = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ? |
||||
tmp_dst.SZ : dst_data.SZ; |
||||
tmp_dst.SW = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ? |
||||
tmp_dst.SW : dst_data.SW; |
||||
*(__global T_IMGx4 *)dst = tmp_dst; |
||||
} |
||||
} |
||||
} |
@ -0,0 +1,108 @@ |
||||
/////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
/////////////////////////////////Macro for border type//////////////////////////////////////////// |
||||
///////////////////////////////////////////////////////////////////////////////////////////////// |
||||
#ifdef BORDER_REPLICATE |
||||
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh |
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i)) |
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr)) |
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i)) |
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr)) |
||||
#endif |
||||
|
||||
#ifdef BORDER_REFLECT |
||||
//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb |
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i)) |
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr)) |
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i)) |
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr)) |
||||
#endif |
||||
|
||||
#ifdef BORDER_REFLECT101 |
||||
//BORDER_REFLECT101: gfedcb|abcdefgh|gfedcba |
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i)) |
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)) |
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i)) |
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr)) |
||||
#endif |
||||
|
||||
#ifdef BORDER_WRAP |
||||
//BORDER_WRAP: cdefgh|abcdefgh|abcdefg |
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i)) |
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr)) |
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i)) |
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr)) |
||||
#endif |
||||
|
||||
__kernel void sobel3( |
||||
__global uchar* Src, |
||||
__global float* DstX, |
||||
__global float* DstY, |
||||
int width, int height, |
||||
uint srcStride, uint dstStride, |
||||
float scale |
||||
) |
||||
{ |
||||
__local float lsmem[BLK_Y+2][BLK_X+2]; |
||||
|
||||
int lix = get_local_id(0); |
||||
int liy = get_local_id(1); |
||||
|
||||
int gix = get_group_id(0); |
||||
int giy = get_group_id(1); |
||||
|
||||
int id_x = get_global_id(0); |
||||
int id_y = get_global_id(1); |
||||
|
||||
lsmem[liy+1][lix+1] = convert_float(Src[ id_y * srcStride + id_x ]); |
||||
|
||||
int id_y_h = ADDR_H(id_y-1, 0,height); |
||||
int id_y_b = ADDR_B(id_y+1, height,id_y+1); |
||||
|
||||
int id_x_l = ADDR_L(id_x-1, 0,width); |
||||
int id_x_r = ADDR_R(id_x+1, width,id_x+1); |
||||
|
||||
if(liy==0) |
||||
{ |
||||
lsmem[0][lix+1]=convert_float(Src[ id_y_h * srcStride + id_x ]); |
||||
|
||||
if(lix==0) |
||||
lsmem[0][0]=convert_float(Src[ id_y_h * srcStride + id_x_l ]); |
||||
else if(lix==BLK_X-1) |
||||
lsmem[0][BLK_X+1]=convert_float(Src[ id_y_h * srcStride + id_x_r ]); |
||||
} |
||||
else if(liy==BLK_Y-1) |
||||
{ |
||||
lsmem[BLK_Y+1][lix+1]=convert_float(Src[ id_y_b * srcStride + id_x ]); |
||||
|
||||
if(lix==0) |
||||
lsmem[BLK_Y+1][0]=convert_float(Src[ id_y_b * srcStride + id_x_l ]); |
||||
else if(lix==BLK_X-1) |
||||
lsmem[BLK_Y+1][BLK_X+1]=convert_float(Src[ id_y_b * srcStride + id_x_r ]); |
||||
} |
||||
|
||||
if(lix==0) |
||||
lsmem[liy+1][0] = convert_float(Src[ id_y * srcStride + id_x_l ]); |
||||
else if(lix==BLK_X-1) |
||||
lsmem[liy+1][BLK_X+1] = convert_float(Src[ id_y * srcStride + id_x_r ]); |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
float u1 = lsmem[liy][lix]; |
||||
float u2 = lsmem[liy][lix+1]; |
||||
float u3 = lsmem[liy][lix+2]; |
||||
|
||||
float m1 = lsmem[liy+1][lix]; |
||||
float m2 = lsmem[liy+1][lix+1]; |
||||
float m3 = lsmem[liy+1][lix+2]; |
||||
|
||||
float b1 = lsmem[liy+2][lix]; |
||||
float b2 = lsmem[liy+2][lix+1]; |
||||
float b3 = lsmem[liy+2][lix+2]; |
||||
|
||||
//m2 * scale;// |
||||
float dx = mad(2.0f, m3 - m1, u3 - u1 + b3 - b1 ); |
||||
DstX[ id_y * dstStride + id_x ] = dx * scale; |
||||
|
||||
float dy = mad(2.0f, b2 - u2, b1 - u1 + b3 - u3); |
||||
DstY[ id_y * dstStride + id_x ] = dy * scale; |
||||
} |
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue