mirror of https://github.com/opencv/opencv.git
Merge pull request #907 from SpecLad:master
commit
b5c013682b
117 changed files with 6042 additions and 11399 deletions
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,5 +1,8 @@ |
||||
<?xml version="1.0" encoding="UTF-8"?> |
||||
<lint> |
||||
<issue id="InlinedApi"> |
||||
<ignore path="src\org\opencv\android\JavaCameraView.java" /> |
||||
</issue> |
||||
<issue id="NewApi"> |
||||
<ignore path="src\org\opencv\android\JavaCameraView.java" /> |
||||
</issue> |
||||
|
@ -1,966 +0,0 @@ |
||||
//////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Jiang Liyuan, jlyuan001.good@163.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other oclMaterials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
// |
||||
#if defined (DOUBLE_SUPPORT) |
||||
#ifdef cl_khr_fp64 |
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable |
||||
#elif defined (cl_amd_fp64) |
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable |
||||
#endif |
||||
#endif |
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
////////////////////////////////////////////BITWISE_AND//////////////////////////////////////////////////// |
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
/**************************************and with scalar without mask**************************************/ |
||||
__kernel void arithm_s_bitwise_and_C1_D0 ( |
||||
__global uchar *src1, int src1_step, int src1_offset, |
||||
__global uchar *dst, int dst_step, int dst_offset, |
||||
uchar4 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
x = x << 2; |
||||
|
||||
#ifdef dst_align |
||||
#undef dst_align |
||||
#endif |
||||
#define dst_align (dst_offset & 3) |
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); |
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset); |
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); |
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); |
||||
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index); |
||||
uchar4 src2_data = (uchar4)(src2.x, src2.x, src2.x, src2.x); |
||||
|
||||
uchar4 data = *((__global uchar4 *)(dst + dst_index)); |
||||
uchar4 tmp_data = src1_data & src2_data; |
||||
|
||||
data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; |
||||
data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; |
||||
data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z; |
||||
data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w; |
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = data; |
||||
} |
||||
} |
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_and_C1_D1 ( |
||||
__global char *src1, int src1_step, int src1_offset, |
||||
__global char *dst, int dst_step, int dst_offset, |
||||
char4 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
x = x << 2; |
||||
|
||||
#ifdef dst_align |
||||
#undef dst_align |
||||
#endif |
||||
#define dst_align (dst_offset & 3) |
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); |
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset); |
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); |
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); |
||||
|
||||
char4 src1_data = vload4(0, src1 + src1_index); |
||||
char4 src2_data = (char4)(src2.x, src2.x, src2.x, src2.x); |
||||
|
||||
char4 data = *((__global char4 *)(dst + dst_index)); |
||||
char4 tmp_data = src1_data & src2_data; |
||||
|
||||
data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; |
||||
data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; |
||||
data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z; |
||||
data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w; |
||||
|
||||
*((__global char4 *)(dst + dst_index)) = data; |
||||
} |
||||
} |
||||
|
||||
__kernel void arithm_s_bitwise_and_C1_D2 ( |
||||
__global ushort *src1, int src1_step, int src1_offset, |
||||
__global ushort *dst, int dst_step, int dst_offset, |
||||
ushort4 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
x = x << 1; |
||||
|
||||
#ifdef dst_align |
||||
#undef dst_align |
||||
#endif |
||||
#define dst_align ((dst_offset >> 1) & 1) |
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); |
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset); |
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); |
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); |
||||
|
||||
ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index)); |
||||
ushort2 src2_data = (ushort2)(src2.x, src2.x); |
||||
|
||||
ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index)); |
||||
ushort2 tmp_data = src1_data & src2_data; |
||||
|
||||
data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x; |
||||
data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y; |
||||
|
||||
*((__global ushort2 *)((__global uchar *)dst + dst_index)) = data; |
||||
} |
||||
} |
||||
__kernel void arithm_s_bitwise_and_C1_D3 ( |
||||
__global short *src1, int src1_step, int src1_offset, |
||||
__global short *dst, int dst_step, int dst_offset, |
||||
short4 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
x = x << 1; |
||||
|
||||
#ifdef dst_align |
||||
#undef dst_align |
||||
#endif |
||||
#define dst_align ((dst_offset >> 1) & 1) |
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); |
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset); |
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); |
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); |
||||
|
||||
short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index)); |
||||
short2 src2_data = (short2)(src2.x, src2.x); |
||||
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); |
||||
|
||||
short2 tmp_data = src1_data & src2_data; |
||||
|
||||
data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x; |
||||
data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y; |
||||
|
||||
*((__global short2 *)((__global uchar *)dst + dst_index)) = data; |
||||
} |
||||
} |
||||
__kernel void arithm_s_bitwise_and_C1_D4 ( |
||||
__global int *src1, int src1_step, int src1_offset, |
||||
__global int *dst, int dst_step, int dst_offset, |
||||
int4 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); |
||||
|
||||
int src_data1 = *((__global int *)((__global char *)src1 + src1_index)); |
||||
int src_data2 = src2.x; |
||||
|
||||
int data = src_data1 & src_data2; |
||||
|
||||
*((__global int *)((__global char *)dst + dst_index)) = data; |
||||
} |
||||
} |
||||
__kernel void arithm_s_bitwise_and_C1_D5 ( |
||||
__global char *src1, int src1_step, int src1_offset, |
||||
__global char *dst, int dst_step, int dst_offset, |
||||
char16 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); |
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset); |
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); |
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); |
||||
|
||||
char4 src1_data = *((__global char4 *)((__global char *)src1 + src1_index)); |
||||
char4 src2_data = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); |
||||
|
||||
char4 data = *((__global char4 *)((__global char *)dst + dst_index)); |
||||
char4 tmp_data = src1_data & src2_data; |
||||
|
||||
data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; |
||||
data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; |
||||
data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z; |
||||
data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w; |
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index)) = data; |
||||
} |
||||
} |
||||
#if defined (DOUBLE_SUPPORT) |
||||
__kernel void arithm_s_bitwise_and_C1_D6 ( |
||||
__global short *src1, int src1_step, int src1_offset, |
||||
__global short *dst, int dst_step, int dst_offset, |
||||
short16 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); |
||||
|
||||
short4 src1_data = *((__global short4 *)((__global char *)src1 + src1_index)); |
||||
short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); |
||||
|
||||
short4 tmp_data = src1_data & src2_data; |
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = tmp_data; |
||||
} |
||||
} |
||||
#endif |
||||
__kernel void arithm_s_bitwise_and_C2_D0 ( |
||||
__global uchar *src1, int src1_step, int src1_offset, |
||||
__global uchar *dst, int dst_step, int dst_offset, |
||||
uchar4 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
x = x << 1; |
||||
|
||||
#ifdef dst_align |
||||
#undef dst_align |
||||
#endif |
||||
#define dst_align ((dst_offset >> 1) & 1) |
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); |
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset); |
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); |
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); |
||||
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index); |
||||
uchar4 src2_data = (uchar4)(src2.x, src2.y, src2.x, src2.y); |
||||
|
||||
uchar4 data = *((__global uchar4 *)(dst + dst_index)); |
||||
uchar4 tmp_data = src1_data & src2_data; |
||||
|
||||
|
||||
data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy; |
||||
data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw; |
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = data; |
||||
} |
||||
} |
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_and_C2_D1 ( |
||||
__global char *src1, int src1_step, int src1_offset, |
||||
__global char *dst, int dst_step, int dst_offset, |
||||
char4 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
x = x << 1; |
||||
|
||||
#ifdef dst_align |
||||
#undef dst_align |
||||
#endif |
||||
#define dst_align ((dst_offset >> 1) & 1) |
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); |
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset); |
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); |
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); |
||||
|
||||
char4 src1_data = vload4(0, src1 + src1_index); |
||||
char4 src2_data = (char4)(src2.x, src2.y, src2.x, src2.y); |
||||
|
||||
char4 data = *((__global char4 *)(dst + dst_index)); |
||||
char4 tmp_data = src1_data & src2_data; |
||||
|
||||
data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy; |
||||
data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw; |
||||
|
||||
*((__global char4 *)(dst + dst_index)) = data; |
||||
} |
||||
} |
||||
|
||||
__kernel void arithm_s_bitwise_and_C2_D2 ( |
||||
__global ushort *src1, int src1_step, int src1_offset, |
||||
__global ushort *dst, int dst_step, int dst_offset, |
||||
ushort4 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); |
||||
|
||||
ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index)); |
||||
ushort2 src_data2 = (ushort2)(src2.x, src2.y); |
||||
|
||||
ushort2 data = src_data1 & src_data2; |
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index)) = data; |
||||
} |
||||
} |
||||
__kernel void arithm_s_bitwise_and_C2_D3 ( |
||||
__global short *src1, int src1_step, int src1_offset, |
||||
__global short *dst, int dst_step, int dst_offset, |
||||
short4 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); |
||||
|
||||
short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index)); |
||||
short2 src_data2 = (short2)(src2.x, src2.y); |
||||
|
||||
short2 data = src_data1 & src_data2; |
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index)) = data; |
||||
} |
||||
} |
||||
__kernel void arithm_s_bitwise_and_C2_D4 ( |
||||
__global int *src1, int src1_step, int src1_offset, |
||||
__global int *dst, int dst_step, int dst_offset, |
||||
int4 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); |
||||
|
||||
int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index)); |
||||
int2 src_data2 = (int2)(src2.x, src2.y); |
||||
|
||||
int2 data = src_data1 & src_data2; |
||||
*((__global int2 *)((__global char *)dst + dst_index)) = data; |
||||
} |
||||
} |
||||
__kernel void arithm_s_bitwise_and_C2_D5 ( |
||||
__global char *src1, int src1_step, int src1_offset, |
||||
__global char *dst, int dst_step, int dst_offset, |
||||
char16 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); |
||||
|
||||
char8 src1_data = *((__global char8 *)((__global char *)src1 + src1_index)); |
||||
char8 src2_data = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7); |
||||
|
||||
char8 tmp_data = src1_data & src2_data; |
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index)) = tmp_data; |
||||
} |
||||
} |
||||
#if defined (DOUBLE_SUPPORT) |
||||
__kernel void arithm_s_bitwise_and_C2_D6 ( |
||||
__global short *src1, int src1_step, int src1_offset, |
||||
__global short *dst, int dst_step, int dst_offset, |
||||
short16 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); |
||||
|
||||
short8 src1_data = *((__global short8 *)((__global char *)src1 + src1_index)); |
||||
short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7); |
||||
|
||||
short8 tmp_data = src1_data & src2_data; |
||||
|
||||
*((__global short8 *)((__global char *)dst + dst_index)) = tmp_data; |
||||
} |
||||
} |
||||
#endif |
||||
__kernel void arithm_s_bitwise_and_C3_D0 ( |
||||
__global uchar *src1, int src1_step, int src1_offset, |
||||
__global uchar *dst, int dst_step, int dst_offset, |
||||
uchar4 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
x = x << 2; |
||||
|
||||
#ifdef dst_align |
||||
#undef dst_align |
||||
#endif |
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3) |
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); |
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset); |
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); |
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); |
||||
|
||||
uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); |
||||
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); |
||||
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); |
||||
|
||||
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x); |
||||
uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y); |
||||
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z); |
||||
|
||||
uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); |
||||
uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); |
||||
uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); |
||||
|
||||
uchar4 tmp_data_0 = src1_data_0 & src2_data_0; |
||||
uchar4 tmp_data_1 = src1_data_1 & src2_data_1; |
||||
uchar4 tmp_data_2 = src1_data_2 & src2_data_2; |
||||
|
||||
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; |
||||
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) |
||||
? tmp_data_0.w : data_0.w; |
||||
|
||||
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) |
||||
? tmp_data_1.xy : data_1.xy; |
||||
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) |
||||
? tmp_data_1.zw : data_1.zw; |
||||
|
||||
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) |
||||
? tmp_data_2.x : data_2.x; |
||||
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) |
||||
? tmp_data_2.yzw : data_2.yzw; |
||||
|
||||
*((__global uchar4 *)(dst + dst_index + 0)) = data_0; |
||||
*((__global uchar4 *)(dst + dst_index + 4)) = data_1; |
||||
*((__global uchar4 *)(dst + dst_index + 8)) = data_2; |
||||
} |
||||
} |
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_and_C3_D1 ( |
||||
__global char *src1, int src1_step, int src1_offset, |
||||
__global char *dst, int dst_step, int dst_offset, |
||||
char4 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
x = x << 2; |
||||
|
||||
#ifdef dst_align |
||||
#undef dst_align |
||||
#endif |
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3) |
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); |
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset); |
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); |
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); |
||||
|
||||
char4 src1_data_0 = vload4(0, src1 + src1_index + 0); |
||||
char4 src1_data_1 = vload4(0, src1 + src1_index + 4); |
||||
char4 src1_data_2 = vload4(0, src1 + src1_index + 8); |
||||
|
||||
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x); |
||||
char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y); |
||||
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z); |
||||
|
||||
char4 data_0 = *((__global char4 *)(dst + dst_index + 0)); |
||||
char4 data_1 = *((__global char4 *)(dst + dst_index + 4)); |
||||
char4 data_2 = *((__global char4 *)(dst + dst_index + 8)); |
||||
|
||||
char4 tmp_data_0 = convert_char4_sat(convert_uchar4_sat(src1_data_0) & convert_uchar4_sat(src2_data_0)); |
||||
char4 tmp_data_1 = convert_char4_sat(convert_uchar4_sat(src1_data_1) & convert_uchar4_sat(src2_data_1)); |
||||
char4 tmp_data_2 = convert_char4_sat(convert_uchar4_sat(src1_data_2) & convert_uchar4_sat(src2_data_2)); |
||||
|
||||
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; |
||||
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) |
||||
? tmp_data_0.w : data_0.w; |
||||
|
||||
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) |
||||
? tmp_data_1.xy : data_1.xy; |
||||
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) |
||||
? tmp_data_1.zw : data_1.zw; |
||||
|
||||
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) |
||||
? tmp_data_2.x : data_2.x; |
||||
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) |
||||
? tmp_data_2.yzw : data_2.yzw; |
||||
|
||||
*((__global char4 *)(dst + dst_index + 0)) = data_0; |
||||
*((__global char4 *)(dst + dst_index + 4)) = data_1; |
||||
*((__global char4 *)(dst + dst_index + 8)) = data_2; |
||||
} |
||||
} |
||||
|
||||
__kernel void arithm_s_bitwise_and_C3_D2 ( |
||||
__global ushort *src1, int src1_step, int src1_offset, |
||||
__global ushort *dst, int dst_step, int dst_offset, |
||||
ushort4 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
x = x << 1; |
||||
|
||||
#ifdef dst_align |
||||
#undef dst_align |
||||
#endif |
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1) |
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); |
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset); |
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); |
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); |
||||
|
||||
ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); |
||||
ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); |
||||
ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); |
||||
|
||||
ushort2 src2_data_0 = (ushort2)(src2.x, src2.y); |
||||
ushort2 src2_data_1 = (ushort2)(src2.z, src2.x); |
||||
ushort2 src2_data_2 = (ushort2)(src2.y, src2.z); |
||||
|
||||
ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); |
||||
ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); |
||||
ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); |
||||
|
||||
ushort2 tmp_data_0 = src1_data_0 & src2_data_0; |
||||
ushort2 tmp_data_1 = src1_data_1 & src2_data_1; |
||||
ushort2 tmp_data_2 = src1_data_2 & src2_data_2; |
||||
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; |
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) |
||||
? tmp_data_1.x : data_1.x; |
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) |
||||
? tmp_data_1.y : data_1.y; |
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) |
||||
? tmp_data_2.xy : data_2.xy; |
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; |
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; |
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; |
||||
} |
||||
} |
||||
__kernel void arithm_s_bitwise_and_C3_D3 ( |
||||
__global short *src1, int src1_step, int src1_offset, |
||||
__global short *dst, int dst_step, int dst_offset, |
||||
short4 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
x = x << 1; |
||||
|
||||
#ifdef dst_align |
||||
#undef dst_align |
||||
#endif |
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1) |
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); |
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset); |
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); |
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); |
||||
|
||||
short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); |
||||
short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); |
||||
short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); |
||||
|
||||
short2 src2_data_0 = (short2)(src2.x, src2.y); |
||||
short2 src2_data_1 = (short2)(src2.z, src2.x); |
||||
short2 src2_data_2 = (short2)(src2.y, src2.z); |
||||
|
||||
short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); |
||||
short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); |
||||
short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); |
||||
|
||||
short2 tmp_data_0 = src1_data_0 & src2_data_0; |
||||
short2 tmp_data_1 = src1_data_1 & src2_data_1; |
||||
short2 tmp_data_2 = src1_data_2 & src2_data_2; |
||||
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; |
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) |
||||
? tmp_data_1.x : data_1.x; |
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) |
||||
? tmp_data_1.y : data_1.y; |
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) |
||||
? tmp_data_2.xy : data_2.xy; |
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; |
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; |
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; |
||||
} |
||||
} |
||||
__kernel void arithm_s_bitwise_and_C3_D4 ( |
||||
__global int *src1, int src1_step, int src1_offset, |
||||
__global int *dst, int dst_step, int dst_offset, |
||||
int4 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); |
||||
|
||||
int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); |
||||
int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); |
||||
int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); |
||||
|
||||
int src2_data_0 = src2.x; |
||||
int src2_data_1 = src2.y; |
||||
int src2_data_2 = src2.z; |
||||
|
||||
int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); |
||||
int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); |
||||
int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); |
||||
|
||||
int tmp_data_0 = src1_data_0 & src2_data_0; |
||||
int tmp_data_1 = src1_data_1 & src2_data_1; |
||||
int tmp_data_2 = src1_data_2 & src2_data_2; |
||||
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0; |
||||
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1; |
||||
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2; |
||||
} |
||||
} |
||||
__kernel void arithm_s_bitwise_and_C3_D5 ( |
||||
__global char *src1, int src1_step, int src1_offset, |
||||
__global char *dst, int dst_step, int dst_offset, |
||||
char16 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); |
||||
|
||||
char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0)); |
||||
char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4)); |
||||
char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8)); |
||||
|
||||
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); |
||||
char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7); |
||||
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB); |
||||
|
||||
char4 data_0 = *((__global char4 *)((__global char *)dst + dst_index + 0)); |
||||
char4 data_1 = *((__global char4 *)((__global char *)dst + dst_index + 4)); |
||||
char4 data_2 = *((__global char4 *)((__global char *)dst + dst_index + 8)); |
||||
|
||||
char4 tmp_data_0 = src1_data_0 & src2_data_0; |
||||
char4 tmp_data_1 = src1_data_1 & src2_data_1; |
||||
char4 tmp_data_2 = src1_data_2 & src2_data_2; |
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0; |
||||
*((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1; |
||||
*((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2; |
||||
} |
||||
} |
||||
#if defined (DOUBLE_SUPPORT) |
||||
__kernel void arithm_s_bitwise_and_C3_D6 ( |
||||
__global short *src1, int src1_step, int src1_offset, |
||||
__global short *dst, int dst_step, int dst_offset, |
||||
short16 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); |
||||
|
||||
short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 )); |
||||
short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 )); |
||||
short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16)); |
||||
|
||||
short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); |
||||
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7); |
||||
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb); |
||||
|
||||
short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 )); |
||||
short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 )); |
||||
short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16)); |
||||
|
||||
short4 tmp_data_0 = src1_data_0 & src2_data_0; |
||||
short4 tmp_data_1 = src1_data_1 & src2_data_1; |
||||
short4 tmp_data_2 = src1_data_2 & src2_data_2; |
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; |
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; |
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2; |
||||
} |
||||
} |
||||
#endif |
||||
__kernel void arithm_s_bitwise_and_C4_D0 ( |
||||
__global uchar *src1, int src1_step, int src1_offset, |
||||
__global uchar *dst, int dst_step, int dst_offset, |
||||
uchar4 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); |
||||
|
||||
uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index)); |
||||
|
||||
uchar4 data = src_data1 & src2; |
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = data; |
||||
} |
||||
} |
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_and_C4_D1 ( |
||||
__global char *src1, int src1_step, int src1_offset, |
||||
__global char *dst, int dst_step, int dst_offset, |
||||
char4 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); |
||||
|
||||
char4 src_data1 = *((__global char4 *)(src1 + src1_index)); |
||||
|
||||
char4 data = src_data1 & src2; |
||||
|
||||
*((__global char4 *)(dst + dst_index)) = data; |
||||
} |
||||
} |
||||
|
||||
__kernel void arithm_s_bitwise_and_C4_D2 ( |
||||
__global ushort *src1, int src1_step, int src1_offset, |
||||
__global ushort *dst, int dst_step, int dst_offset, |
||||
ushort4 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); |
||||
|
||||
ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index)); |
||||
|
||||
ushort4 data = src_data1 & src2; |
||||
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = data; |
||||
} |
||||
} |
||||
__kernel void arithm_s_bitwise_and_C4_D3 ( |
||||
__global short *src1, int src1_step, int src1_offset, |
||||
__global short *dst, int dst_step, int dst_offset, |
||||
short4 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); |
||||
|
||||
short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index)); |
||||
|
||||
short4 data = src_data1 & src2; |
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = data; |
||||
} |
||||
} |
||||
__kernel void arithm_s_bitwise_and_C4_D4 ( |
||||
__global int *src1, int src1_step, int src1_offset, |
||||
__global int *dst, int dst_step, int dst_offset, |
||||
int4 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); |
||||
|
||||
int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index)); |
||||
|
||||
int4 data = src_data1 & src2; |
||||
|
||||
*((__global int4 *)((__global char *)dst + dst_index)) = data; |
||||
} |
||||
} |
||||
__kernel void arithm_s_bitwise_and_C4_D5 ( |
||||
__global char *src1, int src1_step, int src1_offset, |
||||
__global char *dst, int dst_step, int dst_offset, |
||||
char16 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); |
||||
|
||||
char16 src1_data = *((__global char16 *)((__global char *)src1 + src1_index)); |
||||
char16 src2_data = (char16)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7, |
||||
src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf); |
||||
|
||||
char16 tmp_data = src1_data & src2_data; |
||||
|
||||
*((__global char16 *)((__global char *)dst + dst_index)) = tmp_data; |
||||
} |
||||
} |
||||
#if defined (DOUBLE_SUPPORT) |
||||
__kernel void arithm_s_bitwise_and_C4_D6 ( |
||||
__global short *src1, int src1_step, int src1_offset, |
||||
__global short *dst, int dst_step, int dst_offset, |
||||
short16 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 5) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 5) + dst_offset); |
||||
|
||||
short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0)); |
||||
short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8)); |
||||
short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16)); |
||||
short4 src1_data_3 = *((__global short4 *)((__global char *)src1 + src1_index + 24)); |
||||
|
||||
short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); |
||||
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7); |
||||
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb); |
||||
short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf); |
||||
|
||||
short4 tmp_data_0 = src1_data_0 & src2_data_0; |
||||
short4 tmp_data_1 = src1_data_1 & src2_data_1; |
||||
short4 tmp_data_2 = src1_data_2 & src2_data_2; |
||||
short4 tmp_data_3 = src1_data_3 & src2_data_3; |
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; |
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; |
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2; |
||||
*((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3; |
||||
|
||||
} |
||||
} |
||||
#endif |
@ -1,294 +0,0 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Jiang Liyuan, jlyuan001.good@163.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other oclMaterials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
#if defined (DOUBLE_SUPPORT) |
||||
#ifdef cl_khr_fp64 |
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable |
||||
#elif defined (cl_amd_fp64) |
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable |
||||
#endif |
||||
#endif |
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
////////////////////////////////////////////BITWISE_OR//////////////////////////////////////////////////// |
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
/**************************************bitwise_or without mask**************************************/ |
||||
__kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src1_offset, |
||||
__global uchar *src2, int src2_step, int src2_offset, |
||||
__global uchar *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
x = x << 2; |
||||
|
||||
#ifdef dst_align |
||||
#undef dst_align |
||||
#endif |
||||
#define dst_align (dst_offset & 3) |
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); |
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); |
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset); |
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); |
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); |
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index; |
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index; |
||||
uchar4 src1_data = vload4(0, src1 + src1_index_fix); |
||||
uchar4 src2_data = vload4(0, src2 + src2_index_fix); |
||||
if(src1_index < 0) |
||||
{ |
||||
uchar4 tmp; |
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; |
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; |
||||
} |
||||
if(src2_index < 0) |
||||
{ |
||||
uchar4 tmp; |
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; |
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; |
||||
} |
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); |
||||
uchar4 tmp_data = src1_data | src2_data; |
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; |
||||
dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y; |
||||
dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z; |
||||
dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w; |
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = dst_data; |
||||
} |
||||
} |
||||
|
||||
|
||||
__kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1_offset, |
||||
__global char *src2, int src2_step, int src2_offset, |
||||
__global char *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
x = x << 2; |
||||
|
||||
#ifdef dst_align |
||||
#undef dst_align |
||||
#endif |
||||
#define dst_align (dst_offset & 3) |
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); |
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); |
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset); |
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); |
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); |
||||
|
||||
char4 src1_data = vload4(0, src1 + src1_index); |
||||
char4 src2_data = vload4(0, src2 + src2_index); |
||||
|
||||
char4 dst_data = *((__global char4 *)(dst + dst_index)); |
||||
char4 tmp_data = src1_data | src2_data; |
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; |
||||
dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y; |
||||
dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z; |
||||
dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w; |
||||
|
||||
*((__global char4 *)(dst + dst_index)) = dst_data; |
||||
} |
||||
} |
||||
|
||||
|
||||
__kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int src1_offset, |
||||
__global ushort *src2, int src2_step, int src2_offset, |
||||
__global ushort *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int dst_step1) |
||||
|
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
x = x << 2; |
||||
|
||||
#ifdef dst_align |
||||
#undef dst_align |
||||
#endif |
||||
#define dst_align ((dst_offset >> 1) & 3) |
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); |
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); |
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset); |
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); |
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); |
||||
|
||||
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index)); |
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index)); |
||||
|
||||
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); |
||||
ushort4 tmp_data = src1_data | src2_data; |
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; |
||||
dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y; |
||||
dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z; |
||||
dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w; |
||||
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data; |
||||
} |
||||
} |
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src1_offset, |
||||
__global short *src2, int src2_step, int src2_offset, |
||||
__global short *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int dst_step1) |
||||
|
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
x = x << 2; |
||||
|
||||
#ifdef dst_align |
||||
#undef dst_align |
||||
#endif |
||||
#define dst_align ((dst_offset >> 1) & 3) |
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); |
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); |
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset); |
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); |
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); |
||||
|
||||
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index)); |
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index)); |
||||
|
||||
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); |
||||
short4 tmp_data = src1_data | src2_data; |
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; |
||||
dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y; |
||||
dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z; |
||||
dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w; |
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = dst_data; |
||||
} |
||||
} |
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_or_D4 (__global int *src1, int src1_step, int src1_offset, |
||||
__global int *src2, int src2_step, int src2_offset, |
||||
__global int *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); |
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); |
||||
|
||||
int data1 = *((__global int *)((__global char *)src1 + src1_index)); |
||||
int data2 = *((__global int *)((__global char *)src2 + src2_index)); |
||||
int tmp = data1 | data2; |
||||
|
||||
*((__global int *)((__global char *)dst + dst_index)) = tmp; |
||||
} |
||||
} |
||||
|
||||
__kernel void arithm_bitwise_or_D5 (__global char *src1, int src1_step, int src1_offset, |
||||
__global char *src2, int src2_step, int src2_offset, |
||||
__global char *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); |
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); |
||||
|
||||
char4 data1 = *((__global char4 *)((__global char *)src1 + src1_index)); |
||||
char4 data2 = *((__global char4 *)((__global char *)src2 + src2_index)); |
||||
char4 tmp = data1 | data2; |
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index)) = tmp; |
||||
} |
||||
} |
||||
|
||||
#if defined (DOUBLE_SUPPORT) |
||||
__kernel void arithm_bitwise_or_D6 (__global char *src1, int src1_step, int src1_offset, |
||||
__global char *src2, int src2_step, int src2_offset, |
||||
__global char *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); |
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); |
||||
|
||||
char8 data1 = *((__global char8 *)((__global char *)src1 + src1_index)); |
||||
char8 data2 = *((__global char8 *)((__global char *)src2 + src2_index)); |
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index)) = data1 | data2; |
||||
} |
||||
} |
||||
#endif |
File diff suppressed because it is too large
Load Diff
@ -1,973 +0,0 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Jiang Liyuan, jlyuan001.good@163.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other oclMaterials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
#if defined (DOUBLE_SUPPORT) |
||||
#ifdef cl_khr_fp64 |
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable |
||||
#elif defined (cl_amd_fp64) |
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable |
||||
#endif |
||||
#endif |
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
////////////////////////////////////////////BITWISE_OR//////////////////////////////////////////////////// |
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
/**************************************and with scalar without mask**************************************/ |
||||
__kernel void arithm_s_bitwise_or_C1_D0 ( |
||||
__global uchar *src1, int src1_step, int src1_offset, |
||||
__global uchar *dst, int dst_step, int dst_offset, |
||||
uchar4 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
x = x << 2; |
||||
|
||||
#ifdef dst_align |
||||
#undef dst_align |
||||
#endif |
||||
#define dst_align (dst_offset & 3) |
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); |
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset); |
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); |
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); |
||||
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index); |
||||
uchar4 src2_data = (uchar4)(src2.x, src2.x, src2.x, src2.x); |
||||
|
||||
uchar4 data = *((__global uchar4 *)(dst + dst_index)); |
||||
uchar4 tmp_data = src1_data | src2_data; |
||||
|
||||
data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; |
||||
data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; |
||||
data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z; |
||||
data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w; |
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = data; |
||||
} |
||||
} |
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_or_C1_D1 ( |
||||
__global char *src1, int src1_step, int src1_offset, |
||||
__global char *dst, int dst_step, int dst_offset, |
||||
char4 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
x = x << 2; |
||||
|
||||
#ifdef dst_align |
||||
#undef dst_align |
||||
#endif |
||||
#define dst_align (dst_offset & 3) |
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); |
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset); |
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); |
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); |
||||
|
||||
char4 src1_data = vload4(0, src1 + src1_index); |
||||
char4 src2_data = (char4)(src2.x, src2.x, src2.x, src2.x); |
||||
|
||||
char4 data = *((__global char4 *)(dst + dst_index)); |
||||
char4 tmp_data = src1_data | src2_data; |
||||
|
||||
data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; |
||||
data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : data.y; |
||||
data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : data.z; |
||||
data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : data.w; |
||||
|
||||
*((__global char4 *)(dst + dst_index)) = data; |
||||
} |
||||
} |
||||
|
||||
__kernel void arithm_s_bitwise_or_C1_D2 ( |
||||
__global ushort *src1, int src1_step, int src1_offset, |
||||
__global ushort *dst, int dst_step, int dst_offset, |
||||
ushort4 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
x = x << 1; |
||||
|
||||
#ifdef dst_align |
||||
#undef dst_align |
||||
#endif |
||||
#define dst_align ((dst_offset >> 1) & 1) |
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); |
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset); |
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); |
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); |
||||
|
||||
ushort2 src1_data = vload2(0, (__global ushort *)((__global char *)src1 + src1_index)); |
||||
ushort2 src2_data = (ushort2)(src2.x, src2.x); |
||||
|
||||
ushort2 data = *((__global ushort2 *)((__global uchar *)dst + dst_index)); |
||||
ushort2 tmp_data = src1_data | src2_data; |
||||
|
||||
data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x; |
||||
data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y; |
||||
|
||||
*((__global ushort2 *)((__global uchar *)dst + dst_index)) = data; |
||||
} |
||||
} |
||||
__kernel void arithm_s_bitwise_or_C1_D3 ( |
||||
__global short *src1, int src1_step, int src1_offset, |
||||
__global short *dst, int dst_step, int dst_offset, |
||||
short4 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
x = x << 1; |
||||
|
||||
#ifdef dst_align |
||||
#undef dst_align |
||||
#endif |
||||
#define dst_align ((dst_offset >> 1) & 1) |
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); |
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset); |
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); |
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); |
||||
|
||||
short2 src1_data = vload2(0, (__global short *)((__global char *)src1 + src1_index)); |
||||
short2 src2_data = (short2)(src2.x, src2.x); |
||||
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); |
||||
|
||||
short2 tmp_data = src1_data | src2_data; |
||||
|
||||
data.x = (dst_index + 0 >= dst_start) ? tmp_data.x : data.x; |
||||
data.y = (dst_index + 2 < dst_end ) ? tmp_data.y : data.y; |
||||
|
||||
*((__global short2 *)((__global uchar *)dst + dst_index)) = data; |
||||
} |
||||
} |
||||
__kernel void arithm_s_bitwise_or_C1_D4 ( |
||||
__global int *src1, int src1_step, int src1_offset, |
||||
__global int *dst, int dst_step, int dst_offset, |
||||
int4 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); |
||||
|
||||
int src_data1 = *((__global int *)((__global char *)src1 + src1_index)); |
||||
int src_data2 = src2.x; |
||||
|
||||
int data = src_data1 | src_data2; |
||||
|
||||
*((__global int *)((__global char *)dst + dst_index)) = data; |
||||
} |
||||
} |
||||
__kernel void arithm_s_bitwise_or_C1_D5 ( |
||||
__global char *src1, int src1_step, int src1_offset, |
||||
__global char *dst, int dst_step, int dst_offset, |
||||
char16 src2, int rows, int cols, int dst_step1) |
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); |
||||
|
||||
char4 src_data1 = *((__global char4 *)((__global char *)src1 + src1_index)); |
||||
char4 src_data2 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); |
||||
|
||||
char4 data = src_data1 | src_data2; |
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index)) = data; |
||||
} |
||||
} |
||||
#if defined (DOUBLE_SUPPORT) |
||||
__kernel void arithm_s_bitwise_or_C1_D6 ( |
||||
__global short *src1, int src1_step, int src1_offset, |
||||
__global short *dst, int dst_step, int dst_offset, |
||||
short16 src2, int rows, int cols, int dst_step1) |
||||
|
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); |
||||
|
||||
short4 src1_data = *((__global short4 *)((__global char *)src1 + src1_index)); |
||||
short4 src2_data = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); |
||||
|
||||
short4 tmp_data = src1_data | src2_data; |
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = tmp_data; |
||||
} |
||||
} |
||||
#endif |
||||
__kernel void arithm_s_bitwise_or_C2_D0 ( |
||||
__global uchar *src1, int src1_step, int src1_offset, |
||||
__global uchar *dst, int dst_step, int dst_offset, |
||||
uchar4 src2, int rows, int cols, int dst_step1) |
||||
|
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
x = x << 1; |
||||
|
||||
#ifdef dst_align |
||||
#undef dst_align |
||||
#endif |
||||
#define dst_align ((dst_offset >> 1) & 1) |
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); |
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset); |
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); |
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); |
||||
|
||||
uchar4 src1_data = vload4(0, src1 + src1_index); |
||||
uchar4 src2_data = (uchar4)(src2.x, src2.y, src2.x, src2.y); |
||||
|
||||
uchar4 data = *((__global uchar4 *)(dst + dst_index)); |
||||
uchar4 tmp_data = src1_data | src2_data; |
||||
|
||||
data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy; |
||||
data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw; |
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = data; |
||||
} |
||||
} |
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_or_C2_D1 ( |
||||
__global char *src1, int src1_step, int src1_offset, |
||||
__global char *dst, int dst_step, int dst_offset, |
||||
char4 src2, int rows, int cols, int dst_step1) |
||||
|
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
x = x << 1; |
||||
|
||||
#ifdef dst_align |
||||
#undef dst_align |
||||
#endif |
||||
#define dst_align ((dst_offset >> 1) & 1) |
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); |
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset); |
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); |
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffffc); |
||||
|
||||
char4 src1_data = vload4(0, src1 + src1_index); |
||||
char4 src2_data = (char4)(src2.x, src2.y, src2.x, src2.y); |
||||
|
||||
char4 data = *((__global char4 *)(dst + dst_index)); |
||||
char4 tmp_data = src1_data | src2_data; |
||||
|
||||
data.xy = (dst_index + 0 >= dst_start) ? tmp_data.xy : data.xy; |
||||
data.zw = (dst_index + 2 < dst_end ) ? tmp_data.zw : data.zw; |
||||
|
||||
*((__global char4 *)(dst + dst_index)) = data; |
||||
} |
||||
} |
||||
|
||||
__kernel void arithm_s_bitwise_or_C2_D2 ( |
||||
__global ushort *src1, int src1_step, int src1_offset, |
||||
__global ushort *dst, int dst_step, int dst_offset, |
||||
ushort4 src2, int rows, int cols, int dst_step1) |
||||
|
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); |
||||
|
||||
ushort2 src_data1 = *((__global ushort2 *)((__global char *)src1 + src1_index)); |
||||
ushort2 src_data2 = (ushort2)(src2.x, src2.y); |
||||
|
||||
ushort2 data = src_data1 | src_data2; |
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index)) = data; |
||||
} |
||||
} |
||||
__kernel void arithm_s_bitwise_or_C2_D3 ( |
||||
__global short *src1, int src1_step, int src1_offset, |
||||
__global short *dst, int dst_step, int dst_offset, |
||||
short4 src2, int rows, int cols, int dst_step1) |
||||
|
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); |
||||
|
||||
short2 src_data1 = *((__global short2 *)((__global char *)src1 + src1_index)); |
||||
short2 src_data2 = (short2)(src2.x, src2.y); |
||||
|
||||
short2 data = src_data1 | src_data2; |
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index)) = data; |
||||
} |
||||
} |
||||
__kernel void arithm_s_bitwise_or_C2_D4 (__global int *src1, int src1_step, int src1_offset, |
||||
__global int *dst, int dst_step, int dst_offset, |
||||
int4 src2, int rows, int cols, int dst_step1) |
||||
|
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); |
||||
|
||||
int2 src_data1 = *((__global int2 *)((__global char *)src1 + src1_index)); |
||||
int2 src_data2 = (int2)(src2.x, src2.y); |
||||
|
||||
int2 data = src_data1 | src_data2; |
||||
*((__global int2 *)((__global char *)dst + dst_index)) = data; |
||||
} |
||||
} |
||||
__kernel void arithm_s_bitwise_or_C2_D5 ( |
||||
__global char *src1, int src1_step, int src1_offset, |
||||
__global char *dst, int dst_step, int dst_offset, |
||||
char16 src2, int rows, int cols, int dst_step1) |
||||
|
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); |
||||
|
||||
char8 src_data1 = *((__global char8 *)((__global char *)src1 + src1_index)); |
||||
char8 src_data2 = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7); |
||||
|
||||
char8 data = src_data1 | src_data2; |
||||
*((__global char8 *)((__global char *)dst + dst_index)) = data; |
||||
} |
||||
} |
||||
#if defined (DOUBLE_SUPPORT) |
||||
__kernel void arithm_s_bitwise_or_C2_D6 ( |
||||
__global short *src1, int src1_step, int src1_offset, |
||||
__global short *dst, int dst_step, int dst_offset, |
||||
short16 src2, int rows, int cols, int dst_step1) |
||||
|
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); |
||||
|
||||
short8 src1_data = *((__global short8 *)((__global char *)src1 + src1_index)); |
||||
short8 src2_data = (short8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7); |
||||
|
||||
short8 tmp_data = src1_data & src2_data; |
||||
|
||||
*((__global short8 *)((__global char *)dst + dst_index)) = tmp_data; |
||||
} |
||||
} |
||||
#endif |
||||
__kernel void arithm_s_bitwise_or_C3_D0 ( |
||||
__global uchar *src1, int src1_step, int src1_offset, |
||||
__global uchar *dst, int dst_step, int dst_offset, |
||||
uchar4 src2, int rows, int cols, int dst_step1) |
||||
|
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
x = x << 2; |
||||
|
||||
#ifdef dst_align |
||||
#undef dst_align |
||||
#endif |
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3) |
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); |
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset); |
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); |
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); |
||||
|
||||
uchar4 src1_data_0 = vload4(0, src1 + src1_index + 0); |
||||
uchar4 src1_data_1 = vload4(0, src1 + src1_index + 4); |
||||
uchar4 src1_data_2 = vload4(0, src1 + src1_index + 8); |
||||
|
||||
uchar4 src2_data_0 = (uchar4)(src2.x, src2.y, src2.z, src2.x); |
||||
uchar4 src2_data_1 = (uchar4)(src2.y, src2.z, src2.x, src2.y); |
||||
uchar4 src2_data_2 = (uchar4)(src2.z, src2.x, src2.y, src2.z); |
||||
|
||||
uchar4 data_0 = *((__global uchar4 *)(dst + dst_index + 0)); |
||||
uchar4 data_1 = *((__global uchar4 *)(dst + dst_index + 4)); |
||||
uchar4 data_2 = *((__global uchar4 *)(dst + dst_index + 8)); |
||||
|
||||
uchar4 tmp_data_0 = src1_data_0 | src2_data_0 ; |
||||
uchar4 tmp_data_1 = src1_data_1 | src2_data_1 ; |
||||
uchar4 tmp_data_2 = src1_data_2 | src2_data_2 ; |
||||
|
||||
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; |
||||
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) |
||||
? tmp_data_0.w : data_0.w; |
||||
|
||||
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) |
||||
? tmp_data_1.xy : data_1.xy; |
||||
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) |
||||
? tmp_data_1.zw : data_1.zw; |
||||
|
||||
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) |
||||
? tmp_data_2.x : data_2.x; |
||||
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) |
||||
? tmp_data_2.yzw : data_2.yzw; |
||||
|
||||
*((__global uchar4 *)(dst + dst_index + 0)) = data_0; |
||||
*((__global uchar4 *)(dst + dst_index + 4)) = data_1; |
||||
*((__global uchar4 *)(dst + dst_index + 8)) = data_2; |
||||
} |
||||
} |
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_or_C3_D1 ( |
||||
__global char *src1, int src1_step, int src1_offset, |
||||
__global char *dst, int dst_step, int dst_offset, |
||||
char4 src2, int rows, int cols, int dst_step1) |
||||
|
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
x = x << 2; |
||||
|
||||
#ifdef dst_align |
||||
#undef dst_align |
||||
#endif |
||||
#define dst_align (((dst_offset % dst_step) / 3 ) & 3) |
||||
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); |
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset); |
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); |
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 3) - (dst_align * 3)); |
||||
|
||||
char4 src1_data_0 = vload4(0, src1 + src1_index + 0); |
||||
char4 src1_data_1 = vload4(0, src1 + src1_index + 4); |
||||
char4 src1_data_2 = vload4(0, src1 + src1_index + 8); |
||||
|
||||
char4 src2_data_0 = (char4)(src2.x, src2.y, src2.z, src2.x); |
||||
char4 src2_data_1 = (char4)(src2.y, src2.z, src2.x, src2.y); |
||||
char4 src2_data_2 = (char4)(src2.z, src2.x, src2.y, src2.z); |
||||
|
||||
char4 data_0 = *((__global char4 *)(dst + dst_index + 0)); |
||||
char4 data_1 = *((__global char4 *)(dst + dst_index + 4)); |
||||
char4 data_2 = *((__global char4 *)(dst + dst_index + 8)); |
||||
|
||||
char4 tmp_data_0 = src1_data_0 | src2_data_0; |
||||
char4 tmp_data_1 = src1_data_1 | src2_data_1; |
||||
char4 tmp_data_2 = src1_data_2 | src2_data_2; |
||||
|
||||
data_0.xyz = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xyz : data_0.xyz; |
||||
data_0.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) |
||||
? tmp_data_0.w : data_0.w; |
||||
|
||||
data_1.xy = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) |
||||
? tmp_data_1.xy : data_1.xy; |
||||
data_1.zw = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) |
||||
? tmp_data_1.zw : data_1.zw; |
||||
|
||||
data_2.x = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) |
||||
? tmp_data_2.x : data_2.x; |
||||
data_2.yzw = ((dst_index + 9 >= dst_start) && (dst_index + 9 < dst_end)) |
||||
? tmp_data_2.yzw : data_2.yzw; |
||||
|
||||
*((__global char4 *)(dst + dst_index + 0)) = data_0; |
||||
*((__global char4 *)(dst + dst_index + 4)) = data_1; |
||||
*((__global char4 *)(dst + dst_index + 8)) = data_2; |
||||
} |
||||
} |
||||
|
||||
__kernel void arithm_s_bitwise_or_C3_D2 ( |
||||
__global ushort *src1, int src1_step, int src1_offset, |
||||
__global ushort *dst, int dst_step, int dst_offset, |
||||
ushort4 src2, int rows, int cols, int dst_step1) |
||||
|
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
x = x << 1; |
||||
|
||||
#ifdef dst_align |
||||
#undef dst_align |
||||
#endif |
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1) |
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); |
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset); |
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); |
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); |
||||
|
||||
ushort2 src1_data_0 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 0)); |
||||
ushort2 src1_data_1 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 4)); |
||||
ushort2 src1_data_2 = vload2(0, (__global ushort *)((__global char *)src1 + src1_index + 8)); |
||||
|
||||
ushort2 src2_data_0 = (ushort2)(src2.x, src2.y); |
||||
ushort2 src2_data_1 = (ushort2)(src2.z, src2.x); |
||||
ushort2 src2_data_2 = (ushort2)(src2.y, src2.z); |
||||
|
||||
ushort2 data_0 = *((__global ushort2 *)((__global char *)dst + dst_index + 0)); |
||||
ushort2 data_1 = *((__global ushort2 *)((__global char *)dst + dst_index + 4)); |
||||
ushort2 data_2 = *((__global ushort2 *)((__global char *)dst + dst_index + 8)); |
||||
|
||||
ushort2 tmp_data_0 = src1_data_0 | src2_data_0 ; |
||||
ushort2 tmp_data_1 = src1_data_1 | src2_data_1 ; |
||||
ushort2 tmp_data_2 = src1_data_2 | src2_data_2 ; |
||||
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; |
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) |
||||
? tmp_data_1.x : data_1.x; |
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) |
||||
? tmp_data_1.y : data_1.y; |
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) |
||||
? tmp_data_2.xy : data_2.xy; |
||||
|
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; |
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; |
||||
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; |
||||
} |
||||
} |
||||
__kernel void arithm_s_bitwise_or_C3_D3 ( |
||||
__global short *src1, int src1_step, int src1_offset, |
||||
__global short *dst, int dst_step, int dst_offset, |
||||
short4 src2, int rows, int cols, int dst_step1) |
||||
|
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
x = x << 1; |
||||
|
||||
#ifdef dst_align |
||||
#undef dst_align |
||||
#endif |
||||
#define dst_align (((dst_offset % dst_step) / 6 ) & 1) |
||||
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); |
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset); |
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); |
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 6) - (dst_align * 6)); |
||||
|
||||
short2 src1_data_0 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 0)); |
||||
short2 src1_data_1 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 4)); |
||||
short2 src1_data_2 = vload2(0, (__global short *)((__global char *)src1 + src1_index + 8)); |
||||
|
||||
short2 src2_data_0 = (short2)(src2.x, src2.y); |
||||
short2 src2_data_1 = (short2)(src2.z, src2.x); |
||||
short2 src2_data_2 = (short2)(src2.y, src2.z); |
||||
|
||||
short2 data_0 = *((__global short2 *)((__global char *)dst + dst_index + 0)); |
||||
short2 data_1 = *((__global short2 *)((__global char *)dst + dst_index + 4)); |
||||
short2 data_2 = *((__global short2 *)((__global char *)dst + dst_index + 8)); |
||||
|
||||
short2 tmp_data_0 = src1_data_0 | src2_data_0 ; |
||||
short2 tmp_data_1 = src1_data_1 | src2_data_1 ; |
||||
short2 tmp_data_2 = src1_data_2 | src2_data_2 ; |
||||
|
||||
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; |
||||
|
||||
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) |
||||
? tmp_data_1.x : data_1.x; |
||||
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) |
||||
? tmp_data_1.y : data_1.y; |
||||
|
||||
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) |
||||
? tmp_data_2.xy : data_2.xy; |
||||
|
||||
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; |
||||
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; |
||||
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; |
||||
} |
||||
} |
||||
__kernel void arithm_s_bitwise_or_C3_D4 ( |
||||
__global int *src1, int src1_step, int src1_offset, |
||||
__global int *dst, int dst_step, int dst_offset, |
||||
int4 src2, int rows, int cols, int dst_step1) |
||||
|
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); |
||||
|
||||
int src1_data_0 = *((__global int *)((__global char *)src1 + src1_index + 0)); |
||||
int src1_data_1 = *((__global int *)((__global char *)src1 + src1_index + 4)); |
||||
int src1_data_2 = *((__global int *)((__global char *)src1 + src1_index + 8)); |
||||
|
||||
int src2_data_0 = src2.x; |
||||
int src2_data_1 = src2.y; |
||||
int src2_data_2 = src2.z; |
||||
|
||||
int data_0 = *((__global int *)((__global char *)dst + dst_index + 0)); |
||||
int data_1 = *((__global int *)((__global char *)dst + dst_index + 4)); |
||||
int data_2 = *((__global int *)((__global char *)dst + dst_index + 8)); |
||||
|
||||
int tmp_data_0 = src1_data_0 | src2_data_0; |
||||
int tmp_data_1 = src1_data_1 | src2_data_1; |
||||
int tmp_data_2 = src1_data_2 | src2_data_2; |
||||
|
||||
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0; |
||||
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1; |
||||
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2; |
||||
} |
||||
} |
||||
__kernel void arithm_s_bitwise_or_C3_D5 ( |
||||
__global char *src1, int src1_step, int src1_offset, |
||||
__global char *dst, int dst_step, int dst_offset, |
||||
char16 src2, int rows, int cols, int dst_step1) |
||||
|
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
|
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x * 12) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 12)); |
||||
|
||||
char4 src1_data_0 = *((__global char4 *)((__global char *)src1 + src1_index + 0)); |
||||
char4 src1_data_1 = *((__global char4 *)((__global char *)src1 + src1_index + 4)); |
||||
char4 src1_data_2 = *((__global char4 *)((__global char *)src1 + src1_index + 8)); |
||||
|
||||
char4 src2_data_0 = (char4)(src2.s0, src2.s1, src2.s2, src2.s3); |
||||
char4 src2_data_1 = (char4)(src2.s4, src2.s5, src2.s6, src2.s7); |
||||
char4 src2_data_2 = (char4)(src2.s8, src2.s9, src2.sA, src2.sB); |
||||
|
||||
char4 tmp_data_0 = src1_data_0 | src2_data_0; |
||||
char4 tmp_data_1 = src1_data_1 | src2_data_1; |
||||
char4 tmp_data_2 = src1_data_2 | src2_data_2; |
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0; |
||||
*((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1; |
||||
*((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2; |
||||
} |
||||
} |
||||
#if defined (DOUBLE_SUPPORT) |
||||
__kernel void arithm_s_bitwise_or_C3_D6 ( |
||||
__global short *src1, int src1_step, int src1_offset, |
||||
__global short *dst, int dst_step, int dst_offset, |
||||
short16 src2, int rows, int cols, int dst_step1) |
||||
|
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x * 24) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, dst_offset + (x * 24)); |
||||
|
||||
short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0 )); |
||||
short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8 )); |
||||
short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16)); |
||||
|
||||
short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); |
||||
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7); |
||||
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb); |
||||
|
||||
short4 data_0 = *((__global short4 *)((__global char *)dst + dst_index + 0 )); |
||||
short4 data_1 = *((__global short4 *)((__global char *)dst + dst_index + 8 )); |
||||
short4 data_2 = *((__global short4 *)((__global char *)dst + dst_index + 16)); |
||||
|
||||
short4 tmp_data_0 = src1_data_0 | src2_data_0; |
||||
short4 tmp_data_1 = src1_data_1 | src2_data_1; |
||||
short4 tmp_data_2 = src1_data_2 | src2_data_2; |
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; |
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; |
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2; |
||||
} |
||||
} |
||||
#endif |
||||
__kernel void arithm_s_bitwise_or_C4_D0 ( |
||||
__global uchar *src1, int src1_step, int src1_offset, |
||||
__global uchar *dst, int dst_step, int dst_offset, |
||||
uchar4 src2, int rows, int cols, int dst_step1) |
||||
|
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); |
||||
|
||||
uchar4 src_data1 = *((__global uchar4 *)(src1 + src1_index)); |
||||
|
||||
uchar4 data = src_data1 | src2; |
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = data; |
||||
} |
||||
} |
||||
|
||||
|
||||
__kernel void arithm_s_bitwise_or_C4_D1 ( |
||||
__global char *src1, int src1_step, int src1_offset, |
||||
__global char *dst, int dst_step, int dst_offset, |
||||
char4 src2, int rows, int cols, int dst_step1) |
||||
|
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); |
||||
|
||||
char4 src_data1 = *((__global char4 *)(src1 + src1_index)); |
||||
|
||||
char4 data = src_data1 | src2; |
||||
|
||||
*((__global char4 *)(dst + dst_index)) = data; |
||||
} |
||||
} |
||||
|
||||
__kernel void arithm_s_bitwise_or_C4_D2 ( |
||||
__global ushort *src1, int src1_step, int src1_offset, |
||||
__global ushort *dst, int dst_step, int dst_offset, |
||||
ushort4 src2, int rows, int cols, int dst_step1) |
||||
|
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); |
||||
|
||||
ushort4 src_data1 = *((__global ushort4 *)((__global char *)src1 + src1_index)); |
||||
|
||||
ushort4 data = src_data1 | src2; |
||||
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = data; |
||||
} |
||||
} |
||||
__kernel void arithm_s_bitwise_or_C4_D3 ( |
||||
__global short *src1, int src1_step, int src1_offset, |
||||
__global short *dst, int dst_step, int dst_offset, |
||||
short4 src2, int rows, int cols, int dst_step1) |
||||
|
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); |
||||
|
||||
short4 src_data1 = *((__global short4 *)((__global char *)src1 + src1_index)); |
||||
|
||||
short4 data = src_data1 | src2; |
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = data; |
||||
} |
||||
} |
||||
__kernel void arithm_s_bitwise_or_C4_D4 ( |
||||
__global int *src1, int src1_step, int src1_offset, |
||||
__global int *dst, int dst_step, int dst_offset, |
||||
int4 src2, int rows, int cols, int dst_step1) |
||||
|
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); |
||||
|
||||
int4 src_data1 = *((__global int4 *)((__global char *)src1 + src1_index)); |
||||
|
||||
int4 data = src_data1 | src2; |
||||
|
||||
*((__global int4 *)((__global char *)dst + dst_index)) = data; |
||||
} |
||||
} |
||||
__kernel void arithm_s_bitwise_or_C4_D5 ( |
||||
__global char *src1, int src1_step, int src1_offset, |
||||
__global char *dst, int dst_step, int dst_offset, |
||||
char16 src2, int rows, int cols, int dst_step1) |
||||
|
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 4) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 4) + dst_offset); |
||||
|
||||
char16 src_data1 = *((__global char16 *)((__global char *)src1 + src1_index)); |
||||
char16 src_data2 = (char16)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7, |
||||
src2.s8, src2.s9, src2.sa, src2.sb, src2.sc, src2.sd, src2.se, src2.sf); |
||||
|
||||
char16 data = src_data1 | src_data2; |
||||
|
||||
*((__global char16 *)((__global char *)dst + dst_index)) = data; |
||||
} |
||||
} |
||||
#if defined (DOUBLE_SUPPORT) |
||||
__kernel void arithm_s_bitwise_or_C4_D6 ( |
||||
__global short *src1, int src1_step, int src1_offset, |
||||
__global short *dst, int dst_step, int dst_offset, |
||||
short16 src2, int rows, int cols, int dst_step1) |
||||
|
||||
{ |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 5) + src1_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 5) + dst_offset); |
||||
|
||||
short4 src1_data_0 = *((__global short4 *)((__global char *)src1 + src1_index + 0)); |
||||
short4 src1_data_1 = *((__global short4 *)((__global char *)src1 + src1_index + 8)); |
||||
short4 src1_data_2 = *((__global short4 *)((__global char *)src1 + src1_index + 16)); |
||||
short4 src1_data_3 = *((__global short4 *)((__global char *)src1 + src1_index + 24)); |
||||
|
||||
short4 src2_data_0 = (short4)(src2.s0, src2.s1, src2.s2, src2.s3); |
||||
short4 src2_data_1 = (short4)(src2.s4, src2.s5, src2.s6, src2.s7); |
||||
short4 src2_data_2 = (short4)(src2.s8, src2.s9, src2.sa, src2.sb); |
||||
short4 src2_data_3 = (short4)(src2.sc, src2.sd, src2.se, src2.sf); |
||||
|
||||
short4 tmp_data_0 = src1_data_0 | src2_data_0; |
||||
short4 tmp_data_1 = src1_data_1 | src2_data_1; |
||||
short4 tmp_data_2 = src1_data_2 | src2_data_2; |
||||
short4 tmp_data_3 = src1_data_3 | src2_data_3; |
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; |
||||
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; |
||||
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2; |
||||
*((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3; |
||||
|
||||
} |
||||
} |
||||
#endif |
File diff suppressed because it is too large
Load Diff
@ -1,340 +0,0 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Jiang Liyuan, jlyuan001.good@163.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other GpuMaterials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
#if defined (DOUBLE_SUPPORT) |
||||
#ifdef cl_khr_fp64 |
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable |
||||
#elif defined (cl_amd_fp64) |
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable |
||||
#endif |
||||
#endif |
||||
////////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
////////////////////////////////////////////BITWISE_XOR//////////////////////////////////////////////////// |
||||
/////////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
/**************************************bitwise_xor without mask**************************************/ |
||||
__kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int src1_offset, |
||||
__global uchar *src2, int src2_step, int src2_offset, |
||||
__global uchar *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
x = x << 2; |
||||
|
||||
#ifdef dst_align |
||||
#undef dst_align |
||||
#endif |
||||
#define dst_align (dst_offset & 3) |
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); |
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); |
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset); |
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); |
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); |
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index; |
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index; |
||||
uchar4 src1_data = vload4(0, src1 + src1_index_fix); |
||||
uchar4 src2_data = vload4(0, src2 + src2_index_fix); |
||||
|
||||
if(src1_index < 0) |
||||
{ |
||||
uchar4 tmp; |
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; |
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; |
||||
} |
||||
if(src2_index < 0) |
||||
{ |
||||
uchar4 tmp; |
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; |
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; |
||||
} |
||||
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); |
||||
uchar4 tmp_data = src1_data ^ src2_data; |
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; |
||||
dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y; |
||||
dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z; |
||||
dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w; |
||||
|
||||
*((__global uchar4 *)(dst + dst_index)) = dst_data; |
||||
} |
||||
} |
||||
|
||||
|
||||
__kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src1_offset, |
||||
__global char *src2, int src2_step, int src2_offset, |
||||
__global char *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
x = x << 2; |
||||
|
||||
#ifdef dst_align |
||||
#undef dst_align |
||||
#endif |
||||
#define dst_align (dst_offset & 3) |
||||
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); |
||||
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); |
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset); |
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); |
||||
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); |
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index; |
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index; |
||||
char4 src1_data = vload4(0, src1 + src1_index_fix); |
||||
char4 src2_data = vload4(0, src2 + src2_index_fix); |
||||
|
||||
if(src1_index < 0) |
||||
{ |
||||
char4 tmp; |
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; |
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; |
||||
} |
||||
if(src2_index < 0) |
||||
{ |
||||
char4 tmp; |
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; |
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; |
||||
} |
||||
char4 dst_data = *((__global char4 *)(dst + dst_index)); |
||||
char4 tmp_data = src1_data ^ src2_data; |
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; |
||||
dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y; |
||||
dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z; |
||||
dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w; |
||||
|
||||
*((__global char4 *)(dst + dst_index)) = dst_data; |
||||
} |
||||
} |
||||
|
||||
|
||||
__kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int src1_offset, |
||||
__global ushort *src2, int src2_step, int src2_offset, |
||||
__global ushort *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int dst_step1) |
||||
|
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
x = x << 2; |
||||
|
||||
#ifdef dst_align |
||||
#undef dst_align |
||||
#endif |
||||
#define dst_align ((dst_offset >> 1) & 3) |
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); |
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); |
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset); |
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); |
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); |
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index; |
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index; |
||||
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix)); |
||||
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix)); |
||||
|
||||
if(src1_index < 0) |
||||
{ |
||||
ushort4 tmp; |
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; |
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; |
||||
} |
||||
if(src2_index < 0) |
||||
{ |
||||
ushort4 tmp; |
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; |
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; |
||||
} |
||||
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); |
||||
ushort4 tmp_data = src1_data ^ src2_data; |
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; |
||||
dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y; |
||||
dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z; |
||||
dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w; |
||||
|
||||
*((__global ushort4 *)((__global char *)dst + dst_index)) = dst_data; |
||||
} |
||||
} |
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int src1_offset, |
||||
__global short *src2, int src2_step, int src2_offset, |
||||
__global short *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int dst_step1) |
||||
|
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
x = x << 2; |
||||
|
||||
#ifdef dst_align |
||||
#undef dst_align |
||||
#endif |
||||
#define dst_align ((dst_offset >> 1) & 3) |
||||
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); |
||||
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); |
||||
|
||||
int dst_start = mad24(y, dst_step, dst_offset); |
||||
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); |
||||
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); |
||||
|
||||
int src1_index_fix = src1_index < 0 ? 0 : src1_index; |
||||
int src2_index_fix = src2_index < 0 ? 0 : src2_index; |
||||
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix)); |
||||
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix)); |
||||
|
||||
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); |
||||
|
||||
if(src1_index < 0) |
||||
{ |
||||
short4 tmp; |
||||
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; |
||||
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; |
||||
} |
||||
if(src2_index < 0) |
||||
{ |
||||
short4 tmp; |
||||
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; |
||||
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; |
||||
} |
||||
|
||||
|
||||
|
||||
short4 tmp_data = src1_data ^ src2_data; |
||||
|
||||
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; |
||||
dst_data.y = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : dst_data.y; |
||||
dst_data.z = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.z : dst_data.z; |
||||
dst_data.w = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) ? tmp_data.w : dst_data.w; |
||||
|
||||
*((__global short4 *)((__global char *)dst + dst_index)) = dst_data; |
||||
} |
||||
} |
||||
|
||||
|
||||
|
||||
__kernel void arithm_bitwise_xor_D4 (__global int *src1, int src1_step, int src1_offset, |
||||
__global int *src2, int src2_step, int src2_offset, |
||||
__global int *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); |
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); |
||||
|
||||
int data1 = *((__global int *)((__global char *)src1 + src1_index)); |
||||
int data2 = *((__global int *)((__global char *)src2 + src2_index)); |
||||
int tmp = data1 ^ data2; |
||||
|
||||
*((__global int *)((__global char *)dst + dst_index)) = tmp; |
||||
} |
||||
} |
||||
|
||||
__kernel void arithm_bitwise_xor_D5 (__global char *src1, int src1_step, int src1_offset, |
||||
__global char *src2, int src2_step, int src2_offset, |
||||
__global char *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset); |
||||
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 2) + dst_offset); |
||||
|
||||
char4 data1 = *((__global char4 *)((__global char *)src1 + src1_index)); |
||||
char4 data2 = *((__global char4 *)((__global char *)src2 + src2_index)); |
||||
char4 tmp = data1 ^ data2; |
||||
|
||||
*((__global char4 *)((__global char *)dst + dst_index)) = tmp; |
||||
} |
||||
} |
||||
#if defined (DOUBLE_SUPPORT) |
||||
__kernel void arithm_bitwise_xor_D6 (__global char *src1, int src1_step, int src1_offset, |
||||
__global char *src2, int src2_step, int src2_offset, |
||||
__global char *dst, int dst_step, int dst_offset, |
||||
int rows, int cols, int dst_step1) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset); |
||||
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset); |
||||
int dst_index = mad24(y, dst_step, (x << 3) + dst_offset); |
||||
|
||||
char8 data1 = *((__global char8 *)((__global char *)src1 + src1_index)); |
||||
char8 data2 = *((__global char8 *)((__global char *)src2 + src2_index)); |
||||
|
||||
*((__global char8 *)((__global char *)dst + dst_index)) = data1 ^ data2; |
||||
} |
||||
} |
||||
#endif |
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,407 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Jin Ma jin@multicorewareinc.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other oclMaterials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
__kernel void centeredGradientKernel(__global const float* src, int src_col, int src_row, int src_step, |
||||
__global float* dx, __global float* dy, int dx_step) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if((x < src_col)&&(y < src_row)) |
||||
{ |
||||
int src_x1 = (x + 1) < (src_col -1)? (x + 1) : (src_col - 1); |
||||
int src_x2 = (x - 1) > 0 ? (x -1) : 0; |
||||
|
||||
//if(src[y * src_step + src_x1] == src[y * src_step+ src_x2]) |
||||
//{ |
||||
// printf("y = %d\n", y); |
||||
// printf("src_x1 = %d\n", src_x1); |
||||
// printf("src_x2 = %d\n", src_x2); |
||||
//} |
||||
dx[y * dx_step+ x] = 0.5f * (src[y * src_step + src_x1] - src[y * src_step+ src_x2]); |
||||
|
||||
int src_y1 = (y+1) < (src_row - 1) ? (y + 1) : (src_row - 1); |
||||
int src_y2 = (y - 1) > 0 ? (y - 1) : 0; |
||||
dy[y * dx_step+ x] = 0.5f * (src[src_y1 * src_step + x] - src[src_y2 * src_step+ x]); |
||||
} |
||||
|
||||
} |
||||
|
||||
float bicubicCoeff(float x_) |
||||
{ |
||||
|
||||
float x = fabs(x_); |
||||
if (x <= 1.0f) |
||||
{ |
||||
return x * x * (1.5f * x - 2.5f) + 1.0f; |
||||
} |
||||
else if (x < 2.0f) |
||||
{ |
||||
return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f; |
||||
} |
||||
else |
||||
{ |
||||
return 0.0f; |
||||
} |
||||
|
||||
} |
||||
|
||||
__kernel void warpBackwardKernel(__global const float* I0, int I0_step, int I0_col, int I0_row, |
||||
image2d_t tex_I1, image2d_t tex_I1x, image2d_t tex_I1y, |
||||
__global const float* u1, int u1_step, |
||||
__global const float* u2, |
||||
__global float* I1w, |
||||
__global float* I1wx, /*int I1wx_step,*/ |
||||
__global float* I1wy, /*int I1wy_step,*/ |
||||
__global float* grad, /*int grad_step,*/ |
||||
__global float* rho, |
||||
int I1w_step, |
||||
int u2_step, |
||||
int u1_offset_x, |
||||
int u1_offset_y, |
||||
int u2_offset_x, |
||||
int u2_offset_y) |
||||
{ |
||||
const int x = get_global_id(0); |
||||
const int y = get_global_id(1); |
||||
|
||||
if(x < I0_col&&y < I0_row) |
||||
{ |
||||
//const float u1Val = u1(y, x); |
||||
const float u1Val = u1[(y + u1_offset_y) * u1_step + x + u1_offset_x]; |
||||
//const float u2Val = u2(y, x); |
||||
const float u2Val = u2[(y + u2_offset_y) * u2_step + x + u2_offset_x]; |
||||
|
||||
const float wx = x + u1Val; |
||||
const float wy = y + u2Val; |
||||
|
||||
const int xmin = ceil(wx - 2.0f); |
||||
const int xmax = floor(wx + 2.0f); |
||||
|
||||
const int ymin = ceil(wy - 2.0f); |
||||
const int ymax = floor(wy + 2.0f); |
||||
|
||||
float sum = 0.0f; |
||||
float sumx = 0.0f; |
||||
float sumy = 0.0f; |
||||
float wsum = 0.0f; |
||||
sampler_t sampleri = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; |
||||
|
||||
for (int cy = ymin; cy <= ymax; ++cy) |
||||
{ |
||||
for (int cx = xmin; cx <= xmax; ++cx) |
||||
{ |
||||
const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy); |
||||
|
||||
//sum += w * tex2D(tex_I1 , cx, cy); |
||||
int2 cood = (int2)(cx, cy); |
||||
sum += w * read_imagef(tex_I1, sampleri, cood).x; |
||||
//sumx += w * tex2D(tex_I1x, cx, cy); |
||||
sumx += w * read_imagef(tex_I1x, sampleri, cood).x; |
||||
//sumy += w * tex2D(tex_I1y, cx, cy); |
||||
sumy += w * read_imagef(tex_I1y, sampleri, cood).x; |
||||
|
||||
wsum += w; |
||||
} |
||||
} |
||||
|
||||
const float coeff = 1.0f / wsum; |
||||
|
||||
const float I1wVal = sum * coeff; |
||||
const float I1wxVal = sumx * coeff; |
||||
const float I1wyVal = sumy * coeff; |
||||
|
||||
I1w[y * I1w_step + x] = I1wVal; |
||||
I1wx[y * I1w_step + x] = I1wxVal; |
||||
I1wy[y * I1w_step + x] = I1wyVal; |
||||
|
||||
const float Ix2 = I1wxVal * I1wxVal; |
||||
const float Iy2 = I1wyVal * I1wyVal; |
||||
|
||||
// store the |Grad(I1)|^2 |
||||
grad[y * I1w_step + x] = Ix2 + Iy2; |
||||
|
||||
// compute the constant part of the rho function |
||||
const float I0Val = I0[y * I0_step + x]; |
||||
rho[y * I1w_step + x] = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val; |
||||
} |
||||
|
||||
} |
||||
|
||||
float readImage(__global const float *image, const int x, const int y, const int rows, const int cols, const int elemCntPerRow) |
||||
{ |
||||
int i0 = clamp(x, 0, cols - 1); |
||||
int j0 = clamp(y, 0, rows - 1); |
||||
int i1 = clamp(x + 1, 0, cols - 1); |
||||
int j1 = clamp(y + 1, 0, rows - 1); |
||||
|
||||
return image[j0 * elemCntPerRow + i0]; |
||||
} |
||||
|
||||
__kernel void warpBackwardKernelNoImage2d(__global const float* I0, int I0_step, int I0_col, int I0_row, |
||||
__global const float* tex_I1, __global const float* tex_I1x, __global const float* tex_I1y, |
||||
__global const float* u1, int u1_step, |
||||
__global const float* u2, |
||||
__global float* I1w, |
||||
__global float* I1wx, /*int I1wx_step,*/ |
||||
__global float* I1wy, /*int I1wy_step,*/ |
||||
__global float* grad, /*int grad_step,*/ |
||||
__global float* rho, |
||||
int I1w_step, |
||||
int u2_step, |
||||
int I1_step, |
||||
int I1x_step) |
||||
{ |
||||
const int x = get_global_id(0); |
||||
const int y = get_global_id(1); |
||||
|
||||
if(x < I0_col&&y < I0_row) |
||||
{ |
||||
//const float u1Val = u1(y, x); |
||||
const float u1Val = u1[y * u1_step + x]; |
||||
//const float u2Val = u2(y, x); |
||||
const float u2Val = u2[y * u2_step + x]; |
||||
|
||||
const float wx = x + u1Val; |
||||
const float wy = y + u2Val; |
||||
|
||||
const int xmin = ceil(wx - 2.0f); |
||||
const int xmax = floor(wx + 2.0f); |
||||
|
||||
const int ymin = ceil(wy - 2.0f); |
||||
const int ymax = floor(wy + 2.0f); |
||||
|
||||
float sum = 0.0f; |
||||
float sumx = 0.0f; |
||||
float sumy = 0.0f; |
||||
float wsum = 0.0f; |
||||
|
||||
for (int cy = ymin; cy <= ymax; ++cy) |
||||
{ |
||||
for (int cx = xmin; cx <= xmax; ++cx) |
||||
{ |
||||
const float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy); |
||||
|
||||
int2 cood = (int2)(cx, cy); |
||||
sum += w * readImage(tex_I1, cood.x, cood.y, I0_col, I0_row, I1_step); |
||||
sumx += w * readImage(tex_I1x, cood.x, cood.y, I0_col, I0_row, I1x_step); |
||||
sumy += w * readImage(tex_I1y, cood.x, cood.y, I0_col, I0_row, I1x_step); |
||||
wsum += w; |
||||
} |
||||
} |
||||
|
||||
const float coeff = 1.0f / wsum; |
||||
|
||||
const float I1wVal = sum * coeff; |
||||
const float I1wxVal = sumx * coeff; |
||||
const float I1wyVal = sumy * coeff; |
||||
|
||||
I1w[y * I1w_step + x] = I1wVal; |
||||
I1wx[y * I1w_step + x] = I1wxVal; |
||||
I1wy[y * I1w_step + x] = I1wyVal; |
||||
|
||||
const float Ix2 = I1wxVal * I1wxVal; |
||||
const float Iy2 = I1wyVal * I1wyVal; |
||||
|
||||
// store the |Grad(I1)|^2 |
||||
grad[y * I1w_step + x] = Ix2 + Iy2; |
||||
|
||||
// compute the constant part of the rho function |
||||
const float I0Val = I0[y * I0_step + x]; |
||||
rho[y * I1w_step + x] = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val; |
||||
} |
||||
|
||||
} |
||||
|
||||
|
||||
__kernel void estimateDualVariablesKernel(__global const float* u1, int u1_col, int u1_row, int u1_step, |
||||
__global const float* u2, |
||||
__global float* p11, int p11_step, |
||||
__global float* p12, |
||||
__global float* p21, |
||||
__global float* p22, |
||||
const float taut, |
||||
int u2_step, |
||||
int u1_offset_x, |
||||
int u1_offset_y, |
||||
int u2_offset_x, |
||||
int u2_offset_y) |
||||
{ |
||||
|
||||
//const int x = blockIdx.x * blockDim.x + threadIdx.x; |
||||
//const int y = blockIdx.y * blockDim.y + threadIdx.y; |
||||
const int x = get_global_id(0); |
||||
const int y = get_global_id(1); |
||||
|
||||
if(x < u1_col && y < u1_row) |
||||
{ |
||||
int src_x1 = (x + 1) < (u1_col - 1) ? (x + 1) : (u1_col - 1); |
||||
const float u1x = u1[(y + u1_offset_y) * u1_step + src_x1 + u1_offset_x] - u1[(y + u1_offset_y) * u1_step + x + u1_offset_x]; |
||||
|
||||
int src_y1 = (y + 1) < (u1_row - 1) ? (y + 1) : (u1_row - 1); |
||||
const float u1y = u1[(src_y1 + u1_offset_y) * u1_step + x + u1_offset_x] - u1[(y + u1_offset_y) * u1_step + x + u1_offset_x]; |
||||
|
||||
int src_x2 = (x + 1) < (u1_col - 1) ? (x + 1) : (u1_col - 1); |
||||
const float u2x = u2[(y + u2_offset_y) * u2_step + src_x2 + u2_offset_x] - u2[(y + u2_offset_y) * u2_step + x + u2_offset_x]; |
||||
|
||||
int src_y2 = (y + 1) < (u1_row - 1) ? (y + 1) : (u1_row - 1); |
||||
const float u2y = u2[(src_y2 + u2_offset_y) * u2_step + x + u2_offset_x] - u2[(y + u2_offset_y) * u2_step + x + u2_offset_x]; |
||||
|
||||
const float g1 = hypot(u1x, u1y); |
||||
const float g2 = hypot(u2x, u2y); |
||||
|
||||
const float ng1 = 1.0f + taut * g1; |
||||
const float ng2 = 1.0f + taut * g2; |
||||
|
||||
p11[y * p11_step + x] = (p11[y * p11_step + x] + taut * u1x) / ng1; |
||||
p12[y * p11_step + x] = (p12[y * p11_step + x] + taut * u1y) / ng1; |
||||
p21[y * p11_step + x] = (p21[y * p11_step + x] + taut * u2x) / ng2; |
||||
p22[y * p11_step + x] = (p22[y * p11_step + x] + taut * u2y) / ng2; |
||||
} |
||||
|
||||
} |
||||
|
||||
float divergence(__global const float* v1, __global const float* v2, int y, int x, int v1_step, int v2_step) |
||||
{ |
||||
|
||||
if (x > 0 && y > 0) |
||||
{ |
||||
const float v1x = v1[y * v1_step + x] - v1[y * v1_step + x - 1]; |
||||
const float v2y = v2[y * v2_step + x] - v2[(y - 1) * v2_step + x]; |
||||
return v1x + v2y; |
||||
} |
||||
else |
||||
{ |
||||
if (y > 0) |
||||
return v1[y * v1_step + 0] + v2[y * v2_step + 0] - v2[(y - 1) * v2_step + 0]; |
||||
else |
||||
{ |
||||
if (x > 0) |
||||
return v1[0 * v1_step + x] - v1[0 * v1_step + x - 1] + v2[0 * v2_step + x]; |
||||
else |
||||
return v1[0 * v1_step + 0] + v2[0 * v2_step + 0]; |
||||
} |
||||
} |
||||
|
||||
} |
||||
|
||||
__kernel void estimateUKernel(__global const float* I1wx, int I1wx_col, int I1wx_row, int I1wx_step, |
||||
__global const float* I1wy, /*int I1wy_step,*/ |
||||
__global const float* grad, /*int grad_step,*/ |
||||
__global const float* rho_c, /*int rho_c_step,*/ |
||||
__global const float* p11, /*int p11_step,*/ |
||||
__global const float* p12, /*int p12_step,*/ |
||||
__global const float* p21, /*int p21_step,*/ |
||||
__global const float* p22, /*int p22_step,*/ |
||||
__global float* u1, int u1_step, |
||||
__global float* u2, |
||||
__global float* error, const float l_t, const float theta, int u2_step, |
||||
int u1_offset_x, |
||||
int u1_offset_y, |
||||
int u2_offset_x, |
||||
int u2_offset_y) |
||||
{ |
||||
|
||||
//const int x = blockIdx.x * blockDim.x + threadIdx.x; |
||||
//const int y = blockIdx.y * blockDim.y + threadIdx.y; |
||||
|
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
|
||||
if(x < I1wx_col && y < I1wx_row) |
||||
{ |
||||
const float I1wxVal = I1wx[y * I1wx_step + x]; |
||||
const float I1wyVal = I1wy[y * I1wx_step + x]; |
||||
const float gradVal = grad[y * I1wx_step + x]; |
||||
const float u1OldVal = u1[(y + u1_offset_y) * u1_step + x + u1_offset_x]; |
||||
const float u2OldVal = u2[(y + u2_offset_y) * u2_step + x + u2_offset_x]; |
||||
|
||||
const float rho = rho_c[y * I1wx_step + x] + (I1wxVal * u1OldVal + I1wyVal * u2OldVal); |
||||
|
||||
// estimate the values of the variable (v1, v2) (thresholding operator TH) |
||||
|
||||
float d1 = 0.0f; |
||||
float d2 = 0.0f; |
||||
|
||||
if (rho < -l_t * gradVal) |
||||
{ |
||||
d1 = l_t * I1wxVal; |
||||
d2 = l_t * I1wyVal; |
||||
} |
||||
else if (rho > l_t * gradVal) |
||||
{ |
||||
d1 = -l_t * I1wxVal; |
||||
d2 = -l_t * I1wyVal; |
||||
} |
||||
else if (gradVal > 1.192092896e-07f) |
||||
{ |
||||
const float fi = -rho / gradVal; |
||||
d1 = fi * I1wxVal; |
||||
d2 = fi * I1wyVal; |
||||
} |
||||
|
||||
const float v1 = u1OldVal + d1; |
||||
const float v2 = u2OldVal + d2; |
||||
|
||||
// compute the divergence of the dual variable (p1, p2) |
||||
|
||||
const float div_p1 = divergence(p11, p12, y, x, I1wx_step, I1wx_step); |
||||
const float div_p2 = divergence(p21, p22, y, x, I1wx_step, I1wx_step); |
||||
|
||||
// estimate the values of the optical flow (u1, u2) |
||||
|
||||
const float u1NewVal = v1 + theta * div_p1; |
||||
const float u2NewVal = v2 + theta * div_p2; |
||||
|
||||
u1[(y + u1_offset_y) * u1_step + x + u1_offset_x] = u1NewVal; |
||||
u2[(y + u2_offset_y) * u2_step + x + u2_offset_x] = u2NewVal; |
||||
|
||||
const float n1 = (u1OldVal - u1NewVal) * (u1OldVal - u1NewVal); |
||||
const float n2 = (u2OldVal - u2NewVal) * (u2OldVal - u2NewVal); |
||||
error[y * I1wx_step + x] = n1 + n2; |
||||
} |
||||
|
||||
} |
@ -0,0 +1,756 @@ |
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Jia Haipeng, jiahaipeng95@gmail.com
|
||||
// Jin Ma, jin@multicorewareinc.com
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "precomp.hpp" |
||||
|
||||
using namespace cv; |
||||
using namespace cv::ocl; |
||||
using namespace std; |
||||
|
||||
#if !defined (HAVE_OPENCL) |
||||
|
||||
namespace cv |
||||
{ |
||||
namespace ocl |
||||
{ |
||||
|
||||
void cv::ocl::StereoConstantSpaceBP::estimateRecommendedParams(int, int, int &, int &, int &, int &) |
||||
{ |
||||
throw_nogpu(); |
||||
} |
||||
cv::ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int, int, int, int, int) |
||||
{ |
||||
throw_nogpu(); |
||||
} |
||||
cv::ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int, int, int, int, float, float, |
||||
float, float, int, int) |
||||
{ |
||||
throw_nogpu(); |
||||
} |
||||
|
||||
void cv::ocl::StereoConstantSpaceBP::operator()(const oclMat &, const oclMat &, oclMat &) |
||||
{ |
||||
throw_nogpu(); |
||||
} |
||||
} |
||||
} |
||||
|
||||
#else /* !defined (HAVE_OPENCL) */ |
||||
|
||||
namespace cv |
||||
{ |
||||
namespace ocl |
||||
{ |
||||
|
||||
///////////////////////////OpenCL kernel Strings///////////////////////////
|
||||
extern const char *stereocsbp; |
||||
} |
||||
|
||||
} |
||||
namespace cv |
||||
{ |
||||
namespace ocl |
||||
{ |
||||
namespace stereoCSBP |
||||
{ |
||||
//////////////////////////////////////////////////////////////////////////
|
||||
//////////////////////////////common////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
static inline int divUp(int total, int grain) |
||||
{ |
||||
return (total + grain - 1) / grain; |
||||
} |
||||
static String get_kernel_name(String kernel_name, int data_type) |
||||
{ |
||||
return kernel_name + (data_type == CV_16S ? "0" : "1"); |
||||
} |
||||
using cv::ocl::StereoConstantSpaceBP; |
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
/////////////////////////////////init_data_cost//////////////////////////////////
|
||||
//////////////////////////////////////////////////////////////////////////////////
|
||||
static void init_data_cost_caller(const oclMat &left, const oclMat &right, oclMat &temp, |
||||
StereoConstantSpaceBP &rthis, |
||||
int msg_step, int h, int w, int level) |
||||
{ |
||||
Context *clCxt = left.clCxt; |
||||
int data_type = rthis.msg_type; |
||||
int channels = left.oclchannels(); |
||||
|
||||
String kernelName = get_kernel_name("init_data_cost_", data_type); |
||||
|
||||
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName); |
||||
|
||||
//size_t blockSize = 256;
|
||||
size_t localThreads[] = {32, 8 ,1}; |
||||
size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0], |
||||
divUp(h, localThreads[1]) *localThreads[1], |
||||
1 |
||||
}; |
||||
|
||||
int cdisp_step1 = msg_step * h; |
||||
openCLVerifyKernel(clCxt, kernel, localThreads); |
||||
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&temp.data)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&left.data)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&right.data)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&h)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&w)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&level)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&channels)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&msg_step)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_float), (void *)&rthis.data_weight)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_float), (void *)&rthis.max_data_term)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&cdisp_step1)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&rthis.min_disp_th)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&left.step)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int), (void *)&rthis.ndisp)); |
||||
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL, |
||||
globalThreads, localThreads, 0, NULL, NULL)); |
||||
|
||||
clFinish(*(cl_command_queue*)getoclCommandQueue()); |
||||
openCLSafeCall(clReleaseKernel(kernel)); |
||||
} |
||||
|
||||
static void init_data_cost_reduce_caller(const oclMat &left, const oclMat &right, oclMat &temp, |
||||
StereoConstantSpaceBP &rthis, |
||||
int msg_step, int h, int w, int level) |
||||
{ |
||||
|
||||
Context *clCxt = left.clCxt; |
||||
int data_type = rthis.msg_type; |
||||
int channels = left.oclchannels(); |
||||
int win_size = (int)std::pow(2.f, level); |
||||
|
||||
String kernelName = get_kernel_name("init_data_cost_reduce_", data_type); |
||||
|
||||
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName); |
||||
|
||||
const int threadsNum = 256; |
||||
//size_t blockSize = threadsNum;
|
||||
size_t localThreads[3] = {win_size, 1, threadsNum / win_size}; |
||||
size_t globalThreads[3] = {w *localThreads[0], |
||||
h * divUp(rthis.ndisp, localThreads[2]) *localThreads[1], 1 * localThreads[2] |
||||
}; |
||||
|
||||
int local_mem_size = threadsNum * sizeof(float); |
||||
int cdisp_step1 = msg_step * h; |
||||
|
||||
openCLVerifyKernel(clCxt, kernel, localThreads); |
||||
|
||||
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&temp.data)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&left.data)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&right.data)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 3, local_mem_size, (void *)NULL)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&level)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&left.rows)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&left.cols)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&h)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&win_size)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&channels)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&rthis.ndisp)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&left.step)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_float), (void *)&rthis.data_weight)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_float), (void *)&rthis.max_data_term)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_int), (void *)&rthis.min_disp_th)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int), (void *)&cdisp_step1)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_int), (void *)&msg_step)); |
||||
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 3, NULL, |
||||
globalThreads, localThreads, 0, NULL, NULL)); |
||||
clFinish(*(cl_command_queue*)getoclCommandQueue()); |
||||
openCLSafeCall(clReleaseKernel(kernel)); |
||||
} |
||||
|
||||
static void get_first_initial_local_caller(uchar *data_cost_selected, uchar *disp_selected_pyr, |
||||
oclMat &temp, StereoConstantSpaceBP &rthis, |
||||
int h, int w, int nr_plane, int msg_step) |
||||
{ |
||||
Context *clCxt = temp.clCxt; |
||||
int data_type = rthis.msg_type; |
||||
|
||||
String kernelName = get_kernel_name("get_first_k_initial_local_", data_type); |
||||
|
||||
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName); |
||||
|
||||
//size_t blockSize = 256;
|
||||
size_t localThreads[] = {32, 8 ,1}; |
||||
size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0], |
||||
divUp(h, localThreads[1]) *localThreads[1], |
||||
1 |
||||
}; |
||||
|
||||
int disp_step = msg_step * h; |
||||
openCLVerifyKernel(clCxt, kernel, localThreads); |
||||
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&data_cost_selected)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&disp_selected_pyr)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&temp.data)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&h)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&w)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&nr_plane)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&msg_step)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&disp_step)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&rthis.ndisp)); |
||||
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL, |
||||
globalThreads, localThreads, 0, NULL, NULL)); |
||||
|
||||
clFinish(*(cl_command_queue*)getoclCommandQueue()); |
||||
openCLSafeCall(clReleaseKernel(kernel)); |
||||
} |
||||
static void get_first_initial_global_caller(uchar *data_cost_selected, uchar *disp_selected_pyr, |
||||
oclMat &temp, StereoConstantSpaceBP &rthis, |
||||
int h, int w, int nr_plane, int msg_step) |
||||
{ |
||||
Context *clCxt = temp.clCxt; |
||||
int data_type = rthis.msg_type; |
||||
|
||||
String kernelName = get_kernel_name("get_first_k_initial_global_", data_type); |
||||
|
||||
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName); |
||||
|
||||
//size_t blockSize = 256;
|
||||
size_t localThreads[] = {32, 8, 1}; |
||||
size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0], |
||||
divUp(h, localThreads[1]) *localThreads[1], |
||||
1 |
||||
}; |
||||
|
||||
int disp_step = msg_step * h; |
||||
openCLVerifyKernel(clCxt, kernel, localThreads); |
||||
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&data_cost_selected)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&disp_selected_pyr)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&temp.data)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&h)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&w)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&nr_plane)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&msg_step)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&disp_step)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&rthis.ndisp)); |
||||
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL, |
||||
globalThreads, localThreads, 0, NULL, NULL)); |
||||
|
||||
clFinish(*(cl_command_queue*)getoclCommandQueue()); |
||||
openCLSafeCall(clReleaseKernel(kernel)); |
||||
} |
||||
|
||||
static void init_data_cost(const oclMat &left, const oclMat &right, oclMat &temp, StereoConstantSpaceBP &rthis, |
||||
uchar *disp_selected_pyr, uchar *data_cost_selected, |
||||
size_t msg_step, int h, int w, int level, int nr_plane) |
||||
{ |
||||
|
||||
if(level <= 1) |
||||
init_data_cost_caller(left, right, temp, rthis, msg_step, h, w, level); |
||||
else |
||||
init_data_cost_reduce_caller(left, right, temp, rthis, msg_step, h, w, level); |
||||
|
||||
if(rthis.use_local_init_data_cost == true) |
||||
{ |
||||
get_first_initial_local_caller(data_cost_selected, disp_selected_pyr, temp, rthis, h, w, nr_plane, msg_step); |
||||
} |
||||
else |
||||
{ |
||||
get_first_initial_global_caller(data_cost_selected, disp_selected_pyr, temp, rthis, h, w, |
||||
nr_plane, msg_step); |
||||
} |
||||
} |
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
///////////////////////////////////compute_data_cost//////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
static void compute_data_cost_caller(uchar *disp_selected_pyr, uchar *data_cost, |
||||
StereoConstantSpaceBP &rthis, int msg_step1, |
||||
int msg_step2, const oclMat &left, const oclMat &right, int h, |
||||
int w, int h2, int level, int nr_plane) |
||||
{ |
||||
Context *clCxt = left.clCxt; |
||||
int channels = left.oclchannels(); |
||||
int data_type = rthis.msg_type; |
||||
|
||||
String kernelName = get_kernel_name("compute_data_cost_", data_type); |
||||
|
||||
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName); |
||||
|
||||
//size_t blockSize = 256;
|
||||
size_t localThreads[] = {32, 8, 1}; |
||||
size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0], |
||||
divUp(h, localThreads[1]) *localThreads[1], |
||||
1 |
||||
}; |
||||
|
||||
int disp_step1 = msg_step1 * h; |
||||
int disp_step2 = msg_step2 * h2; |
||||
openCLVerifyKernel(clCxt, kernel, localThreads); |
||||
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&disp_selected_pyr)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&data_cost)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&left.data)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&right.data)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&h)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&w)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&level)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&nr_plane)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&channels)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&msg_step1)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&msg_step2)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&disp_step1)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&disp_step2)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_float), (void *)&rthis.data_weight)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_float), (void *)&rthis.max_data_term)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int), (void *)&left.step)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_int), (void *)&rthis.min_disp_th)); |
||||
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL, |
||||
globalThreads, localThreads, 0, NULL, NULL)); |
||||
|
||||
clFinish(*(cl_command_queue*)getoclCommandQueue()); |
||||
openCLSafeCall(clReleaseKernel(kernel)); |
||||
} |
||||
static void compute_data_cost_reduce_caller(uchar *disp_selected_pyr, uchar *data_cost, |
||||
StereoConstantSpaceBP &rthis, int msg_step1, |
||||
int msg_step2, const oclMat &left, const oclMat &right, int h, |
||||
int w, int h2, int level, int nr_plane) |
||||
{ |
||||
Context *clCxt = left.clCxt; |
||||
int data_type = rthis.msg_type; |
||||
int channels = left.oclchannels(); |
||||
int win_size = (int)std::pow(2.f, level); |
||||
|
||||
String kernelName = get_kernel_name("compute_data_cost_reduce_", data_type); |
||||
|
||||
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName); |
||||
|
||||
const size_t threadsNum = 256; |
||||
//size_t blockSize = threadsNum;
|
||||
size_t localThreads[3] = {win_size, 1, threadsNum / win_size}; |
||||
size_t globalThreads[3] = {w *localThreads[0], |
||||
h * divUp(nr_plane, localThreads[2]) *localThreads[1], 1 * localThreads[2] |
||||
}; |
||||
|
||||
int disp_step1 = msg_step1 * h; |
||||
int disp_step2 = msg_step2 * h2; |
||||
size_t local_mem_size = threadsNum * sizeof(float); |
||||
openCLVerifyKernel(clCxt, kernel, localThreads); |
||||
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&disp_selected_pyr)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&data_cost)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&left.data)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&right.data)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 4, local_mem_size, (void *)NULL)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&level)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&left.rows)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&left.cols)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&h)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&nr_plane)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&channels)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&win_size)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&msg_step1)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int), (void *)&msg_step2)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_int), (void *)&disp_step1)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int), (void *)&disp_step2)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_float), (void *)&rthis.data_weight)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 17, sizeof(cl_float), (void *)&rthis.max_data_term)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 18, sizeof(cl_int), (void *)&left.step)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 19, sizeof(cl_int), (void *)&rthis.min_disp_th)); |
||||
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 3, NULL, |
||||
globalThreads, localThreads, 0, NULL, NULL)); |
||||
|
||||
clFinish(*(cl_command_queue*)getoclCommandQueue()); |
||||
openCLSafeCall(clReleaseKernel(kernel)); |
||||
} |
||||
static void compute_data_cost(uchar *disp_selected_pyr, uchar *data_cost, StereoConstantSpaceBP &rthis, |
||||
int msg_step1, int msg_step2, const oclMat &left, const oclMat &right, int h, int w, |
||||
int h2, int level, int nr_plane) |
||||
{ |
||||
if(level <= 1) |
||||
compute_data_cost_caller(disp_selected_pyr, data_cost, rthis, msg_step1, msg_step2, |
||||
left, right, h, w, h2, level, nr_plane); |
||||
else |
||||
compute_data_cost_reduce_caller(disp_selected_pyr, data_cost, rthis, msg_step1, msg_step2, |
||||
left, right, h, w, h2, level, nr_plane); |
||||
} |
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
//////////////////////////////////////init message//////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
static void init_message(uchar *u_new, uchar *d_new, uchar *l_new, uchar *r_new, |
||||
uchar *u_cur, uchar *d_cur, uchar *l_cur, uchar *r_cur, |
||||
uchar *disp_selected_pyr_new, uchar *disp_selected_pyr_cur, |
||||
uchar *data_cost_selected, uchar *data_cost, oclMat &temp, StereoConstantSpaceBP rthis, |
||||
size_t msg_step1, size_t msg_step2, int h, int w, int nr_plane, |
||||
int h2, int w2, int nr_plane2) |
||||
{ |
||||
Context *clCxt = temp.clCxt; |
||||
int data_type = rthis.msg_type; |
||||
|
||||
String kernelName = get_kernel_name("init_message_", data_type); |
||||
|
||||
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName); |
||||
|
||||
//size_t blockSize = 256;
|
||||
size_t localThreads[] = {32, 8, 1}; |
||||
size_t globalThreads[] = {divUp(w, localThreads[0]) *localThreads[0], |
||||
divUp(h, localThreads[1]) *localThreads[1], |
||||
1 |
||||
}; |
||||
|
||||
int disp_step1 = msg_step1 * h; |
||||
int disp_step2 = msg_step2 * h2; |
||||
openCLVerifyKernel(clCxt, kernel, localThreads); |
||||
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&u_new)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d_new)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&l_new)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&r_new)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&u_cur)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&d_cur)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&l_cur)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_mem), (void *)&r_cur)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_mem), (void *)&temp.data)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_mem), (void *)&disp_selected_pyr_new)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_mem), (void *)&disp_selected_pyr_cur)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_mem), (void *)&data_cost_selected)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_mem), (void *)&data_cost)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int), (void *)&h)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_int), (void *)&w)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 15, sizeof(cl_int), (void *)&nr_plane)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 16, sizeof(cl_int), (void *)&h2)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 17, sizeof(cl_int), (void *)&w2)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 18, sizeof(cl_int), (void *)&nr_plane2)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 19, sizeof(cl_int), (void *)&disp_step1)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 20, sizeof(cl_int), (void *)&disp_step2)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 21, sizeof(cl_int), (void *)&msg_step1)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 22, sizeof(cl_int), (void *)&msg_step2)); |
||||
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL, |
||||
globalThreads, localThreads, 0, NULL, NULL)); |
||||
|
||||
clFinish(*(cl_command_queue*)getoclCommandQueue()); |
||||
openCLSafeCall(clReleaseKernel(kernel)); |
||||
} |
||||
////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
///////////////////////////calc_all_iterations////////////////////////////////////////////////
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////
|
||||
static void calc_all_iterations_caller(uchar *u, uchar *d, uchar *l, uchar *r, uchar *data_cost_selected, |
||||
uchar *disp_selected_pyr, oclMat &temp, StereoConstantSpaceBP rthis, |
||||
int msg_step, int h, int w, int nr_plane, int i) |
||||
{ |
||||
Context *clCxt = temp.clCxt; |
||||
int data_type = rthis.msg_type; |
||||
|
||||
String kernelName = get_kernel_name("compute_message_", data_type); |
||||
|
||||
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName); |
||||
size_t localThreads[] = {32, 8, 1}; |
||||
size_t globalThreads[] = {divUp(w, (localThreads[0]) << 1) *localThreads[0], |
||||
divUp(h, localThreads[1]) *localThreads[1], |
||||
1 |
||||
}; |
||||
|
||||
int disp_step = msg_step * h; |
||||
openCLVerifyKernel(clCxt, kernel, localThreads); |
||||
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&u)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&l)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&r)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&data_cost_selected)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&disp_selected_pyr)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&temp.data)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&h)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&w)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&nr_plane)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&i)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_float), (void *)&rthis.max_disc_term)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&disp_step)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 13, sizeof(cl_int), (void *)&msg_step)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 14, sizeof(cl_float), (void *)&rthis.disc_single_jump)); |
||||
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL, |
||||
globalThreads, localThreads, 0, NULL, NULL)); |
||||
|
||||
clFinish(*(cl_command_queue*)getoclCommandQueue()); |
||||
openCLSafeCall(clReleaseKernel(kernel)); |
||||
} |
||||
static void calc_all_iterations(uchar *u, uchar *d, uchar *l, uchar *r, uchar *data_cost_selected, |
||||
uchar *disp_selected_pyr, oclMat &temp, StereoConstantSpaceBP rthis, |
||||
int msg_step, int h, int w, int nr_plane) |
||||
{ |
||||
for(int t = 0; t < rthis.iters; t++) |
||||
calc_all_iterations_caller(u, d, l, r, data_cost_selected, disp_selected_pyr, temp, rthis, |
||||
msg_step, h, w, nr_plane, t & 1); |
||||
} |
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////
|
||||
//////////////////////////compute_disp////////////////////////////////////////////////////////
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////
|
||||
static void compute_disp(uchar *u, uchar *d, uchar *l, uchar *r, uchar *data_cost_selected, |
||||
uchar *disp_selected_pyr, StereoConstantSpaceBP &rthis, size_t msg_step, |
||||
oclMat &disp, int nr_plane) |
||||
{ |
||||
Context *clCxt = disp.clCxt; |
||||
int data_type = rthis.msg_type; |
||||
|
||||
String kernelName = get_kernel_name("compute_disp_", data_type); |
||||
|
||||
cl_kernel kernel = openCLGetKernelFromSource(clCxt, &stereocsbp, kernelName); |
||||
|
||||
//size_t blockSize = 256;
|
||||
size_t localThreads[] = {32, 8, 1}; |
||||
size_t globalThreads[] = {divUp(disp.cols, localThreads[0]) *localThreads[0], |
||||
divUp(disp.rows, localThreads[1]) *localThreads[1], |
||||
1 |
||||
}; |
||||
|
||||
int step_size = disp.step / disp.elemSize(); |
||||
int disp_step = disp.rows * msg_step; |
||||
openCLVerifyKernel(clCxt, kernel, localThreads); |
||||
openCLSafeCall(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&u)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&l)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&r)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&data_cost_selected)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&disp_selected_pyr)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&disp.data)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&step_size)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&disp.cols)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 9, sizeof(cl_int), (void *)&disp.rows)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 10, sizeof(cl_int), (void *)&nr_plane)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 11, sizeof(cl_int), (void *)&msg_step)); |
||||
openCLSafeCall(clSetKernelArg(kernel, 12, sizeof(cl_int), (void *)&disp_step)); |
||||
openCLSafeCall(clEnqueueNDRangeKernel(*(cl_command_queue*)getoclCommandQueue(), kernel, 2, NULL, |
||||
globalThreads, localThreads, 0, NULL, NULL)); |
||||
|
||||
clFinish(*(cl_command_queue*)getoclCommandQueue()); |
||||
openCLSafeCall(clReleaseKernel(kernel)); |
||||
} |
||||
} |
||||
} |
||||
} |
||||
namespace |
||||
{ |
||||
const float DEFAULT_MAX_DATA_TERM = 30.0f; |
||||
const float DEFAULT_DATA_WEIGHT = 1.0f; |
||||
const float DEFAULT_MAX_DISC_TERM = 160.0f; |
||||
const float DEFAULT_DISC_SINGLE_JUMP = 10.0f; |
||||
} |
||||
|
||||
void cv::ocl::StereoConstantSpaceBP::estimateRecommendedParams(int width, int height, int &ndisp, int &iters, int &levels, int &nr_plane) |
||||
{ |
||||
ndisp = (int) ((float) width / 3.14f); |
||||
if ((ndisp & 1) != 0) |
||||
ndisp++; |
||||
|
||||
int mm = ::max(width, height); |
||||
iters = mm / 100 + ((mm > 1200) ? - 4 : 4); |
||||
|
||||
levels = (int)::log(static_cast<double>(mm)) * 2 / 3; |
||||
if (levels == 0) levels++; |
||||
|
||||
nr_plane = (int) ((float) ndisp / std::pow(2.0, levels + 1)); |
||||
} |
||||
|
||||
cv::ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp_, int iters_, int levels_, int nr_plane_, |
||||
int msg_type_) |
||||
|
||||
: ndisp(ndisp_), iters(iters_), levels(levels_), nr_plane(nr_plane_), |
||||
max_data_term(DEFAULT_MAX_DATA_TERM), data_weight(DEFAULT_DATA_WEIGHT), |
||||
max_disc_term(DEFAULT_MAX_DISC_TERM), disc_single_jump(DEFAULT_DISC_SINGLE_JUMP), min_disp_th(0), |
||||
msg_type(msg_type_), use_local_init_data_cost(true) |
||||
{ |
||||
CV_Assert(msg_type_ == CV_32F || msg_type_ == CV_16S); |
||||
} |
||||
|
||||
|
||||
cv::ocl::StereoConstantSpaceBP::StereoConstantSpaceBP(int ndisp_, int iters_, int levels_, int nr_plane_, |
||||
float max_data_term_, float data_weight_, float max_disc_term_, float disc_single_jump_, |
||||
int min_disp_th_, int msg_type_) |
||||
: ndisp(ndisp_), iters(iters_), levels(levels_), nr_plane(nr_plane_), |
||||
max_data_term(max_data_term_), data_weight(data_weight_), |
||||
max_disc_term(max_disc_term_), disc_single_jump(disc_single_jump_), min_disp_th(min_disp_th_), |
||||
msg_type(msg_type_), use_local_init_data_cost(true) |
||||
{ |
||||
CV_Assert(msg_type_ == CV_32F || msg_type_ == CV_16S); |
||||
} |
||||
|
||||
template<class T> |
||||
static void csbp_operator(StereoConstantSpaceBP &rthis, oclMat u[2], oclMat d[2], oclMat l[2], oclMat r[2], |
||||
oclMat disp_selected_pyr[2], oclMat &data_cost, oclMat &data_cost_selected, |
||||
oclMat &temp, oclMat &out, const oclMat &left, const oclMat &right, oclMat &disp) |
||||
{ |
||||
CV_DbgAssert(0 < rthis.ndisp && 0 < rthis.iters && 0 < rthis.levels && 0 < rthis.nr_plane |
||||
&& left.rows == right.rows && left.cols == right.cols && left.type() == right.type()); |
||||
|
||||
CV_Assert(rthis.levels <= 8 && (left.type() == CV_8UC1 || left.type() == CV_8UC3)); |
||||
|
||||
const Scalar zero = Scalar::all(0); |
||||
|
||||
////////////////////////////////////Init///////////////////////////////////////////////////
|
||||
int rows = left.rows; |
||||
int cols = left.cols; |
||||
|
||||
rthis.levels = min(rthis.levels, int(log((double)rthis.ndisp) / log(2.0))); |
||||
int levels = rthis.levels; |
||||
|
||||
AutoBuffer<int> buf(levels * 4); |
||||
|
||||
int *cols_pyr = buf; |
||||
int *rows_pyr = cols_pyr + levels; |
||||
int *nr_plane_pyr = rows_pyr + levels; |
||||
int *step_pyr = nr_plane_pyr + levels; |
||||
|
||||
cols_pyr[0] = cols; |
||||
rows_pyr[0] = rows; |
||||
nr_plane_pyr[0] = rthis.nr_plane; |
||||
|
||||
const int n = 64; |
||||
step_pyr[0] = alignSize(cols * sizeof(T), n) / sizeof(T); |
||||
for (int i = 1; i < levels; i++) |
||||
{ |
||||
cols_pyr[i] = cols_pyr[i - 1] / 2; |
||||
rows_pyr[i] = rows_pyr[i - 1]/ 2; |
||||
|
||||
nr_plane_pyr[i] = nr_plane_pyr[i - 1] * 2; |
||||
|
||||
step_pyr[i] = alignSize(cols_pyr[i] * sizeof(T), n) / sizeof(T); |
||||
} |
||||
|
||||
Size msg_size(step_pyr[0], rows * nr_plane_pyr[0]); |
||||
Size data_cost_size(step_pyr[0], rows * nr_plane_pyr[0] * 2); |
||||
|
||||
u[0].create(msg_size, DataType<T>::type); |
||||
d[0].create(msg_size, DataType<T>::type); |
||||
l[0].create(msg_size, DataType<T>::type); |
||||
r[0].create(msg_size, DataType<T>::type); |
||||
|
||||
u[1].create(msg_size, DataType<T>::type); |
||||
d[1].create(msg_size, DataType<T>::type); |
||||
l[1].create(msg_size, DataType<T>::type); |
||||
r[1].create(msg_size, DataType<T>::type); |
||||
|
||||
disp_selected_pyr[0].create(msg_size, DataType<T>::type); |
||||
disp_selected_pyr[1].create(msg_size, DataType<T>::type); |
||||
|
||||
data_cost.create(data_cost_size, DataType<T>::type); |
||||
data_cost_selected.create(msg_size, DataType<T>::type); |
||||
|
||||
Size temp_size = data_cost_size; |
||||
if (data_cost_size.width * data_cost_size.height < step_pyr[0] * rows_pyr[levels - 1] * rthis.ndisp) |
||||
temp_size = Size(step_pyr[0], rows_pyr[levels - 1] * rthis.ndisp); |
||||
|
||||
temp.create(temp_size, DataType<T>::type); |
||||
temp = zero; |
||||
|
||||
///////////////////////////////// Compute////////////////////////////////////////////////
|
||||
|
||||
//csbp::load_constants(rthis.ndisp, rthis.max_data_term, rthis.data_weight,
|
||||
// rthis.max_disc_term, rthis.disc_single_jump, rthis.min_disp_th, left, right, temp);
|
||||
|
||||
l[0] = zero; |
||||
d[0] = zero; |
||||
r[0] = zero; |
||||
u[0] = zero; |
||||
disp_selected_pyr[0] = zero; |
||||
|
||||
l[1] = zero; |
||||
d[1] = zero; |
||||
r[1] = zero; |
||||
u[1] = zero; |
||||
disp_selected_pyr[1] = zero; |
||||
|
||||
data_cost = zero; |
||||
|
||||
data_cost_selected = zero; |
||||
|
||||
int cur_idx = 0; |
||||
|
||||
for (int i = levels - 1; i >= 0; i--) |
||||
{ |
||||
if (i == levels - 1) |
||||
{ |
||||
cv::ocl::stereoCSBP::init_data_cost(left, right, temp, rthis, disp_selected_pyr[cur_idx].data, |
||||
data_cost_selected.data, step_pyr[0], rows_pyr[i], cols_pyr[i], |
||||
i, nr_plane_pyr[i]); |
||||
} |
||||
else |
||||
{ |
||||
cv::ocl::stereoCSBP::compute_data_cost( |
||||
disp_selected_pyr[cur_idx].data, data_cost.data, rthis, step_pyr[0], |
||||
step_pyr[0], left, right, rows_pyr[i], cols_pyr[i], rows_pyr[i + 1], i, |
||||
nr_plane_pyr[i + 1]); |
||||
|
||||
int new_idx = (cur_idx + 1) & 1; |
||||
|
||||
cv::ocl::stereoCSBP::init_message(u[new_idx].data, d[new_idx].data, l[new_idx].data, r[new_idx].data, |
||||
u[cur_idx].data, d[cur_idx].data, l[cur_idx].data, r[cur_idx].data, |
||||
disp_selected_pyr[new_idx].data, disp_selected_pyr[cur_idx].data, |
||||
data_cost_selected.data, data_cost.data, temp, rthis, step_pyr[0], |
||||
step_pyr[0], rows_pyr[i], cols_pyr[i], nr_plane_pyr[i], rows_pyr[i + 1], |
||||
cols_pyr[i + 1], nr_plane_pyr[i + 1]); |
||||
cur_idx = new_idx; |
||||
} |
||||
cv::ocl::stereoCSBP::calc_all_iterations(u[cur_idx].data, d[cur_idx].data, l[cur_idx].data, r[cur_idx].data, |
||||
data_cost_selected.data, disp_selected_pyr[cur_idx].data, temp, |
||||
rthis, step_pyr[0], rows_pyr[i], cols_pyr[i], nr_plane_pyr[i]); |
||||
} |
||||
|
||||
if (disp.empty()) |
||||
disp.create(rows, cols, CV_16S); |
||||
|
||||
out = ((disp.type() == CV_16S) ? disp : (out.create(rows, cols, CV_16S), out)); |
||||
out = zero; |
||||
|
||||
stereoCSBP::compute_disp(u[cur_idx].data, d[cur_idx].data, l[cur_idx].data, r[cur_idx].data, |
||||
data_cost_selected.data, disp_selected_pyr[cur_idx].data, rthis, step_pyr[0], |
||||
out, nr_plane_pyr[0]); |
||||
if (disp.type() != CV_16S) |
||||
out.convertTo(disp, disp.type()); |
||||
} |
||||
|
||||
|
||||
typedef void (*csbp_operator_t)(StereoConstantSpaceBP &rthis, oclMat u[2], oclMat d[2], oclMat l[2], oclMat r[2], |
||||
oclMat disp_selected_pyr[2], oclMat &data_cost, oclMat &data_cost_selected, |
||||
oclMat &temp, oclMat &out, const oclMat &left, const oclMat &right, oclMat &disp); |
||||
|
||||
const static csbp_operator_t operators[] = {0, 0, 0, csbp_operator<short>, 0, csbp_operator<float>, 0, 0}; |
||||
|
||||
void cv::ocl::StereoConstantSpaceBP::operator()(const oclMat &left, const oclMat &right, oclMat &disp) |
||||
{ |
||||
|
||||
CV_Assert(msg_type == CV_32F || msg_type == CV_16S); |
||||
operators[msg_type](*this, u, d, l, r, disp_selected_pyr, data_cost, data_cost_selected, temp, out, |
||||
left, right, disp); |
||||
} |
||||
|
||||
#endif /* !defined (HAVE_OPENCL) */ |
@ -0,0 +1,475 @@ |
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
|
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// @Authors
|
||||
// Jin Ma, jin@multicorewareinc.com
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other oclMaterials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
|
||||
#include "precomp.hpp" |
||||
using namespace std; |
||||
using namespace cv; |
||||
using namespace cv::ocl; |
||||
|
||||
namespace cv |
||||
{ |
||||
namespace ocl |
||||
{ |
||||
///////////////////////////OpenCL kernel strings///////////////////////////
|
||||
extern const char* tvl1flow; |
||||
} |
||||
} |
||||
|
||||
cv::ocl::OpticalFlowDual_TVL1_OCL::OpticalFlowDual_TVL1_OCL() |
||||
{ |
||||
tau = 0.25; |
||||
lambda = 0.15; |
||||
theta = 0.3; |
||||
nscales = 5; |
||||
warps = 5; |
||||
epsilon = 0.01; |
||||
iterations = 300; |
||||
useInitialFlow = false; |
||||
} |
||||
|
||||
void cv::ocl::OpticalFlowDual_TVL1_OCL::operator()(const oclMat& I0, const oclMat& I1, oclMat& flowx, oclMat& flowy) |
||||
{ |
||||
CV_Assert( I0.type() == CV_8UC1 || I0.type() == CV_32FC1 ); |
||||
CV_Assert( I0.size() == I1.size() ); |
||||
CV_Assert( I0.type() == I1.type() ); |
||||
CV_Assert( !useInitialFlow || (flowx.size() == I0.size() && flowx.type() == CV_32FC1 && flowy.size() == flowx.size() && flowy.type() == flowx.type()) ); |
||||
CV_Assert( nscales > 0 ); |
||||
|
||||
// allocate memory for the pyramid structure
|
||||
I0s.resize(nscales); |
||||
I1s.resize(nscales); |
||||
u1s.resize(nscales); |
||||
u2s.resize(nscales); |
||||
//I0s_step == I1s_step
|
||||
I0.convertTo(I0s[0], CV_32F, I0.depth() == CV_8U ? 1.0 : 255.0); |
||||
I1.convertTo(I1s[0], CV_32F, I1.depth() == CV_8U ? 1.0 : 255.0); |
||||
|
||||
|
||||
if (!useInitialFlow) |
||||
{ |
||||
flowx.create(I0.size(), CV_32FC1); |
||||
flowy.create(I0.size(), CV_32FC1); |
||||
} |
||||
//u1s_step != u2s_step
|
||||
u1s[0] = flowx; |
||||
u2s[0] = flowy; |
||||
|
||||
I1x_buf.create(I0.size(), CV_32FC1); |
||||
I1y_buf.create(I0.size(), CV_32FC1); |
||||
|
||||
I1w_buf.create(I0.size(), CV_32FC1); |
||||
I1wx_buf.create(I0.size(), CV_32FC1); |
||||
I1wy_buf.create(I0.size(), CV_32FC1); |
||||
|
||||
grad_buf.create(I0.size(), CV_32FC1); |
||||
rho_c_buf.create(I0.size(), CV_32FC1); |
||||
|
||||
p11_buf.create(I0.size(), CV_32FC1); |
||||
p12_buf.create(I0.size(), CV_32FC1); |
||||
p21_buf.create(I0.size(), CV_32FC1); |
||||
p22_buf.create(I0.size(), CV_32FC1); |
||||
|
||||
diff_buf.create(I0.size(), CV_32FC1); |
||||
|
||||
// create the scales
|
||||
for (int s = 1; s < nscales; ++s) |
||||
{ |
||||
ocl::pyrDown(I0s[s - 1], I0s[s]); |
||||
ocl::pyrDown(I1s[s - 1], I1s[s]); |
||||
|
||||
if (I0s[s].cols < 16 || I0s[s].rows < 16) |
||||
{ |
||||
nscales = s; |
||||
break; |
||||
} |
||||
|
||||
if (useInitialFlow) |
||||
{ |
||||
ocl::pyrDown(u1s[s - 1], u1s[s]); |
||||
ocl::pyrDown(u2s[s - 1], u2s[s]); |
||||
|
||||
//ocl::multiply(u1s[s], Scalar::all(0.5), u1s[s]);
|
||||
multiply(0.5, u1s[s], u1s[s]); |
||||
//ocl::multiply(u2s[s], Scalar::all(0.5), u2s[s]);
|
||||
multiply(0.5, u1s[s], u2s[s]); |
||||
} |
||||
} |
||||
|
||||
// pyramidal structure for computing the optical flow
|
||||
for (int s = nscales - 1; s >= 0; --s) |
||||
{ |
||||
// compute the optical flow at the current scale
|
||||
procOneScale(I0s[s], I1s[s], u1s[s], u2s[s]); |
||||
|
||||
// if this was the last scale, finish now
|
||||
if (s == 0) |
||||
break; |
||||
|
||||
// otherwise, upsample the optical flow
|
||||
|
||||
// zoom the optical flow for the next finer scale
|
||||
ocl::resize(u1s[s], u1s[s - 1], I0s[s - 1].size()); |
||||
ocl::resize(u2s[s], u2s[s - 1], I0s[s - 1].size()); |
||||
|
||||
// scale the optical flow with the appropriate zoom factor
|
||||
multiply(2, u1s[s - 1], u1s[s - 1]); |
||||
multiply(2, u2s[s - 1], u2s[s - 1]); |
||||
|
||||
} |
||||
|
||||
} |
||||
|
||||
namespace ocl_tvl1flow |
||||
{ |
||||
void centeredGradient(const oclMat &src, oclMat &dx, oclMat &dy); |
||||
|
||||
void warpBackward(const oclMat &I0, const oclMat &I1, oclMat &I1x, oclMat &I1y, |
||||
oclMat &u1, oclMat &u2, oclMat &I1w, oclMat &I1wx, oclMat &I1wy, |
||||
oclMat &grad, oclMat &rho); |
||||
|
||||
void estimateU(oclMat &I1wx, oclMat &I1wy, oclMat &grad, |
||||
oclMat &rho_c, oclMat &p11, oclMat &p12, |
||||
oclMat &p21, oclMat &p22, oclMat &u1, |
||||
oclMat &u2, oclMat &error, float l_t, float theta); |
||||
|
||||
void estimateDualVariables(oclMat &u1, oclMat &u2, |
||||
oclMat &p11, oclMat &p12, oclMat &p21, oclMat &p22, float taut); |
||||
} |
||||
|
||||
void cv::ocl::OpticalFlowDual_TVL1_OCL::procOneScale(const oclMat &I0, const oclMat &I1, oclMat &u1, oclMat &u2) |
||||
{ |
||||
using namespace ocl_tvl1flow; |
||||
|
||||
const double scaledEpsilon = epsilon * epsilon * I0.size().area(); |
||||
|
||||
CV_DbgAssert( I1.size() == I0.size() ); |
||||
CV_DbgAssert( I1.type() == I0.type() ); |
||||
CV_DbgAssert( u1.empty() || u1.size() == I0.size() ); |
||||
CV_DbgAssert( u2.size() == u1.size() ); |
||||
|
||||
if (u1.empty()) |
||||
{ |
||||
u1.create(I0.size(), CV_32FC1); |
||||
u1.setTo(Scalar::all(0)); |
||||
|
||||
u2.create(I0.size(), CV_32FC1); |
||||
u2.setTo(Scalar::all(0)); |
||||
} |
||||
|
||||
oclMat I1x = I1x_buf(Rect(0, 0, I0.cols, I0.rows)); |
||||
oclMat I1y = I1y_buf(Rect(0, 0, I0.cols, I0.rows)); |
||||
|
||||
centeredGradient(I1, I1x, I1y); |
||||
|
||||
oclMat I1w = I1w_buf(Rect(0, 0, I0.cols, I0.rows)); |
||||
oclMat I1wx = I1wx_buf(Rect(0, 0, I0.cols, I0.rows)); |
||||
oclMat I1wy = I1wy_buf(Rect(0, 0, I0.cols, I0.rows)); |
||||
|
||||
oclMat grad = grad_buf(Rect(0, 0, I0.cols, I0.rows)); |
||||
oclMat rho_c = rho_c_buf(Rect(0, 0, I0.cols, I0.rows)); |
||||
|
||||
oclMat p11 = p11_buf(Rect(0, 0, I0.cols, I0.rows)); |
||||
oclMat p12 = p12_buf(Rect(0, 0, I0.cols, I0.rows)); |
||||
oclMat p21 = p21_buf(Rect(0, 0, I0.cols, I0.rows)); |
||||
oclMat p22 = p22_buf(Rect(0, 0, I0.cols, I0.rows)); |
||||
p11.setTo(Scalar::all(0)); |
||||
p12.setTo(Scalar::all(0)); |
||||
p21.setTo(Scalar::all(0)); |
||||
p22.setTo(Scalar::all(0)); |
||||
|
||||
oclMat diff = diff_buf(Rect(0, 0, I0.cols, I0.rows)); |
||||
|
||||
const float l_t = static_cast<float>(lambda * theta); |
||||
const float taut = static_cast<float>(tau / theta); |
||||
|
||||
for (int warpings = 0; warpings < warps; ++warpings) |
||||
{ |
||||
warpBackward(I0, I1, I1x, I1y, u1, u2, I1w, I1wx, I1wy, grad, rho_c); |
||||
|
||||
double error = numeric_limits<double>::max(); |
||||
for (int n = 0; error > scaledEpsilon && n < iterations; ++n) |
||||
{ |
||||
estimateU(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, |
||||
u1, u2, diff, l_t, static_cast<float>(theta)); |
||||
|
||||
error = ocl::sum(diff)[0]; |
||||
|
||||
estimateDualVariables(u1, u2, p11, p12, p21, p22, taut); |
||||
|
||||
} |
||||
} |
||||
|
||||
} |
||||
|
||||
void cv::ocl::OpticalFlowDual_TVL1_OCL::collectGarbage() |
||||
{ |
||||
I0s.clear(); |
||||
I1s.clear(); |
||||
u1s.clear(); |
||||
u2s.clear(); |
||||
|
||||
I1x_buf.release(); |
||||
I1y_buf.release(); |
||||
|
||||
I1w_buf.release(); |
||||
I1wx_buf.release(); |
||||
I1wy_buf.release(); |
||||
|
||||
grad_buf.release(); |
||||
rho_c_buf.release(); |
||||
|
||||
p11_buf.release(); |
||||
p12_buf.release(); |
||||
p21_buf.release(); |
||||
p22_buf.release(); |
||||
|
||||
diff_buf.release(); |
||||
norm_buf.release(); |
||||
} |
||||
|
||||
void ocl_tvl1flow::centeredGradient(const oclMat &src, oclMat &dx, oclMat &dy) |
||||
{ |
||||
Context *clCxt = src.clCxt; |
||||
size_t localThreads[3] = {32, 8, 1}; |
||||
size_t globalThreads[3] = {src.cols, src.rows, 1}; |
||||
|
||||
int srcElementSize = src.elemSize(); |
||||
int src_step = src.step/srcElementSize; |
||||
|
||||
int dElememntSize = dx.elemSize(); |
||||
int dx_step = dx.step/dElememntSize; |
||||
|
||||
String kernelName = "centeredGradientKernel"; |
||||
vector< pair<size_t, const void *> > args; |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&src.data)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&src.cols)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&src.rows)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&src_step)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&dx.data)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&dy.data)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&dx_step)); |
||||
openCLExecuteKernel(clCxt, &tvl1flow, kernelName, globalThreads, localThreads, args, -1, -1); |
||||
|
||||
} |
||||
|
||||
void ocl_tvl1flow::estimateDualVariables(oclMat &u1, oclMat &u2, oclMat &p11, oclMat &p12, oclMat &p21, oclMat &p22, float taut) |
||||
{ |
||||
Context *clCxt = u1.clCxt; |
||||
|
||||
size_t localThread[] = {32, 8, 1}; |
||||
size_t globalThread[] = |
||||
{ |
||||
u1.cols, |
||||
u1.rows, |
||||
1 |
||||
}; |
||||
|
||||
int u1_element_size = u1.elemSize(); |
||||
int u1_step = u1.step/u1_element_size; |
||||
|
||||
int u2_element_size = u2.elemSize(); |
||||
int u2_step = u2.step/u2_element_size; |
||||
|
||||
int p11_element_size = p11.elemSize(); |
||||
int p11_step = p11.step/p11_element_size; |
||||
|
||||
int u1_offset_y = u1.offset/u1.step; |
||||
int u1_offset_x = u1.offset%u1.step; |
||||
u1_offset_x = u1_offset_x/u1.elemSize(); |
||||
|
||||
int u2_offset_y = u2.offset/u2.step; |
||||
int u2_offset_x = u2.offset%u2.step; |
||||
u2_offset_x = u2_offset_x/u2.elemSize(); |
||||
|
||||
String kernelName = "estimateDualVariablesKernel"; |
||||
vector< pair<size_t, const void *> > args; |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&u1.data)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u1.cols)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u1.rows)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u1_step)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&u2.data)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&p11.data)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&p11_step)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&p12.data)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&p21.data)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&p22.data)); |
||||
args.push_back( make_pair( sizeof(cl_float), (void*)&taut)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u2_step)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_x)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_y)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_x)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_y)); |
||||
|
||||
openCLExecuteKernel(clCxt, &tvl1flow, kernelName, globalThread, localThread, args, -1, -1); |
||||
} |
||||
|
||||
void ocl_tvl1flow::estimateU(oclMat &I1wx, oclMat &I1wy, oclMat &grad, |
||||
oclMat &rho_c, oclMat &p11, oclMat &p12, |
||||
oclMat &p21, oclMat &p22, oclMat &u1, |
||||
oclMat &u2, oclMat &error, float l_t, float theta) |
||||
{ |
||||
Context* clCxt = I1wx.clCxt; |
||||
|
||||
size_t localThread[] = {32, 8, 1}; |
||||
size_t globalThread[] = |
||||
{ |
||||
I1wx.cols, |
||||
I1wx.rows, |
||||
1 |
||||
}; |
||||
|
||||
int I1wx_element_size = I1wx.elemSize(); |
||||
int I1wx_step = I1wx.step/I1wx_element_size; |
||||
|
||||
int u1_element_size = u1.elemSize(); |
||||
int u1_step = u1.step/u1_element_size; |
||||
|
||||
int u2_element_size = u2.elemSize(); |
||||
int u2_step = u2.step/u2_element_size; |
||||
|
||||
int u1_offset_y = u1.offset/u1.step; |
||||
int u1_offset_x = u1.offset%u1.step; |
||||
u1_offset_x = u1_offset_x/u1.elemSize(); |
||||
|
||||
int u2_offset_y = u2.offset/u2.step; |
||||
int u2_offset_x = u2.offset%u2.step; |
||||
u2_offset_x = u2_offset_x/u2.elemSize(); |
||||
|
||||
String kernelName = "estimateUKernel"; |
||||
vector< pair<size_t, const void *> > args; |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&I1wx.data)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&I1wx.cols)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&I1wx.rows)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&I1wx_step)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&I1wy.data)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&grad.data)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&rho_c.data)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&p11.data)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&p12.data)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&p21.data)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&p22.data)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&u1.data)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u1_step)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&u2.data)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&error.data)); |
||||
args.push_back( make_pair( sizeof(cl_float), (void*)&l_t)); |
||||
args.push_back( make_pair( sizeof(cl_float), (void*)&theta)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u2_step)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_x)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_y)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_x)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_y)); |
||||
|
||||
openCLExecuteKernel(clCxt, &tvl1flow, kernelName, globalThread, localThread, args, -1, -1); |
||||
} |
||||
|
||||
void ocl_tvl1flow::warpBackward(const oclMat &I0, const oclMat &I1, oclMat &I1x, oclMat &I1y, oclMat &u1, oclMat &u2, oclMat &I1w, oclMat &I1wx, oclMat &I1wy, oclMat &grad, oclMat &rho) |
||||
{ |
||||
Context* clCxt = I0.clCxt; |
||||
const bool isImgSupported = support_image2d(clCxt); |
||||
|
||||
CV_Assert(isImgSupported); |
||||
|
||||
int u1ElementSize = u1.elemSize(); |
||||
int u1Step = u1.step/u1ElementSize; |
||||
|
||||
int u2ElementSize = u2.elemSize(); |
||||
int u2Step = u2.step/u2ElementSize; |
||||
|
||||
int I0ElementSize = I0.elemSize(); |
||||
int I0Step = I0.step/I0ElementSize; |
||||
|
||||
int I1w_element_size = I1w.elemSize(); |
||||
int I1w_step = I1w.step/I1w_element_size; |
||||
|
||||
int u1_offset_y = u1.offset/u1.step; |
||||
int u1_offset_x = u1.offset%u1.step; |
||||
u1_offset_x = u1_offset_x/u1.elemSize(); |
||||
|
||||
int u2_offset_y = u2.offset/u2.step; |
||||
int u2_offset_x = u2.offset%u2.step; |
||||
u2_offset_x = u2_offset_x/u2.elemSize(); |
||||
|
||||
size_t localThread[] = {32, 8, 1}; |
||||
size_t globalThread[] = |
||||
{ |
||||
I0.cols, |
||||
I0.rows, |
||||
1 |
||||
}; |
||||
|
||||
cl_mem I1_tex; |
||||
cl_mem I1x_tex; |
||||
cl_mem I1y_tex; |
||||
I1_tex = bindTexture(I1); |
||||
I1x_tex = bindTexture(I1x); |
||||
I1y_tex = bindTexture(I1y); |
||||
|
||||
String kernelName = "warpBackwardKernel"; |
||||
vector< pair<size_t, const void *> > args; |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&I0.data)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&I0Step)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&I0.cols)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&I0.rows)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&I1_tex)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&I1x_tex)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&I1y_tex)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&u1.data)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u1Step)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&u2.data)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&I1w.data)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&I1wx.data)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&I1wy.data)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&grad.data)); |
||||
args.push_back( make_pair( sizeof(cl_mem), (void*)&rho.data)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&I1w_step)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u2Step)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_x)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u1_offset_y)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_x)); |
||||
args.push_back( make_pair( sizeof(cl_int), (void*)&u2_offset_y)); |
||||
|
||||
openCLExecuteKernel(clCxt, &tvl1flow, kernelName, globalThread, localThread, args, -1, -1); |
||||
} |
@ -1,120 +0,0 @@ |
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// Intel License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000, Intel Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of Intel Corporation may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_TEST_INTERPOLATION_HPP__ |
||||
#define __OPENCV_TEST_INTERPOLATION_HPP__ |
||||
|
||||
template <typename T> T readVal(const cv::Mat &src, int y, int x, int c, int border_type, cv::Scalar borderVal = cv::Scalar()) |
||||
{ |
||||
if (border_type == cv::BORDER_CONSTANT) |
||||
return (y >= 0 && y < src.rows && x >= 0 && x < src.cols) ? src.at<T>(y, x * src.channels() + c) : cv::saturate_cast<T>(borderVal.val[c]); |
||||
|
||||
return src.at<T>(cv::borderInterpolate(y, src.rows, border_type), cv::borderInterpolate(x, src.cols, border_type) * src.channels() + c); |
||||
} |
||||
|
||||
template <typename T> struct NearestInterpolator |
||||
{ |
||||
static T getValue(const cv::Mat &src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar()) |
||||
{ |
||||
return readVal<T>(src, cvFloor(y), cvFloor(x), c, border_type, borderVal); |
||||
} |
||||
}; |
||||
|
||||
template <typename T> struct LinearInterpolator |
||||
{ |
||||
static T getValue(const cv::Mat &src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar()) |
||||
{ |
||||
x -= 0.5f; |
||||
y -= 0.5f; |
||||
|
||||
int x1 = cvFloor(x); |
||||
int y1 = cvFloor(y); |
||||
int x2 = x1 + 1; |
||||
int y2 = y1 + 1; |
||||
|
||||
float res = 0; |
||||
|
||||
res += readVal<T>(src, y1, x1, c, border_type, borderVal) * ((x2 - x) * (y2 - y)); |
||||
res += readVal<T>(src, y1, x2, c, border_type, borderVal) * ((x - x1) * (y2 - y)); |
||||
res += readVal<T>(src, y2, x1, c, border_type, borderVal) * ((x2 - x) * (y - y1)); |
||||
res += readVal<T>(src, y2, x2, c, border_type, borderVal) * ((x - x1) * (y - y1)); |
||||
|
||||
return cv::saturate_cast<T>(res); |
||||
} |
||||
}; |
||||
|
||||
template <typename T> struct CubicInterpolator |
||||
{ |
||||
static float getValue(float p[4], float x) |
||||
{ |
||||
return p[1] + 0.5 * x * (p[2] - p[0] + x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] + x * (3.0 * (p[1] - p[2]) + p[3] - p[0]))); |
||||
} |
||||
|
||||
static float getValue(float p[4][4], float x, float y) |
||||
{ |
||||
float arr[4]; |
||||
|
||||
arr[0] = getValue(p[0], x); |
||||
arr[1] = getValue(p[1], x); |
||||
arr[2] = getValue(p[2], x); |
||||
arr[3] = getValue(p[3], x); |
||||
|
||||
return getValue(arr, y); |
||||
} |
||||
|
||||
static T getValue(const cv::Mat &src, float y, float x, int c, int border_type, cv::Scalar borderVal = cv::Scalar()) |
||||
{ |
||||
int ix = cvRound(x); |
||||
int iy = cvRound(y); |
||||
|
||||
float vals[4][4] = |
||||
{ |
||||
{readVal<T>(src, iy - 2, ix - 2, c, border_type, borderVal), readVal<T>(src, iy - 2, ix - 1, c, border_type, borderVal), readVal<T>(src, iy - 2, ix, c, border_type, borderVal), readVal<T>(src, iy - 2, ix + 1, c, border_type, borderVal)}, |
||||
{readVal<T>(src, iy - 1, ix - 2, c, border_type, borderVal), readVal<T>(src, iy - 1, ix - 1, c, border_type, borderVal), readVal<T>(src, iy - 1, ix, c, border_type, borderVal), readVal<T>(src, iy - 1, ix + 1, c, border_type, borderVal)}, |
||||
{readVal<T>(src, iy , ix - 2, c, border_type, borderVal), readVal<T>(src, iy , ix - 1, c, border_type, borderVal), readVal<T>(src, iy , ix, c, border_type, borderVal), readVal<T>(src, iy , ix + 1, c, border_type, borderVal)}, |
||||
{readVal<T>(src, iy + 1, ix - 2, c, border_type, borderVal), readVal<T>(src, iy + 1, ix - 1, c, border_type, borderVal), readVal<T>(src, iy + 1, ix, c, border_type, borderVal), readVal<T>(src, iy + 1, ix + 1, c, border_type, borderVal)}, |
||||
}; |
||||
|
||||
return cv::saturate_cast<T>(getValue(vals, (x - ix + 2.0) / 4.0, (y - iy + 2.0) / 4.0)); |
||||
} |
||||
}; |
||||
|
||||
#endif // __OPENCV_TEST_INTERPOLATION_HPP__
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue