fix all redefine build errors on some Intel OCL

pull/775/head
yao 12 years ago
parent 656594ad4f
commit bcc086baa9
  1. 102
      modules/ocl/src/opencl/arithm_absdiff.cl
  2. 96
      modules/ocl/src/opencl/arithm_add.cl
  3. 230
      modules/ocl/src/opencl/arithm_addWeighted.cl
  4. 81
      modules/ocl/src/opencl/arithm_add_scalar.cl
  5. 81
      modules/ocl/src/opencl/arithm_add_scalar_mask.cl
  6. 190
      modules/ocl/src/opencl/arithm_bitwise_and.cl
  7. 415
      modules/ocl/src/opencl/arithm_bitwise_and_mask.cl
  8. 307
      modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl
  9. 354
      modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl
  10. 57
      modules/ocl/src/opencl/arithm_bitwise_not.cl
  11. 94
      modules/ocl/src/opencl/arithm_bitwise_or.cl
  12. 413
      modules/ocl/src/opencl/arithm_bitwise_or_mask.cl
  13. 298
      modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl
  14. 356
      modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl
  15. 180
      modules/ocl/src/opencl/arithm_bitwise_xor.cl
  16. 413
      modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl
  17. 307
      modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl
  18. 354
      modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl
  19. 215
      modules/ocl/src/opencl/arithm_compare_eq.cl
  20. 213
      modules/ocl/src/opencl/arithm_compare_ne.cl
  21. 113
      modules/ocl/src/opencl/arithm_div.cl
  22. 24
      modules/ocl/src/opencl/arithm_flip.cl
  23. 35
      modules/ocl/src/opencl/arithm_mul.cl

@ -44,7 +44,11 @@
//M*/
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
@ -63,6 +67,9 @@ __kernel void arithm_absdiff_D0 (__global uchar *src1, int src1_step, int src1_o
{
x = x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@ -111,7 +118,10 @@ __kernel void arithm_absdiff_D2 (__global ushort *src1, int src1_step, int src1_
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@ -145,7 +155,10 @@ __kernel void arithm_absdiff_D3 (__global short *src1, int src1_step, int src1_o
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@ -249,7 +262,10 @@ __kernel void arithm_s_absdiff_C1_D0 (__global uchar *src1, int src1_step, int
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@ -288,7 +304,10 @@ __kernel void arithm_s_absdiff_C1_D2 (__global ushort *src1, int src1_step, in
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
@ -319,7 +338,10 @@ __kernel void arithm_s_absdiff_C1_D3 (__global short *src1, int src1_step, int
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
@ -387,8 +409,8 @@ __kernel void arithm_s_absdiff_C1_D5 (__global float *src1, int src1_step, int
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_absdiff_C1_D6 (__global double *src1, int src1_step, int src1_offset,
__global double *dst, int dst_step, int dst_offset,
double4 src2, int rows, int cols, int dst_step1)
__global double *dst, int dst_step, int dst_offset,
double4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -422,7 +444,10 @@ __kernel void arithm_s_absdiff_C2_D0 (__global uchar *src1, int src1_step, int
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
@ -465,7 +490,7 @@ __kernel void arithm_s_absdiff_C2_D2 (__global ushort *src1, int src1_step, in
}
__kernel void arithm_s_absdiff_C2_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -509,7 +534,7 @@ __kernel void arithm_s_absdiff_C2_D4 (__global int *src1, int src1_step, int s
}
__kernel void arithm_s_absdiff_C2_D5 (__global float *src1, int src1_step, int src1_offset,
__global float *dst, int dst_step, int dst_offset,
float4 src2, int rows, int cols, int dst_step1)
float4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -564,7 +589,10 @@ __kernel void arithm_s_absdiff_C3_D0 (__global uchar *src1, int src1_step, int
{
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int dst_start = mad24(y, dst_step, dst_offset);
@ -618,7 +646,10 @@ __kernel void arithm_s_absdiff_C3_D2 (__global ushort *src1, int src1_step, in
{
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset);
@ -644,16 +675,16 @@ __kernel void arithm_s_absdiff_C3_D2 (__global ushort *src1, int src1_step, in
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_absdiff_C3_D3 (__global short *src1, int src1_step, int src1_offset,
@ -668,7 +699,10 @@ __kernel void arithm_s_absdiff_C3_D3 (__global short *src1, int src1_step, int
{
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset);
@ -694,16 +728,16 @@ __kernel void arithm_s_absdiff_C3_D3 (__global short *src1, int src1_step, int
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_absdiff_C3_D4 (__global int *src1, int src1_step, int src1_offset,
@ -735,9 +769,9 @@ __kernel void arithm_s_absdiff_C3_D4 (__global int *src1, int src1_step, int s
int tmp_data_1 = convert_int_sat(abs_diff(src1_data_1, src2_data_1));
int tmp_data_2 = convert_int_sat(abs_diff(src1_data_2, src2_data_2));
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
}
}
__kernel void arithm_s_absdiff_C3_D5 (__global float *src1, int src1_step, int src1_offset,
@ -769,9 +803,9 @@ __kernel void arithm_s_absdiff_C3_D5 (__global float *src1, int src1_step, int
float tmp_data_1 = fabs(src1_data_1 - src2_data_1);
float tmp_data_2 = fabs(src1_data_2 - src2_data_2);
*((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2;
*((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2;
}
}
@ -805,9 +839,9 @@ __kernel void arithm_s_absdiff_C3_D6 (__global double *src1, int src1_step, in
double tmp_data_1 = fabs(src1_data_1 - src2_data_1);
double tmp_data_2 = fabs(src1_data_2 - src2_data_2);
*((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
*((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
*((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
*((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
}
}
#endif

@ -45,7 +45,11 @@
//M*/
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
@ -64,7 +68,10 @@ __kernel void arithm_add_D0 (__global uchar *src1, int src1_step, int src1_offse
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@ -112,7 +119,10 @@ __kernel void arithm_add_D2 (__global ushort *src1, int src1_step, int src1_offs
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@ -147,7 +157,10 @@ __kernel void arithm_add_D3 (__global short *src1, int src1_step, int src1_offse
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@ -252,7 +265,10 @@ __kernel void arithm_add_with_mask_C1_D0 (__global uchar *src1, int src1_step, i
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -311,7 +327,10 @@ __kernel void arithm_add_with_mask_C1_D2 (__global ushort *src1, int src1_step,
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -348,7 +367,10 @@ __kernel void arithm_add_with_mask_C1_D3 (__global short *src1, int src1_step, i
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -477,7 +499,10 @@ __kernel void arithm_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, i
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -664,7 +689,10 @@ __kernel void arithm_add_with_mask_C3_D0 (__global uchar *src1, int src1_step, i
{
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -724,7 +752,10 @@ __kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step,
{
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -754,16 +785,16 @@ __kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step,
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
@ -780,7 +811,10 @@ __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, i
{
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -810,16 +844,16 @@ __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, i
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_add_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset,
@ -861,9 +895,9 @@ __kernel void arithm_add_with_mask_C3_D4 (__global int *src1, int src1_step, i
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_add_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset,
@ -905,9 +939,9 @@ __kernel void arithm_add_with_mask_C3_D5 (__global float *src1, int src1_step, i
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global float *)((__global char *)dst + dst_index + 0))= data_0;
*((__global float *)((__global char *)dst + dst_index + 4))= data_1;
*((__global float *)((__global char *)dst + dst_index + 8))= data_2;
*((__global float *)((__global char *)dst + dst_index + 0))= data_0;
*((__global float *)((__global char *)dst + dst_index + 4))= data_1;
*((__global float *)((__global char *)dst + dst_index + 8))= data_2;
}
}
@ -951,9 +985,9 @@ __kernel void arithm_add_with_mask_C3_D6 (__global double *src1, int src1_step,
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
*((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
*((__global double *)((__global char *)dst + dst_index + 16))= data_2;
*((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
*((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
*((__global double *)((__global char *)dst + dst_index + 16))= data_2;
}
}
#endif

@ -42,8 +42,12 @@
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if defined DOUBLE_SUPPORT
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
typedef double F;
#else
typedef float F;
@ -52,10 +56,10 @@ typedef float F;
/////////////////////////////////////////////addWeighted//////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset,
__global uchar *src2, int src2_step,int src2_offset,
F alpha,F beta,F gama,
__global uchar *dst, int dst_step,int dst_offset,
int rows, int cols,int dst_step1)
__global uchar *src2, int src2_step,int src2_offset,
F alpha,F beta,F gama,
__global uchar *dst, int dst_step,int dst_offset,
int rows, int cols,int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -65,7 +69,10 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@ -87,7 +94,7 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
// short4 tmp = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama;
short4 tmp;
short4 tmp;
tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
@ -100,7 +107,7 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
*((__global uchar4 *)(dst + dst_index)) = dst_data;
// dst[x + y * dst_step] = src1[x + y * src1_step] * alpha + src2[x + y * src2_step] * beta + gama;
// dst[x + y * dst_step] = src1[x + y * src1_step] * alpha + src2[x + y * src2_step] * beta + gama;
}
}
@ -108,10 +115,10 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
__kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offset,
__global ushort *src2, int src2_step,int src2_offset,
F alpha,F beta,F gama,
__global ushort *dst, int dst_step,int dst_offset,
int rows, int cols,int dst_step1)
__global ushort *src2, int src2_step,int src2_offset,
F alpha,F beta,F gama,
__global ushort *dst, int dst_step,int dst_offset,
int rows, int cols,int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -122,34 +129,37 @@ __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offs
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset +( x<< 1) & (int)0xfffffff8);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
if(src1_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
// int4 tmp = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
int4 tmp;
// int4 tmp = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
int4 tmp;
tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
@ -182,7 +192,10 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
@ -190,26 +203,26 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset +( x<< 1) - (dst_align << 1 ));
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0)
{
short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
if(src1_index < 0)
{
short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
// int4 tmp = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
int4 tmp;
// int4 tmp = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
int4 tmp;
tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
@ -228,7 +241,7 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
__kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
__global int *src2, int src2_step,int src2_offset,
F alpha,F beta, F gama,
F alpha,F beta, F gama,
__global int *dst, int dst_step,int dst_offset,
int rows, int cols,int dst_step1)
{
@ -241,9 +254,12 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
x = x << 2;
#define bitOfInt (sizeof(int)== 4 ? 2: 3)
#define bitOfInt (sizeof(int)== 4 ? 2: 3)
#define dst_align ((dst_offset >> bitOfInt) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> bitOfInt) & 3)
int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
@ -252,26 +268,26 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt));
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index_fix));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0)
{
int4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
int4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
if(src1_index < 0)
{
int4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
int4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
// double4 tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
float4 tmp;
// double4 tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
float4 tmp;
tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
@ -291,7 +307,7 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
__kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset,
__global float *src2, int src2_step,int src2_offset,
F alpha,F beta, F gama,
F alpha,F beta, F gama,
__global float *dst, int dst_step,int dst_offset,
int rows, int cols,int dst_step1)
{
@ -304,7 +320,10 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
@ -313,32 +332,32 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
if(src1_index < 0)
{
float4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
float4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
// double4 tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
// float4 tmp_data =(src1_data) * alpha + (src2_data) * beta + gama ;
float4 tmp_data;
if(src1_index < 0)
{
float4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
float4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
// double4 tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
// float4 tmp_data =(src1_data) * alpha + (src2_data) * beta + gama ;
float4 tmp_data;
tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;
tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama;
tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama;
tmp_data.w = src1_data.w * alpha + src2_data.w * beta + gama;
// float4 tmp_data = convert_float4(tmp);
// float4 tmp_data = convert_float4(tmp);
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y;
@ -353,7 +372,7 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
#if defined (DOUBLE_SUPPORT)
__kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offset,
__global double *src2, int src2_step,int src2_offset,
F alpha,F beta, F gama,
F alpha,F beta, F gama,
__global double *dst, int dst_step,int dst_offset,
int rows, int cols,int dst_step1)
{
@ -366,7 +385,10 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs
x = x << 2;
#define dst_align ((dst_offset >> 3) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 3) & 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
@ -375,25 +397,25 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3));
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index));
if(src1_index < 0)
{
double4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
double4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
// double4 tmp_data = (src1_data) * alpha + (src2_data) * beta + gama ;
double4 tmp_data;
if(src1_index < 0)
{
double4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
double4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
// double4 tmp_data = (src1_data) * alpha + (src2_data) * beta + gama ;
double4 tmp_data;
tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;
tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama;
tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama;

@ -44,9 +44,13 @@
//M*/
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
/**************************************add with scalar without mask**************************************/
__kernel void arithm_s_add_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
@ -59,7 +63,10 @@ __kernel void arithm_s_add_C1_D0 (__global uchar *src1, int src1_step, int src
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@ -99,7 +106,10 @@ __kernel void arithm_s_add_C1_D2 (__global ushort *src1, int src1_step, int sr
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
@ -131,7 +141,10 @@ __kernel void arithm_s_add_C1_D3 (__global short *src1, int src1_step, int src
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
@ -233,7 +246,10 @@ __kernel void arithm_s_add_C2_D0 (__global uchar *src1, int src1_step, int src
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
@ -378,7 +394,10 @@ __kernel void arithm_s_add_C3_D0 (__global uchar *src1, int src1_step, int src
{
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int dst_start = mad24(y, dst_step, dst_offset);
@ -432,7 +451,10 @@ __kernel void arithm_s_add_C3_D2 (__global ushort *src1, int src1_step, int sr
{
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset);
@ -458,16 +480,16 @@ __kernel void arithm_s_add_C3_D2 (__global ushort *src1, int src1_step, int sr
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_add_C3_D3 (__global short *src1, int src1_step, int src1_offset,
@ -482,7 +504,10 @@ __kernel void arithm_s_add_C3_D3 (__global short *src1, int src1_step, int src
{
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset);
@ -508,16 +533,16 @@ __kernel void arithm_s_add_C3_D3 (__global short *src1, int src1_step, int src
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_add_C3_D4 (__global int *src1, int src1_step, int src1_offset,
@ -549,9 +574,9 @@ __kernel void arithm_s_add_C3_D4 (__global int *src1, int src1_step, int src1_
int tmp_data_1 = convert_int_sat((long)src1_data_1 + (long)src2_data_1);
int tmp_data_2 = convert_int_sat((long)src1_data_2 + (long)src2_data_2);
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
}
}
__kernel void arithm_s_add_C3_D5 (__global float *src1, int src1_step, int src1_offset,
@ -583,9 +608,9 @@ __kernel void arithm_s_add_C3_D5 (__global float *src1, int src1_step, int src
float tmp_data_1 = src1_data_1 + src2_data_1;
float tmp_data_2 = src1_data_2 + src2_data_2;
*((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2;
*((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2;
}
}
@ -619,9 +644,9 @@ __kernel void arithm_s_add_C3_D6 (__global double *src1, int src1_step, int sr
double tmp_data_1 = src1_data_1 + src2_data_1;
double tmp_data_2 = src1_data_2 + src2_data_2;
*((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
*((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
*((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
*((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
}
}
#endif

@ -44,7 +44,11 @@
//M*/
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
/**************************************add with scalar with mask**************************************/
@ -61,7 +65,10 @@ __kernel void arithm_s_add_with_mask_C1_D0 (__global uchar *src1, int src1_ste
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -111,7 +118,10 @@ __kernel void arithm_s_add_with_mask_C1_D2 (__global ushort *src1, int src1_st
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -146,7 +156,10 @@ __kernel void arithm_s_add_with_mask_C1_D3 (__global short *src1, int src1_ste
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -267,7 +280,10 @@ __kernel void arithm_s_add_with_mask_C2_D0 (__global uchar *src1, int src1_ste
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -443,7 +459,10 @@ __kernel void arithm_s_add_with_mask_C3_D0 (__global uchar *src1, int src1_ste
{
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -501,7 +520,10 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global ushort *src1, int src1_st
{
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -530,16 +552,16 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global ushort *src1, int src1_st
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_add_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
@ -555,7 +577,10 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global short *src1, int src1_ste
{
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -584,16 +609,16 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global short *src1, int src1_ste
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_add_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset,
@ -633,9 +658,9 @@ __kernel void arithm_s_add_with_mask_C3_D4 (__global int *src1, int src1_step,
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_add_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset,
@ -675,9 +700,9 @@ __kernel void arithm_s_add_with_mask_C3_D5 (__global float *src1, int src1_ste
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global float *)((__global char *)dst + dst_index + 0))= data_0;
*((__global float *)((__global char *)dst + dst_index + 4))= data_1;
*((__global float *)((__global char *)dst + dst_index + 8))= data_2;
*((__global float *)((__global char *)dst + dst_index + 0))= data_0;
*((__global float *)((__global char *)dst + dst_index + 4))= data_1;
*((__global float *)((__global char *)dst + dst_index + 8))= data_2;
}
}
@ -719,9 +744,9 @@ __kernel void arithm_s_add_with_mask_C3_D6 (__global double *src1, int src1_st
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
*((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
*((__global double *)((__global char *)dst + dst_index + 16))= data_2;
*((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
*((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
*((__global double *)((__global char *)dst + dst_index + 16))= data_2;
}
}
#endif

@ -43,7 +43,11 @@
//
//M*/
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
@ -51,9 +55,9 @@
///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************bitwise_and without mask**************************************/
__kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -62,30 +66,33 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = src1_data & src2_data;
@ -101,9 +108,9 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
__kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -112,7 +119,10 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@ -120,23 +130,23 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
char4 src1_data = vload4(0, src1 + src1_index_fix);
char4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0)
{
char4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
char4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
char4 src1_data = vload4(0, src1 + src1_index_fix);
char4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0)
{
char4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
char4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
char4 dst_data = *((__global char4 *)(dst + dst_index));
char4 tmp_data = src1_data & src2_data;
@ -151,9 +161,9 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
__kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global ushort *src2, int src2_step, int src2_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -163,7 +173,10 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@ -171,23 +184,23 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
if(src1_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
ushort4 tmp_data = src1_data & src2_data;
@ -203,9 +216,9 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
__kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global short *src2, int src2_step, int src2_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -215,7 +228,10 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@ -223,23 +239,23 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0)
{
short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
if(src1_index < 0)
{
short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
short4 tmp_data = src1_data & src2_data;
@ -255,9 +271,9 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
__kernel void arithm_bitwise_and_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global int *src2, int src2_step, int src2_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -277,9 +293,9 @@ __kernel void arithm_bitwise_and_D4 (__global int *src1, int src1_step, int src1
}
__kernel void arithm_bitwise_and_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -300,9 +316,9 @@ __kernel void arithm_bitwise_and_D5 (__global char *src1, int src1_step, int src
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_bitwise_and_D6 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);

@ -43,18 +43,22 @@
//
//M*/
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////BITWISE_AND////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************bitwise_and with mask**************************************/
__kernel void arithm_bitwise_and_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_and_with_mask_C1_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -64,7 +68,10 @@ __kernel void arithm_bitwise_and_with_mask_C1_D0 (__global uchar *src1, int src1
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -91,11 +98,12 @@ __kernel void arithm_bitwise_and_with_mask_C1_D0 (__global uchar *src1, int src1
__kernel void arithm_bitwise_and_with_mask_C1_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_and_with_mask_C1_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -105,7 +113,10 @@ __kernel void arithm_bitwise_and_with_mask_C1_D1 (__global char *src1, int src1_
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -132,11 +143,12 @@ __kernel void arithm_bitwise_and_with_mask_C1_D1 (__global char *src1, int src1_
__kernel void arithm_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_and_with_mask_C1_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -146,7 +158,10 @@ __kernel void arithm_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int src
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -171,11 +186,12 @@ __kernel void arithm_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int src
__kernel void arithm_bitwise_and_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_and_with_mask_C1_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -185,7 +201,10 @@ __kernel void arithm_bitwise_and_with_mask_C1_D3 (__global short *src1, int src1
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -198,8 +217,8 @@ __kernel void arithm_bitwise_and_with_mask_C1_D3 (__global short *src1, int src1
short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index));
uchar2 mask_data = vload2(0, mask + mask_index);
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
short2 tmp_data = src1_data & src2_data;
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
short2 tmp_data = src1_data & src2_data;
data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
@ -210,11 +229,12 @@ __kernel void arithm_bitwise_and_with_mask_C1_D3 (__global short *src1, int src1
__kernel void arithm_bitwise_and_with_mask_C1_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_and_with_mask_C1_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -242,11 +262,12 @@ __kernel void arithm_bitwise_and_with_mask_C1_D4 (__global int *src1, int src1
__kernel void arithm_bitwise_and_with_mask_C1_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_and_with_mask_C1_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -274,12 +295,12 @@ __kernel void arithm_bitwise_and_with_mask_C1_D5 (__global char *src1, int src1_
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_bitwise_and_with_mask_C1_D6 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_and_with_mask_C1_D6 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -305,15 +326,15 @@ __kernel void arithm_bitwise_and_with_mask_C1_D6 (__global char *src1, int src1_
}
}
#endif
__kernel void arithm_bitwise_and_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_and_with_mask_C2_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -323,7 +344,10 @@ __kernel void arithm_bitwise_and_with_mask_C2_D0 (__global uchar *src1, int src1
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -347,11 +371,12 @@ __kernel void arithm_bitwise_and_with_mask_C2_D0 (__global uchar *src1, int src1
}
__kernel void arithm_bitwise_and_with_mask_C2_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_and_with_mask_C2_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -361,7 +386,10 @@ __kernel void arithm_bitwise_and_with_mask_C2_D1 (__global char *src1, int src1_
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -384,11 +412,12 @@ __kernel void arithm_bitwise_and_with_mask_C2_D1 (__global char *src1, int src1_
}
}
__kernel void arithm_bitwise_and_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_and_with_mask_C2_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -413,11 +442,12 @@ __kernel void arithm_bitwise_and_with_mask_C2_D2 (__global ushort *src1, int src
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_bitwise_and_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_and_with_mask_C2_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -442,11 +472,12 @@ __kernel void arithm_bitwise_and_with_mask_C2_D3 (__global short *src1, int src1
*((__global short2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_bitwise_and_with_mask_C2_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_and_with_mask_C2_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -471,11 +502,12 @@ __kernel void arithm_bitwise_and_with_mask_C2_D4 (__global int *src1, int src1
*((__global int2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_bitwise_and_with_mask_C2_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_and_with_mask_C2_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -500,12 +532,13 @@ __kernel void arithm_bitwise_and_with_mask_C2_D5 (__global char *src1, int src1_
*((__global char8 *)((__global char *)dst + dst_index)) = data;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_bitwise_and_with_mask_C2_D6 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_and_with_mask_C2_D6 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -530,15 +563,15 @@ __kernel void arithm_bitwise_and_with_mask_C2_D6 (__global char *src1, int src1_
*((__global char16 *)((__global char *)dst + dst_index)) = data;
}
}
#endif
__kernel void arithm_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_and_with_mask_C3_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -548,7 +581,10 @@ __kernel void arithm_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int src1
{
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -596,11 +632,12 @@ __kernel void arithm_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int src1
}
__kernel void arithm_bitwise_and_with_mask_C3_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_and_with_mask_C3_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -610,7 +647,10 @@ __kernel void arithm_bitwise_and_with_mask_C3_D1 (__global char *src1, int src1_
{
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -657,11 +697,12 @@ __kernel void arithm_bitwise_and_with_mask_C3_D1 (__global char *src1, int src1_
}
}
__kernel void arithm_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_and_with_mask_C3_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -671,7 +712,10 @@ __kernel void arithm_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int src
{
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -701,23 +745,24 @@ __kernel void arithm_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int src
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_bitwise_and_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_and_with_mask_C3_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -727,7 +772,10 @@ __kernel void arithm_bitwise_and_with_mask_C3_D3 (__global short *src1, int src1
{
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -757,23 +805,24 @@ __kernel void arithm_bitwise_and_with_mask_C3_D3 (__global short *src1, int src1
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_bitwise_and_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_and_with_mask_C3_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -808,16 +857,17 @@ __kernel void arithm_bitwise_and_with_mask_C3_D4 (__global int *src1, int src1
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_bitwise_and_with_mask_C3_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_and_with_mask_C3_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -852,17 +902,18 @@ __kernel void arithm_bitwise_and_with_mask_C3_D5 (__global char *src1, int src1_
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_bitwise_and_with_mask_C3_D6 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_and_with_mask_C3_D6 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -897,20 +948,20 @@ __kernel void arithm_bitwise_and_with_mask_C3_D6 (__global char *src1, int src1_
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0;
*((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1;
*((__global char8 *)((__global char *)dst + dst_index + 16))= data_2;
*((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0;
*((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1;
*((__global char8 *)((__global char *)dst + dst_index + 16))= data_2;
}
}
#endif
__kernel void arithm_bitwise_and_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_and_with_mask_C4_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -937,11 +988,12 @@ __kernel void arithm_bitwise_and_with_mask_C4_D0 (__global uchar *src1, int src1
}
__kernel void arithm_bitwise_and_with_mask_C4_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_and_with_mask_C4_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -967,11 +1019,12 @@ __kernel void arithm_bitwise_and_with_mask_C4_D1 (__global char *src1, int src1_
}
}
__kernel void arithm_bitwise_and_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_and_with_mask_C4_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -996,11 +1049,12 @@ __kernel void arithm_bitwise_and_with_mask_C4_D2 (__global ushort *src1, int src
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_bitwise_and_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_and_with_mask_C4_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -1025,11 +1079,12 @@ __kernel void arithm_bitwise_and_with_mask_C4_D3 (__global short *src1, int src1
*((__global short4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_bitwise_and_with_mask_C4_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_and_with_mask_C4_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -1054,11 +1109,12 @@ __kernel void arithm_bitwise_and_with_mask_C4_D4 (__global int *src1, int src1
*((__global int4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_bitwise_and_with_mask_C4_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_and_with_mask_C4_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -1084,11 +1140,12 @@ __kernel void arithm_bitwise_and_with_mask_C4_D5 (__global char *src1, int src1_
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_bitwise_and_with_mask_C4_D6 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_and_with_mask_C4_D6 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);

@ -42,19 +42,22 @@
// the use of this software, even if advised of the possibility of such damage.
//
//
#if defined (__ATI__)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (__NVIDIA__)
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////BITWISE_AND////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************and with scalar without mask**************************************/
__kernel void arithm_s_bitwise_and_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_C1_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -63,7 +66,10 @@ __kernel void arithm_s_bitwise_and_C1_D0 (__global uchar *src1, int src1_step,
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@ -86,9 +92,10 @@ __kernel void arithm_s_bitwise_and_C1_D0 (__global uchar *src1, int src1_step,
}
__kernel void arithm_s_bitwise_and_C1_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_C1_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -97,7 +104,10 @@ __kernel void arithm_s_bitwise_and_C1_D1 (__global char *src1, int src1_step,
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@ -119,9 +129,10 @@ __kernel void arithm_s_bitwise_and_C1_D1 (__global char *src1, int src1_step,
}
}
__kernel void arithm_s_bitwise_and_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_C1_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -131,7 +142,10 @@ __kernel void arithm_s_bitwise_and_C1_D2 (__global ushort *src1, int src1_step
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
@ -150,9 +164,10 @@ __kernel void arithm_s_bitwise_and_C1_D2 (__global ushort *src1, int src1_step
*((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_C1_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_C1_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -162,7 +177,10 @@ __kernel void arithm_s_bitwise_and_C1_D3 (__global short *src1, int src1_step,
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
@ -181,9 +199,10 @@ __kernel void arithm_s_bitwise_and_C1_D3 (__global short *src1, int src1_step,
*((__global short2 *)((__global uchar *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_C1_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_C1_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -202,9 +221,10 @@ __kernel void arithm_s_bitwise_and_C1_D4 (__global int *src1, int src1_step, i
*((__global int *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_C1_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_C1_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -232,11 +252,11 @@ __kernel void arithm_s_bitwise_and_C1_D5 (__global char *src1, int src1_step,
*((__global char4 *)((__global char *)dst + dst_index)) = data;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_and_C1_D6 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_C1_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -256,9 +276,10 @@ __kernel void arithm_s_bitwise_and_C1_D6 (__global short *src1, int src1_step, i
}
}
#endif
__kernel void arithm_s_bitwise_and_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_C2_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -268,7 +289,10 @@ __kernel void arithm_s_bitwise_and_C2_D0 (__global uchar *src1, int src1_step,
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
@ -290,9 +314,10 @@ __kernel void arithm_s_bitwise_and_C2_D0 (__global uchar *src1, int src1_step,
}
__kernel void arithm_s_bitwise_and_C2_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_C2_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -302,7 +327,10 @@ __kernel void arithm_s_bitwise_and_C2_D1 (__global char *src1, int src1_step,
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
@ -322,9 +350,10 @@ __kernel void arithm_s_bitwise_and_C2_D1 (__global char *src1, int src1_step,
}
}
__kernel void arithm_s_bitwise_and_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_C2_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -343,9 +372,10 @@ __kernel void arithm_s_bitwise_and_C2_D2 (__global ushort *src1, int src1_step
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_C2_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_C2_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -364,9 +394,10 @@ __kernel void arithm_s_bitwise_and_C2_D3 (__global short *src1, int src1_step,
*((__global short2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_C2_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_C2_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -384,9 +415,10 @@ __kernel void arithm_s_bitwise_and_C2_D4 (__global int *src1, int src1_step, i
*((__global int2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_C2_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_C2_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -403,12 +435,13 @@ __kernel void arithm_s_bitwise_and_C2_D5 (__global char *src1, int src1_step,
char8 tmp_data = src1_data & src2_data;
*((__global char8 *)((__global char *)dst + dst_index)) = tmp_data;
}
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_and_C2_D6 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_C2_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -428,9 +461,10 @@ __kernel void arithm_s_bitwise_and_C2_D6 (__global short *src1, int src1_step, i
}
}
#endif
__kernel void arithm_s_bitwise_and_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_C3_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -440,7 +474,10 @@ __kernel void arithm_s_bitwise_and_C3_D0 (__global uchar *src1, int src1_step,
{
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int dst_start = mad24(y, dst_step, dst_offset);
@ -484,9 +521,10 @@ __kernel void arithm_s_bitwise_and_C3_D0 (__global uchar *src1, int src1_step,
}
__kernel void arithm_s_bitwise_and_C3_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_C3_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -496,7 +534,10 @@ __kernel void arithm_s_bitwise_and_C3_D1 (__global char *src1, int src1_step,
{
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int dst_start = mad24(y, dst_step, dst_offset);
@ -539,9 +580,10 @@ __kernel void arithm_s_bitwise_and_C3_D1 (__global char *src1, int src1_step,
}
}
__kernel void arithm_s_bitwise_and_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_C3_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -551,7 +593,10 @@ __kernel void arithm_s_bitwise_and_C3_D2 (__global ushort *src1, int src1_step
{
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset);
@ -577,21 +622,22 @@ __kernel void arithm_s_bitwise_and_C3_D2 (__global ushort *src1, int src1_step
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_bitwise_and_C3_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_C3_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -601,7 +647,10 @@ __kernel void arithm_s_bitwise_and_C3_D3 (__global short *src1, int src1_step,
{
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset);
@ -627,21 +676,22 @@ __kernel void arithm_s_bitwise_and_C3_D3 (__global short *src1, int src1_step,
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_bitwise_and_C3_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_C3_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -668,14 +718,15 @@ __kernel void arithm_s_bitwise_and_C3_D4 (__global int *src1, int src1_step, i
int tmp_data_1 = src1_data_1 & src2_data_1;
int tmp_data_2 = src1_data_2 & src2_data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
}
}
__kernel void arithm_s_bitwise_and_C3_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_C3_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -702,15 +753,16 @@ __kernel void arithm_s_bitwise_and_C3_D5 (__global char *src1, int src1_step,
char4 tmp_data_1 = src1_data_1 & src2_data_1;
char4 tmp_data_2 = src1_data_2 & src2_data_2;
*((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
*((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_and_C3_D6 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_C3_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -737,15 +789,16 @@ __kernel void arithm_s_bitwise_and_C3_D6 (__global short *src1, int src1_step, i
short4 tmp_data_1 = src1_data_1 & src2_data_1;
short4 tmp_data_2 = src1_data_2 & src2_data_2;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
}
}
#endif
__kernel void arithm_s_bitwise_and_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_C4_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -765,9 +818,10 @@ __kernel void arithm_s_bitwise_and_C4_D0 (__global uchar *src1, int src1_step,
}
__kernel void arithm_s_bitwise_and_C4_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_C4_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -786,9 +840,10 @@ __kernel void arithm_s_bitwise_and_C4_D1 (__global char *src1, int src1_step,
}
}
__kernel void arithm_s_bitwise_and_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_C4_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -806,9 +861,10 @@ __kernel void arithm_s_bitwise_and_C4_D2 (__global ushort *src1, int src1_step
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_C4_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_C4_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -826,9 +882,10 @@ __kernel void arithm_s_bitwise_and_C4_D3 (__global short *src1, int src1_step,
*((__global short4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_C4_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_C4_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -846,9 +903,10 @@ __kernel void arithm_s_bitwise_and_C4_D4 (__global int *src1, int src1_step, i
*((__global int4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_C4_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_C4_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -869,9 +927,10 @@ __kernel void arithm_s_bitwise_and_C4_D5 (__global char *src1, int src1_step,
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_and_C4_D6 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_C4_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -897,10 +956,10 @@ __kernel void arithm_s_bitwise_and_C4_D6 (__global short *src1, int src1_step, i
short4 tmp_data_2 = src1_data_2 & src2_data_2;
short4 tmp_data_3 = src1_data_3 & src2_data_3;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
*((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
*((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
}
}

@ -42,20 +42,22 @@
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if defined (__ATI__)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (__NVIDIA__)
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////BITWISE_AND////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************bitwise_and with scalar with mask**************************************/
__kernel void arithm_s_bitwise_and_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
uchar4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_with_mask_C1_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -65,7 +67,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D0 (__global uchar *src1, int
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -90,10 +95,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D0 (__global uchar *src1, int
}
__kernel void arithm_s_bitwise_and_with_mask_C1_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_with_mask_C1_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -103,7 +109,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D1 (__global char *src1, int s
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -127,10 +136,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D1 (__global char *src1, int s
}
}
__kernel void arithm_s_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
ushort4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_with_mask_C1_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -140,7 +150,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -161,10 +174,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int
*((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_with_mask_C1_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -174,7 +188,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D3 (__global short *src1, int
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -195,10 +212,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D3 (__global short *src1, int
*((__global short2 *)((__global uchar *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_with_mask_C1_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_with_mask_C1_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -223,10 +241,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D4 (__global int *src1, int
}
}
__kernel void arithm_s_bitwise_and_with_mask_C1_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_with_mask_C1_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -252,10 +271,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D5 (__global char *src1, int src
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_and_with_mask_C1_D6 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_with_mask_C1_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -280,10 +300,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D6 (__global short *src1, int sr
}
}
#endif
__kernel void arithm_s_bitwise_and_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
uchar4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_with_mask_C2_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -293,7 +314,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D0 (__global uchar *src1, int
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -316,10 +340,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D0 (__global uchar *src1, int
}
__kernel void arithm_s_bitwise_and_with_mask_C2_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_with_mask_C2_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -329,7 +354,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D1 (__global char *src1, int s
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -351,10 +379,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D1 (__global char *src1, int s
}
}
__kernel void arithm_s_bitwise_and_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
ushort4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_with_mask_C2_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -378,10 +407,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D2 (__global ushort *src1, int
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_with_mask_C2_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -405,10 +435,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D3 (__global short *src1, int
*((__global short2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_with_mask_C2_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_with_mask_C2_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -432,10 +463,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D4 (__global int *src1, int sr
*((__global int2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_with_mask_C2_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_with_mask_C2_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -461,10 +493,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D5 (__global char *src1, int s
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_and_with_mask_C2_D6 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_with_mask_C2_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -489,10 +522,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D6 (__global short *src1, int sr
}
}
#endif
__kernel void arithm_s_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
uchar4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_with_mask_C3_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -502,7 +536,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int
{
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -549,10 +586,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int
}
__kernel void arithm_s_bitwise_and_with_mask_C3_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_with_mask_C3_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -562,7 +600,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D1 (__global char *src1, int s
{
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -608,10 +649,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D1 (__global char *src1, int s
}
}
__kernel void arithm_s_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
ushort4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_with_mask_C3_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -621,7 +663,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int
{
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -650,22 +695,23 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_bitwise_and_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_with_mask_C3_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -675,7 +721,10 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D3 (__global short *src1, int
{
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -704,22 +753,23 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D3 (__global short *src1, int
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_bitwise_and_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_with_mask_C3_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -753,15 +803,16 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D4 (__global int *src1, int sr
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_bitwise_and_with_mask_C3_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_with_mask_C3_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -795,16 +846,17 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D5 (__global char *src1, int s
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_and_with_mask_C3_D6 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_with_mask_C3_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -838,16 +890,17 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D6 (__global short *src1, int sr
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1;
*((__global short4 *)((__global char *)dst + dst_index + 16))= data_2;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1;
*((__global short4 *)((__global char *)dst + dst_index + 16))= data_2;
}
}
#endif
__kernel void arithm_s_bitwise_and_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
uchar4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_with_mask_C4_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -872,10 +925,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D0 (__global uchar *src1, int
}
__kernel void arithm_s_bitwise_and_with_mask_C4_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_with_mask_C4_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -899,10 +953,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D1 (__global char *src1, int s
}
}
__kernel void arithm_s_bitwise_and_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
ushort4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_with_mask_C4_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -925,10 +980,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D2 (__global ushort *src1, int
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_with_mask_C4_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -951,10 +1007,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D3 (__global short *src1, int
*((__global short4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_with_mask_C4_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_with_mask_C4_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -977,10 +1034,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D4 (__global int *src1, int sr
*((__global int4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_and_with_mask_C4_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_with_mask_C4_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -1006,10 +1064,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D5 (__global char *src1, int s
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_and_with_mask_C4_D6 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_and_with_mask_C4_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -1055,3 +1114,4 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D6 (__global short *src1, int sr
}
}
#endif

@ -43,9 +43,12 @@
//
//M*/
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////BITWISE_NOT////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
@ -61,25 +64,28 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = ~ src1_data;
/* if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
*/
/* if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
*/
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
@ -91,8 +97,8 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
__kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -101,7 +107,10 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@ -124,8 +133,8 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
__kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -135,7 +144,10 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
@ -159,8 +171,8 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s
__kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -170,7 +182,10 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
@ -194,8 +209,8 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr
__kernel void arithm_bitwise_not_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);

@ -43,7 +43,11 @@
//
//M*/
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
@ -51,9 +55,9 @@
///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************bitwise_or without mask**************************************/
__kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -62,29 +66,32 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = src1_data | src2_data;
@ -99,9 +106,9 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src
__kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -110,7 +117,10 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@ -135,9 +145,9 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1
__kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global ushort *src2, int src2_step, int src2_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -147,7 +157,10 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@ -173,9 +186,9 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr
__kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global short *src2, int src2_step, int src2_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -185,7 +198,10 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@ -211,9 +227,9 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src
__kernel void arithm_bitwise_or_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global int *src2, int src2_step, int src2_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -233,9 +249,9 @@ __kernel void arithm_bitwise_or_D4 (__global int *src1, int src1_step, int src1_
}
__kernel void arithm_bitwise_or_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -256,9 +272,9 @@ __kernel void arithm_bitwise_or_D5 (__global char *src1, int src1_step, int src1
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_bitwise_or_D6 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);

@ -43,18 +43,22 @@
//
//M*/
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////BITWISE_OR////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************bitwise_or with mask**************************************/
__kernel void arithm_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_or_with_mask_C1_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -64,7 +68,10 @@ __kernel void arithm_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int src1_
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -91,11 +98,12 @@ __kernel void arithm_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int src1_
__kernel void arithm_bitwise_or_with_mask_C1_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_or_with_mask_C1_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -105,7 +113,10 @@ __kernel void arithm_bitwise_or_with_mask_C1_D1 (__global char *src1, int src1_s
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -132,11 +143,12 @@ __kernel void arithm_bitwise_or_with_mask_C1_D1 (__global char *src1, int src1_s
__kernel void arithm_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_or_with_mask_C1_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -146,7 +158,10 @@ __kernel void arithm_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int src1
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -171,11 +186,12 @@ __kernel void arithm_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int src1
__kernel void arithm_bitwise_or_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_or_with_mask_C1_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -185,7 +201,10 @@ __kernel void arithm_bitwise_or_with_mask_C1_D3 (__global short *src1, int src1_
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -198,8 +217,8 @@ __kernel void arithm_bitwise_or_with_mask_C1_D3 (__global short *src1, int src1_
short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index));
uchar2 mask_data = vload2(0, mask + mask_index);
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
short2 tmp_data = src1_data | src2_data;
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
short2 tmp_data = src1_data | src2_data;
data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
@ -210,11 +229,12 @@ __kernel void arithm_bitwise_or_with_mask_C1_D3 (__global short *src1, int src1_
__kernel void arithm_bitwise_or_with_mask_C1_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_or_with_mask_C1_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -242,11 +262,12 @@ __kernel void arithm_bitwise_or_with_mask_C1_D4 (__global int *src1, int src1_
__kernel void arithm_bitwise_or_with_mask_C1_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_or_with_mask_C1_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -273,13 +294,13 @@ __kernel void arithm_bitwise_or_with_mask_C1_D5 (__global char *src1, int src1_s
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_bitwise_or_with_mask_C1_D6 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_or_with_mask_C1_D6 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -308,12 +329,12 @@ __kernel void arithm_bitwise_or_with_mask_C1_D6 (__global char *src1, int src1_s
#endif
__kernel void arithm_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_or_with_mask_C2_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -323,7 +344,10 @@ __kernel void arithm_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int src1_
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -347,11 +371,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int src1_
}
__kernel void arithm_bitwise_or_with_mask_C2_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_or_with_mask_C2_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -361,7 +386,10 @@ __kernel void arithm_bitwise_or_with_mask_C2_D1 (__global char *src1, int src1_s
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -384,11 +412,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D1 (__global char *src1, int src1_s
}
}
__kernel void arithm_bitwise_or_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_or_with_mask_C2_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -413,11 +442,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D2 (__global ushort *src1, int src1
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_bitwise_or_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_or_with_mask_C2_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -442,11 +472,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D3 (__global short *src1, int src1_
*((__global short2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_bitwise_or_with_mask_C2_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_or_with_mask_C2_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -471,11 +502,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D4 (__global int *src1, int src1_
*((__global int2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_bitwise_or_with_mask_C2_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_or_with_mask_C2_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -501,11 +533,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D5 (__global char *src1, int src1_s
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_bitwise_or_with_mask_C2_D6 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_or_with_mask_C2_D6 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -533,12 +566,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D6 (__global char *src1, int src1_s
#endif
__kernel void arithm_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_or_with_mask_C3_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -548,7 +581,10 @@ __kernel void arithm_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int src1_
{
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -596,11 +632,12 @@ __kernel void arithm_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int src1_
}
__kernel void arithm_bitwise_or_with_mask_C3_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_or_with_mask_C3_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -610,7 +647,10 @@ __kernel void arithm_bitwise_or_with_mask_C3_D1 (__global char *src1, int src1_s
{
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -657,11 +697,12 @@ __kernel void arithm_bitwise_or_with_mask_C3_D1 (__global char *src1, int src1_s
}
}
__kernel void arithm_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_or_with_mask_C3_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -671,7 +712,10 @@ __kernel void arithm_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int src1
{
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -701,23 +745,24 @@ __kernel void arithm_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int src1
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_bitwise_or_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_or_with_mask_C3_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -727,7 +772,10 @@ __kernel void arithm_bitwise_or_with_mask_C3_D3 (__global short *src1, int src1_
{
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -757,23 +805,24 @@ __kernel void arithm_bitwise_or_with_mask_C3_D3 (__global short *src1, int src1_
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_bitwise_or_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_or_with_mask_C3_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -808,16 +857,17 @@ __kernel void arithm_bitwise_or_with_mask_C3_D4 (__global int *src1, int src1_
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_bitwise_or_with_mask_C3_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_or_with_mask_C3_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -852,17 +902,18 @@ __kernel void arithm_bitwise_or_with_mask_C3_D5 (__global char *src1, int src1_s
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_bitwise_or_with_mask_C3_D6 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_or_with_mask_C3_D6 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -897,20 +948,20 @@ __kernel void arithm_bitwise_or_with_mask_C3_D6 (__global char *src1, int src1_s
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0;
*((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1;
*((__global char8 *)((__global char *)dst + dst_index + 16))= data_2;
*((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0;
*((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1;
*((__global char8 *)((__global char *)dst + dst_index + 16))= data_2;
}
}
#endif
__kernel void arithm_bitwise_or_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_or_with_mask_C4_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -937,11 +988,12 @@ __kernel void arithm_bitwise_or_with_mask_C4_D0 (__global uchar *src1, int src1_
}
__kernel void arithm_bitwise_or_with_mask_C4_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_or_with_mask_C4_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -967,11 +1019,12 @@ __kernel void arithm_bitwise_or_with_mask_C4_D1 (__global char *src1, int src1_s
}
}
__kernel void arithm_bitwise_or_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_or_with_mask_C4_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -996,11 +1049,12 @@ __kernel void arithm_bitwise_or_with_mask_C4_D2 (__global ushort *src1, int src1
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_bitwise_or_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_or_with_mask_C4_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -1025,11 +1079,12 @@ __kernel void arithm_bitwise_or_with_mask_C4_D3 (__global short *src1, int src1_
*((__global short4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_bitwise_or_with_mask_C4_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_or_with_mask_C4_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -1054,11 +1109,12 @@ __kernel void arithm_bitwise_or_with_mask_C4_D4 (__global int *src1, int src1_
*((__global int4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_bitwise_or_with_mask_C4_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_or_with_mask_C4_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -1084,11 +1140,12 @@ __kernel void arithm_bitwise_or_with_mask_C4_D5 (__global char *src1, int src1_s
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_bitwise_or_with_mask_C4_D6 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_or_with_mask_C4_D6 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);

@ -43,16 +43,21 @@
//
//M*/
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////BITWISE_OR////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************and with scalar without mask**************************************/
__kernel void arithm_s_bitwise_or_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_C1_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -61,7 +66,10 @@ __kernel void arithm_s_bitwise_or_C1_D0 (__global uchar *src1, int src1_step,
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@ -84,9 +92,10 @@ __kernel void arithm_s_bitwise_or_C1_D0 (__global uchar *src1, int src1_step,
}
__kernel void arithm_s_bitwise_or_C1_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_C1_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -95,7 +104,10 @@ __kernel void arithm_s_bitwise_or_C1_D1 (__global char *src1, int src1_step, i
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@ -117,9 +129,10 @@ __kernel void arithm_s_bitwise_or_C1_D1 (__global char *src1, int src1_step, i
}
}
__kernel void arithm_s_bitwise_or_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_C1_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -129,7 +142,10 @@ __kernel void arithm_s_bitwise_or_C1_D2 (__global ushort *src1, int src1_step,
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
@ -148,9 +164,10 @@ __kernel void arithm_s_bitwise_or_C1_D2 (__global ushort *src1, int src1_step,
*((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_C1_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_C1_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -160,7 +177,10 @@ __kernel void arithm_s_bitwise_or_C1_D3 (__global short *src1, int src1_step,
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
@ -179,9 +199,10 @@ __kernel void arithm_s_bitwise_or_C1_D3 (__global short *src1, int src1_step,
*((__global short2 *)((__global uchar *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_C1_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_C1_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -200,9 +221,10 @@ __kernel void arithm_s_bitwise_or_C1_D4 (__global int *src1, int src1_step, in
*((__global int *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_C1_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_C1_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -222,9 +244,10 @@ __kernel void arithm_s_bitwise_or_C1_D5 (__global char *src1, int src1_step, i
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_or_C1_D6 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_C1_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
{
@ -245,10 +268,10 @@ __kernel void arithm_s_bitwise_or_C1_D6 (__global short *src1, int src1_step, in
}
}
#endif
__kernel void arithm_s_bitwise_or_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_C2_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
@ -259,7 +282,10 @@ __kernel void arithm_s_bitwise_or_C2_D0 (__global uchar *src1, int src1_step,
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
@ -280,9 +306,10 @@ __kernel void arithm_s_bitwise_or_C2_D0 (__global uchar *src1, int src1_step,
}
__kernel void arithm_s_bitwise_or_C2_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_C2_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
{
@ -293,7 +320,10 @@ __kernel void arithm_s_bitwise_or_C2_D1 (__global char *src1, int src1_step, i
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
@ -313,9 +343,10 @@ __kernel void arithm_s_bitwise_or_C2_D1 (__global char *src1, int src1_step, i
}
}
__kernel void arithm_s_bitwise_or_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_C2_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
@ -335,9 +366,10 @@ __kernel void arithm_s_bitwise_or_C2_D2 (__global ushort *src1, int src1_step,
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_C2_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_C2_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
{
@ -358,8 +390,8 @@ __kernel void arithm_s_bitwise_or_C2_D3 (__global short *src1, int src1_step,
}
}
__kernel void arithm_s_bitwise_or_C2_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
{
@ -378,9 +410,10 @@ __kernel void arithm_s_bitwise_or_C2_D4 (__global int *src1, int src1_step, in
*((__global int2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_C2_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_C2_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
{
@ -400,9 +433,10 @@ __kernel void arithm_s_bitwise_or_C2_D5 (__global char *src1, int src1_step, i
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_or_C2_D6 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_C2_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
{
@ -423,9 +457,10 @@ __kernel void arithm_s_bitwise_or_C2_D6 (__global short *src1, int src1_step, in
}
}
#endif
__kernel void arithm_s_bitwise_or_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_C3_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
@ -436,7 +471,10 @@ __kernel void arithm_s_bitwise_or_C3_D0 (__global uchar *src1, int src1_step,
{
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int dst_start = mad24(y, dst_step, dst_offset);
@ -480,9 +518,10 @@ __kernel void arithm_s_bitwise_or_C3_D0 (__global uchar *src1, int src1_step,
}
__kernel void arithm_s_bitwise_or_C3_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_C3_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
{
@ -493,7 +532,10 @@ __kernel void arithm_s_bitwise_or_C3_D1 (__global char *src1, int src1_step, i
{
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int dst_start = mad24(y, dst_step, dst_offset);
@ -536,9 +578,10 @@ __kernel void arithm_s_bitwise_or_C3_D1 (__global char *src1, int src1_step, i
}
}
__kernel void arithm_s_bitwise_or_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_C3_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
@ -549,7 +592,10 @@ __kernel void arithm_s_bitwise_or_C3_D2 (__global ushort *src1, int src1_step,
{
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset);
@ -575,21 +621,22 @@ __kernel void arithm_s_bitwise_or_C3_D2 (__global ushort *src1, int src1_step,
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_bitwise_or_C3_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_C3_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
{
@ -600,7 +647,10 @@ __kernel void arithm_s_bitwise_or_C3_D3 (__global short *src1, int src1_step,
{
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset);
@ -626,21 +676,22 @@ __kernel void arithm_s_bitwise_or_C3_D3 (__global short *src1, int src1_step,
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_bitwise_or_C3_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_C3_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
{
@ -668,14 +719,15 @@ __kernel void arithm_s_bitwise_or_C3_D4 (__global int *src1, int src1_step, in
int tmp_data_1 = src1_data_1 | src2_data_1;
int tmp_data_2 = src1_data_2 | src2_data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
}
}
__kernel void arithm_s_bitwise_or_C3_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_C3_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
{
@ -700,15 +752,16 @@ __kernel void arithm_s_bitwise_or_C3_D5 (__global char *src1, int src1_step, i
char4 tmp_data_1 = src1_data_1 | src2_data_1;
char4 tmp_data_2 = src1_data_2 | src2_data_2;
*((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
*((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_or_C3_D6 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_C3_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
{
@ -736,15 +789,16 @@ __kernel void arithm_s_bitwise_or_C3_D6 (__global short *src1, int src1_step, in
short4 tmp_data_1 = src1_data_1 | src2_data_1;
short4 tmp_data_2 = src1_data_2 | src2_data_2;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
}
}
#endif
__kernel void arithm_s_bitwise_or_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_C4_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
@ -765,9 +819,10 @@ __kernel void arithm_s_bitwise_or_C4_D0 (__global uchar *src1, int src1_step,
}
__kernel void arithm_s_bitwise_or_C4_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_C4_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
{
@ -787,9 +842,10 @@ __kernel void arithm_s_bitwise_or_C4_D1 (__global char *src1, int src1_step, i
}
}
__kernel void arithm_s_bitwise_or_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_C4_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
@ -808,9 +864,10 @@ __kernel void arithm_s_bitwise_or_C4_D2 (__global ushort *src1, int src1_step,
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_C4_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_C4_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
{
@ -829,9 +886,10 @@ __kernel void arithm_s_bitwise_or_C4_D3 (__global short *src1, int src1_step,
*((__global short4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_C4_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_C4_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
{
@ -850,9 +908,10 @@ __kernel void arithm_s_bitwise_or_C4_D4 (__global int *src1, int src1_step, in
*((__global int4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_C4_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_C4_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
{
@ -874,9 +933,10 @@ __kernel void arithm_s_bitwise_or_C4_D5 (__global char *src1, int src1_step, i
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_or_C4_D6 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_C4_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
{
@ -903,10 +963,10 @@ __kernel void arithm_s_bitwise_or_C4_D6 (__global short *src1, int src1_step, in
short4 tmp_data_2 = src1_data_2 | src2_data_2;
short4 tmp_data_3 = src1_data_3 | src2_data_3;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
*((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
*((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
}
}

@ -43,17 +43,21 @@
//
//M*/
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////BITWISE_OR////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************bitwise_or with scalar with mask**************************************/
__kernel void arithm_s_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
uchar4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_with_mask_C1_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
@ -64,7 +68,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int s
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -89,10 +96,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int s
}
__kernel void arithm_s_bitwise_or_with_mask_C1_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_with_mask_C1_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char4 src2, int rows, int cols, int dst_step1)
{
@ -103,7 +111,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D1 (__global char *src1, int sr
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -127,10 +138,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D1 (__global char *src1, int sr
}
}
__kernel void arithm_s_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
ushort4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_with_mask_C1_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
@ -141,7 +153,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -162,10 +177,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int
*((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_with_mask_C1_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short4 src2, int rows, int cols, int dst_step1)
{
@ -176,7 +192,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D3 (__global short *src1, int s
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -197,10 +216,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D3 (__global short *src1, int s
*((__global short2 *)((__global uchar *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_with_mask_C1_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_with_mask_C1_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1)
{
@ -226,10 +246,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D4 (__global int *src1, int s
}
}
__kernel void arithm_s_bitwise_or_with_mask_C1_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_with_mask_C1_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char16 src2, int rows, int cols, int dst_step1)
{
@ -254,12 +275,12 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D5 (__global char *src1, int
*((__global char4 *)((__global char *)dst + dst_index)) = data;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_or_with_mask_C1_D6 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_with_mask_C1_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short16 src2, int rows, int cols, int dst_step1)
{
@ -285,10 +306,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D6 (__global short *src1, int src
}
}
#endif
__kernel void arithm_s_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
uchar4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_with_mask_C2_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
@ -299,7 +321,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int s
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -322,10 +347,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int s
}
__kernel void arithm_s_bitwise_or_with_mask_C2_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_with_mask_C2_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char4 src2, int rows, int cols, int dst_step1)
{
@ -336,7 +362,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D1 (__global char *src1, int sr
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -358,10 +387,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D1 (__global char *src1, int sr
}
}
__kernel void arithm_s_bitwise_or_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
ushort4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_with_mask_C2_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
@ -386,10 +416,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D2 (__global ushort *src1, int
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_with_mask_C2_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short4 src2, int rows, int cols, int dst_step1)
{
@ -414,10 +445,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D3 (__global short *src1, int s
*((__global short2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_with_mask_C2_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_with_mask_C2_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1)
{
@ -442,10 +474,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D4 (__global int *src1, int src
*((__global int2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_with_mask_C2_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_with_mask_C2_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char16 src2, int rows, int cols, int dst_step1)
{
@ -463,17 +496,18 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D5 (__global char *src1, int sr
char8 src_data1 = *((__global char8 *)((__global char *)src1 + src1_index));
char8 src_data2 = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7);
char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index));
char8 data = src_data1 | src_data2;
char8 data = src_data1 | src_data2;
data = mask_data ? data : dst_data;
*((__global char8 *)((__global char *)dst + dst_index)) = data;
}
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_or_with_mask_C2_D6 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_with_mask_C2_D6 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short16 src2, int rows, int cols, int dst_step1)
{
@ -499,10 +533,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D6 (__global char *src1, int sr
}
}
#endif
__kernel void arithm_s_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
uchar4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_with_mask_C3_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
@ -513,7 +548,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int s
{
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -560,10 +598,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int s
}
__kernel void arithm_s_bitwise_or_with_mask_C3_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_with_mask_C3_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char4 src2, int rows, int cols, int dst_step1)
{
@ -574,7 +613,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D1 (__global char *src1, int sr
{
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -620,10 +662,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D1 (__global char *src1, int sr
}
}
__kernel void arithm_s_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
ushort4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_with_mask_C3_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
@ -634,7 +677,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int
{
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -663,22 +709,23 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_bitwise_or_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_with_mask_C3_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short4 src2, int rows, int cols, int dst_step1)
{
@ -689,7 +736,10 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D3 (__global short *src1, int s
{
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -718,22 +768,23 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D3 (__global short *src1, int s
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_bitwise_or_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_with_mask_C3_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1)
{
@ -768,15 +819,16 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D4 (__global int *src1, int src
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_bitwise_or_with_mask_C3_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_with_mask_C3_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char16 src2, int rows, int cols, int dst_step1)
{
@ -811,17 +863,18 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D5 (__global char *src1, int sr
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_or_with_mask_C3_D6 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_with_mask_C3_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -855,16 +908,17 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D6 (__global short *src1, int src
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1;
*((__global short4 *)((__global char *)dst + dst_index + 16))= data_2;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1;
*((__global short4 *)((__global char *)dst + dst_index + 16))= data_2;
}
}
#endif
__kernel void arithm_s_bitwise_or_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
uchar4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_with_mask_C4_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
@ -890,10 +944,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D0 (__global uchar *src1, int s
}
__kernel void arithm_s_bitwise_or_with_mask_C4_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_with_mask_C4_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char4 src2, int rows, int cols, int dst_step1)
{
@ -918,10 +973,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D1 (__global char *src1, int sr
}
}
__kernel void arithm_s_bitwise_or_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
ushort4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_with_mask_C4_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
@ -945,10 +1001,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D2 (__global ushort *src1, int
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_with_mask_C4_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short4 src2, int rows, int cols, int dst_step1)
{
@ -972,10 +1029,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D3 (__global short *src1, int s
*((__global short4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_with_mask_C4_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_with_mask_C4_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1)
{
@ -999,10 +1057,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D4 (__global int *src1, int src
*((__global int4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_or_with_mask_C4_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_with_mask_C4_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char16 src2, int rows, int cols, int dst_step1)
{
@ -1029,10 +1088,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D5 (__global char *src1, int sr
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_or_with_mask_C4_D6 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_or_with_mask_C4_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);

@ -43,17 +43,20 @@
//
//M*/
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////BITWISE_XOR////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************bitwise_xor without mask**************************************/
__kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -62,7 +65,10 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@ -70,23 +76,23 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
if(src1_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = src1_data ^ src2_data;
@ -101,9 +107,9 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
__kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -112,7 +118,10 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@ -120,23 +129,23 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
char4 src1_data = vload4(0, src1 + src1_index_fix);
char4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0)
{
char4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
char4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
if(src1_index < 0)
{
char4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
char4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
char4 dst_data = *((__global char4 *)(dst + dst_index));
char4 tmp_data = src1_data ^ src2_data;
@ -151,9 +160,9 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
__kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global ushort *src2, int src2_step, int src2_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -163,7 +172,10 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@ -171,23 +183,23 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
if(src1_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
ushort4 tmp_data = src1_data ^ src2_data;
@ -203,9 +215,9 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
__kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global short *src2, int src2_step, int src2_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -215,7 +227,10 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@ -223,25 +238,25 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
if(src1_index < 0)
{
short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
if(src1_index < 0)
{
short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
}
if(src2_index < 0)
{
short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
@ -259,9 +274,9 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
__kernel void arithm_bitwise_xor_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global int *src2, int src2_step, int src2_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -281,9 +296,9 @@ __kernel void arithm_bitwise_xor_D4 (__global int *src1, int src1_step, int src1
}
__kernel void arithm_bitwise_xor_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -301,12 +316,11 @@ __kernel void arithm_bitwise_xor_D5 (__global char *src1, int src1_step, int src
*((__global char4 *)((__global char *)dst + dst_index)) = tmp;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_bitwise_xor_D6 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);

@ -43,18 +43,22 @@
//
//M*/
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////BITWISE_XOR////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************bitwise_xor with mask**************************************/
__kernel void arithm_bitwise_xor_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_xor_with_mask_C1_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -64,7 +68,10 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D0 (__global uchar *src1, int src1
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -91,11 +98,12 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D0 (__global uchar *src1, int src1
__kernel void arithm_bitwise_xor_with_mask_C1_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_xor_with_mask_C1_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -105,7 +113,10 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D1 (__global char *src1, int src1_
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -132,11 +143,12 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D1 (__global char *src1, int src1_
__kernel void arithm_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_xor_with_mask_C1_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -146,7 +158,10 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int src
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -171,11 +186,12 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int src
__kernel void arithm_bitwise_xor_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_xor_with_mask_C1_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -185,7 +201,10 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D3 (__global short *src1, int src1
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -198,8 +217,8 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D3 (__global short *src1, int src1
short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index));
uchar2 mask_data = vload2(0, mask + mask_index);
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
short2 tmp_data = src1_data ^ src2_data;
short2 data = *((__global short2 *)((__global uchar *)dst + dst_index));
short2 tmp_data = src1_data ^ src2_data;
data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x;
data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y;
@ -210,11 +229,12 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D3 (__global short *src1, int src1
__kernel void arithm_bitwise_xor_with_mask_C1_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_xor_with_mask_C1_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -242,11 +262,12 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D4 (__global int *src1, int src1
__kernel void arithm_bitwise_xor_with_mask_C1_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_xor_with_mask_C1_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -273,13 +294,13 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D5 (__global char *src1, int src1_
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_bitwise_xor_with_mask_C1_D6 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_xor_with_mask_C1_D6 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -308,12 +329,12 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D6 (__global char *src1, int src1_
__kernel void arithm_bitwise_xor_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_xor_with_mask_C2_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -323,7 +344,10 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D0 (__global uchar *src1, int src1
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -347,11 +371,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D0 (__global uchar *src1, int src1
}
__kernel void arithm_bitwise_xor_with_mask_C2_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_xor_with_mask_C2_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -361,7 +386,10 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D1 (__global char *src1, int src1_
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -384,11 +412,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D1 (__global char *src1, int src1_
}
}
__kernel void arithm_bitwise_xor_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_xor_with_mask_C2_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -413,11 +442,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D2 (__global ushort *src1, int src
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_bitwise_xor_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_xor_with_mask_C2_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -442,11 +472,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D3 (__global short *src1, int src1
*((__global short2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_bitwise_xor_with_mask_C2_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_xor_with_mask_C2_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -471,11 +502,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D4 (__global int *src1, int src1
*((__global int2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_bitwise_xor_with_mask_C2_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_xor_with_mask_C2_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -501,11 +533,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D5 (__global char *src1, int src1_
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_bitwise_xor_with_mask_C2_D6 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_xor_with_mask_C2_D6 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -533,12 +566,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D6 (__global char *src1, int src1_
#endif
__kernel void arithm_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_xor_with_mask_C3_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -548,7 +581,10 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int src1
{
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -596,11 +632,12 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int src1
}
__kernel void arithm_bitwise_xor_with_mask_C3_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_xor_with_mask_C3_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -610,7 +647,10 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D1 (__global char *src1, int src1_
{
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -657,11 +697,12 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D1 (__global char *src1, int src1_
}
}
__kernel void arithm_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_xor_with_mask_C3_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -671,7 +712,10 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int src
{
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -701,23 +745,24 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int src
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_bitwise_xor_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_xor_with_mask_C3_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -727,7 +772,10 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D3 (__global short *src1, int src1
{
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -757,23 +805,24 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D3 (__global short *src1, int src1
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_bitwise_xor_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_xor_with_mask_C3_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -808,16 +857,17 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D4 (__global int *src1, int src1
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_bitwise_xor_with_mask_C3_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_xor_with_mask_C3_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -852,17 +902,18 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D5 (__global char *src1, int src1_
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_bitwise_xor_with_mask_C3_D6 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_xor_with_mask_C3_D6 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -897,20 +948,20 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D6 (__global char *src1, int src1_
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0;
*((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1;
*((__global char8 *)((__global char *)dst + dst_index + 16))= data_2;
*((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0;
*((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1;
*((__global char8 *)((__global char *)dst + dst_index + 16))= data_2;
}
}
#endif
__kernel void arithm_bitwise_xor_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_xor_with_mask_C4_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -937,11 +988,12 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D0 (__global uchar *src1, int src1
}
__kernel void arithm_bitwise_xor_with_mask_C4_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_xor_with_mask_C4_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -967,11 +1019,12 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D1 (__global char *src1, int src1_
}
}
__kernel void arithm_bitwise_xor_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_xor_with_mask_C4_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -996,11 +1049,12 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D2 (__global ushort *src1, int src
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_bitwise_xor_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_xor_with_mask_C4_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -1025,11 +1079,12 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D3 (__global short *src1, int src1
*((__global short4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_bitwise_xor_with_mask_C4_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_xor_with_mask_C4_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -1054,11 +1109,12 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D4 (__global int *src1, int src1
*((__global int4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_bitwise_xor_with_mask_C4_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_xor_with_mask_C4_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -1084,11 +1140,12 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D5 (__global char *src1, int src1_
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_bitwise_xor_with_mask_C4_D6 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__kernel void arithm_bitwise_xor_with_mask_C4_D6 (
__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global uchar *mask, int mask_step, int mask_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);

@ -42,19 +42,21 @@
// the use of this software, even if advised of the possibility of such damage.
//
//
#if defined (__ATI__)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (__NVIDIA__)
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////BITWISE_XOR////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************xor with scalar without mask**************************************/
__kernel void arithm_s_bitwise_xor_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_C1_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -63,7 +65,10 @@ __kernel void arithm_s_bitwise_xor_C1_D0 (__global uchar *src1, int src1_step,
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@ -86,9 +91,10 @@ __kernel void arithm_s_bitwise_xor_C1_D0 (__global uchar *src1, int src1_step,
}
__kernel void arithm_s_bitwise_xor_C1_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_C1_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -97,7 +103,10 @@ __kernel void arithm_s_bitwise_xor_C1_D1 (__global char *src1, int src1_step,
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@ -119,9 +128,10 @@ __kernel void arithm_s_bitwise_xor_C1_D1 (__global char *src1, int src1_step,
}
}
__kernel void arithm_s_bitwise_xor_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_C1_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -131,7 +141,10 @@ __kernel void arithm_s_bitwise_xor_C1_D2 (__global ushort *src1, int src1_step
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
@ -150,9 +163,10 @@ __kernel void arithm_s_bitwise_xor_C1_D2 (__global ushort *src1, int src1_step
*((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_xor_C1_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_C1_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -162,7 +176,10 @@ __kernel void arithm_s_bitwise_xor_C1_D3 (__global short *src1, int src1_step,
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
@ -181,9 +198,10 @@ __kernel void arithm_s_bitwise_xor_C1_D3 (__global short *src1, int src1_step,
*((__global short2 *)((__global uchar *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_xor_C1_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_C1_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -202,9 +220,10 @@ __kernel void arithm_s_bitwise_xor_C1_D4 (__global int *src1, int src1_step, i
*((__global int *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_xor_C1_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_C1_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -234,9 +253,10 @@ __kernel void arithm_s_bitwise_xor_C1_D5 (__global char *src1, int src1_step,
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_xor_C1_D6 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_C1_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -256,9 +276,10 @@ __kernel void arithm_s_bitwise_xor_C1_D6 (__global short *src1, int src1_step, i
}
}
#endif
__kernel void arithm_s_bitwise_xor_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_C2_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -268,7 +289,10 @@ __kernel void arithm_s_bitwise_xor_C2_D0 (__global uchar *src1, int src1_step,
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
@ -290,9 +314,10 @@ __kernel void arithm_s_bitwise_xor_C2_D0 (__global uchar *src1, int src1_step,
}
__kernel void arithm_s_bitwise_xor_C2_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_C2_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -302,7 +327,10 @@ __kernel void arithm_s_bitwise_xor_C2_D1 (__global char *src1, int src1_step,
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
@ -322,9 +350,10 @@ __kernel void arithm_s_bitwise_xor_C2_D1 (__global char *src1, int src1_step,
}
}
__kernel void arithm_s_bitwise_xor_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_C2_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -343,9 +372,10 @@ __kernel void arithm_s_bitwise_xor_C2_D2 (__global ushort *src1, int src1_step
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_xor_C2_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_C2_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -364,9 +394,10 @@ __kernel void arithm_s_bitwise_xor_C2_D3 (__global short *src1, int src1_step,
*((__global short2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_xor_C2_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_C2_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -384,9 +415,10 @@ __kernel void arithm_s_bitwise_xor_C2_D4 (__global int *src1, int src1_step, i
*((__global int2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_xor_C2_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_C2_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -403,12 +435,13 @@ __kernel void arithm_s_bitwise_xor_C2_D5 (__global char *src1, int src1_step,
char8 tmp_data = src1_data ^ src2_data;
*((__global char8 *)((__global char *)dst + dst_index)) = tmp_data;
}
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_xor_C2_D6 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_C2_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -428,9 +461,10 @@ __kernel void arithm_s_bitwise_xor_C2_D6 (__global short *src1, int src1_step, i
}
}
#endif
__kernel void arithm_s_bitwise_xor_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_C3_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -440,7 +474,10 @@ __kernel void arithm_s_bitwise_xor_C3_D0 (__global uchar *src1, int src1_step,
{
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int dst_start = mad24(y, dst_step, dst_offset);
@ -484,9 +521,10 @@ __kernel void arithm_s_bitwise_xor_C3_D0 (__global uchar *src1, int src1_step,
}
__kernel void arithm_s_bitwise_xor_C3_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_C3_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -496,7 +534,10 @@ __kernel void arithm_s_bitwise_xor_C3_D1 (__global char *src1, int src1_step,
{
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int dst_start = mad24(y, dst_step, dst_offset);
@ -539,9 +580,10 @@ __kernel void arithm_s_bitwise_xor_C3_D1 (__global char *src1, int src1_step,
}
}
__kernel void arithm_s_bitwise_xor_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_C3_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -551,7 +593,10 @@ __kernel void arithm_s_bitwise_xor_C3_D2 (__global ushort *src1, int src1_step
{
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset);
@ -577,21 +622,22 @@ __kernel void arithm_s_bitwise_xor_C3_D2 (__global ushort *src1, int src1_step
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_bitwise_xor_C3_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_C3_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -601,7 +647,10 @@ __kernel void arithm_s_bitwise_xor_C3_D3 (__global short *src1, int src1_step,
{
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset);
@ -627,21 +676,22 @@ __kernel void arithm_s_bitwise_xor_C3_D3 (__global short *src1, int src1_step,
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_bitwise_xor_C3_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_C3_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -668,14 +718,15 @@ __kernel void arithm_s_bitwise_xor_C3_D4 (__global int *src1, int src1_step, i
int tmp_data_1 = src1_data_1 ^ src2_data_1;
int tmp_data_2 = src1_data_2 ^ src2_data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
}
}
__kernel void arithm_s_bitwise_xor_C3_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_C3_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -702,15 +753,16 @@ __kernel void arithm_s_bitwise_xor_C3_D5 (__global char *src1, int src1_step,
char4 tmp_data_1 = src1_data_1 ^ src2_data_1;
char4 tmp_data_2 = src1_data_2 ^ src2_data_2;
*((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
*((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_xor_C3_D6 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_C3_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -737,15 +789,16 @@ __kernel void arithm_s_bitwise_xor_C3_D6 (__global short *src1, int src1_step, i
short4 tmp_data_1 = src1_data_1 ^ src2_data_1;
short4 tmp_data_2 = src1_data_2 ^ src2_data_2;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
}
}
#endif
__kernel void arithm_s_bitwise_xor_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_C4_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -765,9 +818,10 @@ __kernel void arithm_s_bitwise_xor_C4_D0 (__global uchar *src1, int src1_step,
}
__kernel void arithm_s_bitwise_xor_C4_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_C4_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -786,9 +840,10 @@ __kernel void arithm_s_bitwise_xor_C4_D1 (__global char *src1, int src1_step,
}
}
__kernel void arithm_s_bitwise_xor_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_C4_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -806,9 +861,10 @@ __kernel void arithm_s_bitwise_xor_C4_D2 (__global ushort *src1, int src1_step
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_xor_C4_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_C4_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -826,9 +882,10 @@ __kernel void arithm_s_bitwise_xor_C4_D3 (__global short *src1, int src1_step,
*((__global short4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_xor_C4_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_C4_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -846,9 +903,10 @@ __kernel void arithm_s_bitwise_xor_C4_D4 (__global int *src1, int src1_step, i
*((__global int4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_xor_C4_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_C4_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -869,9 +927,10 @@ __kernel void arithm_s_bitwise_xor_C4_D5 (__global char *src1, int src1_step,
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_xor_C4_D6 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_C4_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -897,10 +956,10 @@ __kernel void arithm_s_bitwise_xor_C4_D6 (__global short *src1, int src1_step, i
short4 tmp_data_2 = src1_data_2 ^ src2_data_2;
short4 tmp_data_3 = src1_data_3 ^ src2_data_3;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
*((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
*((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2;
*((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3;
}
}

@ -42,20 +42,23 @@
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#if defined (__ATI__)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (__NVIDIA__)
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////BITWISE_XOR////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************bitwise_xor with scalar with mask**************************************/
__kernel void arithm_s_bitwise_xor_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
uchar4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_with_mask_C1_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -65,7 +68,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D0 (__global uchar *src1, int
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -90,10 +96,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D0 (__global uchar *src1, int
}
__kernel void arithm_s_bitwise_xor_with_mask_C1_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_with_mask_C1_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -103,7 +110,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D1 (__global char *src1, int s
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -127,10 +137,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D1 (__global char *src1, int s
}
}
__kernel void arithm_s_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
ushort4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_with_mask_C1_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -140,7 +151,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -161,10 +175,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int
*((__global ushort2 *)((__global uchar *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_xor_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_with_mask_C1_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -174,7 +189,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D3 (__global short *src1, int
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -195,10 +213,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D3 (__global short *src1, int
*((__global short2 *)((__global uchar *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_xor_with_mask_C1_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_with_mask_C1_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -223,10 +242,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D4 (__global int *src1, int
}
}
__kernel void arithm_s_bitwise_xor_with_mask_C1_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_with_mask_C1_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -252,10 +272,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D5 (__global char *src1, int src
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_xor_with_mask_C1_D6 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_with_mask_C1_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -280,10 +301,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D6 (__global short *src1, int sr
}
}
#endif
__kernel void arithm_s_bitwise_xor_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
uchar4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_with_mask_C2_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -293,7 +315,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D0 (__global uchar *src1, int
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -316,10 +341,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D0 (__global uchar *src1, int
}
__kernel void arithm_s_bitwise_xor_with_mask_C2_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_with_mask_C2_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -329,7 +355,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D1 (__global char *src1, int s
{
x = x << 1;
#define dst_align ((dst_offset >> 1) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -351,10 +380,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D1 (__global char *src1, int s
}
}
__kernel void arithm_s_bitwise_xor_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
ushort4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_with_mask_C2_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -378,10 +408,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D2 (__global ushort *src1, int
*((__global ushort2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_xor_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_with_mask_C2_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -405,10 +436,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D3 (__global short *src1, int
*((__global short2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_xor_with_mask_C2_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_with_mask_C2_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -432,10 +464,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D4 (__global int *src1, int sr
*((__global int2 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_xor_with_mask_C2_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_with_mask_C2_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -461,10 +494,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D5 (__global char *src1, int s
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_xor_with_mask_C2_D6 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_with_mask_C2_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -489,10 +523,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D6 (__global short *src1, int sr
}
}
#endif
__kernel void arithm_s_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
uchar4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_with_mask_C3_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -502,7 +537,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int
{
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -549,10 +587,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int
}
__kernel void arithm_s_bitwise_xor_with_mask_C3_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_with_mask_C3_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -562,7 +601,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D1 (__global char *src1, int s
{
x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -608,10 +650,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D1 (__global char *src1, int s
}
}
__kernel void arithm_s_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
ushort4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_with_mask_C3_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -621,7 +664,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int
{
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -650,22 +696,23 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_bitwise_xor_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_with_mask_C3_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -675,7 +722,10 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D3 (__global short *src1, int
{
x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@ -704,22 +754,23 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D3 (__global short *src1, int
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x;
? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y;
? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy;
? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_bitwise_xor_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_with_mask_C3_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -753,15 +804,16 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D4 (__global int *src1, int sr
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= data_2;
}
}
__kernel void arithm_s_bitwise_xor_with_mask_C3_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_with_mask_C3_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -795,16 +847,17 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D5 (__global char *src1, int s
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
*((__global char4 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global char4 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global char4 *)((__global char *)dst + dst_index + 8))= data_2;
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_xor_with_mask_C3_D6 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_with_mask_C3_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -838,16 +891,17 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D6 (__global short *src1, int sr
data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1;
*((__global short4 *)((__global char *)dst + dst_index + 16))= data_2;
*((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0;
*((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1;
*((__global short4 *)((__global char *)dst + dst_index + 16))= data_2;
}
}
#endif
__kernel void arithm_s_bitwise_xor_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
uchar4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_with_mask_C4_D0 (
__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
uchar4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -872,10 +926,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D0 (__global uchar *src1, int
}
__kernel void arithm_s_bitwise_xor_with_mask_C4_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_with_mask_C4_D1 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -899,10 +954,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D1 (__global char *src1, int s
}
}
__kernel void arithm_s_bitwise_xor_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
ushort4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_with_mask_C4_D2 (
__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
ushort4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -925,10 +981,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D2 (__global ushort *src1, int
*((__global ushort4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_xor_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_with_mask_C4_D3 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -951,10 +1008,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D3 (__global short *src1, int
*((__global short4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_xor_with_mask_C4_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_with_mask_C4_D4 (
__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
int4 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -977,10 +1035,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D4 (__global int *src1, int sr
*((__global int4 *)((__global char *)dst + dst_index)) = data;
}
}
__kernel void arithm_s_bitwise_xor_with_mask_C4_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_with_mask_C4_D5 (
__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
char16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -1006,10 +1065,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D5 (__global char *src1, int s
}
}
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_bitwise_xor_with_mask_C4_D6 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short16 src2, int rows, int cols, int dst_step1)
__kernel void arithm_s_bitwise_xor_with_mask_C4_D6 (
__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset,
__global uchar *mask, int mask_step, int mask_offset,
short16 src2, int rows, int cols, int dst_step1)
{
int x = get_global_id(0);

@ -43,7 +43,11 @@
//
//M*/
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
@ -51,9 +55,9 @@
///////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -62,7 +66,10 @@ __kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@ -102,9 +109,9 @@ __kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src
__kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -114,7 +121,10 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr
{
x = x << 2;
#define dst_align ((dst_offset >> 1)& 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1)& 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@ -153,9 +163,9 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr
__kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global short *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -165,7 +175,10 @@ __kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@ -207,9 +220,9 @@ __kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src
__kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global int *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -217,7 +230,10 @@ __kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
@ -227,7 +243,7 @@ __kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
if(src1_index < 0)
{
@ -255,9 +271,9 @@ __kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_
}
__kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src1_offset,
__global float *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global float *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -265,7 +281,10 @@ __kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
@ -275,7 +294,8 @@ __kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src2_index < 0)
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
if(src2_index < 0)
{
float4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
@ -297,9 +317,9 @@ __kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_compare_eq_D6 (__global double *src1, int src1_step, int src1_offset,
__global double *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global double *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -307,7 +327,10 @@ __kernel void arithm_compare_eq_D6 (__global double *src1, int src1_step, int sr
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align ((dst_offset >> 3) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 3) & 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
@ -347,9 +370,9 @@ __kernel void arithm_compare_eq_D6 (__global double *src1, int src1_step, int sr
/***********************************Compare GT**************************/
__kernel void arithm_compare_gt_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -358,7 +381,10 @@ __kernel void arithm_compare_gt_D0 (__global uchar *src1, int src1_step, int src
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@ -397,9 +423,9 @@ __kernel void arithm_compare_gt_D0 (__global uchar *src1, int src1_step, int src
}
__kernel void arithm_compare_gt_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -409,7 +435,10 @@ __kernel void arithm_compare_gt_D2 (__global ushort *src1, int src1_step, int sr
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@ -450,9 +479,9 @@ __kernel void arithm_compare_gt_D2 (__global ushort *src1, int src1_step, int sr
__kernel void arithm_compare_gt_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global short *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -462,7 +491,10 @@ __kernel void arithm_compare_gt_D3 (__global short *src1, int src1_step, int src
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@ -501,9 +533,9 @@ __kernel void arithm_compare_gt_D3 (__global short *src1, int src1_step, int src
}
__kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global int *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -511,7 +543,10 @@ __kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
@ -521,7 +556,7 @@ __kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index));
if(src1_index < 0)
{
@ -550,9 +585,9 @@ __kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_
}
__kernel void arithm_compare_gt_D5 (__global float *src1, int src1_step, int src1_offset,
__global float *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global float *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -560,7 +595,10 @@ __kernel void arithm_compare_gt_D5 (__global float *src1, int src1_step, int src
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
@ -599,9 +637,9 @@ __kernel void arithm_compare_gt_D5 (__global float *src1, int src1_step, int src
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_compare_gt_D6 (__global double *src1, int src1_step, int src1_offset,
__global double *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global double *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -609,7 +647,10 @@ __kernel void arithm_compare_gt_D6 (__global double *src1, int src1_step, int sr
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align ((dst_offset >> 3) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 3) & 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
@ -649,9 +690,9 @@ __kernel void arithm_compare_gt_D6 (__global double *src1, int src1_step, int sr
/***********************************Compare GE**************************/
__kernel void arithm_compare_ge_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -660,7 +701,10 @@ __kernel void arithm_compare_ge_D0 (__global uchar *src1, int src1_step, int src
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@ -702,9 +746,9 @@ __kernel void arithm_compare_ge_D0 (__global uchar *src1, int src1_step, int src
__kernel void arithm_compare_ge_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -714,7 +758,10 @@ __kernel void arithm_compare_ge_D2 (__global ushort *src1, int src1_step, int sr
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@ -757,9 +804,9 @@ __kernel void arithm_compare_ge_D2 (__global ushort *src1, int src1_step, int sr
__kernel void arithm_compare_ge_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global short *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -769,7 +816,10 @@ __kernel void arithm_compare_ge_D3 (__global short *src1, int src1_step, int src
{
x = x << 2;
#define dst_align ((dst_offset >> 1)& 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1)& 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@ -809,9 +859,9 @@ __kernel void arithm_compare_ge_D3 (__global short *src1, int src1_step, int src
}
__kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global int *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -820,7 +870,10 @@ __kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_
{
x = x << 2;
#define dst_align ((dst_offset >> 2)& 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 2)& 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
@ -845,7 +898,7 @@ __kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
@ -858,9 +911,9 @@ __kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_
}
__kernel void arithm_compare_ge_D5 (__global float *src1, int src1_step, int src1_offset,
__global float *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global float *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -869,7 +922,10 @@ __kernel void arithm_compare_ge_D5 (__global float *src1, int src1_step, int src
{
x = x << 2;
#define dst_align ((dst_offset >> 2)& 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 2)& 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
@ -909,9 +965,9 @@ __kernel void arithm_compare_ge_D5 (__global float *src1, int src1_step, int src
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int src1_offset,
__global double *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global double *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -920,7 +976,10 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr
{
x = x << 2;
#define dst_align ((dst_offset >> 3)& 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 3)& 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
@ -942,7 +1001,8 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr
double4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data >= src2_data));
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
@ -954,3 +1014,4 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr
}
}
#endif

@ -43,13 +43,17 @@
//
//M*/
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
/***********************************Compare NE*******************************/
__kernel void arithm_compare_ne_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -58,7 +62,10 @@ __kernel void arithm_compare_ne_D0 (__global uchar *src1, int src1_step, int src
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@ -98,9 +105,9 @@ __kernel void arithm_compare_ne_D0 (__global uchar *src1, int src1_step, int src
__kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -110,7 +117,10 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr
{
x = x << 2;
#define dst_align ((dst_offset >> 1)& 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1)& 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@ -150,9 +160,9 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr
__kernel void arithm_compare_ne_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global short *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -162,7 +172,10 @@ __kernel void arithm_compare_ne_D3 (__global short *src1, int src1_step, int src
{
x = x << 2;
#define dst_align ((dst_offset >> 1)& 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1)& 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@ -200,9 +213,9 @@ __kernel void arithm_compare_ne_D3 (__global short *src1, int src1_step, int src
}
__kernel void arithm_compare_ne_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global int *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -210,7 +223,10 @@ __kernel void arithm_compare_ne_D4 (__global int *src1, int src1_step, int src1_
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align ((dst_offset >> 2)& 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 2)& 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
@ -249,9 +265,9 @@ __kernel void arithm_compare_ne_D4 (__global int *src1, int src1_step, int src1_
}
__kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src1_offset,
__global float *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global float *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -259,7 +275,10 @@ __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
@ -269,7 +288,8 @@ __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src
int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index;
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src1_index < 0)
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0)
{
float4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
@ -282,7 +302,7 @@ __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data != src2_data));
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
@ -296,9 +316,9 @@ __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int src1_offset,
__global double *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global double *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -306,7 +326,10 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align ((dst_offset >> 3) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 3) & 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
@ -347,9 +370,9 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr
/***********************************Compare LT*******************************/
__kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -358,7 +381,10 @@ __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@ -398,9 +424,9 @@ __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src
__kernel void arithm_compare_lt_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -410,7 +436,10 @@ __kernel void arithm_compare_lt_D2 (__global ushort *src1, int src1_step, int sr
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@ -451,9 +480,9 @@ __kernel void arithm_compare_lt_D2 (__global ushort *src1, int src1_step, int sr
__kernel void arithm_compare_lt_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global short *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -463,7 +492,10 @@ __kernel void arithm_compare_lt_D3 (__global short *src1, int src1_step, int src
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@ -502,9 +534,9 @@ __kernel void arithm_compare_lt_D3 (__global short *src1, int src1_step, int src
}
__kernel void arithm_compare_lt_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global int *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -512,7 +544,10 @@ __kernel void arithm_compare_lt_D4 (__global int *src1, int src1_step, int src1_
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
@ -554,9 +589,9 @@ __kernel void arithm_compare_lt_D4 (__global int *src1, int src1_step, int src1_
}
__kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src1_offset,
__global float *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global float *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -564,7 +599,10 @@ __kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align ((dst_offset >> 2) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
@ -589,7 +627,7 @@ __kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
@ -603,9 +641,9 @@ __kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int src1_offset,
__global double *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global double *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -613,7 +651,10 @@ __kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int sr
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align ((dst_offset >> 3) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 3) & 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
@ -638,7 +679,7 @@ __kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int sr
}
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = convert_uchar4((src1_data < src2_data));
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
@ -653,9 +694,9 @@ __kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int sr
/***********************************Compare LE*******************************/
__kernel void arithm_compare_le_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global uchar *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -664,7 +705,10 @@ __kernel void arithm_compare_le_D0 (__global uchar *src1, int src1_step, int src
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@ -705,9 +749,9 @@ __kernel void arithm_compare_le_D0 (__global uchar *src1, int src1_step, int src
__kernel void arithm_compare_le_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global ushort *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -717,7 +761,10 @@ __kernel void arithm_compare_le_D2 (__global ushort *src1, int src1_step, int sr
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@ -758,9 +805,9 @@ __kernel void arithm_compare_le_D2 (__global ushort *src1, int src1_step, int sr
__kernel void arithm_compare_le_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global short *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
@ -770,7 +817,10 @@ __kernel void arithm_compare_le_D3 (__global short *src1, int src1_step, int src
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@ -809,9 +859,9 @@ __kernel void arithm_compare_le_D3 (__global short *src1, int src1_step, int src
}
__kernel void arithm_compare_le_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global int *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -819,7 +869,10 @@ __kernel void arithm_compare_le_D4 (__global int *src1, int src1_step, int src1_
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align ((dst_offset >> 2)& 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 2)& 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
@ -857,9 +910,9 @@ __kernel void arithm_compare_le_D4 (__global int *src1, int src1_step, int src1_
}
__kernel void arithm_compare_le_D5 (__global float *src1, int src1_step, int src1_offset,
__global float *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global float *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -867,7 +920,10 @@ __kernel void arithm_compare_le_D5 (__global float *src1, int src1_step, int src
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align ((dst_offset >> 2)& 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 2)& 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
@ -905,9 +961,9 @@ __kernel void arithm_compare_le_D5 (__global float *src1, int src1_step, int src
#if defined (DOUBLE_SUPPORT)
__kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int src1_offset,
__global double *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global double *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
@ -915,7 +971,10 @@ __kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int sr
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align ((dst_offset >> 3)& 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 3)& 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
@ -952,3 +1011,5 @@ __kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int sr
}
}
#endif

@ -44,7 +44,11 @@
//M*/
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
typedef double F ;
typedef double4 F4;
#define convert_F4 convert_double4
@ -56,34 +60,24 @@ typedef float4 F4;
#define convert_F float
#endif
uchar round2_uchar(F v){
uchar v1 = convert_uchar_sat(round(v));
//uchar v2 = convert_uchar_sat(v+(v>=0 ? 0.5 : -0.5));
return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
inline uchar round2_uchar(F v)
{
return convert_uchar_sat(round(v));
}
ushort round2_ushort(F v){
ushort v1 = convert_ushort_sat(round(v));
//ushort v2 = convert_ushort_sat(v+(v>=0 ? 0.5 : -0.5));
return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
inline ushort round2_ushort(F v)
{
return convert_ushort_sat(round(v));
}
short round2_short(F v){
short v1 = convert_short_sat(round(v));
//short v2 = convert_short_sat(v+(v>=0 ? 0.5 : -0.5));
return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
inline short round2_short(F v)
{
return convert_short_sat(round(v));
}
int round2_int(F v){
int v1 = convert_int_sat(round(v));
//int v2 = convert_int_sat(v+(v>=0 ? 0.5 : -0.5));
return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
inline int round2_int(F v)
{
return convert_int_sat(round(v));
}
///////////////////////////////////////////////////////////////////////////////////////
////////////////////////////divide///////////////////////////////////////////////////
@ -94,39 +88,41 @@ __kernel void arithm_div_D0 (__global uchar *src1, int src1_step, int src1_offse
__global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1, F scalar)
{
int x = get_global_id(0);
int y = get_global_id(1);
int2 coor = (int2)(get_global_id(0), get_global_id(1));
if (x < cols && y < rows)
if (coor.x < cols && coor.y < rows)
{
x = x << 2;
coor.x = coor.x << 2;
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int2 src_index = (int2)(mad24(coor.y, src1_step, coor.x + src1_offset - dst_align),
mad24(coor.y, src2_step, coor.x + src2_offset - dst_align));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int4 dst_args = (int4)(mad24(coor.y, dst_step, dst_offset),
mad24(coor.y, dst_step, dst_offset + dst_step1),
mad24(coor.y, dst_step, dst_offset + coor.x & (int)0xfffffffc),
0);
uchar4 src1_data = vload4(0, src1 + src1_index);
uchar4 src2_data = vload4(0, src2 + src2_index);
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 src1_data = vload4(0, src1 + src_index.x);
uchar4 src2_data = vload4(0, src2 + src_index.y);
uchar4 dst_data = *((__global uchar4 *)(dst + dst_args.z));
F4 tmp = convert_F4(src1_data) * scalar;
uchar4 tmp_data;
tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_uchar(tmp.x / (F)src2_data.x);
tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_uchar(tmp.y / (F)src2_data.y);
tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_uchar(tmp.z / (F)src2_data.z);
tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_uchar(tmp.w / (F)src2_data.w);
tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_uchar(tmp.x / src2_data.x);
tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_uchar(tmp.y / src2_data.y);
tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_uchar(tmp.z / src2_data.z);
tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_uchar(tmp.w / src2_data.w);
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
dst_data.x = ((dst_args.z + 0 >= dst_args.x) && (dst_args.z + 0 < dst_args.y)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_args.z + 1 >= dst_args.x) && (dst_args.z + 1 < dst_args.y)) ? tmp_data.y : dst_data.y;
dst_data.z = ((dst_args.z + 2 >= dst_args.x) && (dst_args.z + 2 < dst_args.y)) ? tmp_data.z : dst_data.z;
dst_data.w = ((dst_args.z + 3 >= dst_args.x) && (dst_args.z + 3 < dst_args.y)) ? tmp_data.w : dst_data.w;
*((__global uchar4 *)(dst + dst_index)) = dst_data;
*((__global uchar4 *)(dst + dst_args.z)) = dst_data;
}
}
@ -142,7 +138,10 @@ __kernel void arithm_div_D2 (__global ushort *src1, int src1_step, int src1_offs
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@ -182,7 +181,10 @@ __kernel void arithm_div_D3 (__global short *src1, int src1_step, int src1_offse
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@ -297,7 +299,10 @@ __kernel void arithm_s_div_D0 (__global uchar *src, int src_step, int src_offset
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src_index = mad24(y, src_step, x + src_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
@ -333,7 +338,10 @@ __kernel void arithm_s_div_D2 (__global ushort *src, int src_step, int src_offse
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
@ -368,7 +376,10 @@ __kernel void arithm_s_div_D3 (__global short *src, int src_step, int src_offset
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
@ -455,3 +466,5 @@ __kernel void arithm_s_div_D6 (__global double *src, int src_step, int src_offse
}
}
#endif

@ -44,7 +44,11 @@
//M*/
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
@ -61,7 +65,10 @@ __kernel void arithm_flip_rows_D0 (__global uchar *src, int src_step, int src_of
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src_index_0 = mad24(y, src_step, x + src_offset - dst_align);
int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align);
@ -116,7 +123,10 @@ __kernel void arithm_flip_rows_D1 (__global char *src, int src_step, int src_off
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src_index_0 = mad24(y, src_step, x + src_offset - dst_align);
int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align);
@ -158,7 +168,10 @@ __kernel void arithm_flip_rows_D2 (__global ushort *src, int src_step, int src_o
{
x = x << 2;
#define dst_align (((dst_offset >> 1) & 3) << 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset >> 1) & 3) << 1)
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset - dst_align);
int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align);
@ -200,7 +213,10 @@ __kernel void arithm_flip_rows_D3 (__global short *src, int src_step, int src_of
{
x = x << 2;
#define dst_align (((dst_offset >> 1) & 3) << 1)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset >> 1) & 3) << 1)
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset - dst_align);
int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align);

@ -16,7 +16,6 @@
//
// @Authors
// Jia Haipeng, jiahaipeng95@gmail.com
// Dachuan Zhao, dachuan@multicorewareinc.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
@ -44,11 +43,16 @@
//
//M*/
#if defined DOUBLE_SUPPORT
#if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif
int4 round_int4(float4 v){
int4 round_int4(float4 v)
{
v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5);
v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5);
v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5);
@ -56,7 +60,8 @@ int4 round_int4(float4 v){
return convert_int4_sat(v);
}
uint4 round_uint4(float4 v){
uint4 round_uint4(float4 v)
{
v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5);
v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5);
v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5);
@ -64,7 +69,8 @@ uint4 round_uint4(float4 v){
return convert_uint4_sat(v);
}
long round_int(float v){
long round_int(float v)
{
v = v + (v > 0 ? 0.5 : -0.5);
return convert_int_sat(v);
@ -85,7 +91,10 @@ __kernel void arithm_mul_D0 (__global uchar *src1, int src1_step, int src1_offse
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@ -130,7 +139,10 @@ __kernel void arithm_mul_D2 (__global ushort *src1, int src1_step, int src1_offs
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@ -166,7 +178,10 @@ __kernel void arithm_mul_D3 (__global short *src1, int src1_step, int src1_offse
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@ -263,8 +278,8 @@ __kernel void arithm_mul_D6 (__global double *src1, int src1_step, int src1_offs
#endif
__kernel void arithm_muls_D5 (__global float *src1, int src1_step, int src1_offset,
__global float *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1, float scalar)
__global float *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1, float scalar)
{
int x = get_global_id(0);
int y = get_global_id(1);

Loading…
Cancel
Save