parent
bb4bf7a1f9
commit
d914f20a4c
64 changed files with 13349 additions and 312 deletions
@ -0,0 +1,60 @@ |
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the OpenCV Foundation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#ifndef __OPENCV_OPENCL_GENBASE_HPP__ |
||||
#define __OPENCV_OPENCL_GENBASE_HPP__ |
||||
|
||||
namespace cv |
||||
{ |
||||
namespace ocl |
||||
{ |
||||
|
||||
struct ProgramEntry |
||||
{ |
||||
const char* name; |
||||
const char* programStr; |
||||
const char* programHash; |
||||
}; |
||||
|
||||
} |
||||
} |
||||
|
||||
#endif |
@ -0,0 +1,307 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Copyright (C) 2013, OpenCV Foundation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Jia Haipeng, jiahaipeng95@gmail.com |
||||
// |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the copyright holders or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
/* |
||||
Usage: |
||||
after compiling this program user gets a single kernel called KF. |
||||
the following flags should be passed: |
||||
1) one of "-D BINARY_OP", "-D UNARY_OP", "-D MASK_BINARY_OP" or "-D MASK_UNARY_OP" |
||||
2) the actual operation performed, one of "-D OP_...", see below the list of operations. |
||||
2a) "-D dstDepth=<destination depth> [-D cn=<num channels]" |
||||
for some operations, like min/max/and/or/xor it's enough |
||||
2b) "-D srcDepth1=<source1 depth> -D srcDepth2=<source2 depth> -D dstDepth=<destination depth> |
||||
-D workDepth=<work depth> [-D cn=<num channels>]" - for mixed-type operations |
||||
*/ |
||||
|
||||
#if defined (DOUBLE_SUPPORT) |
||||
#ifdef cl_khr_fp64 |
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable |
||||
#elif defined (cl_amd_fp64) |
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable |
||||
#endif |
||||
#endif |
||||
|
||||
#define CV_32S 4 |
||||
#define CV_32F 5 |
||||
|
||||
#define dstelem *(dstT*)(dstptr + dst_index) |
||||
#define noconvert(x) x |
||||
|
||||
#ifndef workT |
||||
|
||||
#define srcT1 dstT |
||||
#define srcT2 dstT |
||||
#define workT dstT |
||||
#define srcelem1 *(dstT*)(srcptr1 + src1_index) |
||||
#define srcelem2 *(dstT*)(srcptr2 + src2_index) |
||||
#define convertToDT noconvert |
||||
|
||||
#else |
||||
|
||||
#define srcelem1 convertToWT1(*(srcT1*)(srcptr1 + src1_index)) |
||||
#define srcelem2 convertToWT2(*(srcT2*)(srcptr2 + src2_index)) |
||||
|
||||
#endif |
||||
|
||||
#define EXTRA_PARAMS |
||||
|
||||
#if defined OP_ADD_SAT |
||||
#define PROCESS_ELEM dstelem = add_sat(srcelem1, srcelem2) |
||||
|
||||
#elif defined OP_ADD |
||||
#define PROCESS_ELEM dstelem = convertToDT(srcelem1 + srcelem2) |
||||
|
||||
#elif defined OP_SUB_SAT |
||||
#define PROCESS_ELEM dstelem = sub_sat(srcelem1, srcelem2) |
||||
|
||||
#elif defined OP_SUB |
||||
#define PROCESS_ELEM dstelem = convertToDT(srcelem1 - srcelem2) |
||||
|
||||
#elif defined OP_RSUB_SAT |
||||
#define PROCESS_ELEM dstelem = sub_sat(srcelem2, srcelem1) |
||||
|
||||
#elif defined OP_RSUB |
||||
#define PROCESS_ELEM dstelem = convertToDT(srcelem2 - srcelem1) |
||||
|
||||
#elif defined OP_ABSDIFF |
||||
#define PROCESS_ELEM dstelem = abs_diff(srcelem1, srcelem2) |
||||
|
||||
#elif defined OP_AND |
||||
#define PROCESS_ELEM dstelem = srcelem1 & srcelem2 |
||||
|
||||
#elif defined OP_OR |
||||
#define PROCESS_ELEM dstelem = srcelem1 | srcelem2 |
||||
|
||||
#elif defined OP_XOR |
||||
#define PROCESS_ELEM dstelem = srcelem1 ^ srcelem2 |
||||
|
||||
#elif defined OP_NOT |
||||
#define PROCESS_ELEM dstelem = ~srcelem1 |
||||
|
||||
#elif defined OP_MIN |
||||
#define PROCESS_ELEM dstelem = min(srcelem1, srcelem2) |
||||
|
||||
#elif defined OP_MAX |
||||
#define PROCESS_ELEM dstelem = max(srcelem1, srcelem2) |
||||
|
||||
#elif defined OP_MUL |
||||
#define PROCESS_ELEM dstelem = convertToDT(srcelem1 * srcelem2) |
||||
|
||||
#elif defined OP_MUL_SCALE |
||||
#undef EXTRA_PARAMS |
||||
#define EXTRA_PARAMS , workT scale |
||||
#define PROCESS_ELEM dstelem = convertToDT(srcelem1 * srcelem2 * scale) |
||||
|
||||
#elif defined OP_DIV |
||||
#define PROCESS_ELEM \ |
||||
workT e2 = srcelem2, zero = (workT)(0); \ |
||||
dstelem = convertToDT(e2 != zero ? srcelem1 / e2 : zero) |
||||
|
||||
#elif defined OP_DIV_SCALE |
||||
#undef EXTRA_PARAMS |
||||
#define EXTRA_PARAMS , workT scale |
||||
#define PROCESS_ELEM \ |
||||
workT e2 = srcelem2, zero = (workT)(0); \ |
||||
dstelem = convertToDT(e2 != zero ? srcelem1 * scale / e2 : zero) |
||||
|
||||
#elif defined OP_RECIP_SCALE |
||||
#undef EXTRA_PARAMS |
||||
#define EXTRA_PARAMS , workT scale |
||||
#define PROCESS_ELEM \ |
||||
workT e1 = srcelem1, zero = (workT)(0); \ |
||||
dstelem = convertToDT(e1 != zero ? scale / e1 : zero) |
||||
|
||||
#elif defined OP_ADDW |
||||
#undef EXTRA_PARAMS |
||||
#define EXTRA_PARAMS , workT alpha, workT beta, workT gamma |
||||
#define PROCESS_ELEM dstelem = convertToDT(srcelem1*alpha + srcelem2*beta + gamma) |
||||
|
||||
#elif defined OP_MAG |
||||
#define PROCESS_ELEM dstelem = hypot(srcelem1, srcelem2) |
||||
|
||||
#elif defined OP_PHASE_RADIANS |
||||
#define PROCESS_ELEM \ |
||||
workT tmp = atan2(srcelem2, srcelem1); \ |
||||
if(tmp < 0) tmp += 6.283185307179586232; \ |
||||
dstelem = tmp |
||||
|
||||
#elif defined OP_PHASE_DEGREES |
||||
#define PROCESS_ELEM \ |
||||
workT tmp = atan2(srcelem2, srcelem1)*57.29577951308232286465; \ |
||||
if(tmp < 0) tmp += 360; \ |
||||
dstelem = tmp |
||||
|
||||
#elif defined OP_EXP |
||||
#define PROCESS_ELEM dstelem = exp(srcelem1) |
||||
|
||||
#elif defined OP_SQRT |
||||
#define PROCESS_ELEM dstelem = sqrt(srcelem1) |
||||
|
||||
#elif defined OP_LOG |
||||
#define PROCESS_ELEM dstelem = log(abs(srcelem1)) |
||||
|
||||
#elif defined OP_CMP |
||||
#define PROCESS_ELEM dstelem = convert_uchar(srcelem1 CMP_OPERATOR srcelem2 ? 255 : 0) |
||||
|
||||
#elif defined OP_CONVERT |
||||
#define PROCESS_ELEM dstelem = convertToDT(srcelem1) |
||||
|
||||
#elif defined OP_CONVERT_SCALE |
||||
#undef EXTRA_PARAMS |
||||
#define EXTRA_PARAMS , workT alpha, workT beta |
||||
#define PROCESS_ELEM dstelem = convertToDT(srcelem1*alpha + beta) |
||||
|
||||
#else |
||||
#error "unknown op type" |
||||
#endif |
||||
|
||||
#if defined UNARY_OP || defined MASK_UNARY_OP |
||||
#undef srcelem2 |
||||
#if defined OP_AND || defined OP_OR || defined OP_XOR || defined OP_ADD || defined OP_SAT_ADD || \ |
||||
defined OP_SUB || defined OP_SAT_SUB || defined OP_RSUB || defined OP_SAT_RSUB || \ |
||||
defined OP_ABSDIFF || defined OP_CMP || defined OP_MIN || defined OP_MAX |
||||
#undef EXTRA_PARAMS |
||||
#define EXTRA_PARAMS , workT srcelem2 |
||||
#endif |
||||
#endif |
||||
|
||||
#if defined BINARY_OP |
||||
|
||||
__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1, |
||||
__global const uchar* srcptr2, int srcstep2, int srcoffset2, |
||||
__global uchar* dstptr, int dststep, int dstoffset, |
||||
int rows, int cols EXTRA_PARAMS ) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, srcstep1, x*sizeof(srcT1) + srcoffset1); |
||||
int src2_index = mad24(y, srcstep2, x*sizeof(srcT2) + srcoffset2); |
||||
int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset); |
||||
|
||||
PROCESS_ELEM; |
||||
//printf("(x=%d, y=%d). %d, %d, %d\n", x, y, (int)srcelem1, (int)srcelem2, (int)dstelem); |
||||
} |
||||
} |
||||
|
||||
#elif defined MASK_BINARY_OP |
||||
|
||||
__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1, |
||||
__global const uchar* srcptr2, int srcstep2, int srcoffset2, |
||||
__global const uchar* mask, int maskstep, int maskoffset, |
||||
__global uchar* dstptr, int dststep, int dstoffset, |
||||
int rows, int cols EXTRA_PARAMS ) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int mask_index = mad24(y, maskstep, x + maskoffset); |
||||
if( mask[mask_index] ) |
||||
{ |
||||
int src1_index = mad24(y, srcstep1, x*sizeof(srcT1) + srcoffset1); |
||||
int src2_index = mad24(y, srcstep2, x*sizeof(srcT2) + srcoffset2); |
||||
int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset); |
||||
|
||||
PROCESS_ELEM; |
||||
} |
||||
} |
||||
} |
||||
|
||||
#elif defined UNARY_OP |
||||
|
||||
__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1, |
||||
__global uchar* dstptr, int dststep, int dstoffset, |
||||
int rows, int cols EXTRA_PARAMS ) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int src1_index = mad24(y, srcstep1, x*sizeof(srcT1) + srcoffset1); |
||||
int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset); |
||||
|
||||
PROCESS_ELEM; |
||||
} |
||||
} |
||||
|
||||
#elif defined MASK_UNARY_OP |
||||
|
||||
__kernel void KF(__global const uchar* srcptr1, int srcstep1, int srcoffset1, |
||||
__global const uchar* mask, int maskstep, int maskoffset, |
||||
__global uchar* dstptr, int dststep, int dstoffset, |
||||
int rows, int cols EXTRA_PARAMS ) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int mask_index = mad24(y, maskstep, x + maskoffset); |
||||
if( mask[mask_index] ) |
||||
{ |
||||
int src1_index = mad24(y, srcstep1, x*sizeof(srcT1) + srcoffset1); |
||||
int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset); |
||||
|
||||
PROCESS_ELEM; |
||||
} |
||||
} |
||||
} |
||||
|
||||
#else |
||||
|
||||
#error "Unknown operation type" |
||||
|
||||
#endif |
||||
|
||||
|
||||
|
||||
|
@ -0,0 +1,74 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Copyright (C) 2013, OpenCV Foundation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the copyright holders or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
__kernel void setMask(__global const uchar* mask, int maskstep, int maskoffset, |
||||
__global uchar* dstptr, int dststep, int dstoffset, |
||||
int rows, int cols, dstT value ) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int mask_index = mad24(y, maskstep, x + maskoffset); |
||||
if( mask[mask_index] ) |
||||
{ |
||||
int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset); |
||||
*(dstT*)(dstptr + dst_index) = value; |
||||
} |
||||
} |
||||
} |
||||
|
||||
__kernel void set(__global uchar* dstptr, int dststep, int dstoffset, |
||||
int rows, int cols, dstT value ) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset); |
||||
*(dstT*)(dstptr + dst_index) = value; |
||||
} |
||||
} |
||||
|
@ -0,0 +1,96 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Peng Xiao, pengxiao@multicorewareinc.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other oclMaterials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the uintel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business uinterruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
typedef float2 cfloat; |
||||
inline cfloat cmulf(cfloat a, cfloat b) |
||||
{ |
||||
return (cfloat)( a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x); |
||||
} |
||||
|
||||
inline cfloat conjf(cfloat a) |
||||
{ |
||||
return (cfloat)( a.x, - a.y ); |
||||
} |
||||
|
||||
__kernel void |
||||
mulAndScaleSpectrumsKernel( |
||||
__global const cfloat* a, |
||||
__global const cfloat* b, |
||||
float scale, |
||||
__global cfloat* dst, |
||||
uint cols, |
||||
uint rows, |
||||
uint mstep |
||||
) |
||||
{ |
||||
const uint x = get_global_id(0); |
||||
const uint y = get_global_id(1); |
||||
const uint idx = mad24(y, mstep / sizeof(cfloat), x); |
||||
if (x < cols && y < rows) |
||||
{ |
||||
cfloat v = cmulf(a[idx], b[idx]); |
||||
dst[idx] = (cfloat)( v.x * scale, v.y * scale ); |
||||
} |
||||
} |
||||
__kernel void |
||||
mulAndScaleSpectrumsKernel_CONJ( |
||||
__global const cfloat* a, |
||||
__global const cfloat* b, |
||||
float scale, |
||||
__global cfloat* dst, |
||||
uint cols, |
||||
uint rows, |
||||
uint mstep |
||||
) |
||||
{ |
||||
const uint x = get_global_id(0); |
||||
const uint y = get_global_id(1); |
||||
const uint idx = mad24(y, mstep / sizeof(cfloat), x); |
||||
if (x < cols && y < rows) |
||||
{ |
||||
cfloat v = cmulf(a[idx], conjf(b[idx])); |
||||
dst[idx] = (cfloat)( v.x * scale, v.y * scale ); |
||||
} |
||||
} |
@ -0,0 +1,73 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Copyright (C) 2013, OpenCV Foundation, all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the copyright holders or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
__kernel void polarToCart(__global const uchar* mask, int maskstep, int maskoffset, |
||||
__global uchar* dstptr, int dststep, int dstoffset, |
||||
int rows, int cols, dstT value ) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int mask_index = mad24(y, maskstep, x + maskoffset); |
||||
if( mask[mask_index] ) |
||||
{ |
||||
int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset); |
||||
*(dstT*)(dstptr + dst_index) = value; |
||||
} |
||||
} |
||||
} |
||||
|
||||
__kernel void cartToPolar(__global uchar* dstptr, int dststep, int dstoffset, |
||||
int rows, int cols, dstT value ) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < cols && y < rows) |
||||
{ |
||||
int dst_index = mad24(y, dststep, x*sizeof(dstT) + dstoffset); |
||||
*(dstT*)(dstptr + dst_index) = value; |
||||
} |
||||
} |
@ -0,0 +1,104 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Shengen Yan,yanshengen@gmail.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if defined (DOUBLE_SUPPORT) |
||||
#ifdef cl_khr_fp64 |
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable |
||||
#elif defined (cl_amd_fp64) |
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable |
||||
#endif |
||||
#endif |
||||
|
||||
#if FUNC_SUM |
||||
#define FUNC(a, b) b += a; |
||||
#elif FUNC_ABS_SUM |
||||
#define FUNC(a, b) b += a >= (dstT)(0) ? a : -a; |
||||
#elif FUNC_SQR_SUM |
||||
#define FUNC(a, b) b += a * a; |
||||
#else |
||||
#error No sum function |
||||
#endif |
||||
|
||||
/**************************************Array buffer SUM**************************************/ |
||||
|
||||
__kernel void arithm_op_sum(int cols,int invalid_cols,int offset,int elemnum,int groupnum, |
||||
__global srcT *src, __global dstT *dst) |
||||
{ |
||||
unsigned int lid = get_local_id(0); |
||||
unsigned int gid = get_group_id(0); |
||||
unsigned int id = get_global_id(0); |
||||
unsigned int idx = offset + id + (id / cols) * invalid_cols; |
||||
|
||||
__local dstT localmem_sum[128]; |
||||
dstT sum = (dstT)(0), temp; |
||||
|
||||
for (int grainSize = groupnum << 8; id < elemnum; id += grainSize) |
||||
{ |
||||
idx = offset + id + (id / cols) * invalid_cols; |
||||
temp = convertToDstT(src[idx]); |
||||
FUNC(temp, sum); |
||||
} |
||||
|
||||
if (lid > 127) |
||||
localmem_sum[lid - 128] = sum; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (lid < 128) |
||||
localmem_sum[lid] = sum + localmem_sum[lid]; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
for (int lsize = 64; lsize > 0; lsize >>= 1) |
||||
{ |
||||
if (lid < lsize) |
||||
{ |
||||
int lid2 = lsize + lid; |
||||
localmem_sum[lid] = localmem_sum[lid] + localmem_sum[lid2]; |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
|
||||
if (lid == 0) |
||||
dst[gid] = localmem_sum[0]; |
||||
} |
@ -0,0 +1,145 @@ |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Rock Li, Rock.li@amd.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
|
||||
__kernel void bilateral_C1_D0(__global uchar *dst, |
||||
__global const uchar *src, |
||||
const int dst_rows, |
||||
const int dst_cols, |
||||
const int maxk, |
||||
const int radius, |
||||
const int dst_step, |
||||
const int dst_offset, |
||||
const int src_step, |
||||
const int src_rows, |
||||
const int src_cols, |
||||
__constant float *color_weight, |
||||
__constant float *space_weight, |
||||
__constant int *space_ofs) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (y < dst_rows && x < dst_cols) |
||||
{ |
||||
int src_index = mad24(y + radius, src_step, x + radius); |
||||
int dst_index = mad24(y, dst_step, x + dst_offset); |
||||
float sum = 0.f, wsum = 0.f; |
||||
|
||||
int val0 = (int)src[src_index]; |
||||
for(int k = 0; k < maxk; k++ ) |
||||
{ |
||||
int val = (int)src[src_index + space_ofs[k]]; |
||||
float w = space_weight[k] * color_weight[abs(val - val0)]; |
||||
sum += (float)(val) * w; |
||||
wsum += w; |
||||
} |
||||
dst[dst_index] = convert_uchar_rtz(sum / wsum + 0.5f); |
||||
} |
||||
} |
||||
|
||||
__kernel void bilateral2_C1_D0(__global uchar *dst, |
||||
__global const uchar *src, |
||||
const int dst_rows, |
||||
const int dst_cols, |
||||
const int maxk, |
||||
const int radius, |
||||
const int dst_step, |
||||
const int dst_offset, |
||||
const int src_step, |
||||
const int src_rows, |
||||
const int src_cols, |
||||
__constant float *color_weight, |
||||
__constant float *space_weight, |
||||
__constant int *space_ofs) |
||||
{ |
||||
int x = get_global_id(0) << 2; |
||||
int y = get_global_id(1); |
||||
|
||||
if (y < dst_rows && x < dst_cols) |
||||
{ |
||||
int src_index = mad24(y + radius, src_step, x + radius); |
||||
int dst_index = mad24(y, dst_step, x + dst_offset); |
||||
float4 sum = (float4)(0.f), wsum = (float4)(0.f); |
||||
|
||||
int4 val0 = convert_int4(vload4(0,src + src_index)); |
||||
for(int k = 0; k < maxk; k++ ) |
||||
{ |
||||
int4 val = convert_int4(vload4(0,src+src_index + space_ofs[k])); |
||||
float4 w = (float4)(space_weight[k]) * (float4)(color_weight[abs(val.x - val0.x)], color_weight[abs(val.y - val0.y)], |
||||
color_weight[abs(val.z - val0.z)], color_weight[abs(val.w - val0.w)]); |
||||
sum += convert_float4(val) * w; |
||||
wsum += w; |
||||
} |
||||
*(__global uchar4*)(dst+dst_index) = convert_uchar4_rtz(sum/wsum+0.5f); |
||||
} |
||||
} |
||||
|
||||
__kernel void bilateral_C4_D0(__global uchar4 *dst, |
||||
__global const uchar4 *src, |
||||
const int dst_rows, |
||||
const int dst_cols, |
||||
const int maxk, |
||||
const int radius, |
||||
const int dst_step, |
||||
const int dst_offset, |
||||
const int src_step, |
||||
const int src_rows, |
||||
const int src_cols, |
||||
__constant float *color_weight, |
||||
__constant float *space_weight, |
||||
__constant int *space_ofs) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (y < dst_rows && x < dst_cols) |
||||
{ |
||||
int src_index = mad24(y + radius, src_step, x + radius); |
||||
int dst_index = mad24(y, dst_step, x + dst_offset); |
||||
float4 sum = (float4)0.f; |
||||
float wsum = 0.f; |
||||
|
||||
int4 val0 = convert_int4(src[src_index]); |
||||
for(int k = 0; k < maxk; k++ ) |
||||
{ |
||||
int4 val = convert_int4(src[src_index + space_ofs[k]]); |
||||
float w = space_weight[k] * color_weight[abs(val.x - val0.x) + abs(val.y - val0.y) + abs(val.z - val0.z)]; |
||||
sum += convert_float4(val) * (float4)w; |
||||
wsum += w; |
||||
} |
||||
|
||||
wsum = 1.f / wsum; |
||||
dst[dst_index] = convert_uchar4_rtz(sum * (float4)wsum + (float4)0.5f); |
||||
} |
||||
} |
@ -0,0 +1,478 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Zhang Ying, zhangying913@gmail.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
/////////////////////////////////Macro for border type//////////////////////////////////////////// |
||||
///////////////////////////////////////////////////////////////////////////////////////////////// |
||||
#ifdef BORDER_REPLICATE |
||||
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh |
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i)) |
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr)) |
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i)) |
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr)) |
||||
#endif |
||||
|
||||
#ifdef BORDER_REFLECT |
||||
//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb |
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i)) |
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr)) |
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i)) |
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr)) |
||||
#endif |
||||
|
||||
#ifdef BORDER_REFLECT_101 |
||||
//BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba |
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i)) |
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)) |
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i)) |
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr)) |
||||
#endif |
||||
|
||||
//blur function does not support BORDER_WRAP |
||||
#ifdef BORDER_WRAP |
||||
//BORDER_WRAP: cdefgh|abcdefgh|abcdefg |
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i)) |
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr)) |
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i)) |
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr)) |
||||
#endif |
||||
|
||||
#define THREADS 256 |
||||
#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2) |
||||
|
||||
inline void update_dst_C1_D0(__global uchar *dst, __local uint* temp, |
||||
int dst_rows, int dst_cols, |
||||
int dst_startX, int dst_x_off, |
||||
float alpha) |
||||
{ |
||||
if(get_local_id(0) < anX || get_local_id(0) >= (THREADS-ksX+anX+1)) |
||||
{ |
||||
return; |
||||
} |
||||
|
||||
uint4 tmp_sum = 0; |
||||
int posX = dst_startX - dst_x_off + (get_local_id(0)-anX)*4; |
||||
int posY = (get_group_id(1) << 1); |
||||
|
||||
for(int i=-anX; i<=anX; i++) |
||||
{ |
||||
tmp_sum += vload4(get_local_id(0), temp+i); |
||||
} |
||||
|
||||
if(posY < dst_rows && posX < dst_cols) |
||||
{ |
||||
tmp_sum /= (uint4) alpha; |
||||
if(posX >= 0 && posX < dst_cols) |
||||
*(dst) = tmp_sum.x; |
||||
if(posX+1 >= 0 && posX+1 < dst_cols) |
||||
*(dst + 1) = tmp_sum.y; |
||||
if(posX+2 >= 0 && posX+2 < dst_cols) |
||||
*(dst + 2) = tmp_sum.z; |
||||
if(posX+3 >= 0 && posX+3 < dst_cols) |
||||
*(dst + 3) = tmp_sum.w; |
||||
} |
||||
} |
||||
|
||||
|
||||
inline void update_dst_C4_D0(__global uchar4 *dst, __local uint4* temp, |
||||
int dst_rows, int dst_cols, |
||||
int dst_startX, int dst_x_off, |
||||
float alpha) |
||||
{ |
||||
if(get_local_id(0) >= (THREADS-ksX+1)) |
||||
{ |
||||
return; |
||||
} |
||||
|
||||
int posX = dst_startX - dst_x_off + get_local_id(0); |
||||
int posY = (get_group_id(1) << 1); |
||||
|
||||
uint4 temp_sum = 0; |
||||
for(int i=-anX; i<=anX; i++) |
||||
{ |
||||
temp_sum += temp[get_local_id(0) + anX + i]; |
||||
} |
||||
|
||||
if(posX >= 0 && posX < dst_cols && posY >= 0 && posY < dst_rows) |
||||
*dst = convert_uchar4(convert_float4(temp_sum)/alpha); |
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
/////////////////////////////////////////8uC1//////////////////////////////////////////////////////// |
||||
//////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
__kernel void boxFilter_C1_D0(__global const uchar * restrict src, __global uchar *dst, float alpha, |
||||
int src_offset, int src_whole_rows, int src_whole_cols, int src_step, |
||||
int dst_offset, int dst_rows, int dst_cols, int dst_step |
||||
) |
||||
{ |
||||
|
||||
int col = get_local_id(0); |
||||
const int gX = get_group_id(0); |
||||
const int gY = get_group_id(1); |
||||
int src_x_off = src_offset % src_step; |
||||
int src_y_off = src_offset / src_step; |
||||
int dst_x_off = dst_offset % dst_step; |
||||
int dst_y_off = dst_offset / dst_step; |
||||
|
||||
int head_off = dst_x_off%4; |
||||
int startX = ((gX * (THREADS-ksX+1)-anX) * 4) - head_off + src_x_off; |
||||
int startY = (gY << 1) - anY + src_y_off; |
||||
int dst_startX = (gX * (THREADS-ksX+1) * 4) - head_off + dst_x_off; |
||||
int dst_startY = (gY << 1) + dst_y_off; |
||||
|
||||
uint4 data[ksY+1]; |
||||
__local uint4 temp[2][THREADS]; |
||||
|
||||
#ifdef BORDER_CONSTANT |
||||
|
||||
for(int i=0; i < ksY+1; i++) |
||||
{ |
||||
if(startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4+3<src_whole_cols) |
||||
{ |
||||
data[i].x = *(src+(startY+i)*src_step + startX + col * 4); |
||||
data[i].y = *(src+(startY+i)*src_step + startX + col * 4 + 1); |
||||
data[i].z = *(src+(startY+i)*src_step + startX + col * 4 + 2); |
||||
data[i].w = *(src+(startY+i)*src_step + startX + col * 4 + 3); |
||||
} |
||||
else |
||||
{ |
||||
data[i]=0; |
||||
int con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4 >=0 && startX+col*4<src_whole_cols; |
||||
if(con)data[i].s0 = *(src+(startY+i)*src_step + startX + col*4); |
||||
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+1 >=0 && startX+col*4+1<src_whole_cols; |
||||
if(con)data[i].s1 = *(src+(startY+i)*src_step + startX + col*4+1) ; |
||||
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+2 >=0 && startX+col*4+2<src_whole_cols; |
||||
if(con)data[i].s2 = *(src+(startY+i)*src_step + startX + col*4+2); |
||||
con = startY+i >=0 && startY+i < src_whole_rows && startX+col*4+3 >=0 && startX+col*4+3<src_whole_cols; |
||||
if(con)data[i].s3 = *(src+(startY+i)*src_step + startX + col*4+3); |
||||
} |
||||
} |
||||
|
||||
#else |
||||
int not_all_in_range; |
||||
for(int i=0; i < ksY+1; i++) |
||||
{ |
||||
not_all_in_range = (startX+col*4<0) | (startX+col*4+3>src_whole_cols-1) |
||||
| (startY+i<0) | (startY+i>src_whole_rows-1); |
||||
if(not_all_in_range) |
||||
{ |
||||
int selected_row; |
||||
int4 selected_col; |
||||
selected_row = ADDR_H(startY+i, 0, src_whole_rows); |
||||
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row); |
||||
|
||||
selected_col.x = ADDR_L(startX+col*4, 0, src_whole_cols); |
||||
selected_col.x = ADDR_R(startX+col*4, src_whole_cols, selected_col.x); |
||||
|
||||
selected_col.y = ADDR_L(startX+col*4+1, 0, src_whole_cols); |
||||
selected_col.y = ADDR_R(startX+col*4+1, src_whole_cols, selected_col.y); |
||||
|
||||
selected_col.z = ADDR_L(startX+col*4+2, 0, src_whole_cols); |
||||
selected_col.z = ADDR_R(startX+col*4+2, src_whole_cols, selected_col.z); |
||||
|
||||
selected_col.w = ADDR_L(startX+col*4+3, 0, src_whole_cols); |
||||
selected_col.w = ADDR_R(startX+col*4+3, src_whole_cols, selected_col.w); |
||||
|
||||
data[i].x = *(src + selected_row * src_step + selected_col.x); |
||||
data[i].y = *(src + selected_row * src_step + selected_col.y); |
||||
data[i].z = *(src + selected_row * src_step + selected_col.z); |
||||
data[i].w = *(src + selected_row * src_step + selected_col.w); |
||||
} |
||||
else |
||||
{ |
||||
data[i] = convert_uint4(vload4(col,(__global uchar*)(src+(startY+i)*src_step + startX))); |
||||
} |
||||
} |
||||
#endif |
||||
uint4 tmp_sum = 0; |
||||
for(int i=1; i < ksY; i++) |
||||
{ |
||||
tmp_sum += (data[i]); |
||||
} |
||||
|
||||
int index = dst_startY * dst_step + dst_startX + (col-anX)*4; |
||||
|
||||
temp[0][col] = tmp_sum + (data[0]); |
||||
temp[1][col] = tmp_sum + (data[ksY]); |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
update_dst_C1_D0(dst+index, (__local uint *)(temp[0]), |
||||
dst_rows, dst_cols, dst_startX, dst_x_off, alpha); |
||||
update_dst_C1_D0(dst+index+dst_step, (__local uint *)(temp[1]), |
||||
dst_rows, dst_cols, dst_startX, dst_x_off, alpha); |
||||
|
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
/////////////////////////////////////////8uC4//////////////////////////////////////////////////////// |
||||
//////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
__kernel void boxFilter_C4_D0(__global const uchar4 * restrict src, __global uchar4 *dst, float alpha, |
||||
int src_offset, int src_whole_rows, int src_whole_cols, int src_step, |
||||
int dst_offset, int dst_rows, int dst_cols, int dst_step |
||||
) |
||||
{ |
||||
int col = get_local_id(0); |
||||
const int gX = get_group_id(0); |
||||
const int gY = get_group_id(1); |
||||
|
||||
int src_x_off = (src_offset % src_step) >> 2; |
||||
int src_y_off = src_offset / src_step; |
||||
int dst_x_off = (dst_offset % dst_step) >> 2; |
||||
int dst_y_off = dst_offset / dst_step; |
||||
|
||||
int startX = gX * (THREADS-ksX+1) - anX + src_x_off; |
||||
int startY = (gY << 1) - anY + src_y_off; |
||||
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off; |
||||
int dst_startY = (gY << 1) + dst_y_off; |
||||
|
||||
uint4 data[ksY+1]; |
||||
__local uint4 temp[2][THREADS]; |
||||
|
||||
#ifdef BORDER_CONSTANT |
||||
bool con; |
||||
for(int i=0; i < ksY+1; i++) |
||||
{ |
||||
con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows; |
||||
int cur_col = clamp(startX + col, 0, src_whole_cols); |
||||
|
||||
data[i].x = con ? src[(startY+i)*(src_step>>2) + cur_col].x : 0; |
||||
data[i].y = con ? src[(startY+i)*(src_step>>2) + cur_col].y : 0; |
||||
data[i].z = con ? src[(startY+i)*(src_step>>2) + cur_col].z : 0; |
||||
data[i].w = con ? src[(startY+i)*(src_step>>2) + cur_col].w : 0; |
||||
} |
||||
#else |
||||
for(int i=0; i < ksY+1; i++) |
||||
{ |
||||
int selected_row; |
||||
int selected_col; |
||||
selected_row = ADDR_H(startY+i, 0, src_whole_rows); |
||||
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row); |
||||
|
||||
selected_col = ADDR_L(startX+col, 0, src_whole_cols); |
||||
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col); |
||||
|
||||
|
||||
data[i] = convert_uint4(src[selected_row * (src_step>>2) + selected_col]); |
||||
} |
||||
|
||||
#endif |
||||
uint4 tmp_sum = 0; |
||||
for(int i=1; i < ksY; i++) |
||||
{ |
||||
tmp_sum += (data[i]); |
||||
} |
||||
|
||||
int index = dst_startY * (dst_step>>2)+ dst_startX + col; |
||||
|
||||
temp[0][col] = tmp_sum + (data[0]); |
||||
temp[1][col] = tmp_sum + (data[ksY]); |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
update_dst_C4_D0(dst+index, (__local uint4 *)(temp[0]), |
||||
dst_rows, dst_cols, dst_startX, dst_x_off, alpha); |
||||
update_dst_C4_D0(dst+index+(dst_step>>2), (__local uint4 *)(temp[1]), |
||||
dst_rows, dst_cols, dst_startX, dst_x_off, alpha); |
||||
|
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
/////////////////////////////////////////32fC1//////////////////////////////////////////////////////// |
||||
//////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
__kernel void boxFilter_C1_D5(__global const float *restrict src, __global float *dst, float alpha, |
||||
int src_offset, int src_whole_rows, int src_whole_cols, int src_step, |
||||
int dst_offset, int dst_rows, int dst_cols, int dst_step |
||||
) |
||||
{ |
||||
int col = get_local_id(0); |
||||
const int gX = get_group_id(0); |
||||
const int gY = get_group_id(1); |
||||
|
||||
int src_x_off = (src_offset % src_step) >> 2; |
||||
int src_y_off = src_offset / src_step; |
||||
int dst_x_off = (dst_offset % dst_step) >> 2; |
||||
int dst_y_off = dst_offset / dst_step; |
||||
|
||||
int startX = gX * (THREADS-ksX+1) - anX + src_x_off; |
||||
int startY = (gY << 1) - anY + src_y_off; |
||||
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off; |
||||
int dst_startY = (gY << 1) + dst_y_off; |
||||
float data[ksY+1]; |
||||
__local float temp[2][THREADS]; |
||||
#ifdef BORDER_CONSTANT |
||||
bool con; |
||||
float ss; |
||||
for(int i=0; i < ksY+1; i++) |
||||
{ |
||||
con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows; |
||||
|
||||
int cur_col = clamp(startX + col, 0, src_whole_cols); |
||||
ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>2) + cur_col]:(float)0; |
||||
|
||||
data[i] = con ? ss : 0.f; |
||||
} |
||||
#else |
||||
for(int i=0; i < ksY+1; i++) |
||||
{ |
||||
int selected_row; |
||||
int selected_col; |
||||
selected_row = ADDR_H(startY+i, 0, src_whole_rows); |
||||
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row); |
||||
|
||||
selected_col = ADDR_L(startX+col, 0, src_whole_cols); |
||||
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col); |
||||
|
||||
data[i] = src[selected_row * (src_step>>2) + selected_col]; |
||||
} |
||||
|
||||
#endif |
||||
float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0; |
||||
for(int i=1; i < ksY; i++) |
||||
{ |
||||
sum0 += (data[i]); |
||||
} |
||||
sum1 = sum0 + (data[0]); |
||||
sum2 = sum0 + (data[ksY]); |
||||
temp[0][col] = sum1; |
||||
temp[1][col] = sum2; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if(col < (THREADS-(ksX-1))) |
||||
{ |
||||
col += anX; |
||||
int posX = dst_startX - dst_x_off + col - anX; |
||||
int posY = (gY << 1); |
||||
|
||||
float tmp_sum[2]= {0.0, 0.0}; |
||||
for(int k=0; k<2; k++) |
||||
for(int i=-anX; i<=anX; i++) |
||||
{ |
||||
tmp_sum[k] += temp[k][col+i]; |
||||
} |
||||
for(int i=0; i<2; i++) |
||||
{ |
||||
if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows) |
||||
dst[(dst_startY+i) * (dst_step>>2)+ dst_startX + col - anX] = tmp_sum[i]/alpha; |
||||
} |
||||
|
||||
} |
||||
} |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
/////////////////////////////////////////32fC4//////////////////////////////////////////////////////// |
||||
//////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
__kernel void boxFilter_C4_D5(__global const float4 *restrict src, __global float4 *dst, float alpha, |
||||
int src_offset, int src_whole_rows, int src_whole_cols, int src_step, |
||||
int dst_offset, int dst_rows, int dst_cols, int dst_step |
||||
) |
||||
{ |
||||
int col = get_local_id(0); |
||||
const int gX = get_group_id(0); |
||||
const int gY = get_group_id(1); |
||||
|
||||
int src_x_off = (src_offset % src_step) >> 4; |
||||
int src_y_off = src_offset / src_step; |
||||
int dst_x_off = (dst_offset % dst_step) >> 4; |
||||
int dst_y_off = dst_offset / dst_step; |
||||
|
||||
int startX = gX * (THREADS-ksX+1) - anX + src_x_off; |
||||
int startY = (gY << 1) - anY + src_y_off; |
||||
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off; |
||||
int dst_startY = (gY << 1) + dst_y_off; |
||||
float4 data[ksY+1]; |
||||
__local float4 temp[2][THREADS]; |
||||
#ifdef BORDER_CONSTANT |
||||
bool con; |
||||
float4 ss; |
||||
for(int i=0; i < ksY+1; i++) |
||||
{ |
||||
con = startX+col >= 0 && startX+col < src_whole_cols && startY+i >= 0 && startY+i < src_whole_rows; |
||||
|
||||
int cur_col = clamp(startX + col, 0, src_whole_cols); |
||||
ss = (startY+i)<src_whole_rows&&(startY+i)>=0&&cur_col>=0&&cur_col<src_whole_cols?src[(startY+i)*(src_step>>4) + cur_col]:(float4)0; |
||||
|
||||
data[i] = con ? ss : (float4)(0.0,0.0,0.0,0.0); |
||||
} |
||||
#else |
||||
for(int i=0; i < ksY+1; i++) |
||||
{ |
||||
int selected_row; |
||||
int selected_col; |
||||
selected_row = ADDR_H(startY+i, 0, src_whole_rows); |
||||
selected_row = ADDR_B(startY+i, src_whole_rows, selected_row); |
||||
|
||||
selected_col = ADDR_L(startX+col, 0, src_whole_cols); |
||||
selected_col = ADDR_R(startX+col, src_whole_cols, selected_col); |
||||
|
||||
data[i] = src[selected_row * (src_step>>4) + selected_col]; |
||||
} |
||||
|
||||
#endif |
||||
float4 sum0 = 0.0, sum1 = 0.0, sum2 = 0.0; |
||||
for(int i=1; i < ksY; i++) |
||||
{ |
||||
sum0 += (data[i]); |
||||
} |
||||
sum1 = sum0 + (data[0]); |
||||
sum2 = sum0 + (data[ksY]); |
||||
temp[0][col] = sum1; |
||||
temp[1][col] = sum2; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if(col < (THREADS-(ksX-1))) |
||||
{ |
||||
col += anX; |
||||
int posX = dst_startX - dst_x_off + col - anX; |
||||
int posY = (gY << 1); |
||||
|
||||
float4 tmp_sum[2]= {(float4)(0.0,0.0,0.0,0.0), (float4)(0.0,0.0,0.0,0.0)}; |
||||
for(int k=0; k<2; k++) |
||||
for(int i=-anX; i<=anX; i++) |
||||
{ |
||||
tmp_sum[k] += temp[k][col+i]; |
||||
} |
||||
for(int i=0; i<2; i++) |
||||
{ |
||||
if(posX >= 0 && posX < dst_cols && (posY+i) >= 0 && (posY+i) < dst_rows) |
||||
dst[(dst_startY+i) * (dst_step>>4)+ dst_startX + col - anX] = tmp_sum[i]/alpha; |
||||
} |
||||
|
||||
} |
||||
} |
@ -0,0 +1,636 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Peng Xiao, pengxiao@multicorewareinc.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable |
||||
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable |
||||
|
||||
#ifdef L2GRAD |
||||
inline float calc(int x, int y) |
||||
{ |
||||
return sqrt((float)(x * x + y * y)); |
||||
} |
||||
#else |
||||
inline float calc(int x, int y) |
||||
{ |
||||
return (float)abs(x) + abs(y); |
||||
} |
||||
#endif // |
||||
|
||||
// Smoothing perpendicular to the derivative direction with a triangle filter |
||||
// only support 3x3 Sobel kernel |
||||
// h (-1) = 1, h (0) = 2, h (1) = 1 |
||||
// h'(-1) = -1, h'(0) = 0, h'(1) = 1 |
||||
// thus sobel 2D operator can be calculated as: |
||||
// h'(x, y) = h'(x)h(y) for x direction |
||||
// |
||||
// src input 8bit single channel image data |
||||
// dx_buf output dx buffer |
||||
// dy_buf output dy buffer |
||||
__kernel |
||||
void |
||||
__attribute__((reqd_work_group_size(16,16,1))) |
||||
calcSobelRowPass |
||||
( |
||||
__global const uchar * src, |
||||
__global int * dx_buf, |
||||
__global int * dy_buf, |
||||
int rows, |
||||
int cols, |
||||
int src_step, |
||||
int src_offset, |
||||
int dx_buf_step, |
||||
int dx_buf_offset, |
||||
int dy_buf_step, |
||||
int dy_buf_offset |
||||
) |
||||
{ |
||||
dx_buf_step /= sizeof(*dx_buf); |
||||
dx_buf_offset /= sizeof(*dx_buf); |
||||
dy_buf_step /= sizeof(*dy_buf); |
||||
dy_buf_offset /= sizeof(*dy_buf); |
||||
|
||||
int gidx = get_global_id(0); |
||||
int gidy = get_global_id(1); |
||||
|
||||
int lidx = get_local_id(0); |
||||
int lidy = get_local_id(1); |
||||
|
||||
__local int smem[16][18]; |
||||
|
||||
smem[lidy][lidx + 1] = |
||||
src[gidx + min(gidy, rows - 1) * src_step + src_offset]; |
||||
if(lidx == 0) |
||||
{ |
||||
smem[lidy][0] = |
||||
src[max(gidx - 1, 0) + min(gidy, rows - 1) * src_step + src_offset]; |
||||
smem[lidy][17] = |
||||
src[min(gidx + 16, cols - 1) + min(gidy, rows - 1) * src_step + src_offset]; |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if(gidy < rows && gidx < cols) |
||||
{ |
||||
dx_buf[gidx + gidy * dx_buf_step + dx_buf_offset] = |
||||
-smem[lidy][lidx] + smem[lidy][lidx + 2]; |
||||
dy_buf[gidx + gidy * dy_buf_step + dy_buf_offset] = |
||||
smem[lidy][lidx] + 2 * smem[lidy][lidx + 1] + smem[lidy][lidx + 2]; |
||||
} |
||||
} |
||||
|
||||
// calculate the magnitude of the filter pass combining both x and y directions |
||||
// This is the buffered version(3x3 sobel) |
||||
// |
||||
// dx_buf dx buffer, calculated from calcSobelRowPass |
||||
// dy_buf dy buffer, calculated from calcSobelRowPass |
||||
// dx direvitive in x direction output |
||||
// dy direvitive in y direction output |
||||
// mag magnitude direvitive of xy output |
||||
__kernel |
||||
void |
||||
__attribute__((reqd_work_group_size(16,16,1))) |
||||
calcMagnitude_buf |
||||
( |
||||
__global const int * dx_buf, |
||||
__global const int * dy_buf, |
||||
__global int * dx, |
||||
__global int * dy, |
||||
__global float * mag, |
||||
int rows, |
||||
int cols, |
||||
int dx_buf_step, |
||||
int dx_buf_offset, |
||||
int dy_buf_step, |
||||
int dy_buf_offset, |
||||
int dx_step, |
||||
int dx_offset, |
||||
int dy_step, |
||||
int dy_offset, |
||||
int mag_step, |
||||
int mag_offset |
||||
) |
||||
{ |
||||
dx_buf_step /= sizeof(*dx_buf); |
||||
dx_buf_offset /= sizeof(*dx_buf); |
||||
dy_buf_step /= sizeof(*dy_buf); |
||||
dy_buf_offset /= sizeof(*dy_buf); |
||||
dx_step /= sizeof(*dx); |
||||
dx_offset /= sizeof(*dx); |
||||
dy_step /= sizeof(*dy); |
||||
dy_offset /= sizeof(*dy); |
||||
mag_step /= sizeof(*mag); |
||||
mag_offset /= sizeof(*mag); |
||||
|
||||
int gidx = get_global_id(0); |
||||
int gidy = get_global_id(1); |
||||
|
||||
int lidx = get_local_id(0); |
||||
int lidy = get_local_id(1); |
||||
|
||||
__local int sdx[18][16]; |
||||
__local int sdy[18][16]; |
||||
|
||||
sdx[lidy + 1][lidx] = |
||||
dx_buf[gidx + min(gidy, rows - 1) * dx_buf_step + dx_buf_offset]; |
||||
sdy[lidy + 1][lidx] = |
||||
dy_buf[gidx + min(gidy, rows - 1) * dy_buf_step + dy_buf_offset]; |
||||
if(lidy == 0) |
||||
{ |
||||
sdx[0][lidx] = |
||||
dx_buf[gidx + min(max(gidy-1,0),rows-1) * dx_buf_step + dx_buf_offset]; |
||||
sdx[17][lidx] = |
||||
dx_buf[gidx + min(gidy + 16, rows - 1) * dx_buf_step + dx_buf_offset]; |
||||
|
||||
sdy[0][lidx] = |
||||
dy_buf[gidx + min(max(gidy-1,0),rows-1) * dy_buf_step + dy_buf_offset]; |
||||
sdy[17][lidx] = |
||||
dy_buf[gidx + min(gidy + 16, rows - 1) * dy_buf_step + dy_buf_offset]; |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if(gidx < cols && gidy < rows) |
||||
{ |
||||
int x = sdx[lidy][lidx] + 2 * sdx[lidy + 1][lidx] + sdx[lidy + 2][lidx]; |
||||
int y = -sdy[lidy][lidx] + sdy[lidy + 2][lidx]; |
||||
|
||||
dx[gidx + gidy * dx_step + dx_offset] = x; |
||||
dy[gidx + gidy * dy_step + dy_offset] = y; |
||||
|
||||
mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] = calc(x, y); |
||||
} |
||||
} |
||||
|
||||
// calculate the magnitude of the filter pass combining both x and y directions |
||||
// This is the non-buffered version(non-3x3 sobel) |
||||
// |
||||
// dx_buf dx buffer, calculated from calcSobelRowPass |
||||
// dy_buf dy buffer, calculated from calcSobelRowPass |
||||
// dx direvitive in x direction output |
||||
// dy direvitive in y direction output |
||||
// mag magnitude direvitive of xy output |
||||
__kernel |
||||
void calcMagnitude |
||||
( |
||||
__global const int * dx, |
||||
__global const int * dy, |
||||
__global float * mag, |
||||
int rows, |
||||
int cols, |
||||
int dx_step, |
||||
int dx_offset, |
||||
int dy_step, |
||||
int dy_offset, |
||||
int mag_step, |
||||
int mag_offset |
||||
) |
||||
{ |
||||
dx_step /= sizeof(*dx); |
||||
dx_offset /= sizeof(*dx); |
||||
dy_step /= sizeof(*dy); |
||||
dy_offset /= sizeof(*dy); |
||||
mag_step /= sizeof(*mag); |
||||
mag_offset /= sizeof(*mag); |
||||
|
||||
int gidx = get_global_id(0); |
||||
int gidy = get_global_id(1); |
||||
|
||||
if(gidy < rows && gidx < cols) |
||||
{ |
||||
mag[(gidx + 1) + (gidy + 1) * mag_step + mag_offset] = |
||||
calc( |
||||
dx[gidx + gidy * dx_step + dx_offset], |
||||
dy[gidx + gidy * dy_step + dy_offset] |
||||
); |
||||
} |
||||
} |
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////// |
||||
// 0.4142135623730950488016887242097 is tan(22.5) |
||||
#define CANNY_SHIFT 15 |
||||
#define TG22 (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5) |
||||
|
||||
//First pass of edge detection and non-maximum suppression |
||||
// edgetype is set to for each pixel: |
||||
// 0 - below low thres, not an edge |
||||
// 1 - maybe an edge |
||||
// 2 - is an edge, either magnitude is greater than high thres, or |
||||
// Given estimates of the image gradients, a search is then carried out |
||||
// to determine if the gradient magnitude assumes a local maximum in the gradient direction. |
||||
// if the rounded gradient angle is zero degrees (i.e. the edge is in the north-south direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the west and east directions, |
||||
// if the rounded gradient angle is 90 degrees (i.e. the edge is in the east-west direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north and south directions, |
||||
// if the rounded gradient angle is 135 degrees (i.e. the edge is in the north east-south west direction) the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north west and south east directions, |
||||
// if the rounded gradient angle is 45 degrees (i.e. the edge is in the north west-south east direction)the point will be considered to be on the edge if its gradient magnitude is greater than the magnitudes in the north east and south west directions. |
||||
// |
||||
// dx, dy direvitives of x and y direction |
||||
// mag magnitudes calculated from calcMagnitude function |
||||
// map output containing raw edge types |
||||
__kernel |
||||
void |
||||
__attribute__((reqd_work_group_size(16,16,1))) |
||||
calcMap |
||||
( |
||||
__global const int * dx, |
||||
__global const int * dy, |
||||
__global const float * mag, |
||||
__global int * map, |
||||
int rows, |
||||
int cols, |
||||
float low_thresh, |
||||
float high_thresh, |
||||
int dx_step, |
||||
int dx_offset, |
||||
int dy_step, |
||||
int dy_offset, |
||||
int mag_step, |
||||
int mag_offset, |
||||
int map_step, |
||||
int map_offset |
||||
) |
||||
{ |
||||
dx_step /= sizeof(*dx); |
||||
dx_offset /= sizeof(*dx); |
||||
dy_step /= sizeof(*dy); |
||||
dy_offset /= sizeof(*dy); |
||||
mag_step /= sizeof(*mag); |
||||
mag_offset /= sizeof(*mag); |
||||
map_step /= sizeof(*map); |
||||
map_offset /= sizeof(*map); |
||||
|
||||
mag += mag_offset; |
||||
map += map_offset; |
||||
|
||||
__local float smem[18][18]; |
||||
|
||||
int gidx = get_global_id(0); |
||||
int gidy = get_global_id(1); |
||||
|
||||
int lidx = get_local_id(0); |
||||
int lidy = get_local_id(1); |
||||
|
||||
int grp_idx = get_global_id(0) & 0xFFFFF0; |
||||
int grp_idy = get_global_id(1) & 0xFFFFF0; |
||||
|
||||
int tid = lidx + lidy * 16; |
||||
int lx = tid % 18; |
||||
int ly = tid / 18; |
||||
if(ly < 14) |
||||
{ |
||||
smem[ly][lx] = |
||||
mag[grp_idx + lx + min(grp_idy + ly, rows - 1) * mag_step]; |
||||
} |
||||
if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols) |
||||
{ |
||||
smem[ly + 14][lx] = |
||||
mag[grp_idx + lx + min(grp_idy + ly + 14, rows -1) * mag_step]; |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if(gidy < rows && gidx < cols) |
||||
{ |
||||
int x = dx[gidx + gidy * dx_step]; |
||||
int y = dy[gidx + gidy * dy_step]; |
||||
const int s = (x ^ y) < 0 ? -1 : 1; |
||||
const float m = smem[lidy + 1][lidx + 1]; |
||||
x = abs(x); |
||||
y = abs(y); |
||||
|
||||
// 0 - the pixel can not belong to an edge |
||||
// 1 - the pixel might belong to an edge |
||||
// 2 - the pixel does belong to an edge |
||||
int edge_type = 0; |
||||
if(m > low_thresh) |
||||
{ |
||||
const int tg22x = x * TG22; |
||||
const int tg67x = tg22x + (x << (1 + CANNY_SHIFT)); |
||||
y <<= CANNY_SHIFT; |
||||
if(y < tg22x) |
||||
{ |
||||
if(m > smem[lidy + 1][lidx] && m >= smem[lidy + 1][lidx + 2]) |
||||
{ |
||||
edge_type = 1 + (int)(m > high_thresh); |
||||
} |
||||
} |
||||
else if (y > tg67x) |
||||
{ |
||||
if(m > smem[lidy][lidx + 1]&& m >= smem[lidy + 2][lidx + 1]) |
||||
{ |
||||
edge_type = 1 + (int)(m > high_thresh); |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
if(m > smem[lidy][lidx + 1 - s]&& m > smem[lidy + 2][lidx + 1 + s]) |
||||
{ |
||||
edge_type = 1 + (int)(m > high_thresh); |
||||
} |
||||
} |
||||
} |
||||
map[gidx + 1 + (gidy + 1) * map_step] = edge_type; |
||||
} |
||||
} |
||||
|
||||
#undef CANNY_SHIFT |
||||
#undef TG22 |
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////// |
||||
// do Hysteresis for pixel whose edge type is 1 |
||||
// |
||||
// If candidate pixel (edge type is 1) has a neighbour pixel (in 3x3 area) with type 2, it is believed to be part of an edge and |
||||
// marked as edge. Each thread will iterate for 16 times to connect local edges. |
||||
// Candidate pixel being identified as edge will then be tested if there is nearby potiential edge points. If there is, counter will |
||||
// be incremented by 1 and the point location is stored. These potiential candidates will be processed further in next kernel. |
||||
// |
||||
// map raw edge type results calculated from calcMap. |
||||
// st the potiential edge points found in this kernel call |
||||
// counter the number of potiential edge points |
||||
__kernel |
||||
void |
||||
__attribute__((reqd_work_group_size(16,16,1))) |
||||
edgesHysteresisLocal |
||||
( |
||||
__global int * map, |
||||
__global ushort2 * st, |
||||
__global unsigned int * counter, |
||||
int rows, |
||||
int cols, |
||||
int map_step, |
||||
int map_offset |
||||
) |
||||
{ |
||||
map_step /= sizeof(*map); |
||||
map_offset /= sizeof(*map); |
||||
|
||||
map += map_offset; |
||||
|
||||
__local int smem[18][18]; |
||||
|
||||
int gidx = get_global_id(0); |
||||
int gidy = get_global_id(1); |
||||
|
||||
int lidx = get_local_id(0); |
||||
int lidy = get_local_id(1); |
||||
|
||||
int grp_idx = get_global_id(0) & 0xFFFFF0; |
||||
int grp_idy = get_global_id(1) & 0xFFFFF0; |
||||
|
||||
int tid = lidx + lidy * 16; |
||||
int lx = tid % 18; |
||||
int ly = tid / 18; |
||||
if(ly < 14) |
||||
{ |
||||
smem[ly][lx] = |
||||
map[grp_idx + lx + min(grp_idy + ly, rows - 1) * map_step]; |
||||
} |
||||
if(ly < 4 && grp_idy + ly + 14 <= rows && grp_idx + lx <= cols) |
||||
{ |
||||
smem[ly + 14][lx] = |
||||
map[grp_idx + lx + min(grp_idy + ly + 14, rows - 1) * map_step]; |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if(gidy < rows && gidx < cols) |
||||
{ |
||||
int n; |
||||
|
||||
#pragma unroll |
||||
for (int k = 0; k < 16; ++k) |
||||
{ |
||||
n = 0; |
||||
|
||||
if (smem[lidy + 1][lidx + 1] == 1) |
||||
{ |
||||
n += smem[lidy ][lidx ] == 2; |
||||
n += smem[lidy ][lidx + 1] == 2; |
||||
n += smem[lidy ][lidx + 2] == 2; |
||||
|
||||
n += smem[lidy + 1][lidx ] == 2; |
||||
n += smem[lidy + 1][lidx + 2] == 2; |
||||
|
||||
n += smem[lidy + 2][lidx ] == 2; |
||||
n += smem[lidy + 2][lidx + 1] == 2; |
||||
n += smem[lidy + 2][lidx + 2] == 2; |
||||
} |
||||
|
||||
if (n > 0) |
||||
smem[lidy + 1][lidx + 1] = 2; |
||||
} |
||||
|
||||
const int e = smem[lidy + 1][lidx + 1]; |
||||
map[gidx + 1 + (gidy + 1) * map_step] = e; |
||||
|
||||
n = 0; |
||||
if(e == 2) |
||||
{ |
||||
n += smem[lidy ][lidx ] == 1; |
||||
n += smem[lidy ][lidx + 1] == 1; |
||||
n += smem[lidy ][lidx + 2] == 1; |
||||
|
||||
n += smem[lidy + 1][lidx ] == 1; |
||||
n += smem[lidy + 1][lidx + 2] == 1; |
||||
|
||||
n += smem[lidy + 2][lidx ] == 1; |
||||
n += smem[lidy + 2][lidx + 1] == 1; |
||||
n += smem[lidy + 2][lidx + 2] == 1; |
||||
} |
||||
|
||||
if(n > 0) |
||||
{ |
||||
unsigned int ind = atomic_inc(counter); |
||||
st[ind] = (ushort2)(gidx + 1, gidy + 1); |
||||
} |
||||
} |
||||
} |
||||
|
||||
__constant int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; |
||||
__constant int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; |
||||
|
||||
|
||||
#define stack_size 512 |
||||
__kernel |
||||
void |
||||
__attribute__((reqd_work_group_size(128,1,1))) |
||||
edgesHysteresisGlobal |
||||
( |
||||
__global int * map, |
||||
__global ushort2 * st1, |
||||
__global ushort2 * st2, |
||||
__global int * counter, |
||||
int rows, |
||||
int cols, |
||||
int count, |
||||
int map_step, |
||||
int map_offset |
||||
) |
||||
{ |
||||
|
||||
map_step /= sizeof(*map); |
||||
map_offset /= sizeof(*map); |
||||
|
||||
map += map_offset; |
||||
|
||||
int gidx = get_global_id(0); |
||||
int gidy = get_global_id(1); |
||||
|
||||
int lidx = get_local_id(0); |
||||
int lidy = get_local_id(1); |
||||
|
||||
int grp_idx = get_group_id(0); |
||||
int grp_idy = get_group_id(1); |
||||
|
||||
__local unsigned int s_counter; |
||||
__local unsigned int s_ind; |
||||
|
||||
__local ushort2 s_st[stack_size]; |
||||
|
||||
if(lidx == 0) |
||||
{ |
||||
s_counter = 0; |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
int ind = mad24(grp_idy, (int)get_local_size(0), grp_idx); |
||||
|
||||
if(ind < count) |
||||
{ |
||||
ushort2 pos = st1[ind]; |
||||
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows) |
||||
{ |
||||
if (lidx < 8) |
||||
{ |
||||
pos.x += c_dx[lidx]; |
||||
pos.y += c_dy[lidx]; |
||||
|
||||
if (map[pos.x + pos.y * map_step] == 1) |
||||
{ |
||||
map[pos.x + pos.y * map_step] = 2; |
||||
|
||||
ind = atomic_inc(&s_counter); |
||||
|
||||
s_st[ind] = pos; |
||||
} |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
while (s_counter > 0 && s_counter <= stack_size - get_local_size(0)) |
||||
{ |
||||
const int subTaskIdx = lidx >> 3; |
||||
const int portion = min(s_counter, (uint)(get_local_size(0)>> 3)); |
||||
|
||||
pos.x = pos.y = 0; |
||||
|
||||
if (subTaskIdx < portion) |
||||
pos = s_st[s_counter - 1 - subTaskIdx]; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (lidx == 0) |
||||
s_counter -= portion; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (pos.x > 0 && pos.x <= cols && pos.y > 0 && pos.y <= rows) |
||||
{ |
||||
pos.x += c_dx[lidx & 7]; |
||||
pos.y += c_dy[lidx & 7]; |
||||
|
||||
if (map[pos.x + pos.y * map_step] == 1) |
||||
{ |
||||
map[pos.x + pos.y * map_step] = 2; |
||||
|
||||
ind = atomic_inc(&s_counter); |
||||
|
||||
s_st[ind] = pos; |
||||
} |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
|
||||
if (s_counter > 0) |
||||
{ |
||||
if (lidx == 0) |
||||
{ |
||||
ind = atomic_add(counter, s_counter); |
||||
s_ind = ind - s_counter; |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
ind = s_ind; |
||||
|
||||
for (int i = lidx; i < s_counter; i += get_local_size(0)) |
||||
{ |
||||
st2[ind + i] = s_st[i]; |
||||
} |
||||
} |
||||
} |
||||
} |
||||
} |
||||
#undef stack_size |
||||
|
||||
//Get the edge result. egde type of value 2 will be marked as an edge point and set to 255. Otherwise 0. |
||||
// map edge type mappings |
||||
// dst edge output |
||||
__kernel |
||||
void getEdges |
||||
( |
||||
__global const int * map, |
||||
__global uchar * dst, |
||||
int rows, |
||||
int cols, |
||||
int map_step, |
||||
int map_offset, |
||||
int dst_step, |
||||
int dst_offset |
||||
) |
||||
{ |
||||
map_step /= sizeof(*map); |
||||
map_offset /= sizeof(*map); |
||||
|
||||
int gidx = get_global_id(0); |
||||
int gidy = get_global_id(1); |
||||
|
||||
if(gidy < rows && gidx < cols) |
||||
{ |
||||
dst[gidx + gidy * dst_step] = (uchar)(-(map[gidx + 1 + (gidy + 1) * map_step + map_offset] >> 1)); |
||||
} |
||||
} |
@ -0,0 +1,255 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Sen Liu, swjtuls1987@126.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#ifndef WAVE_SIZE |
||||
#define WAVE_SIZE 1 |
||||
#endif |
||||
|
||||
int calc_lut(__local int* smem, int val, int tid) |
||||
{ |
||||
smem[tid] = val; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (tid == 0) |
||||
for (int i = 1; i < 256; ++i) |
||||
smem[i] += smem[i - 1]; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
return smem[tid]; |
||||
} |
||||
|
||||
#ifdef CPU |
||||
void reduce(volatile __local int* smem, int val, int tid) |
||||
{ |
||||
smem[tid] = val; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (tid < 128) |
||||
smem[tid] = val += smem[tid + 128]; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (tid < 64) |
||||
smem[tid] = val += smem[tid + 64]; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (tid < 32) |
||||
smem[tid] += smem[tid + 32]; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (tid < 16) |
||||
smem[tid] += smem[tid + 16]; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (tid < 8) |
||||
smem[tid] += smem[tid + 8]; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (tid < 4) |
||||
smem[tid] += smem[tid + 4]; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (tid < 2) |
||||
smem[tid] += smem[tid + 2]; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (tid < 1) |
||||
smem[256] = smem[tid] + smem[tid + 1]; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
|
||||
#else |
||||
|
||||
void reduce(__local volatile int* smem, int val, int tid) |
||||
{ |
||||
smem[tid] = val; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (tid < 128) |
||||
smem[tid] = val += smem[tid + 128]; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (tid < 64) |
||||
smem[tid] = val += smem[tid + 64]; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (tid < 32) |
||||
{ |
||||
smem[tid] += smem[tid + 32]; |
||||
#if WAVE_SIZE < 32 |
||||
} barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (tid < 16) |
||||
{ |
||||
#endif |
||||
smem[tid] += smem[tid + 16]; |
||||
#if WAVE_SIZE < 16 |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (tid < 8) |
||||
{ |
||||
#endif |
||||
smem[tid] += smem[tid + 8]; |
||||
smem[tid] += smem[tid + 4]; |
||||
smem[tid] += smem[tid + 2]; |
||||
smem[tid] += smem[tid + 1]; |
||||
} |
||||
} |
||||
#endif |
||||
|
||||
__kernel void calcLut(__global __const uchar * src, __global uchar * lut, |
||||
const int srcStep, const int dstStep, |
||||
const int2 tileSize, const int tilesX, |
||||
const int clipLimit, const float lutScale, |
||||
const int src_offset, const int dst_offset) |
||||
{ |
||||
__local int smem[512]; |
||||
|
||||
const int tx = get_group_id(0); |
||||
const int ty = get_group_id(1); |
||||
const unsigned int tid = get_local_id(1) * get_local_size(0) |
||||
+ get_local_id(0); |
||||
|
||||
smem[tid] = 0; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
for (int i = get_local_id(1); i < tileSize.y; i += get_local_size(1)) |
||||
{ |
||||
__global const uchar* srcPtr = src + mad24(ty * tileSize.y + i, srcStep, tx * tileSize.x + src_offset); |
||||
for (int j = get_local_id(0); j < tileSize.x; j += get_local_size(0)) |
||||
{ |
||||
const int data = srcPtr[j]; |
||||
atomic_inc(&smem[data]); |
||||
} |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
int tHistVal = smem[tid]; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (clipLimit > 0) |
||||
{ |
||||
// clip histogram bar |
||||
int clipped = 0; |
||||
if (tHistVal > clipLimit) |
||||
{ |
||||
clipped = tHistVal - clipLimit; |
||||
tHistVal = clipLimit; |
||||
} |
||||
|
||||
// find number of overall clipped samples |
||||
reduce(smem, clipped, tid); |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
#ifdef CPU |
||||
clipped = smem[256]; |
||||
#else |
||||
clipped = smem[0]; |
||||
#endif |
||||
|
||||
// broadcast evaluated value |
||||
|
||||
__local int totalClipped; |
||||
|
||||
if (tid == 0) |
||||
totalClipped = clipped; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
// redistribute clipped samples evenly |
||||
|
||||
int redistBatch = totalClipped / 256; |
||||
tHistVal += redistBatch; |
||||
|
||||
int residual = totalClipped - redistBatch * 256; |
||||
if (tid < residual) |
||||
++tHistVal; |
||||
} |
||||
|
||||
const int lutVal = calc_lut(smem, tHistVal, tid); |
||||
uint ires = (uint)convert_int_rte(lutScale * lutVal); |
||||
lut[(ty * tilesX + tx) * dstStep + tid + dst_offset] = |
||||
convert_uchar(clamp(ires, (uint)0, (uint)255)); |
||||
} |
||||
|
||||
__kernel void transform(__global __const uchar * src, |
||||
__global uchar * dst, |
||||
__global uchar * lut, |
||||
const int srcStep, const int dstStep, const int lutStep, |
||||
const int cols, const int rows, |
||||
const int2 tileSize, |
||||
const int tilesX, const int tilesY, |
||||
const int src_offset, const int dst_offset, int lut_offset) |
||||
{ |
||||
const int x = get_global_id(0); |
||||
const int y = get_global_id(1); |
||||
|
||||
if (x >= cols || y >= rows) |
||||
return; |
||||
|
||||
const float tyf = (convert_float(y) / tileSize.y) - 0.5f; |
||||
int ty1 = convert_int_rtn(tyf); |
||||
int ty2 = ty1 + 1; |
||||
const float ya = tyf - ty1; |
||||
ty1 = max(ty1, 0); |
||||
ty2 = min(ty2, tilesY - 1); |
||||
|
||||
const float txf = (convert_float(x) / tileSize.x) - 0.5f; |
||||
int tx1 = convert_int_rtn(txf); |
||||
int tx2 = tx1 + 1; |
||||
const float xa = txf - tx1; |
||||
tx1 = max(tx1, 0); |
||||
tx2 = min(tx2, tilesX - 1); |
||||
|
||||
const int srcVal = src[mad24(y, srcStep, x + src_offset)]; |
||||
|
||||
float res = 0; |
||||
|
||||
res += lut[mad24(ty1 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (1.0f - ya)); |
||||
res += lut[mad24(ty1 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (1.0f - ya)); |
||||
res += lut[mad24(ty2 * tilesX + tx1, lutStep, srcVal + lut_offset)] * ((1.0f - xa) * (ya)); |
||||
res += lut[mad24(ty2 * tilesX + tx2, lutStep, srcVal + lut_offset)] * ((xa) * (ya)); |
||||
|
||||
uint ires = (uint)convert_int_rte(res); |
||||
dst[mad24(y, dstStep, x + dst_offset)] = convert_uchar(clamp(ires, (uint)0, (uint)255)); |
||||
} |
@ -0,0 +1,109 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Jiang Liyuan, jlyuan001.good@163.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if defined (__ATI__) |
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable |
||||
#elif defined (__NVIDIA__) |
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable |
||||
#endif |
||||
|
||||
/************************************** convolve **************************************/ |
||||
|
||||
__kernel void convolve_D5(__global float *src, __global float *temp1, __global float *dst, |
||||
int rows, int cols, int src_step, int dst_step,int k_step, int kWidth, int kHeight, |
||||
int src_offset, int dst_offset, int koffset) |
||||
{ |
||||
__local float smem[16 + 2 * 8][16 + 2 * 8]; |
||||
|
||||
int x = get_local_id(0); |
||||
int y = get_local_id(1); |
||||
int gx = get_global_id(0); |
||||
int gy = get_global_id(1); |
||||
|
||||
// x | x 0 | 0 |
||||
// ----------- |
||||
// x | x 0 | 0 |
||||
// 0 | 0 0 | 0 |
||||
// ----------- |
||||
// 0 | 0 0 | 0 |
||||
smem[y][x] = src[min(max(gy - 8, 0), rows - 1) * src_step + min(max(gx - 8, 0), cols - 1) + src_offset]; |
||||
|
||||
// 0 | 0 x | x |
||||
// ----------- |
||||
// 0 | 0 x | x |
||||
// 0 | 0 0 | 0 |
||||
// ----------- |
||||
// 0 | 0 0 | 0 |
||||
smem[y][x + 16] = src[min(max(gy - 8, 0), rows - 1) * src_step + min(gx + 8, cols - 1) + src_offset]; |
||||
|
||||
// 0 | 0 0 | 0 |
||||
// ----------- |
||||
// 0 | 0 0 | 0 |
||||
// x | x 0 | 0 |
||||
// ----------- |
||||
// x | x 0 | 0 |
||||
smem[y + 16][x] = src[min(gy + 8, rows - 1) * src_step + min(max(gx - 8, 0), cols - 1) + src_offset]; |
||||
|
||||
// 0 | 0 0 | 0 |
||||
// ----------- |
||||
// 0 | 0 0 | 0 |
||||
// 0 | 0 x | x |
||||
// ----------- |
||||
// 0 | 0 x | x |
||||
smem[y + 16][x + 16] = src[min(gy + 8, rows - 1) * src_step + min(gx + 8, cols - 1) + src_offset]; |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (gx < cols && gy < rows) |
||||
{ |
||||
float res = 0; |
||||
|
||||
for (int i = 0; i < kHeight; ++i) |
||||
for (int j = 0; j < kWidth; ++j) |
||||
res += smem[y + 8 - kHeight / 2 + i][x + 8 - kWidth / 2 + j] * temp1[i * k_step + j + koffset]; |
||||
|
||||
dst[gy * dst_step + gx + dst_offset] = res; |
||||
} |
||||
} |
@ -0,0 +1,134 @@ |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Niko Li, newlife20080214@gmail.com |
||||
// Zero Lin zero.lin@amd.com |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
// |
||||
|
||||
#if defined (DOUBLE_SUPPORT) |
||||
#ifdef cl_amd_fp64 |
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable |
||||
#elif defined (cl_khr_fp64) |
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable |
||||
#endif |
||||
#endif |
||||
|
||||
#ifdef BORDER_CONSTANT |
||||
#define EXTRAPOLATE(x, y, v) v = scalar; |
||||
#elif defined BORDER_REPLICATE |
||||
#define EXTRAPOLATE(x, y, v) \ |
||||
{ \ |
||||
x = max(min(x, src_cols - 1), 0); \ |
||||
y = max(min(y, src_rows - 1), 0); \ |
||||
v = src[mad24(y, src_step, x + src_offset)]; \ |
||||
} |
||||
#elif defined BORDER_WRAP |
||||
#define EXTRAPOLATE(x, y, v) \ |
||||
{ \ |
||||
if (x < 0) \ |
||||
x -= ((x - src_cols + 1) / src_cols) * src_cols; \ |
||||
if (x >= src_cols) \ |
||||
x %= src_cols; \ |
||||
\ |
||||
if (y < 0) \ |
||||
y -= ((y - src_rows + 1) / src_rows) * src_rows; \ |
||||
if( y >= src_rows ) \ |
||||
y %= src_rows; \ |
||||
v = src[mad24(y, src_step, x + src_offset)]; \ |
||||
} |
||||
#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101) |
||||
#ifdef BORDER_REFLECT |
||||
#define DELTA int delta = 0 |
||||
#else |
||||
#define DELTA int delta = 1 |
||||
#endif |
||||
#define EXTRAPOLATE(x, y, v) \ |
||||
{ \ |
||||
DELTA; \ |
||||
if (src_cols == 1) \ |
||||
x = 0; \ |
||||
else \ |
||||
do \ |
||||
{ \ |
||||
if( x < 0 ) \ |
||||
x = -x - 1 + delta; \ |
||||
else \ |
||||
x = src_cols - 1 - (x - src_cols) - delta; \ |
||||
} \ |
||||
while (x >= src_cols || x < 0); \ |
||||
\ |
||||
if (src_rows == 1) \ |
||||
y = 0; \ |
||||
else \ |
||||
do \ |
||||
{ \ |
||||
if( y < 0 ) \ |
||||
y = -y - 1 + delta; \ |
||||
else \ |
||||
y = src_rows - 1 - (y - src_rows) - delta; \ |
||||
} \ |
||||
while (y >= src_rows || y < 0); \ |
||||
v = src[mad24(y, src_step, x + src_offset)]; \ |
||||
} |
||||
#else |
||||
#error No extrapolation method |
||||
#endif |
||||
|
||||
#define NEED_EXTRAPOLATION(gx, gy) (gx >= src_cols || gy >= src_rows || gx < 0 || gy < 0) |
||||
|
||||
__kernel void copymakeborder |
||||
(__global const GENTYPE *src, |
||||
__global GENTYPE *dst, |
||||
int dst_cols, int dst_rows, |
||||
int src_cols, int src_rows, |
||||
int src_step, int src_offset, |
||||
int dst_step, int dst_offset, |
||||
int top, int left, GENTYPE scalar) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < dst_cols && y < dst_rows) |
||||
{ |
||||
int src_x = x - left; |
||||
int src_y = y - top; |
||||
int dst_index = mad24(y, dst_step, x + dst_offset); |
||||
|
||||
if (NEED_EXTRAPOLATION(src_x, src_y)) |
||||
EXTRAPOLATE(src_x, src_y, dst[dst_index]) |
||||
else |
||||
{ |
||||
int src_index = mad24(src_y, src_step, src_x + src_offset); |
||||
dst[dst_index] = src[src_index]; |
||||
} |
||||
} |
||||
} |
@ -0,0 +1,306 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Jia Haipeng, jiahaipeng95@gmail.com |
||||
// Peng Xiao, pengxiao@multicorewareinc.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
/**************************************PUBLICFUNC*************************************/ |
||||
|
||||
#if defined (DOUBLE_SUPPORT) |
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable |
||||
#endif |
||||
|
||||
#if depth == 0 |
||||
#define DATA_TYPE uchar |
||||
#define MAX_NUM 255 |
||||
#define HALF_MAX 128 |
||||
#define SAT_CAST(num) convert_uchar_sat(num) |
||||
#define DEPTH_0 |
||||
#elif depth == 2 |
||||
#define DATA_TYPE ushort |
||||
#define MAX_NUM 65535 |
||||
#define HALF_MAX 32768 |
||||
#define SAT_CAST(num) convert_ushort_sat(num) |
||||
#define DEPTH_2 |
||||
#elif depth == 5 |
||||
#define DATA_TYPE float |
||||
#define MAX_NUM 1.0f |
||||
#define HALF_MAX 0.5f |
||||
#define SAT_CAST(num) (num) |
||||
#define DEPTH_5 |
||||
#else |
||||
#error "invalid depth: should be 0 (CV_8U), 2 (CV_16U) or 5 (CV_32F)" |
||||
#endif |
||||
|
||||
#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n)) |
||||
|
||||
enum |
||||
{ |
||||
yuv_shift = 14, |
||||
xyz_shift = 12, |
||||
R2Y = 4899, |
||||
G2Y = 9617, |
||||
B2Y = 1868, |
||||
BLOCK_SIZE = 256 |
||||
}; |
||||
|
||||
#define scnbytes ((int)sizeof(DATA_TYPE)*scn) |
||||
#define dcnbytes ((int)sizeof(DATA_TYPE)*dcn) |
||||
|
||||
///////////////////////////////////// RGB <-> GRAY ////////////////////////////////////// |
||||
|
||||
__kernel void RGB2Gray(__global const uchar* srcptr, int srcstep, int srcoffset, |
||||
__global uchar* dstptr, int dststep, int dstoffset, |
||||
int rows, int cols) |
||||
{ |
||||
const int x = get_global_id(0); |
||||
const int y = get_global_id(1); |
||||
|
||||
if (y < rows && x < cols) |
||||
{ |
||||
const DATA_TYPE* src = (const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes)); |
||||
DATA_TYPE* dst = (DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes)); |
||||
#if defined (DEPTH_5) |
||||
dst[0] = src[bidx] * 0.114f + src[1] * 0.587f + src[(bidx^2)] * 0.299f; |
||||
#else |
||||
dst[0] = (DATA_TYPE)CV_DESCALE((src[bidx] * B2Y + src[1] * G2Y + src[(bidx^2)] * R2Y), yuv_shift); |
||||
#endif |
||||
} |
||||
} |
||||
|
||||
__kernel void Gray2RGB(__global const uchar* srcptr, int srcstep, int srcoffset, |
||||
__global uchar* dstptr, int dststep, int dstoffset, |
||||
int rows, int cols) |
||||
{ |
||||
const int x = get_global_id(0); |
||||
const int y = get_global_id(1); |
||||
|
||||
if (y < rows && x < cols) |
||||
{ |
||||
const DATA_TYPE* src = (const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes)); |
||||
DATA_TYPE* dst = (DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes)); |
||||
DATA_TYPE val = src[0]; |
||||
dst[0] = dst[1] = dst[2] = val; |
||||
#if dcn == 4 |
||||
dst[3] = MAX_NUM; |
||||
#endif |
||||
} |
||||
} |
||||
|
||||
///////////////////////////////////// RGB <-> YUV ////////////////////////////////////// |
||||
|
||||
__constant float c_RGB2YUVCoeffs_f[5] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f }; |
||||
__constant int c_RGB2YUVCoeffs_i[5] = { B2Y, G2Y, R2Y, 8061, 14369 }; |
||||
|
||||
__kernel void RGB2YUV(__global const uchar* srcptr, int srcstep, int srcoffset, |
||||
__global uchar* dstptr, int dststep, int dstoffset, |
||||
int rows, int cols) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (y < rows && x < cols) |
||||
{ |
||||
const DATA_TYPE* src = (const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes)); |
||||
DATA_TYPE* dst = (DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes)); |
||||
DATA_TYPE b=src[bidx], g=src[1], r=src[bidx^2]; |
||||
|
||||
#if defined (DEPTH_5) |
||||
__constant float * coeffs = c_RGB2YUVCoeffs_f; |
||||
const DATA_TYPE Y = b * coeffs[0] + g * coeffs[1] + r * coeffs[2]; |
||||
const DATA_TYPE U = (b - Y) * coeffs[3] + HALF_MAX; |
||||
const DATA_TYPE V = (r - Y) * coeffs[4] + HALF_MAX; |
||||
#else |
||||
__constant int * coeffs = c_RGB2YUVCoeffs_i; |
||||
const int delta = HALF_MAX * (1 << yuv_shift); |
||||
const int Y = CV_DESCALE(b * coeffs[0] + g * coeffs[1] + r * coeffs[2], yuv_shift); |
||||
const int U = CV_DESCALE((b - Y) * coeffs[3] + delta, yuv_shift); |
||||
const int V = CV_DESCALE((r - Y) * coeffs[4] + delta, yuv_shift); |
||||
#endif |
||||
|
||||
dst[0] = SAT_CAST( Y ); |
||||
dst[1] = SAT_CAST( U ); |
||||
dst[2] = SAT_CAST( V ); |
||||
} |
||||
} |
||||
|
||||
__constant float c_YUV2RGBCoeffs_f[5] = { 2.032f, -0.395f, -0.581f, 1.140f }; |
||||
__constant int c_YUV2RGBCoeffs_i[5] = { 33292, -6472, -9519, 18678 }; |
||||
|
||||
__kernel void YUV2RGB(__global const uchar* srcptr, int srcstep, int srcoffset, |
||||
__global uchar* dstptr, int dststep, int dstoffset, |
||||
int rows, int cols) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (y < rows && x < cols) |
||||
{ |
||||
const DATA_TYPE* src = (const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes)); |
||||
DATA_TYPE* dst = (DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes)); |
||||
DATA_TYPE Y = src[0], U = src[1], V = src[2]; |
||||
|
||||
#if defined (DEPTH_5) |
||||
__constant float * coeffs = c_YUV2RGBCoeffs_f; |
||||
const float r = Y + (V - HALF_MAX) * coeffs[3]; |
||||
const float g = Y + (V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1]; |
||||
const float b = Y + (U - HALF_MAX) * coeffs[0]; |
||||
#else |
||||
__constant int * coeffs = c_YUV2RGBCoeffs_i; |
||||
const int r = Y + CV_DESCALE((V - HALF_MAX) * coeffs[3], yuv_shift); |
||||
const int g = Y + CV_DESCALE((V - HALF_MAX) * coeffs[2] + (U - HALF_MAX) * coeffs[1], yuv_shift); |
||||
const int b = Y + CV_DESCALE((U - HALF_MAX) * coeffs[0], yuv_shift); |
||||
#endif |
||||
|
||||
dst[bidx] = SAT_CAST( b ); |
||||
dst[1] = SAT_CAST( g ); |
||||
dst[bidx^2] = SAT_CAST( r ); |
||||
#if dcn == 4 |
||||
dst[3] = MAX_NUM; |
||||
#endif |
||||
} |
||||
} |
||||
|
||||
__constant int ITUR_BT_601_CY = 1220542; |
||||
__constant int ITUR_BT_601_CUB = 2116026; |
||||
__constant int ITUR_BT_601_CUG = 409993; |
||||
__constant int ITUR_BT_601_CVG = 852492; |
||||
__constant int ITUR_BT_601_CVR = 1673527; |
||||
__constant int ITUR_BT_601_SHIFT = 20; |
||||
|
||||
__kernel void YUV2RGBA_NV12(__global const uchar* srcptr, int srcstep, int srcoffset, |
||||
__global uchar* dstptr, int dststep, int dstoffset, |
||||
int rows, int cols) |
||||
{ |
||||
const int x = get_global_id(0); // max_x = width / 2 |
||||
const int y = get_global_id(1); // max_y = height/ 2 |
||||
|
||||
if (y < rows / 2 && x < cols / 2 ) |
||||
{ |
||||
__global const uchar* ysrc = srcptr + mad24(y << 1, srcstep, (x << 1) + srcoffset); |
||||
__global const uchar* usrc = srcptr + mad24(rows + y, srcstep, (x << 1) + srcoffset); |
||||
__global uchar* dst1 = dstptr + mad24(y << 1, dststep, x*(dcn*2) + dstoffset); |
||||
__global uchar* dst2 = dstptr + mad24((y << 1) + 1, dststep, x*(dcn*2) + dstoffset); |
||||
|
||||
int Y1 = ysrc[0]; |
||||
int Y2 = ysrc[1]; |
||||
int Y3 = ysrc[srcstep]; |
||||
int Y4 = ysrc[srcstep + 1]; |
||||
|
||||
int U = usrc[0] - 128; |
||||
int V = usrc[1] - 128; |
||||
|
||||
int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * V; |
||||
int guv = (1 << (ITUR_BT_601_SHIFT - 1)) - ITUR_BT_601_CVG * V - ITUR_BT_601_CUG * U; |
||||
int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * U; |
||||
|
||||
Y1 = max(0, Y1 - 16) * ITUR_BT_601_CY; |
||||
dst1[2 - bidx] = convert_uchar_sat((Y1 + ruv) >> ITUR_BT_601_SHIFT); |
||||
dst1[1] = convert_uchar_sat((Y1 + guv) >> ITUR_BT_601_SHIFT); |
||||
dst1[bidx] = convert_uchar_sat((Y1 + buv) >> ITUR_BT_601_SHIFT); |
||||
#if dcn == 4 |
||||
dst1[3] = 255; |
||||
#endif |
||||
|
||||
Y2 = max(0, Y2 - 16) * ITUR_BT_601_CY; |
||||
dst1[(dcn + 2) - bidx] = convert_uchar_sat((Y2 + ruv) >> ITUR_BT_601_SHIFT); |
||||
dst1[dcn + 1] = convert_uchar_sat((Y2 + guv) >> ITUR_BT_601_SHIFT); |
||||
dst1[dcn + bidx] = convert_uchar_sat((Y2 + buv) >> ITUR_BT_601_SHIFT); |
||||
#if dcn == 4 |
||||
dst1[7] = 255; |
||||
#endif |
||||
|
||||
Y3 = max(0, Y3 - 16) * ITUR_BT_601_CY; |
||||
dst2[2 - bidx] = convert_uchar_sat((Y3 + ruv) >> ITUR_BT_601_SHIFT); |
||||
dst2[1] = convert_uchar_sat((Y3 + guv) >> ITUR_BT_601_SHIFT); |
||||
dst2[bidx] = convert_uchar_sat((Y3 + buv) >> ITUR_BT_601_SHIFT); |
||||
#if dcn == 4 |
||||
dst2[3] = 255; |
||||
#endif |
||||
|
||||
Y4 = max(0, Y4 - 16) * ITUR_BT_601_CY; |
||||
dst2[(dcn + 2) - bidx] = convert_uchar_sat((Y4 + ruv) >> ITUR_BT_601_SHIFT); |
||||
dst2[dcn + 1] = convert_uchar_sat((Y4 + guv) >> ITUR_BT_601_SHIFT); |
||||
dst2[dcn + bidx] = convert_uchar_sat((Y4 + buv) >> ITUR_BT_601_SHIFT); |
||||
#if dcn == 4 |
||||
dst2[7] = 255; |
||||
#endif |
||||
} |
||||
} |
||||
|
||||
///////////////////////////////////// RGB <-> YUV ////////////////////////////////////// |
||||
|
||||
__constant float c_RGB2YCrCbCoeffs_f[5] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f}; |
||||
__constant int c_RGB2YCrCbCoeffs_i[5] = {R2Y, G2Y, B2Y, 11682, 9241}; |
||||
|
||||
__kernel void RGB2YCrCb(__global const uchar* srcptr, int srcstep, int srcoffset, |
||||
__global uchar* dstptr, int dststep, int dstoffset, |
||||
int rows, int cols) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (y < rows && x < cols) |
||||
{ |
||||
const DATA_TYPE* src = (const DATA_TYPE*)(srcptr + mad24(y, srcstep, srcoffset + x * scnbytes)); |
||||
DATA_TYPE* dst = (DATA_TYPE*)(dstptr + mad24(y, dststep, dstoffset + x * dcnbytes)); |
||||
DATA_TYPE b=src[bidx], g=src[1], r=src[bidx^2]; |
||||
|
||||
#if defined (DEPTH_5) |
||||
__constant float * coeffs = c_RGB2YCrCbCoeffs_f; |
||||
const DATA_TYPE Y = b * coeffs[0] + g * coeffs[1] + r * coeffs[2]; |
||||
const DATA_TYPE Cr = (r - Y) * coeffs[3] + HALF_MAX; |
||||
const DATA_TYPE Cb = (b - Y) * coeffs[4] + HALF_MAX; |
||||
#else |
||||
__constant int * coeffs = c_RGB2YCrCbCoeffs_i; |
||||
const int delta = HALF_MAX * (1 << yuv_shift); |
||||
const int Y = CV_DESCALE(b * coeffs[0] + g * coeffs[1] + r * coeffs[2], yuv_shift); |
||||
const int Cr = CV_DESCALE((r - Y) * coeffs[3] + delta, yuv_shift); |
||||
const int Cb = CV_DESCALE((b - Y) * coeffs[4] + delta, yuv_shift); |
||||
#endif |
||||
|
||||
dst[0] = SAT_CAST( Y ); |
||||
dst[1] = SAT_CAST( Cr ); |
||||
dst[2] = SAT_CAST( Cb ); |
||||
} |
||||
} |
@ -0,0 +1,275 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Peng Xiao, pengxiao@outlook.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#ifndef WITH_MASK |
||||
#define WITH_MASK 0 |
||||
#endif |
||||
|
||||
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; |
||||
|
||||
inline float ELEM_INT2(image2d_t _eig, int _x, int _y) |
||||
{ |
||||
return read_imagef(_eig, sampler, (int2)(_x, _y)).x; |
||||
} |
||||
|
||||
inline float ELEM_FLT2(image2d_t _eig, float2 pt) |
||||
{ |
||||
return read_imagef(_eig, sampler, pt).x; |
||||
} |
||||
|
||||
__kernel |
||||
void findCorners |
||||
( |
||||
image2d_t eig, |
||||
__global const char * mask, |
||||
__global float2 * corners, |
||||
const int mask_strip,// in pixels |
||||
const float threshold, |
||||
const int rows, |
||||
const int cols, |
||||
const int max_count, |
||||
__global int * g_counter |
||||
) |
||||
{ |
||||
const int j = get_global_id(0); |
||||
const int i = get_global_id(1); |
||||
|
||||
if (i > 0 && i < rows - 1 && j > 0 && j < cols - 1 |
||||
#if WITH_MASK |
||||
&& mask[i * mask_strip + j] != 0 |
||||
#endif |
||||
) |
||||
{ |
||||
const float val = ELEM_INT2(eig, j, i); |
||||
|
||||
if (val > threshold) |
||||
{ |
||||
float maxVal = val; |
||||
|
||||
maxVal = fmax(ELEM_INT2(eig, j - 1, i - 1), maxVal); |
||||
maxVal = fmax(ELEM_INT2(eig, j , i - 1), maxVal); |
||||
maxVal = fmax(ELEM_INT2(eig, j + 1, i - 1), maxVal); |
||||
|
||||
maxVal = fmax(ELEM_INT2(eig, j - 1, i), maxVal); |
||||
maxVal = fmax(ELEM_INT2(eig, j + 1, i), maxVal); |
||||
|
||||
maxVal = fmax(ELEM_INT2(eig, j - 1, i + 1), maxVal); |
||||
maxVal = fmax(ELEM_INT2(eig, j , i + 1), maxVal); |
||||
maxVal = fmax(ELEM_INT2(eig, j + 1, i + 1), maxVal); |
||||
|
||||
if (val == maxVal) |
||||
{ |
||||
const int ind = atomic_inc(g_counter); |
||||
|
||||
if (ind < max_count) |
||||
corners[ind] = (float2)(j, i); |
||||
} |
||||
} |
||||
} |
||||
} |
||||
|
||||
//bitonic sort |
||||
__kernel |
||||
void sortCorners_bitonicSort |
||||
( |
||||
image2d_t eig, |
||||
__global float2 * corners, |
||||
const int count, |
||||
const int stage, |
||||
const int passOfStage |
||||
) |
||||
{ |
||||
const int threadId = get_global_id(0); |
||||
if(threadId >= count / 2) |
||||
{ |
||||
return; |
||||
} |
||||
|
||||
const int sortOrder = (((threadId/(1 << stage)) % 2)) == 1 ? 1 : 0; // 0 is descent |
||||
|
||||
const int pairDistance = 1 << (stage - passOfStage); |
||||
const int blockWidth = 2 * pairDistance; |
||||
|
||||
const int leftId = min( (threadId % pairDistance) |
||||
+ (threadId / pairDistance) * blockWidth, count ); |
||||
|
||||
const int rightId = min( leftId + pairDistance, count ); |
||||
|
||||
const float2 leftPt = corners[leftId]; |
||||
const float2 rightPt = corners[rightId]; |
||||
|
||||
const float leftVal = ELEM_FLT2(eig, leftPt); |
||||
const float rightVal = ELEM_FLT2(eig, rightPt); |
||||
|
||||
const bool compareResult = leftVal > rightVal; |
||||
|
||||
float2 greater = compareResult ? leftPt:rightPt; |
||||
float2 lesser = compareResult ? rightPt:leftPt; |
||||
|
||||
corners[leftId] = sortOrder ? lesser : greater; |
||||
corners[rightId] = sortOrder ? greater : lesser; |
||||
} |
||||
|
||||
//selection sort for gfft |
||||
//kernel is ported from Bolt library: |
||||
//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl |
||||
// Local sort will firstly sort elements of each workgroup using selection sort |
||||
// its performance is O(n) |
||||
__kernel |
||||
void sortCorners_selectionSortLocal |
||||
( |
||||
image2d_t eig, |
||||
__global float2 * corners, |
||||
const int count, |
||||
__local float2 * scratch |
||||
) |
||||
{ |
||||
int i = get_local_id(0); // index in workgroup |
||||
int numOfGroups = get_num_groups(0); // index in workgroup |
||||
int groupID = get_group_id(0); |
||||
int wg = get_local_size(0); // workgroup size = block size |
||||
int n; // number of elements to be processed for this work group |
||||
|
||||
int offset = groupID * wg; |
||||
int same = 0; |
||||
corners += offset; |
||||
n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg; |
||||
float2 pt1, pt2; |
||||
|
||||
pt1 = corners[min(i, n)]; |
||||
scratch[i] = pt1; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if(i >= n) |
||||
{ |
||||
return; |
||||
} |
||||
|
||||
float val1 = ELEM_FLT2(eig, pt1); |
||||
float val2; |
||||
|
||||
int pos = 0; |
||||
for (int j=0;j<n;++j) |
||||
{ |
||||
pt2 = scratch[j]; |
||||
val2 = ELEM_FLT2(eig, pt2); |
||||
if(val2 > val1) |
||||
pos++;//calculate the rank of this element in this work group |
||||
else |
||||
{ |
||||
if(val1 > val2) |
||||
continue; |
||||
else |
||||
{ |
||||
// val1 and val2 are same |
||||
same++; |
||||
} |
||||
} |
||||
} |
||||
for (int j=0; j< same; j++) |
||||
corners[pos + j] = pt1; |
||||
} |
||||
__kernel |
||||
void sortCorners_selectionSortFinal |
||||
( |
||||
image2d_t eig, |
||||
__global float2 * corners, |
||||
const int count |
||||
) |
||||
{ |
||||
const int i = get_local_id(0); // index in workgroup |
||||
const int numOfGroups = get_num_groups(0); // index in workgroup |
||||
const int groupID = get_group_id(0); |
||||
const int wg = get_local_size(0); // workgroup size = block size |
||||
int pos = 0, same = 0; |
||||
const int offset = get_group_id(0) * wg; |
||||
const int remainder = count - wg*(numOfGroups-1); |
||||
|
||||
if((offset + i ) >= count) |
||||
return; |
||||
float2 pt1, pt2; |
||||
pt1 = corners[groupID*wg + i]; |
||||
|
||||
float val1 = ELEM_FLT2(eig, pt1); |
||||
float val2; |
||||
|
||||
for(int j=0; j<numOfGroups-1; j++ ) |
||||
{ |
||||
for(int k=0; k<wg; k++) |
||||
{ |
||||
pt2 = corners[j*wg + k]; |
||||
val2 = ELEM_FLT2(eig, pt2); |
||||
if(val1 > val2) |
||||
break; |
||||
else |
||||
{ |
||||
//Increment only if the value is not the same. |
||||
if( val2 > val1 ) |
||||
pos++; |
||||
else |
||||
same++; |
||||
} |
||||
} |
||||
} |
||||
|
||||
for(int k=0; k<remainder; k++) |
||||
{ |
||||
pt2 = corners[(numOfGroups-1)*wg + k]; |
||||
val2 = ELEM_FLT2(eig, pt2); |
||||
if(val1 > val2) |
||||
break; |
||||
else |
||||
{ |
||||
//Don't increment if the value is the same. |
||||
//Two elements are same if (*userComp)(jData, iData) and (*userComp)(iData, jData) are both false |
||||
if(val2 > val1) |
||||
pos++; |
||||
else |
||||
same++; |
||||
} |
||||
} |
||||
for (int j=0; j< same; j++) |
||||
corners[pos + j] = pt1; |
||||
} |
@ -0,0 +1,202 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Shengen Yan,yanshengen@gmail.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if defined (DOUBLE_SUPPORT) |
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable |
||||
#endif |
||||
/////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
/////////////////////////////////Macro for border type//////////////////////////////////////////// |
||||
///////////////////////////////////////////////////////////////////////////////////////////////// |
||||
#ifdef BORDER_REPLICATE |
||||
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh |
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i)) |
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr)) |
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i)) |
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr)) |
||||
#endif |
||||
|
||||
#ifdef BORDER_REFLECT |
||||
//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb |
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i)) |
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr)) |
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i)) |
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr)) |
||||
#endif |
||||
|
||||
#ifdef BORDER_REFLECT101 |
||||
//BORDER_REFLECT101: gfedcb|abcdefgh|gfedcba |
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i)) |
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)) |
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i)) |
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr)) |
||||
#endif |
||||
|
||||
#ifdef BORDER_WRAP |
||||
//BORDER_WRAP: cdefgh|abcdefgh|abcdefg |
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i)) |
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr)) |
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i)) |
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr)) |
||||
#endif |
||||
|
||||
#define THREADS 256 |
||||
#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2) |
||||
/////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
/////////////////////////////////////calcHarris//////////////////////////////////////////////////// |
||||
/////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
__kernel void calcHarris(__global const float *Dx,__global const float *Dy, __global float *dst, |
||||
int dx_offset, int dx_whole_rows, int dx_whole_cols, int dx_step, |
||||
int dy_offset, int dy_whole_rows, int dy_whole_cols, int dy_step, |
||||
int dst_offset, int dst_rows, int dst_cols, int dst_step, |
||||
float k) |
||||
{ |
||||
int col = get_local_id(0); |
||||
const int gX = get_group_id(0); |
||||
const int gY = get_group_id(1); |
||||
const int glx = get_global_id(0); |
||||
const int gly = get_global_id(1); |
||||
|
||||
int dx_x_off = (dx_offset % dx_step) >> 2; |
||||
int dx_y_off = dx_offset / dx_step; |
||||
int dy_x_off = (dy_offset % dy_step) >> 2; |
||||
int dy_y_off = dy_offset / dy_step; |
||||
int dst_x_off = (dst_offset % dst_step) >> 2; |
||||
int dst_y_off = dst_offset / dst_step; |
||||
|
||||
int dx_startX = gX * (THREADS-ksX+1) - anX + dx_x_off; |
||||
int dx_startY = (gY << 1) - anY + dx_y_off; |
||||
int dy_startX = gX * (THREADS-ksX+1) - anX + dy_x_off; |
||||
int dy_startY = (gY << 1) - anY + dy_y_off; |
||||
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off; |
||||
int dst_startY = (gY << 1) + dst_y_off; |
||||
|
||||
float dx_data[ksY+1],dy_data[ksY+1],data[3][ksY+1]; |
||||
__local float temp[6][THREADS]; |
||||
#ifdef BORDER_CONSTANT |
||||
bool dx_con,dy_con; |
||||
float dx_s,dy_s; |
||||
for(int i=0; i < ksY+1; i++) |
||||
{ |
||||
dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows; |
||||
dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)]; |
||||
dx_data[i] = dx_con ? dx_s : 0.0; |
||||
dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows; |
||||
dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)]; |
||||
dy_data[i] = dy_con ? dy_s : 0.0; |
||||
data[0][i] = dx_data[i] * dx_data[i]; |
||||
data[1][i] = dx_data[i] * dy_data[i]; |
||||
data[2][i] = dy_data[i] * dy_data[i]; |
||||
} |
||||
#else |
||||
int clamped_col = min(dst_cols, col); |
||||
for(int i=0; i < ksY+1; i++) |
||||
{ |
||||
int dx_selected_row; |
||||
int dx_selected_col; |
||||
dx_selected_row = ADDR_H(dx_startY+i, 0, dx_whole_rows); |
||||
dx_selected_row = ADDR_B(dx_startY+i, dx_whole_rows, dx_selected_row); |
||||
dx_selected_col = ADDR_L(dx_startX+clamped_col, 0, dx_whole_cols); |
||||
dx_selected_col = ADDR_R(dx_startX+clamped_col, dx_whole_cols, dx_selected_col); |
||||
dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col]; |
||||
|
||||
int dy_selected_row; |
||||
int dy_selected_col; |
||||
dy_selected_row = ADDR_H(dy_startY+i, 0, dy_whole_rows); |
||||
dy_selected_row = ADDR_B(dy_startY+i, dy_whole_rows, dy_selected_row); |
||||
dy_selected_col = ADDR_L(dy_startX+clamped_col, 0, dy_whole_cols); |
||||
dy_selected_col = ADDR_R(dy_startX+clamped_col, dy_whole_cols, dy_selected_col); |
||||
dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col]; |
||||
|
||||
data[0][i] = dx_data[i] * dx_data[i]; |
||||
data[1][i] = dx_data[i] * dy_data[i]; |
||||
data[2][i] = dy_data[i] * dy_data[i]; |
||||
} |
||||
#endif |
||||
float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0; |
||||
for(int i=1; i < ksY; i++) |
||||
{ |
||||
sum0 += (data[0][i]); |
||||
sum1 += (data[1][i]); |
||||
sum2 += (data[2][i]); |
||||
} |
||||
float sum01,sum02,sum11,sum12,sum21,sum22; |
||||
sum01 = sum0 + (data[0][0]); |
||||
sum02 = sum0 + (data[0][ksY]); |
||||
temp[0][col] = sum01; |
||||
temp[1][col] = sum02; |
||||
sum11 = sum1 + (data[1][0]); |
||||
sum12 = sum1 + (data[1][ksY]); |
||||
temp[2][col] = sum11; |
||||
temp[3][col] = sum12; |
||||
sum21 = sum2 + (data[2][0]); |
||||
sum22 = sum2 + (data[2][ksY]); |
||||
temp[4][col] = sum21; |
||||
temp[5][col] = sum22; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if(col < (THREADS-(ksX-1))) |
||||
{ |
||||
col += anX; |
||||
int posX = dst_startX - dst_x_off + col - anX; |
||||
int posY = (gly << 1); |
||||
int till = (ksX + 1)%2; |
||||
float tmp_sum[6]={ 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0 }; |
||||
for(int k=0; k<6; k++) |
||||
for(int i=-anX; i<=anX - till; i++) |
||||
{ |
||||
tmp_sum[k] += temp[k][col+i]; |
||||
} |
||||
|
||||
if(posX < dst_cols && (posY) < dst_rows) |
||||
{ |
||||
dst[(dst_startY+0) * (dst_step>>2)+ dst_startX + col - anX] = |
||||
tmp_sum[0] * tmp_sum[4] - tmp_sum[2] * tmp_sum[2] - k * (tmp_sum[0] + tmp_sum[4]) * (tmp_sum[0] + tmp_sum[4]); |
||||
} |
||||
if(posX < dst_cols && (posY + 1) < dst_rows) |
||||
{ |
||||
dst[(dst_startY+1) * (dst_step>>2)+ dst_startX + col - anX] = |
||||
tmp_sum[1] * tmp_sum[5] - tmp_sum[3] * tmp_sum[3] - k * (tmp_sum[1] + tmp_sum[5]) * (tmp_sum[1] + tmp_sum[5]); |
||||
} |
||||
} |
||||
} |
@ -0,0 +1,279 @@ |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Niko Li, newlife20080214@gmail.com |
||||
// Jia Haipeng, jiahaipeng95@gmail.com |
||||
// Xu Pang, pangxu010@163.com |
||||
// Wenju He, wenju@multicorewareinc.com |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
// |
||||
#define PARTIAL_HISTOGRAM256_COUNT (256) |
||||
#define HISTOGRAM256_BIN_COUNT (256) |
||||
|
||||
#define HISTOGRAM256_WORK_GROUP_SIZE (256) |
||||
#define HISTOGRAM256_LOCAL_MEM_SIZE (HISTOGRAM256_BIN_COUNT) |
||||
|
||||
#define NBANKS (16) |
||||
#define NBANKS_BIT (4) |
||||
|
||||
|
||||
__kernel __attribute__((reqd_work_group_size(HISTOGRAM256_BIN_COUNT,1,1)))void calc_sub_hist_D0( |
||||
__global const uint4* src, |
||||
int src_step, int src_offset, |
||||
__global int* globalHist, |
||||
int dataCount, int cols, |
||||
int inc_x, int inc_y, |
||||
int hist_step) |
||||
{ |
||||
__local int subhist[(HISTOGRAM256_BIN_COUNT << NBANKS_BIT)]; // NBINS*NBANKS |
||||
int gid = get_global_id(0); |
||||
int lid = get_local_id(0); |
||||
int gx = get_group_id(0); |
||||
int gsize = get_global_size(0); |
||||
int lsize = get_local_size(0); |
||||
const int shift = 8; |
||||
const int mask = HISTOGRAM256_BIN_COUNT-1; |
||||
int offset = (lid & (NBANKS-1));// lid % NBANKS |
||||
uint4 data, temp1, temp2, temp3, temp4; |
||||
src += src_offset; |
||||
|
||||
//clear LDS |
||||
for(int i=0, idx=lid; i<(NBANKS >> 2); i++, idx += lsize) |
||||
{ |
||||
subhist[idx] = 0; |
||||
subhist[idx+=lsize] = 0; |
||||
subhist[idx+=lsize] = 0; |
||||
subhist[idx+=lsize] = 0; |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
//read and scatter |
||||
int y = gid/cols; |
||||
int x = gid - mul24(y, cols); |
||||
for(int idx=gid; idx<dataCount; idx+=gsize) |
||||
{ |
||||
data = src[mad24(y, src_step, x)]; |
||||
temp1 = ((data & mask) << NBANKS_BIT) + offset; |
||||
data >>= shift; |
||||
temp2 = ((data & mask) << NBANKS_BIT) + offset; |
||||
data >>= shift; |
||||
temp3 = ((data & mask) << NBANKS_BIT) + offset; |
||||
data >>= shift; |
||||
temp4 = ((data & mask) << NBANKS_BIT) + offset; |
||||
|
||||
atomic_inc(subhist + temp1.x); |
||||
atomic_inc(subhist + temp1.y); |
||||
atomic_inc(subhist + temp1.z); |
||||
atomic_inc(subhist + temp1.w); |
||||
|
||||
atomic_inc(subhist + temp2.x); |
||||
atomic_inc(subhist + temp2.y); |
||||
atomic_inc(subhist + temp2.z); |
||||
atomic_inc(subhist + temp2.w); |
||||
|
||||
atomic_inc(subhist + temp3.x); |
||||
atomic_inc(subhist + temp3.y); |
||||
atomic_inc(subhist + temp3.z); |
||||
atomic_inc(subhist + temp3.w); |
||||
|
||||
atomic_inc(subhist + temp4.x); |
||||
atomic_inc(subhist + temp4.y); |
||||
atomic_inc(subhist + temp4.z); |
||||
atomic_inc(subhist + temp4.w); |
||||
|
||||
x += inc_x; |
||||
int off = ((x>=cols) ? -1 : 0); |
||||
x = mad24(off, cols, x); |
||||
y += inc_y - off; |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
//reduce local banks to single histogram per workgroup |
||||
int bin1=0, bin2=0, bin3=0, bin4=0; |
||||
for(int i=0; i<NBANKS; i+=4) |
||||
{ |
||||
bin1 += subhist[(lid << NBANKS_BIT) + i]; |
||||
bin2 += subhist[(lid << NBANKS_BIT) + i+1]; |
||||
bin3 += subhist[(lid << NBANKS_BIT) + i+2]; |
||||
bin4 += subhist[(lid << NBANKS_BIT) + i+3]; |
||||
} |
||||
|
||||
globalHist[mad24(gx, hist_step, lid)] = bin1+bin2+bin3+bin4; |
||||
} |
||||
|
||||
__kernel void __attribute__((reqd_work_group_size(1,HISTOGRAM256_BIN_COUNT,1))) |
||||
calc_sub_hist_border_D0(__global const uchar* src, int src_step, int src_offset, |
||||
__global int* globalHist, int left_col, int cols, |
||||
int rows, int hist_step) |
||||
{ |
||||
int gidx = get_global_id(0); |
||||
int gidy = get_global_id(1); |
||||
int lidy = get_local_id(1); |
||||
int gx = get_group_id(0); |
||||
int gy = get_group_id(1); |
||||
int gn = get_num_groups(0); |
||||
int rowIndex = mad24(gy, gn, gx); |
||||
// rowIndex &= (PARTIAL_HISTOGRAM256_COUNT - 1); |
||||
|
||||
__local int subhist[HISTOGRAM256_LOCAL_MEM_SIZE]; |
||||
subhist[lidy] = 0; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
gidx = ((gidx>=left_col) ? (gidx+cols) : gidx); |
||||
if(gidy<rows) |
||||
{ |
||||
int src_index = src_offset + mad24(gidy, src_step, gidx); |
||||
int p = (int)src[src_index]; |
||||
// p = gidy >= rows ? HISTOGRAM256_LOCAL_MEM_SIZE : p; |
||||
atomic_inc(subhist + p); |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
globalHist[mad24(rowIndex, hist_step, lidy)] += subhist[lidy]; |
||||
} |
||||
|
||||
__kernel __attribute__((reqd_work_group_size(256,1,1)))void merge_hist(__global int* buf, |
||||
__global int* hist, |
||||
int src_step) |
||||
{ |
||||
int lx = get_local_id(0); |
||||
int gx = get_group_id(0); |
||||
|
||||
int sum = 0; |
||||
|
||||
for(int i = lx; i < PARTIAL_HISTOGRAM256_COUNT; i += HISTOGRAM256_WORK_GROUP_SIZE) |
||||
sum += buf[ mad24(i, src_step, gx)]; |
||||
|
||||
__local int data[HISTOGRAM256_WORK_GROUP_SIZE]; |
||||
data[lx] = sum; |
||||
|
||||
for(int stride = HISTOGRAM256_WORK_GROUP_SIZE /2; stride > 0; stride >>= 1) |
||||
{ |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if(lx < stride) |
||||
data[lx] += data[lx + stride]; |
||||
} |
||||
|
||||
if(lx == 0) |
||||
hist[gx] = data[0]; |
||||
} |
||||
|
||||
__kernel __attribute__((reqd_work_group_size(256,1,1))) |
||||
void calLUT(__global uchar * dst, __constant int * hist, int total) |
||||
{ |
||||
int lid = get_local_id(0); |
||||
__local int sumhist[HISTOGRAM256_BIN_COUNT]; |
||||
__local float scale; |
||||
|
||||
sumhist[lid] = hist[lid]; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if (lid == 0) |
||||
{ |
||||
int sum = 0, i = 0; |
||||
while (!sumhist[i]) |
||||
++i; |
||||
|
||||
if (total == sumhist[i]) |
||||
{ |
||||
scale = 1; |
||||
for (int j = 0; j < HISTOGRAM256_BIN_COUNT; ++j) |
||||
sumhist[i] = i; |
||||
} |
||||
else |
||||
{ |
||||
scale = 255.f/(total - sumhist[i]); |
||||
|
||||
for (sumhist[i++] = 0; i < HISTOGRAM256_BIN_COUNT; i++) |
||||
{ |
||||
sum += sumhist[i]; |
||||
sumhist[i] = sum; |
||||
} |
||||
} |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
dst[lid]= convert_uchar_sat_rte(convert_float(sumhist[lid])*scale); |
||||
} |
||||
|
||||
/* |
||||
///////////////////////////////equalizeHist////////////////////////////////////////////////// |
||||
__kernel __attribute__((reqd_work_group_size(256,1,1)))void equalizeHist( |
||||
__global uchar * src, |
||||
__global uchar * dst, |
||||
__constant int * hist, |
||||
int srcstep, |
||||
int srcoffset, |
||||
int dststep, |
||||
int dstoffset, |
||||
int width, |
||||
int height, |
||||
float scale, |
||||
int inc_x, |
||||
int inc_y) |
||||
{ |
||||
int gidx = get_global_id(0); |
||||
int lid = get_local_id(0); |
||||
int glb_size = get_global_size(0); |
||||
src+=srcoffset; |
||||
dst+=dstoffset; |
||||
__local int sumhist[HISTOGRAM256_BIN_COUNT]; |
||||
__local uchar lut[HISTOGRAM256_BIN_COUNT+1]; |
||||
|
||||
sumhist[lid]=hist[lid]; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if(lid==0) |
||||
{ |
||||
int sum = 0; |
||||
for(int i=0;i<HISTOGRAM256_BIN_COUNT;i++) |
||||
{ |
||||
sum+=sumhist[i]; |
||||
sumhist[i]=sum; |
||||
} |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
lut[lid]= convert_uchar_sat(convert_float(sumhist[lid])*scale); |
||||
lut[0]=0; |
||||
int pos_y = gidx / width; |
||||
int pos_x = gidx - mul24(pos_y, width); |
||||
|
||||
for(int pos = gidx; pos < mul24(width,height); pos += glb_size) |
||||
{ |
||||
int inaddr = mad24(pos_y,srcstep,pos_x); |
||||
int outaddr = mad24(pos_y,dststep,pos_x); |
||||
dst[outaddr] = lut[src[inaddr]]; |
||||
pos_x +=inc_x; |
||||
int off = (pos_x >= width ? -1 : 0); |
||||
pos_x = mad24(off,width,pos_x); |
||||
pos_y += inc_y - off; |
||||
} |
||||
} |
||||
*/ |
@ -0,0 +1,280 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. |
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors "as is" and |
||||
// any express or bpied warranties, including, but not limited to, the bpied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable |
||||
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable |
||||
|
||||
//////////////////////////////////////////////////////////////////////// |
||||
// buildPointList |
||||
|
||||
#define PIXELS_PER_THREAD 16 |
||||
|
||||
// TODO: add offset to support ROI |
||||
__kernel void buildPointList(__global const uchar* src, |
||||
int cols, |
||||
int rows, |
||||
int step, |
||||
__global unsigned int* list, |
||||
__global int* counter) |
||||
{ |
||||
__local unsigned int s_queues[4][32 * PIXELS_PER_THREAD]; |
||||
__local int s_qsize[4]; |
||||
__local int s_globStart[4]; |
||||
|
||||
const int x = get_group_id(0) * get_local_size(0) * PIXELS_PER_THREAD + get_local_id(0); |
||||
const int y = get_global_id(1); |
||||
|
||||
if (get_local_id(0) == 0) |
||||
s_qsize[get_local_id(1)] = 0; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (y < rows) |
||||
{ |
||||
// fill the queue |
||||
__global const uchar* srcRow = &src[y * step]; |
||||
for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < cols; ++i, xx += get_local_size(0)) |
||||
{ |
||||
if (srcRow[xx]) |
||||
{ |
||||
const unsigned int val = (y << 16) | xx; |
||||
const int qidx = atomic_add(&s_qsize[get_local_id(1)], 1); |
||||
s_queues[get_local_id(1)][qidx] = val; |
||||
} |
||||
} |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
// let one work-item reserve the space required in the global list |
||||
if (get_local_id(0) == 0 && get_local_id(1) == 0) |
||||
{ |
||||
// find how many items are stored in each list |
||||
int totalSize = 0; |
||||
for (int i = 0; i < get_local_size(1); ++i) |
||||
{ |
||||
s_globStart[i] = totalSize; |
||||
totalSize += s_qsize[i]; |
||||
} |
||||
|
||||
// calculate the offset in the global list |
||||
const int globalOffset = atomic_add(counter, totalSize); |
||||
for (int i = 0; i < get_local_size(1); ++i) |
||||
s_globStart[i] += globalOffset; |
||||
} |
||||
|
||||
barrier(CLK_GLOBAL_MEM_FENCE); |
||||
|
||||
// copy local queues to global queue |
||||
const int qsize = s_qsize[get_local_id(1)]; |
||||
int gidx = s_globStart[get_local_id(1)] + get_local_id(0); |
||||
for(int i = get_local_id(0); i < qsize; i += get_local_size(0), gidx += get_local_size(0)) |
||||
list[gidx] = s_queues[get_local_id(1)][i]; |
||||
} |
||||
|
||||
//////////////////////////////////////////////////////////////////////// |
||||
// circlesAccumCenters |
||||
|
||||
// TODO: add offset to support ROI |
||||
__kernel void circlesAccumCenters(__global const unsigned int* list, |
||||
const int count, |
||||
__global const int* dx, |
||||
const int dxStep, |
||||
__global const int* dy, |
||||
const int dyStep, |
||||
__global int* accum, |
||||
const int accumStep, |
||||
const int width, |
||||
const int height, |
||||
const int minRadius, |
||||
const int maxRadius, |
||||
const float idp) |
||||
{ |
||||
const int dxStepInPixel = dxStep / sizeof(int); |
||||
const int dyStepInPixel = dyStep / sizeof(int); |
||||
const int accumStepInPixel = accumStep / sizeof(int); |
||||
|
||||
const int SHIFT = 10; |
||||
const int ONE = 1 << SHIFT; |
||||
|
||||
// const int tid = blockIdx.x * blockDim.x + threadIdx.x; |
||||
const int wid = get_global_id(0); |
||||
|
||||
if (wid >= count) |
||||
return; |
||||
|
||||
const unsigned int val = list[wid]; |
||||
|
||||
const int x = (val & 0xFFFF); |
||||
const int y = (val >> 16) & 0xFFFF; |
||||
|
||||
const int vx = dx[mad24(y, dxStepInPixel, x)]; |
||||
const int vy = dy[mad24(y, dyStepInPixel, x)]; |
||||
|
||||
if (vx == 0 && vy == 0) |
||||
return; |
||||
|
||||
const float mag = sqrt(convert_float(vx * vx + vy * vy)); |
||||
|
||||
const int x0 = convert_int_rte((x * idp) * ONE); |
||||
const int y0 = convert_int_rte((y * idp) * ONE); |
||||
|
||||
int sx = convert_int_rte((vx * idp) * ONE / mag); |
||||
int sy = convert_int_rte((vy * idp) * ONE / mag); |
||||
|
||||
// Step from minRadius to maxRadius in both directions of the gradient |
||||
for (int k1 = 0; k1 < 2; ++k1) |
||||
{ |
||||
int x1 = x0 + minRadius * sx; |
||||
int y1 = y0 + minRadius * sy; |
||||
|
||||
for (int r = minRadius; r <= maxRadius; x1 += sx, y1 += sy, ++r) |
||||
{ |
||||
const int x2 = x1 >> SHIFT; |
||||
const int y2 = y1 >> SHIFT; |
||||
|
||||
if (x2 < 0 || x2 >= width || y2 < 0 || y2 >= height) |
||||
break; |
||||
|
||||
atomic_add(&accum[mad24(y2+1, accumStepInPixel, x2+1)], 1); |
||||
} |
||||
|
||||
sx = -sx; |
||||
sy = -sy; |
||||
} |
||||
} |
||||
|
||||
// //////////////////////////////////////////////////////////////////////// |
||||
// // buildCentersList |
||||
|
||||
// TODO: add offset to support ROI |
||||
__kernel void buildCentersList(__global const int* accum, |
||||
const int accumCols, |
||||
const int accumRows, |
||||
const int accumStep, |
||||
__global unsigned int* centers, |
||||
const int threshold, |
||||
__global int* counter) |
||||
{ |
||||
const int accumStepInPixel = accumStep/sizeof(int); |
||||
|
||||
const int x = get_global_id(0); |
||||
const int y = get_global_id(1); |
||||
|
||||
if (x < accumCols - 2 && y < accumRows - 2) |
||||
{ |
||||
const int top = accum[mad24(y, accumStepInPixel, x + 1)]; |
||||
|
||||
const int left = accum[mad24(y + 1, accumStepInPixel, x)]; |
||||
const int cur = accum[mad24(y + 1, accumStepInPixel, x + 1)]; |
||||
const int right = accum[mad24(y + 1, accumStepInPixel, x + 2)]; |
||||
|
||||
const int bottom = accum[mad24(y + 2, accumStepInPixel, x + 1)];; |
||||
|
||||
if (cur > threshold && cur > top && cur >= bottom && cur > left && cur >= right) |
||||
{ |
||||
const unsigned int val = (y << 16) | x; |
||||
const int idx = atomic_add(counter, 1); |
||||
centers[idx] = val; |
||||
} |
||||
} |
||||
} |
||||
|
||||
|
||||
// //////////////////////////////////////////////////////////////////////// |
||||
// // circlesAccumRadius |
||||
|
||||
// TODO: add offset to support ROI |
||||
__kernel void circlesAccumRadius(__global const unsigned int* centers, |
||||
__global const unsigned int* list, const int count, |
||||
__global float4* circles, const int maxCircles, |
||||
const float dp, |
||||
const int minRadius, const int maxRadius, |
||||
const int histSize, |
||||
const int threshold, |
||||
__local int* smem, |
||||
__global int* counter) |
||||
{ |
||||
for (int i = get_local_id(0); i < histSize + 2; i += get_local_size(0)) |
||||
smem[i] = 0; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
unsigned int val = centers[get_group_id(0)]; |
||||
|
||||
float cx = convert_float(val & 0xFFFF); |
||||
float cy = convert_float((val >> 16) & 0xFFFF); |
||||
|
||||
cx = (cx + 0.5f) * dp; |
||||
cy = (cy + 0.5f) * dp; |
||||
|
||||
for (int i = get_local_id(0); i < count; i += get_local_size(0)) |
||||
{ |
||||
val = list[i]; |
||||
|
||||
const int x = (val & 0xFFFF); |
||||
const int y = (val >> 16) & 0xFFFF; |
||||
|
||||
const float rad = sqrt((cx - x) * (cx - x) + (cy - y) * (cy - y)); |
||||
if (rad >= minRadius && rad <= maxRadius) |
||||
{ |
||||
const int r = convert_int_rte(rad - minRadius); |
||||
|
||||
atomic_add(&smem[r + 1], 1); |
||||
} |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
for (int i = get_local_id(0); i < histSize; i += get_local_size(0)) |
||||
{ |
||||
const int curVotes = smem[i + 1]; |
||||
|
||||
if (curVotes >= threshold && curVotes > smem[i] && curVotes >= smem[i + 2]) |
||||
|
||||
{ |
||||
const int ind = atomic_add(counter, 1); |
||||
if (ind < maxCircles) |
||||
{ |
||||
circles[ind] = (float4)(cx, cy, convert_float(i + minRadius), 0.0f); |
||||
} |
||||
} |
||||
} |
||||
} |
@ -0,0 +1,493 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Shengen Yan,yanshengen@gmail.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if defined (DOUBLE_SUPPORT) |
||||
#ifdef cl_khr_fp64 |
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable |
||||
#elif defined (cl_amd_fp64) |
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable |
||||
#endif |
||||
#endif |
||||
#define LSIZE 256 |
||||
#define LSIZE_1 255 |
||||
#define LSIZE_2 254 |
||||
#define HF_LSIZE 128 |
||||
#define LOG_LSIZE 8 |
||||
#define LOG_NUM_BANKS 5 |
||||
#define NUM_BANKS 32 |
||||
#define GET_CONFLICT_OFFSET(lid) ((lid) >> LOG_NUM_BANKS) |
||||
|
||||
|
||||
kernel void integral_cols_D4(__global uchar4 *src,__global int *sum ,__global float *sqsum, |
||||
int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step) |
||||
{ |
||||
unsigned int lid = get_local_id(0); |
||||
unsigned int gid = get_group_id(0); |
||||
int4 src_t[2], sum_t[2]; |
||||
float4 sqsum_t[2]; |
||||
__local int4 lm_sum[2][LSIZE + LOG_LSIZE]; |
||||
__local float4 lm_sqsum[2][LSIZE + LOG_LSIZE]; |
||||
__local int* sum_p; |
||||
__local float* sqsum_p; |
||||
src_step = src_step >> 2; |
||||
gid = gid << 1; |
||||
for(int i = 0; i < rows; i =i + LSIZE_1) |
||||
{ |
||||
src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid, (uint)cols - 1)]) : 0); |
||||
src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + min(gid + 1, (uint)cols - 1)]) : 0); |
||||
|
||||
sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); |
||||
sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); |
||||
sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); |
||||
sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]); |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
int bf_loc = lid + GET_CONFLICT_OFFSET(lid); |
||||
lm_sum[0][bf_loc] = src_t[0]; |
||||
lm_sqsum[0][bf_loc] = convert_float4(src_t[0] * src_t[0]); |
||||
|
||||
lm_sum[1][bf_loc] = src_t[1]; |
||||
lm_sqsum[1][bf_loc] = convert_float4(src_t[1] * src_t[1]); |
||||
|
||||
int offset = 1; |
||||
for(int d = LSIZE >> 1 ; d > 0; d>>=1) |
||||
{ |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; |
||||
ai += GET_CONFLICT_OFFSET(ai); |
||||
bi += GET_CONFLICT_OFFSET(bi); |
||||
|
||||
if((lid & 127) < d) |
||||
{ |
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; |
||||
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai]; |
||||
} |
||||
offset <<= 1; |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if(lid < 2) |
||||
{ |
||||
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0; |
||||
lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0; |
||||
} |
||||
for(int d = 1; d < LSIZE; d <<= 1) |
||||
{ |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
offset >>= 1; |
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; |
||||
ai += GET_CONFLICT_OFFSET(ai); |
||||
bi += GET_CONFLICT_OFFSET(bi); |
||||
|
||||
if((lid & 127) < d) |
||||
{ |
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; |
||||
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai]; |
||||
|
||||
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai]; |
||||
lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai]; |
||||
} |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ; |
||||
if(lid > 0 && (i+lid) <= rows) |
||||
{ |
||||
lm_sum[0][bf_loc] += sum_t[0]; |
||||
lm_sum[1][bf_loc] += sum_t[1]; |
||||
lm_sqsum[0][bf_loc] += sqsum_t[0]; |
||||
lm_sqsum[1][bf_loc] += sqsum_t[1]; |
||||
sum_p = (__local int*)(&(lm_sum[0][bf_loc])); |
||||
sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc])); |
||||
for(int k = 0; k < 4; k++) |
||||
{ |
||||
if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue; |
||||
sum[loc_s0 + k * dst_step / 4] = sum_p[k]; |
||||
sqsum[loc_s0 + k * dst_step / 4] = sqsum_p[k]; |
||||
} |
||||
sum_p = (__local int*)(&(lm_sum[1][bf_loc])); |
||||
sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc])); |
||||
for(int k = 0; k < 4; k++) |
||||
{ |
||||
if(gid * 4 + k + 4 >= cols + pre_invalid) break; |
||||
sum[loc_s1 + k * dst_step / 4] = sum_p[k]; |
||||
sqsum[loc_s1 + k * dst_step / 4] = sqsum_p[k]; |
||||
} |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
} |
||||
|
||||
|
||||
kernel void integral_rows_D4(__global int4 *srcsum,__global float4 * srcsqsum,__global int *sum , |
||||
__global float *sqsum,int rows,int cols,int src_step,int sum_step, |
||||
int sqsum_step,int sum_offset,int sqsum_offset) |
||||
{ |
||||
unsigned int lid = get_local_id(0); |
||||
unsigned int gid = get_group_id(0); |
||||
int4 src_t[2], sum_t[2]; |
||||
float4 sqsrc_t[2],sqsum_t[2]; |
||||
__local int4 lm_sum[2][LSIZE + LOG_LSIZE]; |
||||
__local float4 lm_sqsum[2][LSIZE + LOG_LSIZE]; |
||||
__local int *sum_p; |
||||
__local float *sqsum_p; |
||||
src_step = src_step >> 4; |
||||
for(int i = 0; i < rows; i =i + LSIZE_1) |
||||
{ |
||||
src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (int4)0; |
||||
sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : (float4)0; |
||||
src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (int4)0; |
||||
sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0; |
||||
|
||||
sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); |
||||
sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); |
||||
sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); |
||||
sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]); |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
int bf_loc = lid + GET_CONFLICT_OFFSET(lid); |
||||
lm_sum[0][bf_loc] = src_t[0]; |
||||
lm_sqsum[0][bf_loc] = sqsrc_t[0]; |
||||
|
||||
lm_sum[1][bf_loc] = src_t[1]; |
||||
lm_sqsum[1][bf_loc] = sqsrc_t[1]; |
||||
|
||||
int offset = 1; |
||||
for(int d = LSIZE >> 1 ; d > 0; d>>=1) |
||||
{ |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; |
||||
ai += GET_CONFLICT_OFFSET(ai); |
||||
bi += GET_CONFLICT_OFFSET(bi); |
||||
|
||||
if((lid & 127) < d) |
||||
{ |
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; |
||||
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai]; |
||||
} |
||||
offset <<= 1; |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if(lid < 2) |
||||
{ |
||||
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0; |
||||
lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0; |
||||
} |
||||
for(int d = 1; d < LSIZE; d <<= 1) |
||||
{ |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
offset >>= 1; |
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; |
||||
ai += GET_CONFLICT_OFFSET(ai); |
||||
bi += GET_CONFLICT_OFFSET(bi); |
||||
|
||||
if((lid & 127) < d) |
||||
{ |
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; |
||||
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai]; |
||||
|
||||
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai]; |
||||
lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai]; |
||||
} |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if(gid == 0 && (i + lid) <= rows) |
||||
{ |
||||
sum[sum_offset + i + lid] = 0; |
||||
sqsum[sqsum_offset + i + lid] = 0; |
||||
} |
||||
if(i + lid == 0) |
||||
{ |
||||
int loc0 = gid * 2 * sum_step; |
||||
int loc1 = gid * 2 * sqsum_step; |
||||
for(int k = 1; k <= 8; k++) |
||||
{ |
||||
if(gid * 8 + k > cols) break; |
||||
sum[sum_offset + loc0 + k * sum_step / 4] = 0; |
||||
sqsum[sqsum_offset + loc1 + k * sqsum_step / 4] = 0; |
||||
} |
||||
} |
||||
int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ; |
||||
int loc_sq0 = sqsum_offset + gid * 2 * sqsum_step + sqsum_step / 4 + i + lid, loc_sq1 = loc_sq0 + sqsum_step ; |
||||
if(lid > 0 && (i+lid) <= rows) |
||||
{ |
||||
lm_sum[0][bf_loc] += sum_t[0]; |
||||
lm_sum[1][bf_loc] += sum_t[1]; |
||||
lm_sqsum[0][bf_loc] += sqsum_t[0]; |
||||
lm_sqsum[1][bf_loc] += sqsum_t[1]; |
||||
sum_p = (__local int*)(&(lm_sum[0][bf_loc])); |
||||
sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc])); |
||||
for(int k = 0; k < 4; k++) |
||||
{ |
||||
if(gid * 8 + k >= cols) break; |
||||
sum[loc_s0 + k * sum_step / 4] = sum_p[k]; |
||||
sqsum[loc_sq0 + k * sqsum_step / 4] = sqsum_p[k]; |
||||
} |
||||
sum_p = (__local int*)(&(lm_sum[1][bf_loc])); |
||||
sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc])); |
||||
for(int k = 0; k < 4; k++) |
||||
{ |
||||
if(gid * 8 + 4 + k >= cols) break; |
||||
sum[loc_s1 + k * sum_step / 4] = sum_p[k]; |
||||
sqsum[loc_sq1 + k * sqsum_step / 4] = sqsum_p[k]; |
||||
} |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
} |
||||
|
||||
kernel void integral_cols_D5(__global uchar4 *src,__global float *sum ,__global float *sqsum, |
||||
int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step) |
||||
{ |
||||
unsigned int lid = get_local_id(0); |
||||
unsigned int gid = get_group_id(0); |
||||
float4 src_t[2], sum_t[2]; |
||||
float4 sqsum_t[2]; |
||||
__local float4 lm_sum[2][LSIZE + LOG_LSIZE]; |
||||
__local float4 lm_sqsum[2][LSIZE + LOG_LSIZE]; |
||||
__local float* sum_p; |
||||
__local float* sqsum_p; |
||||
src_step = src_step >> 2; |
||||
gid = gid << 1; |
||||
for(int i = 0; i < rows; i =i + LSIZE_1) |
||||
{ |
||||
src_t[0] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid, (uint)cols - 1)]) : (float4)0); |
||||
src_t[1] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + min(gid + 1, (uint)cols - 1)]) : (float4)0); |
||||
|
||||
sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); |
||||
sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); |
||||
sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); |
||||
sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]); |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
int bf_loc = lid + GET_CONFLICT_OFFSET(lid); |
||||
lm_sum[0][bf_loc] = src_t[0]; |
||||
lm_sqsum[0][bf_loc] = convert_float4(src_t[0] * src_t[0]); |
||||
|
||||
lm_sum[1][bf_loc] = src_t[1]; |
||||
lm_sqsum[1][bf_loc] = convert_float4(src_t[1] * src_t[1]); |
||||
|
||||
int offset = 1; |
||||
for(int d = LSIZE >> 1 ; d > 0; d>>=1) |
||||
{ |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; |
||||
ai += GET_CONFLICT_OFFSET(ai); |
||||
bi += GET_CONFLICT_OFFSET(bi); |
||||
|
||||
if((lid & 127) < d) |
||||
{ |
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; |
||||
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai]; |
||||
} |
||||
offset <<= 1; |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if(lid < 2) |
||||
{ |
||||
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0; |
||||
lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0; |
||||
} |
||||
for(int d = 1; d < LSIZE; d <<= 1) |
||||
{ |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
offset >>= 1; |
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; |
||||
ai += GET_CONFLICT_OFFSET(ai); |
||||
bi += GET_CONFLICT_OFFSET(bi); |
||||
|
||||
if((lid & 127) < d) |
||||
{ |
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; |
||||
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai]; |
||||
|
||||
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai]; |
||||
lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai]; |
||||
} |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ; |
||||
if(lid > 0 && (i+lid) <= rows) |
||||
{ |
||||
lm_sum[0][bf_loc] += sum_t[0]; |
||||
lm_sum[1][bf_loc] += sum_t[1]; |
||||
lm_sqsum[0][bf_loc] += sqsum_t[0]; |
||||
lm_sqsum[1][bf_loc] += sqsum_t[1]; |
||||
sum_p = (__local float*)(&(lm_sum[0][bf_loc])); |
||||
sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc])); |
||||
for(int k = 0; k < 4; k++) |
||||
{ |
||||
if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue; |
||||
sum[loc_s0 + k * dst_step / 4] = sum_p[k]; |
||||
sqsum[loc_s0 + k * dst_step / 4] = sqsum_p[k]; |
||||
} |
||||
sum_p = (__local float*)(&(lm_sum[1][bf_loc])); |
||||
sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc])); |
||||
for(int k = 0; k < 4; k++) |
||||
{ |
||||
if(gid * 4 + k + 4 >= cols + pre_invalid) break; |
||||
sum[loc_s1 + k * dst_step / 4] = sum_p[k]; |
||||
sqsum[loc_s1 + k * dst_step / 4] = sqsum_p[k]; |
||||
} |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
} |
||||
|
||||
|
||||
kernel void integral_rows_D5(__global float4 *srcsum,__global float4 * srcsqsum,__global float *sum , |
||||
__global float *sqsum,int rows,int cols,int src_step,int sum_step, |
||||
int sqsum_step,int sum_offset,int sqsum_offset) |
||||
{ |
||||
unsigned int lid = get_local_id(0); |
||||
unsigned int gid = get_group_id(0); |
||||
float4 src_t[2], sum_t[2]; |
||||
float4 sqsrc_t[2],sqsum_t[2]; |
||||
__local float4 lm_sum[2][LSIZE + LOG_LSIZE]; |
||||
__local float4 lm_sqsum[2][LSIZE + LOG_LSIZE]; |
||||
__local float *sum_p; |
||||
__local float *sqsum_p; |
||||
src_step = src_step >> 4; |
||||
for(int i = 0; i < rows; i =i + LSIZE_1) |
||||
{ |
||||
src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (float4)0; |
||||
sqsrc_t[0] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2] : (float4)0; |
||||
src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0; |
||||
sqsrc_t[1] = i + lid < rows ? srcsqsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0; |
||||
|
||||
sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); |
||||
sqsum_t[0] = (i == 0 ? (float4)0 : lm_sqsum[0][LSIZE_2 + LOG_LSIZE]); |
||||
sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); |
||||
sqsum_t[1] = (i == 0 ? (float4)0 : lm_sqsum[1][LSIZE_2 + LOG_LSIZE]); |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
int bf_loc = lid + GET_CONFLICT_OFFSET(lid); |
||||
lm_sum[0][bf_loc] = src_t[0]; |
||||
lm_sqsum[0][bf_loc] = sqsrc_t[0]; |
||||
|
||||
lm_sum[1][bf_loc] = src_t[1]; |
||||
lm_sqsum[1][bf_loc] = sqsrc_t[1]; |
||||
|
||||
int offset = 1; |
||||
for(int d = LSIZE >> 1 ; d > 0; d>>=1) |
||||
{ |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; |
||||
ai += GET_CONFLICT_OFFSET(ai); |
||||
bi += GET_CONFLICT_OFFSET(bi); |
||||
|
||||
if((lid & 127) < d) |
||||
{ |
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; |
||||
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai]; |
||||
} |
||||
offset <<= 1; |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if(lid < 2) |
||||
{ |
||||
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0; |
||||
lm_sqsum[lid][LSIZE_2 + LOG_LSIZE] = 0; |
||||
} |
||||
for(int d = 1; d < LSIZE; d <<= 1) |
||||
{ |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
offset >>= 1; |
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; |
||||
ai += GET_CONFLICT_OFFSET(ai); |
||||
bi += GET_CONFLICT_OFFSET(bi); |
||||
|
||||
if((lid & 127) < d) |
||||
{ |
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; |
||||
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai]; |
||||
|
||||
lm_sqsum[lid >> 7][bi] += lm_sqsum[lid >> 7][ai]; |
||||
lm_sqsum[lid >> 7][ai] = lm_sqsum[lid >> 7][bi] - lm_sqsum[lid >> 7][ai]; |
||||
} |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if(gid == 0 && (i + lid) <= rows) |
||||
{ |
||||
sum[sum_offset + i + lid] = 0; |
||||
sqsum[sqsum_offset + i + lid] = 0; |
||||
} |
||||
if(i + lid == 0) |
||||
{ |
||||
int loc0 = gid * 2 * sum_step; |
||||
int loc1 = gid * 2 * sqsum_step; |
||||
for(int k = 1; k <= 8; k++) |
||||
{ |
||||
if(gid * 8 + k > cols) break; |
||||
sum[sum_offset + loc0 + k * sum_step / 4] = 0; |
||||
sqsum[sqsum_offset + loc1 + k * sqsum_step / 4] = 0; |
||||
} |
||||
} |
||||
int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ; |
||||
int loc_sq0 = sqsum_offset + gid * 2 * sqsum_step + sqsum_step / 4 + i + lid, loc_sq1 = loc_sq0 + sqsum_step ; |
||||
if(lid > 0 && (i+lid) <= rows) |
||||
{ |
||||
lm_sum[0][bf_loc] += sum_t[0]; |
||||
lm_sum[1][bf_loc] += sum_t[1]; |
||||
lm_sqsum[0][bf_loc] += sqsum_t[0]; |
||||
lm_sqsum[1][bf_loc] += sqsum_t[1]; |
||||
sum_p = (__local float*)(&(lm_sum[0][bf_loc])); |
||||
sqsum_p = (__local float*)(&(lm_sqsum[0][bf_loc])); |
||||
for(int k = 0; k < 4; k++) |
||||
{ |
||||
if(gid * 8 + k >= cols) break; |
||||
sum[loc_s0 + k * sum_step / 4] = sum_p[k]; |
||||
sqsum[loc_sq0 + k * sqsum_step / 4] = sqsum_p[k]; |
||||
} |
||||
sum_p = (__local float*)(&(lm_sum[1][bf_loc])); |
||||
sqsum_p = (__local float*)(&(lm_sqsum[1][bf_loc])); |
||||
for(int k = 0; k < 4; k++) |
||||
{ |
||||
if(gid * 8 + 4 + k >= cols) break; |
||||
sum[loc_s1 + k * sum_step / 4] = sum_p[k]; |
||||
sqsum[loc_sq1 + k * sqsum_step / 4] = sqsum_p[k]; |
||||
} |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
} |
@ -0,0 +1,412 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Shengen Yan,yanshengen@gmail.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if defined (DOUBLE_SUPPORT) |
||||
#ifdef cl_khr_fp64 |
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable |
||||
#elif defined (cl_amd_fp64) |
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable |
||||
#endif |
||||
#endif |
||||
|
||||
#define LSIZE 256 |
||||
#define LSIZE_1 255 |
||||
#define LSIZE_2 254 |
||||
#define HF_LSIZE 128 |
||||
#define LOG_LSIZE 8 |
||||
#define LOG_NUM_BANKS 5 |
||||
#define NUM_BANKS 32 |
||||
#define GET_CONFLICT_OFFSET(lid) ((lid) >> LOG_NUM_BANKS) |
||||
|
||||
|
||||
kernel void integral_sum_cols_D4(__global uchar4 *src,__global int *sum , |
||||
int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step) |
||||
{ |
||||
unsigned int lid = get_local_id(0); |
||||
unsigned int gid = get_group_id(0); |
||||
int4 src_t[2], sum_t[2]; |
||||
__local int4 lm_sum[2][LSIZE + LOG_LSIZE]; |
||||
__local int* sum_p; |
||||
src_step = src_step >> 2; |
||||
gid = gid << 1; |
||||
for(int i = 0; i < rows; i =i + LSIZE_1) |
||||
{ |
||||
src_t[0] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid]) : 0); |
||||
src_t[1] = (i + lid < rows ? convert_int4(src[src_offset + (lid+i) * src_step + gid + 1]) : 0); |
||||
|
||||
sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); |
||||
sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
int bf_loc = lid + GET_CONFLICT_OFFSET(lid); |
||||
lm_sum[0][bf_loc] = src_t[0]; |
||||
|
||||
lm_sum[1][bf_loc] = src_t[1]; |
||||
|
||||
int offset = 1; |
||||
for(int d = LSIZE >> 1 ; d > 0; d>>=1) |
||||
{ |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; |
||||
ai += GET_CONFLICT_OFFSET(ai); |
||||
bi += GET_CONFLICT_OFFSET(bi); |
||||
|
||||
if((lid & 127) < d) |
||||
{ |
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; |
||||
} |
||||
offset <<= 1; |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if(lid < 2) |
||||
{ |
||||
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0; |
||||
} |
||||
for(int d = 1; d < LSIZE; d <<= 1) |
||||
{ |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
offset >>= 1; |
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; |
||||
ai += GET_CONFLICT_OFFSET(ai); |
||||
bi += GET_CONFLICT_OFFSET(bi); |
||||
|
||||
if((lid & 127) < d) |
||||
{ |
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; |
||||
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai]; |
||||
} |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if(lid > 0 && (i+lid) <= rows) |
||||
{ |
||||
int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ; |
||||
lm_sum[0][bf_loc] += sum_t[0]; |
||||
lm_sum[1][bf_loc] += sum_t[1]; |
||||
sum_p = (__local int*)(&(lm_sum[0][bf_loc])); |
||||
for(int k = 0; k < 4; k++) |
||||
{ |
||||
if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue; |
||||
sum[loc_s0 + k * dst_step / 4] = sum_p[k]; |
||||
} |
||||
sum_p = (__local int*)(&(lm_sum[1][bf_loc])); |
||||
for(int k = 0; k < 4; k++) |
||||
{ |
||||
if(gid * 4 + k + 4 >= cols + pre_invalid) break; |
||||
sum[loc_s1 + k * dst_step / 4] = sum_p[k]; |
||||
} |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
} |
||||
|
||||
|
||||
kernel void integral_sum_rows_D4(__global int4 *srcsum,__global int *sum , |
||||
int rows,int cols,int src_step,int sum_step, |
||||
int sum_offset) |
||||
{ |
||||
unsigned int lid = get_local_id(0); |
||||
unsigned int gid = get_group_id(0); |
||||
int4 src_t[2], sum_t[2]; |
||||
__local int4 lm_sum[2][LSIZE + LOG_LSIZE]; |
||||
__local int *sum_p; |
||||
src_step = src_step >> 4; |
||||
for(int i = 0; i < rows; i =i + LSIZE_1) |
||||
{ |
||||
src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : 0; |
||||
src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : 0; |
||||
|
||||
sum_t[0] = (i == 0 ? 0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); |
||||
sum_t[1] = (i == 0 ? 0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
int bf_loc = lid + GET_CONFLICT_OFFSET(lid); |
||||
lm_sum[0][bf_loc] = src_t[0]; |
||||
|
||||
lm_sum[1][bf_loc] = src_t[1]; |
||||
|
||||
int offset = 1; |
||||
for(int d = LSIZE >> 1 ; d > 0; d>>=1) |
||||
{ |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; |
||||
ai += GET_CONFLICT_OFFSET(ai); |
||||
bi += GET_CONFLICT_OFFSET(bi); |
||||
|
||||
if((lid & 127) < d) |
||||
{ |
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; |
||||
} |
||||
offset <<= 1; |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if(lid < 2) |
||||
{ |
||||
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0; |
||||
} |
||||
for(int d = 1; d < LSIZE; d <<= 1) |
||||
{ |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
offset >>= 1; |
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; |
||||
ai += GET_CONFLICT_OFFSET(ai); |
||||
bi += GET_CONFLICT_OFFSET(bi); |
||||
|
||||
if((lid & 127) < d) |
||||
{ |
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; |
||||
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai]; |
||||
} |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if(gid == 0 && (i + lid) <= rows) |
||||
{ |
||||
sum[sum_offset + i + lid] = 0; |
||||
} |
||||
if(i + lid == 0) |
||||
{ |
||||
int loc0 = gid * 2 * sum_step; |
||||
for(int k = 1; k <= 8; k++) |
||||
{ |
||||
if(gid * 8 + k > cols) break; |
||||
sum[sum_offset + loc0 + k * sum_step / 4] = 0; |
||||
} |
||||
} |
||||
|
||||
if(lid > 0 && (i+lid) <= rows) |
||||
{ |
||||
int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ; |
||||
lm_sum[0][bf_loc] += sum_t[0]; |
||||
lm_sum[1][bf_loc] += sum_t[1]; |
||||
sum_p = (__local int*)(&(lm_sum[0][bf_loc])); |
||||
for(int k = 0; k < 4; k++) |
||||
{ |
||||
if(gid * 8 + k >= cols) break; |
||||
sum[loc_s0 + k * sum_step / 4] = sum_p[k]; |
||||
} |
||||
sum_p = (__local int*)(&(lm_sum[1][bf_loc])); |
||||
for(int k = 0; k < 4; k++) |
||||
{ |
||||
if(gid * 8 + 4 + k >= cols) break; |
||||
sum[loc_s1 + k * sum_step / 4] = sum_p[k]; |
||||
} |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
} |
||||
|
||||
kernel void integral_sum_cols_D5(__global uchar4 *src,__global float *sum , |
||||
int src_offset,int pre_invalid,int rows,int cols,int src_step,int dst_step) |
||||
{ |
||||
unsigned int lid = get_local_id(0); |
||||
unsigned int gid = get_group_id(0); |
||||
float4 src_t[2], sum_t[2]; |
||||
__local float4 lm_sum[2][LSIZE + LOG_LSIZE]; |
||||
__local float* sum_p; |
||||
src_step = src_step >> 2; |
||||
gid = gid << 1; |
||||
for(int i = 0; i < rows; i =i + LSIZE_1) |
||||
{ |
||||
src_t[0] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + gid]) : (float4)0); |
||||
src_t[1] = (i + lid < rows ? convert_float4(src[src_offset + (lid+i) * src_step + gid + 1]) : (float4)0); |
||||
|
||||
sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); |
||||
sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
int bf_loc = lid + GET_CONFLICT_OFFSET(lid); |
||||
lm_sum[0][bf_loc] = src_t[0]; |
||||
|
||||
lm_sum[1][bf_loc] = src_t[1]; |
||||
|
||||
int offset = 1; |
||||
for(int d = LSIZE >> 1 ; d > 0; d>>=1) |
||||
{ |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; |
||||
ai += GET_CONFLICT_OFFSET(ai); |
||||
bi += GET_CONFLICT_OFFSET(bi); |
||||
|
||||
if((lid & 127) < d) |
||||
{ |
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; |
||||
} |
||||
offset <<= 1; |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if(lid < 2) |
||||
{ |
||||
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0; |
||||
} |
||||
for(int d = 1; d < LSIZE; d <<= 1) |
||||
{ |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
offset >>= 1; |
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; |
||||
ai += GET_CONFLICT_OFFSET(ai); |
||||
bi += GET_CONFLICT_OFFSET(bi); |
||||
|
||||
if((lid & 127) < d) |
||||
{ |
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; |
||||
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai]; |
||||
} |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if(lid > 0 && (i+lid) <= rows) |
||||
{ |
||||
int loc_s0 = gid * dst_step + i + lid - 1 - pre_invalid * dst_step / 4, loc_s1 = loc_s0 + dst_step ; |
||||
lm_sum[0][bf_loc] += sum_t[0]; |
||||
lm_sum[1][bf_loc] += sum_t[1]; |
||||
sum_p = (__local float*)(&(lm_sum[0][bf_loc])); |
||||
for(int k = 0; k < 4; k++) |
||||
{ |
||||
if(gid * 4 + k >= cols + pre_invalid || gid * 4 + k < pre_invalid) continue; |
||||
sum[loc_s0 + k * dst_step / 4] = sum_p[k]; |
||||
} |
||||
sum_p = (__local float*)(&(lm_sum[1][bf_loc])); |
||||
for(int k = 0; k < 4; k++) |
||||
{ |
||||
if(gid * 4 + k + 4 >= cols + pre_invalid) break; |
||||
sum[loc_s1 + k * dst_step / 4] = sum_p[k]; |
||||
} |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
} |
||||
|
||||
|
||||
kernel void integral_sum_rows_D5(__global float4 *srcsum,__global float *sum , |
||||
int rows,int cols,int src_step,int sum_step, |
||||
int sum_offset) |
||||
{ |
||||
unsigned int lid = get_local_id(0); |
||||
unsigned int gid = get_group_id(0); |
||||
float4 src_t[2], sum_t[2]; |
||||
__local float4 lm_sum[2][LSIZE + LOG_LSIZE]; |
||||
__local float *sum_p; |
||||
src_step = src_step >> 4; |
||||
for(int i = 0; i < rows; i =i + LSIZE_1) |
||||
{ |
||||
src_t[0] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2] : (float4)0; |
||||
src_t[1] = i + lid < rows ? srcsum[(lid+i) * src_step + gid * 2 + 1] : (float4)0; |
||||
|
||||
sum_t[0] = (i == 0 ? (float4)0 : lm_sum[0][LSIZE_2 + LOG_LSIZE]); |
||||
sum_t[1] = (i == 0 ? (float4)0 : lm_sum[1][LSIZE_2 + LOG_LSIZE]); |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
int bf_loc = lid + GET_CONFLICT_OFFSET(lid); |
||||
lm_sum[0][bf_loc] = src_t[0]; |
||||
|
||||
lm_sum[1][bf_loc] = src_t[1]; |
||||
|
||||
int offset = 1; |
||||
for(int d = LSIZE >> 1 ; d > 0; d>>=1) |
||||
{ |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; |
||||
ai += GET_CONFLICT_OFFSET(ai); |
||||
bi += GET_CONFLICT_OFFSET(bi); |
||||
|
||||
if((lid & 127) < d) |
||||
{ |
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; |
||||
} |
||||
offset <<= 1; |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if(lid < 2) |
||||
{ |
||||
lm_sum[lid][LSIZE_2 + LOG_LSIZE] = 0; |
||||
} |
||||
for(int d = 1; d < LSIZE; d <<= 1) |
||||
{ |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
offset >>= 1; |
||||
int ai = offset * (((lid & 127)<<1) +1) - 1,bi = ai + offset; |
||||
ai += GET_CONFLICT_OFFSET(ai); |
||||
bi += GET_CONFLICT_OFFSET(bi); |
||||
|
||||
if((lid & 127) < d) |
||||
{ |
||||
lm_sum[lid >> 7][bi] += lm_sum[lid >> 7][ai]; |
||||
lm_sum[lid >> 7][ai] = lm_sum[lid >> 7][bi] - lm_sum[lid >> 7][ai]; |
||||
} |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if(gid == 0 && (i + lid) <= rows) |
||||
{ |
||||
sum[sum_offset + i + lid] = 0; |
||||
} |
||||
if(i + lid == 0) |
||||
{ |
||||
int loc0 = gid * 2 * sum_step; |
||||
for(int k = 1; k <= 8; k++) |
||||
{ |
||||
if(gid * 8 + k > cols) break; |
||||
sum[sum_offset + loc0 + k * sum_step / 4] = 0; |
||||
} |
||||
} |
||||
|
||||
if(lid > 0 && (i+lid) <= rows) |
||||
{ |
||||
int loc_s0 = sum_offset + gid * 2 * sum_step + sum_step / 4 + i + lid, loc_s1 = loc_s0 + sum_step ; |
||||
lm_sum[0][bf_loc] += sum_t[0]; |
||||
lm_sum[1][bf_loc] += sum_t[1]; |
||||
sum_p = (__local float*)(&(lm_sum[0][bf_loc])); |
||||
for(int k = 0; k < 4; k++) |
||||
{ |
||||
if(gid * 8 + k >= cols) break; |
||||
sum[loc_s0 + k * sum_step / 4] = sum_p[k]; |
||||
} |
||||
sum_p = (__local float*)(&(lm_sum[1][bf_loc])); |
||||
for(int k = 0; k < 4; k++) |
||||
{ |
||||
if(gid * 8 + 4 + k >= cols) break; |
||||
sum[loc_s1 + k * sum_step / 4] = sum_p[k]; |
||||
} |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
} |
@ -0,0 +1,381 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Pang Erping, erping@multicorewareinc.com |
||||
// Jia Haipeng, jiahaipeng95@gmail.com |
||||
// Peng Xiao, pengxiao@outlook.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
/////////////////////////////////Macro for border type//////////////////////////////////////////// |
||||
///////////////////////////////////////////////////////////////////////////////////////////////// |
||||
#ifdef BORDER_REPLICATE |
||||
|
||||
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh |
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i)) |
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr)) |
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) : (i)) |
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr)) |
||||
#endif |
||||
|
||||
#ifdef BORDER_REFLECT |
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? ((l_edge)<<1)-(i)-1 : (i)) |
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr)) |
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? ((t_edge)<<1)-(i)-1 : (i)) |
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr)) |
||||
#endif |
||||
|
||||
#ifdef BORDER_REFLECT_101 |
||||
//BORDER_REFLECT_101: gfedcb|abcdefgh|gfedcba |
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? ((l_edge)<<1)-(i) : (i)) |
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)) |
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? ((t_edge)<<1)-(i) : (i)) |
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr)) |
||||
#endif |
||||
|
||||
#ifdef IMG_C_1_0 |
||||
#define T_IMG uchar |
||||
#define T_IMGx4 uchar4 |
||||
#define T_IMG_C1 uchar |
||||
#define CONVERT_TYPE convert_uchar_sat |
||||
#define CONVERT_TYPEx4 convert_uchar4_sat |
||||
#endif |
||||
#ifdef IMG_C_4_0 |
||||
#define T_IMG uchar4 |
||||
#define T_IMGx4 uchar16 |
||||
#define T_IMG_C1 uchar |
||||
#define CONVERT_TYPE convert_uchar4_sat |
||||
#define CONVERT_TYPEx4 convert_uchar16_sat |
||||
#endif |
||||
#ifdef IMG_C_1_5 |
||||
#define T_IMG float |
||||
#define T_IMGx4 float4 |
||||
#define T_IMG_C1 float |
||||
#define CONVERT_TYPE convert_float |
||||
#define CONVERT_TYPEx4 convert_float4 |
||||
#endif |
||||
#ifdef IMG_C_4_5 |
||||
#define T_IMG float4 |
||||
#define T_IMGx4 float16 |
||||
#define T_IMG_C1 float |
||||
#define CONVERT_TYPE convert_float4 |
||||
#define CONVERT_TYPEx4 convert_float16 |
||||
#endif |
||||
|
||||
#ifndef CN |
||||
#define CN 1 |
||||
#endif |
||||
|
||||
#if CN == 1 |
||||
#define T_SUM float |
||||
#define T_SUMx4 float4 |
||||
#define CONVERT_TYPE_SUM convert_float |
||||
#define CONVERT_TYPE_SUMx4 convert_float4 |
||||
#define SUM_ZERO (0.0f) |
||||
#define SUM_ZEROx4 (0.0f, 0.0f, 0.0f, 0.0f) |
||||
#define VLOAD4 vload4 |
||||
#define SX x |
||||
#define SY y |
||||
#define SZ z |
||||
#define SW w |
||||
#elif CN == 4 |
||||
#define T_SUM float4 |
||||
#define T_SUMx4 float16 |
||||
#define CONVERT_TYPE_SUM convert_float4 |
||||
#define CONVERT_TYPE_SUMx4 convert_float16 |
||||
#define SUM_ZERO (0.0f, 0.0f, 0.0f, 0.0f) |
||||
#define SUM_ZEROx4 (0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f) |
||||
#define VLOAD4 vload16 |
||||
#define SX s0123 |
||||
#define SY s4567 |
||||
#define SZ s89ab |
||||
#define SW scdef |
||||
#endif |
||||
|
||||
#ifndef FILTER_SIZE |
||||
#define FILTER_SIZE 3 |
||||
#endif |
||||
|
||||
#define LOCAL_GROUP_SIZE 16 |
||||
|
||||
#define LOCAL_WIDTH ((FILTER_SIZE/2)*2 + LOCAL_GROUP_SIZE) |
||||
#define LOCAL_HEIGHT ((FILTER_SIZE/2)*2 + LOCAL_GROUP_SIZE) |
||||
|
||||
#define FILTER_RADIUS (FILTER_SIZE >> 1) |
||||
|
||||
__kernel void filter2D( |
||||
__global T_IMG *src, |
||||
__global T_IMG *dst, |
||||
int src_step, |
||||
int dst_step, |
||||
__constant float *mat_kernel, |
||||
__local T_IMG *local_data, |
||||
int wholerows, |
||||
int wholecols, |
||||
int src_offset_x, |
||||
int src_offset_y, |
||||
int dst_offset_x, |
||||
int dst_offset_y, |
||||
int cols, |
||||
int rows, |
||||
int operate_cols |
||||
) |
||||
{ |
||||
int groupStartCol = get_group_id(0) * get_local_size(0); |
||||
int groupStartRow = get_group_id(1) * get_local_size(1); |
||||
|
||||
int localCol = get_local_id(0); |
||||
int localRow = get_local_id(1); |
||||
int globalCol = groupStartCol + localCol; |
||||
int globalRow = groupStartRow + localRow; |
||||
const int src_offset = mad24(src_offset_y, src_step, src_offset_x); |
||||
const int dst_offset = mad24(dst_offset_y, dst_step, dst_offset_x); |
||||
|
||||
#ifdef BORDER_CONSTANT |
||||
for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1)) |
||||
{ |
||||
int curRow = groupStartRow + i; |
||||
for(int j = localCol; j < LOCAL_WIDTH; j += get_local_size(0)) |
||||
{ |
||||
int curCol = groupStartCol + j; |
||||
if(curRow < FILTER_RADIUS - src_offset_y || (curRow - FILTER_RADIUS) >= wholerows - src_offset_y|| |
||||
curCol < FILTER_RADIUS - src_offset_x || (curCol - FILTER_RADIUS) >= wholecols - src_offset_x) |
||||
{ |
||||
local_data[(i) * LOCAL_WIDTH + j] = 0; |
||||
} |
||||
else |
||||
{ |
||||
local_data[(i) * LOCAL_WIDTH + j] = src[(curRow - FILTER_RADIUS) * src_step + curCol - FILTER_RADIUS + src_offset]; |
||||
} |
||||
} |
||||
} |
||||
#else |
||||
for(int i = localRow; i < LOCAL_HEIGHT; i += get_local_size(1)) |
||||
{ |
||||
int curRow = groupStartRow + i; |
||||
|
||||
curRow = ADDR_H(curRow, FILTER_RADIUS - src_offset_y, wholerows - src_offset_y); |
||||
|
||||
curRow = ADDR_B(curRow - FILTER_RADIUS, wholerows - src_offset_y, curRow - FILTER_RADIUS); |
||||
|
||||
for(int j = localCol; j < LOCAL_WIDTH; j += get_local_size(0)) |
||||
{ |
||||
int curCol = groupStartCol + j; |
||||
curCol = ADDR_L(curCol, FILTER_RADIUS - src_offset_x, wholecols - src_offset_x); |
||||
curCol = ADDR_R(curCol - FILTER_RADIUS, wholecols - src_offset_x, curCol - FILTER_RADIUS); |
||||
if(curRow < wholerows && curCol < wholecols) |
||||
{ |
||||
local_data[(i) * LOCAL_WIDTH + j] = src[(curRow) * src_step + curCol + src_offset]; |
||||
} |
||||
} |
||||
} |
||||
#endif |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if(globalRow < rows && globalCol < cols) |
||||
{ |
||||
T_SUM sum = (T_SUM)(SUM_ZERO); |
||||
int filterIdx = 0; |
||||
for(int i = 0; i < FILTER_SIZE; i++) |
||||
{ |
||||
int offset = (i + localRow) * LOCAL_WIDTH; |
||||
|
||||
for(int j = 0; j < FILTER_SIZE; j++) |
||||
{ |
||||
sum += CONVERT_TYPE_SUM(local_data[offset + j + localCol]) * mat_kernel[filterIdx++]; |
||||
} |
||||
} |
||||
dst[(globalRow)*dst_step + (globalCol) + dst_offset] = CONVERT_TYPE(sum); |
||||
} |
||||
} |
||||
|
||||
/// following is specific for 3x3 kernels |
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
/////////////////////////////Macro for define elements number per thread///////////////////////////// |
||||
//////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
|
||||
#define ANX 1 |
||||
#define ANY 1 |
||||
|
||||
#define ROWS_PER_GROUP 4 |
||||
#define ROWS_PER_GROUP_BITS 2 |
||||
#define ROWS_FETCH (ROWS_PER_GROUP + ANY + ANY) //(ROWS_PER_GROUP + anY * 2) |
||||
|
||||
#define THREADS_PER_ROW 64 |
||||
#define THREADS_PER_ROW_BIT 6 |
||||
|
||||
#define ELEMENTS_PER_THREAD 4 |
||||
#define ELEMENTS_PER_THREAD_BIT 2 |
||||
|
||||
#define LOCAL_MEM_STEP 260 //divup((get_local_size(0) + anX * 2), 4) * 4 |
||||
|
||||
/////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
/////////////////////////////////////////8uC1//////////////////////////////////////////////////////// |
||||
//////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
|
||||
__kernel void filter2D_3x3( |
||||
__global T_IMG *src, |
||||
__global T_IMG *dst, |
||||
int src_step, |
||||
int dst_step, |
||||
__constant float *mat_kernel, |
||||
__local T_IMG *local_data, |
||||
int wholerows, |
||||
int wholecols, |
||||
int src_offset_x, |
||||
int src_offset_y, |
||||
int dst_offset_x, |
||||
int dst_offset_y, |
||||
int cols, |
||||
int rows, |
||||
int operate_cols |
||||
) |
||||
{ |
||||
int gX = get_global_id(0); |
||||
int gY = get_global_id(1); |
||||
|
||||
int lX = get_local_id(0); |
||||
|
||||
int groupX_size = get_local_size(0); |
||||
int groupX_id = get_group_id(0); |
||||
|
||||
#define dst_align (dst_offset_x & 3) |
||||
int cols_start_index_group = src_offset_x - dst_align + groupX_size * groupX_id - ANX; |
||||
int rows_start_index = src_offset_y + (gY << ROWS_PER_GROUP_BITS) - ANY; |
||||
|
||||
if((gY << 2) < rows) |
||||
{ |
||||
for(int i = 0; i < ROWS_FETCH; ++i) |
||||
{ |
||||
if((rows_start_index - src_offset_y) + i < rows + ANY) |
||||
{ |
||||
#ifdef BORDER_CONSTANT |
||||
int selected_row = rows_start_index + i; |
||||
int selected_cols = cols_start_index_group + lX; |
||||
|
||||
T_IMG data = src[mad24(selected_row, src_step, selected_cols)]; |
||||
int con = selected_row >= 0 && selected_row < wholerows && selected_cols >= 0 && selected_cols < wholecols; |
||||
data = con ? data : (T_IMG)(0); |
||||
local_data[mad24(i, LOCAL_MEM_STEP, lX)] = data; |
||||
|
||||
if(lX < (ANX << 1)) |
||||
{ |
||||
selected_cols = cols_start_index_group + lX + groupX_size; |
||||
|
||||
data = src[mad24(selected_row, src_step, selected_cols)]; |
||||
con = selected_row >= 0 && selected_row < wholerows && selected_cols >= 0 && selected_cols < wholecols; |
||||
data = con ? data : (T_IMG)(0); |
||||
local_data[mad24(i, LOCAL_MEM_STEP, lX) + groupX_size] = data; |
||||
} |
||||
#else |
||||
int selected_row = ADDR_H(rows_start_index + i, 0, wholerows); |
||||
selected_row = ADDR_B(rows_start_index + i, wholerows, selected_row); |
||||
|
||||
int selected_cols = ADDR_L(cols_start_index_group + lX, 0, wholecols); |
||||
selected_cols = ADDR_R(cols_start_index_group + lX, wholecols, selected_cols); |
||||
|
||||
T_IMG data = src[mad24(selected_row, src_step, selected_cols)]; |
||||
|
||||
local_data[mad24(i, LOCAL_MEM_STEP, lX)] = data; |
||||
|
||||
if(lX < (ANX << 1)) |
||||
{ |
||||
selected_cols = cols_start_index_group + lX + groupX_size; |
||||
selected_cols = ADDR_R(selected_cols, wholecols, selected_cols); |
||||
|
||||
data = src[mad24(selected_row, src_step, selected_cols)]; |
||||
local_data[mad24(i, LOCAL_MEM_STEP, lX) + groupX_size] = data; |
||||
} |
||||
#endif |
||||
} |
||||
} |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
int process_col = groupX_size * groupX_id + ((lX % THREADS_PER_ROW) << 2); |
||||
if(((gY << 2) < rows) && (process_col < operate_cols)) |
||||
{ |
||||
int dst_cols_start = dst_offset_x; |
||||
int dst_cols_end = dst_offset_x + cols; |
||||
int dst_cols_index = (dst_offset_x + process_col) & 0xfffffffc; |
||||
|
||||
int dst_rows_end = dst_offset_y + rows; |
||||
int dst_rows_index = dst_offset_y + (gY << ROWS_PER_GROUP_BITS) + (lX >> THREADS_PER_ROW_BIT); |
||||
dst = dst + mad24(dst_rows_index, dst_step, dst_cols_index); |
||||
|
||||
T_IMGx4 dst_data = *(__global T_IMGx4 *)dst; |
||||
|
||||
T_SUMx4 sum = (T_SUMx4)SUM_ZEROx4; |
||||
T_IMGx4 data; |
||||
|
||||
for(int i = 0; i < FILTER_SIZE; i++) |
||||
{ |
||||
#pragma unroll |
||||
for(int j = 0; j < FILTER_SIZE; j++) |
||||
{ |
||||
if(dst_rows_index < dst_rows_end) |
||||
{ |
||||
int local_row = (lX >> THREADS_PER_ROW_BIT) + i; |
||||
int local_cols = ((lX % THREADS_PER_ROW) << ELEMENTS_PER_THREAD_BIT) + j; |
||||
|
||||
data = VLOAD4(0, (__local T_IMG_C1 *)(local_data + local_row * LOCAL_MEM_STEP + local_cols)); |
||||
sum = sum + (mat_kernel[i * FILTER_SIZE + j] * CONVERT_TYPE_SUMx4(data)); |
||||
} |
||||
} |
||||
} |
||||
|
||||
if(dst_rows_index < dst_rows_end) |
||||
{ |
||||
T_IMGx4 tmp_dst = CONVERT_TYPEx4(sum); |
||||
tmp_dst.SX = ((dst_cols_index + 0 >= dst_cols_start) && (dst_cols_index + 0 < dst_cols_end)) ? |
||||
tmp_dst.SX : dst_data.SX; |
||||
tmp_dst.SY = ((dst_cols_index + 1 >= dst_cols_start) && (dst_cols_index + 1 < dst_cols_end)) ? |
||||
tmp_dst.SY : dst_data.SY; |
||||
tmp_dst.SZ = ((dst_cols_index + 2 >= dst_cols_start) && (dst_cols_index + 2 < dst_cols_end)) ? |
||||
tmp_dst.SZ : dst_data.SZ; |
||||
tmp_dst.SW = ((dst_cols_index + 3 >= dst_cols_start) && (dst_cols_index + 3 < dst_cols_end)) ? |
||||
tmp_dst.SW : dst_data.SW; |
||||
*(__global T_IMGx4 *)dst = tmp_dst; |
||||
} |
||||
} |
||||
} |
@ -0,0 +1,857 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Peng Xiao, pengxiao@multicorewareinc.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#pragma OPENCL EXTENSION cl_amd_printf : enable |
||||
|
||||
#if defined (DOUBLE_SUPPORT) |
||||
|
||||
#ifdef cl_khr_fp64 |
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable |
||||
#elif defined (cl_amd_fp64) |
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable |
||||
#endif |
||||
|
||||
#define TYPE_IMAGE_SQSUM double |
||||
#else |
||||
#define TYPE_IMAGE_SQSUM float |
||||
#endif |
||||
|
||||
#ifndef CN4 |
||||
#define CN4 1 |
||||
#else |
||||
#define CN4 4 |
||||
#endif |
||||
|
||||
////////////////////////////////////////////////// |
||||
// utilities |
||||
#define SQSUMS_PTR(ox, oy) mad24(gidy + oy, img_sqsums_step, (gidx + img_sqsums_offset + ox) * CN4) |
||||
#define SUMS_PTR(ox, oy) mad24(gidy + oy, img_sums_step, gidx + img_sums_offset + ox) |
||||
// normAcc* are accurate normalization routines which make GPU matchTemplate |
||||
// consistent with CPU one |
||||
float normAcc(float num, float denum) |
||||
{ |
||||
if(fabs(num) < denum) |
||||
{ |
||||
return num / denum; |
||||
} |
||||
if(fabs(num) < denum * 1.125f) |
||||
{ |
||||
return num > 0 ? 1 : -1; |
||||
} |
||||
return 0; |
||||
} |
||||
|
||||
float normAcc_SQDIFF(float num, float denum) |
||||
{ |
||||
if(fabs(num) < denum) |
||||
{ |
||||
return num / denum; |
||||
} |
||||
if(fabs(num) < denum * 1.125f) |
||||
{ |
||||
return num > 0 ? 1 : -1; |
||||
} |
||||
return 1; |
||||
} |
||||
////////////////////////////////////////////////////////////////////// |
||||
// normalize |
||||
|
||||
__kernel |
||||
void normalizeKernel_C1_D0 |
||||
( |
||||
__global const float * img_sqsums, |
||||
__global float * res, |
||||
ulong tpl_sqsum, |
||||
int res_rows, |
||||
int res_cols, |
||||
int tpl_rows, |
||||
int tpl_cols, |
||||
int img_sqsums_offset, |
||||
int img_sqsums_step, |
||||
int res_offset, |
||||
int res_step |
||||
) |
||||
{ |
||||
int gidx = get_global_id(0); |
||||
int gidy = get_global_id(1); |
||||
|
||||
res_step /= sizeof(*res); |
||||
res_offset /= sizeof(*res); |
||||
img_sqsums_step /= sizeof(*img_sqsums); |
||||
img_sqsums_offset /= sizeof(*img_sqsums); |
||||
int res_idx = mad24(gidy, res_step, res_offset + gidx); |
||||
if(gidx < res_cols && gidy < res_rows) |
||||
{ |
||||
float image_sqsum_ = (float)( |
||||
(img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - |
||||
(img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); |
||||
res[res_idx] = normAcc(res[res_idx], sqrt(image_sqsum_ * tpl_sqsum)); |
||||
} |
||||
} |
||||
|
||||
__kernel |
||||
void matchTemplate_Prepared_SQDIFF_C1_D0 |
||||
( |
||||
__global const TYPE_IMAGE_SQSUM * img_sqsums, |
||||
__global float * res, |
||||
ulong tpl_sqsum, |
||||
int res_rows, |
||||
int res_cols, |
||||
int tpl_rows, |
||||
int tpl_cols, |
||||
int img_sqsums_offset, |
||||
int img_sqsums_step, |
||||
int res_offset, |
||||
int res_step |
||||
) |
||||
{ |
||||
int gidx = get_global_id(0); |
||||
int gidy = get_global_id(1); |
||||
|
||||
res_step /= sizeof(*res); |
||||
res_offset /= sizeof(*res); |
||||
img_sqsums_step /= sizeof(*img_sqsums); |
||||
img_sqsums_offset /= sizeof(*img_sqsums); |
||||
int res_idx = mad24(gidy, res_step, res_offset + gidx); |
||||
if(gidx < res_cols && gidy < res_rows) |
||||
{ |
||||
float image_sqsum_ = (float)( |
||||
(img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - |
||||
(img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); |
||||
res[res_idx] = image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum; |
||||
} |
||||
} |
||||
|
||||
__kernel |
||||
void matchTemplate_Prepared_SQDIFF_NORMED_C1_D0 |
||||
( |
||||
__global const float * img_sqsums, |
||||
__global float * res, |
||||
ulong tpl_sqsum, |
||||
int res_rows, |
||||
int res_cols, |
||||
int tpl_rows, |
||||
int tpl_cols, |
||||
int img_sqsums_offset, |
||||
int img_sqsums_step, |
||||
int res_offset, |
||||
int res_step |
||||
) |
||||
{ |
||||
int gidx = get_global_id(0); |
||||
int gidy = get_global_id(1); |
||||
|
||||
res_step /= sizeof(*res); |
||||
res_offset /= sizeof(*res); |
||||
img_sqsums_step /= sizeof(*img_sqsums); |
||||
img_sqsums_offset /= sizeof(*img_sqsums); |
||||
int res_idx = mad24(gidy, res_step, res_offset + gidx); |
||||
if(gidx < res_cols && gidy < res_rows) |
||||
{ |
||||
float image_sqsum_ = (float)( |
||||
(img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - |
||||
(img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); |
||||
res[res_idx] = normAcc_SQDIFF(image_sqsum_ - 2.f * res[res_idx] + tpl_sqsum, |
||||
sqrt(image_sqsum_ * tpl_sqsum)); |
||||
} |
||||
} |
||||
|
||||
////////////////////////////////////////////////// |
||||
// SQDIFF |
||||
__kernel |
||||
void matchTemplate_Naive_SQDIFF_C1_D0 |
||||
( |
||||
__global const uchar * img, |
||||
__global const uchar * tpl, |
||||
__global float * res, |
||||
int img_rows, |
||||
int img_cols, |
||||
int tpl_rows, |
||||
int tpl_cols, |
||||
int res_rows, |
||||
int res_cols, |
||||
int img_offset, |
||||
int tpl_offset, |
||||
int res_offset, |
||||
int img_step, |
||||
int tpl_step, |
||||
int res_step |
||||
) |
||||
{ |
||||
int gidx = get_global_id(0); |
||||
int gidy = get_global_id(1); |
||||
int i,j; |
||||
int delta; |
||||
int sum = 0; |
||||
res_step /= sizeof(*res); |
||||
res_offset /= sizeof(*res); |
||||
int res_idx = mad24(gidy, res_step, res_offset + gidx); |
||||
|
||||
if(gidx < res_cols && gidy < res_rows) |
||||
{ |
||||
for(i = 0; i < tpl_rows; i ++) |
||||
{ |
||||
// get specific rows of img data |
||||
__global const uchar * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); |
||||
__global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); |
||||
for(j = 0; j < tpl_cols; j ++) |
||||
{ |
||||
delta = img_ptr[j] - tpl_ptr[j]; |
||||
sum = mad24(delta, delta, sum); |
||||
} |
||||
} |
||||
res[res_idx] = sum; |
||||
} |
||||
} |
||||
|
||||
__kernel |
||||
void matchTemplate_Naive_SQDIFF_C1_D5 |
||||
( |
||||
__global const float * img, |
||||
__global const float * tpl, |
||||
__global float * res, |
||||
int img_rows, |
||||
int img_cols, |
||||
int tpl_rows, |
||||
int tpl_cols, |
||||
int res_rows, |
||||
int res_cols, |
||||
int img_offset, |
||||
int tpl_offset, |
||||
int res_offset, |
||||
int img_step, |
||||
int tpl_step, |
||||
int res_step |
||||
) |
||||
{ |
||||
int gidx = get_global_id(0); |
||||
int gidy = get_global_id(1); |
||||
int i,j; |
||||
float delta; |
||||
float sum = 0; |
||||
img_step /= sizeof(*img); |
||||
img_offset /= sizeof(*img); |
||||
tpl_step /= sizeof(*tpl); |
||||
tpl_offset /= sizeof(*tpl); |
||||
res_step /= sizeof(*res); |
||||
res_offset /= sizeof(*res); |
||||
|
||||
int res_idx = mad24(gidy, res_step, res_offset + gidx); |
||||
|
||||
if(gidx < res_cols && gidy < res_rows) |
||||
{ |
||||
for(i = 0; i < tpl_rows; i ++) |
||||
{ |
||||
// get specific rows of img data |
||||
__global const float * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); |
||||
__global const float * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); |
||||
for(j = 0; j < tpl_cols; j ++) |
||||
{ |
||||
delta = img_ptr[j] - tpl_ptr[j]; |
||||
sum = mad(delta, delta, sum); |
||||
} |
||||
} |
||||
res[res_idx] = sum; |
||||
} |
||||
} |
||||
|
||||
__kernel |
||||
void matchTemplate_Naive_SQDIFF_C4_D0 |
||||
( |
||||
__global const uchar4 * img, |
||||
__global const uchar4 * tpl, |
||||
__global float * res, |
||||
int img_rows, |
||||
int img_cols, |
||||
int tpl_rows, |
||||
int tpl_cols, |
||||
int res_rows, |
||||
int res_cols, |
||||
int img_offset, |
||||
int tpl_offset, |
||||
int res_offset, |
||||
int img_step, |
||||
int tpl_step, |
||||
int res_step |
||||
) |
||||
{ |
||||
int gidx = get_global_id(0); |
||||
int gidy = get_global_id(1); |
||||
int i,j; |
||||
int4 delta; |
||||
int4 sum = (int4)(0, 0, 0, 0); |
||||
img_step /= sizeof(*img); |
||||
img_offset /= sizeof(*img); |
||||
tpl_step /= sizeof(*tpl); |
||||
tpl_offset /= sizeof(*tpl); |
||||
res_step /= sizeof(*res); |
||||
res_offset /= sizeof(*res); |
||||
|
||||
int res_idx = mad24(gidy, res_step, res_offset + gidx); |
||||
|
||||
if(gidx < res_cols && gidy < res_rows) |
||||
{ |
||||
for(i = 0; i < tpl_rows; i ++) |
||||
{ |
||||
// get specific rows of img data |
||||
__global const uchar4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); |
||||
__global const uchar4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); |
||||
for(j = 0; j < tpl_cols; j ++) |
||||
{ |
||||
//delta = convert_int4(img_ptr[j] - tpl_ptr[j]); // this alternative is incorrect |
||||
delta.x = img_ptr[j].x - tpl_ptr[j].x; |
||||
delta.y = img_ptr[j].y - tpl_ptr[j].y; |
||||
delta.z = img_ptr[j].z - tpl_ptr[j].z; |
||||
delta.w = img_ptr[j].w - tpl_ptr[j].w; |
||||
sum = mad24(delta, delta, sum); |
||||
} |
||||
} |
||||
res[res_idx] = sum.x + sum.y + sum.z + sum.w; |
||||
} |
||||
} |
||||
|
||||
__kernel |
||||
void matchTemplate_Naive_SQDIFF_C4_D5 |
||||
( |
||||
__global const float4 * img, |
||||
__global const float4 * tpl, |
||||
__global float * res, |
||||
int img_rows, |
||||
int img_cols, |
||||
int tpl_rows, |
||||
int tpl_cols, |
||||
int res_rows, |
||||
int res_cols, |
||||
int img_offset, |
||||
int tpl_offset, |
||||
int res_offset, |
||||
int img_step, |
||||
int tpl_step, |
||||
int res_step |
||||
) |
||||
{ |
||||
int gidx = get_global_id(0); |
||||
int gidy = get_global_id(1); |
||||
int i,j; |
||||
float4 delta; |
||||
float4 sum = (float4)(0, 0, 0, 0); |
||||
img_step /= sizeof(*img); |
||||
img_offset /= sizeof(*img); |
||||
tpl_step /= sizeof(*tpl); |
||||
tpl_offset /= sizeof(*tpl); |
||||
res_step /= sizeof(*res); |
||||
res_offset /= sizeof(*res); |
||||
|
||||
int res_idx = mad24(gidy, res_step, res_offset + gidx); |
||||
|
||||
if(gidx < res_cols && gidy < res_rows) |
||||
{ |
||||
for(i = 0; i < tpl_rows; i ++) |
||||
{ |
||||
// get specific rows of img data |
||||
__global const float4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); |
||||
__global const float4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); |
||||
for(j = 0; j < tpl_cols; j ++) |
||||
{ |
||||
//delta = convert_int4(img_ptr[j] - tpl_ptr[j]); // this alternative is incorrect |
||||
delta.x = img_ptr[j].x - tpl_ptr[j].x; |
||||
delta.y = img_ptr[j].y - tpl_ptr[j].y; |
||||
delta.z = img_ptr[j].z - tpl_ptr[j].z; |
||||
delta.w = img_ptr[j].w - tpl_ptr[j].w; |
||||
sum = mad(delta, delta, sum); |
||||
} |
||||
} |
||||
res[res_idx] = sum.x + sum.y + sum.z + sum.w; |
||||
} |
||||
} |
||||
|
||||
////////////////////////////////////////////////// |
||||
// CCORR |
||||
__kernel |
||||
void matchTemplate_Naive_CCORR_C1_D0 |
||||
( |
||||
__global const uchar * img, |
||||
__global const uchar * tpl, |
||||
__global float * res, |
||||
int img_rows, |
||||
int img_cols, |
||||
int tpl_rows, |
||||
int tpl_cols, |
||||
int res_rows, |
||||
int res_cols, |
||||
int img_offset, |
||||
int tpl_offset, |
||||
int res_offset, |
||||
int img_step, |
||||
int tpl_step, |
||||
int res_step |
||||
) |
||||
{ |
||||
int gidx = get_global_id(0); |
||||
int gidy = get_global_id(1); |
||||
int i,j; |
||||
int sum = 0; |
||||
res_step /= sizeof(*res); |
||||
res_offset /= sizeof(*res); |
||||
|
||||
int res_idx = mad24(gidy, res_step, res_offset + gidx); |
||||
|
||||
if(gidx < res_cols && gidy < res_rows) |
||||
{ |
||||
for(i = 0; i < tpl_rows; i ++) |
||||
{ |
||||
// get specific rows of img data |
||||
__global const uchar * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); |
||||
__global const uchar * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); |
||||
for(j = 0; j < tpl_cols; j ++) |
||||
{ |
||||
sum = mad24(convert_int(img_ptr[j]), convert_int(tpl_ptr[j]), sum); |
||||
} |
||||
} |
||||
res[res_idx] = (float)sum; |
||||
} |
||||
} |
||||
|
||||
__kernel |
||||
void matchTemplate_Naive_CCORR_C1_D5 |
||||
( |
||||
__global const float * img, |
||||
__global const float * tpl, |
||||
__global float * res, |
||||
int img_rows, |
||||
int img_cols, |
||||
int tpl_rows, |
||||
int tpl_cols, |
||||
int res_rows, |
||||
int res_cols, |
||||
int img_offset, |
||||
int tpl_offset, |
||||
int res_offset, |
||||
int img_step, |
||||
int tpl_step, |
||||
int res_step |
||||
) |
||||
{ |
||||
int gidx = get_global_id(0); |
||||
int gidy = get_global_id(1); |
||||
int i,j; |
||||
float sum = 0; |
||||
img_step /= sizeof(*img); |
||||
img_offset /= sizeof(*img); |
||||
tpl_step /= sizeof(*tpl); |
||||
tpl_offset /= sizeof(*tpl); |
||||
res_step /= sizeof(*res); |
||||
res_offset /= sizeof(*res); |
||||
|
||||
int res_idx = mad24(gidy, res_step, res_offset + gidx); |
||||
|
||||
if(gidx < res_cols && gidy < res_rows) |
||||
{ |
||||
for(i = 0; i < tpl_rows; i ++) |
||||
{ |
||||
// get specific rows of img data |
||||
__global const float * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); |
||||
__global const float * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); |
||||
for(j = 0; j < tpl_cols; j ++) |
||||
{ |
||||
sum = mad(img_ptr[j], tpl_ptr[j], sum); |
||||
} |
||||
} |
||||
res[res_idx] = sum; |
||||
} |
||||
} |
||||
|
||||
__kernel |
||||
void matchTemplate_Naive_CCORR_C4_D0 |
||||
( |
||||
__global const uchar4 * img, |
||||
__global const uchar4 * tpl, |
||||
__global float * res, |
||||
int img_rows, |
||||
int img_cols, |
||||
int tpl_rows, |
||||
int tpl_cols, |
||||
int res_rows, |
||||
int res_cols, |
||||
int img_offset, |
||||
int tpl_offset, |
||||
int res_offset, |
||||
int img_step, |
||||
int tpl_step, |
||||
int res_step |
||||
) |
||||
{ |
||||
int gidx = get_global_id(0); |
||||
int gidy = get_global_id(1); |
||||
int i,j; |
||||
int4 sum = (int4)(0, 0, 0, 0); |
||||
img_step /= sizeof(*img); |
||||
img_offset /= sizeof(*img); |
||||
tpl_step /= sizeof(*tpl); |
||||
tpl_offset /= sizeof(*tpl); |
||||
res_step /= sizeof(*res); |
||||
res_offset /= sizeof(*res); |
||||
|
||||
int res_idx = mad24(gidy, res_step, res_offset + gidx); |
||||
|
||||
if(gidx < res_cols && gidy < res_rows) |
||||
{ |
||||
for(i = 0; i < tpl_rows; i ++) |
||||
{ |
||||
// get specific rows of img data |
||||
__global const uchar4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); |
||||
__global const uchar4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); |
||||
for(j = 0; j < tpl_cols; j ++) |
||||
{ |
||||
sum = mad24(convert_int4(img_ptr[j]), convert_int4(tpl_ptr[j]), sum); |
||||
} |
||||
} |
||||
res[res_idx] = (float)(sum.x + sum.y + sum.z + sum.w); |
||||
} |
||||
} |
||||
|
||||
__kernel |
||||
void matchTemplate_Naive_CCORR_C4_D5 |
||||
( |
||||
__global const float4 * img, |
||||
__global const float4 * tpl, |
||||
__global float * res, |
||||
int img_rows, |
||||
int img_cols, |
||||
int tpl_rows, |
||||
int tpl_cols, |
||||
int res_rows, |
||||
int res_cols, |
||||
int img_offset, |
||||
int tpl_offset, |
||||
int res_offset, |
||||
int img_step, |
||||
int tpl_step, |
||||
int res_step |
||||
) |
||||
{ |
||||
int gidx = get_global_id(0); |
||||
int gidy = get_global_id(1); |
||||
int i,j; |
||||
float4 sum = (float4)(0, 0, 0, 0); |
||||
img_step /= sizeof(*img); |
||||
img_offset /= sizeof(*img); |
||||
tpl_step /= sizeof(*tpl); |
||||
tpl_offset /= sizeof(*tpl); |
||||
res_step /= sizeof(*res); |
||||
res_offset /= sizeof(*res); |
||||
|
||||
int res_idx = mad24(gidy, res_step, res_offset + gidx); |
||||
|
||||
if(gidx < res_cols && gidy < res_rows) |
||||
{ |
||||
for(i = 0; i < tpl_rows; i ++) |
||||
{ |
||||
// get specific rows of img data |
||||
__global const float4 * img_ptr = img + mad24(gidy + i, img_step, gidx + img_offset); |
||||
__global const float4 * tpl_ptr = tpl + mad24(i, tpl_step, tpl_offset); |
||||
for(j = 0; j < tpl_cols; j ++) |
||||
{ |
||||
sum = mad(convert_float4(img_ptr[j]), convert_float4(tpl_ptr[j]), sum); |
||||
} |
||||
} |
||||
res[res_idx] = sum.x + sum.y + sum.z + sum.w; |
||||
} |
||||
} |
||||
|
||||
////////////////////////////////////////////////// |
||||
// CCOFF |
||||
__kernel |
||||
void matchTemplate_Prepared_CCOFF_C1_D0 |
||||
( |
||||
__global float * res, |
||||
int img_rows, |
||||
int img_cols, |
||||
int tpl_rows, |
||||
int tpl_cols, |
||||
int res_rows, |
||||
int res_cols, |
||||
int res_offset, |
||||
int res_step, |
||||
__global const uint * img_sums, |
||||
int img_sums_offset, |
||||
int img_sums_step, |
||||
float tpl_sum |
||||
) |
||||
{ |
||||
int gidx = get_global_id(0); |
||||
int gidy = get_global_id(1); |
||||
|
||||
img_sums_offset /= sizeof(*img_sums); |
||||
img_sums_step /= sizeof(*img_sums); |
||||
res_step /= sizeof(*res); |
||||
res_offset /= sizeof(*res); |
||||
|
||||
int res_idx = mad24(gidy, res_step, res_offset + gidx); |
||||
|
||||
if(gidx < res_cols && gidy < res_rows) |
||||
{ |
||||
float sum = (float)((img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)]) |
||||
-(img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)])); |
||||
res[res_idx] -= sum * tpl_sum; |
||||
} |
||||
} |
||||
__kernel |
||||
void matchTemplate_Prepared_CCOFF_C4_D0 |
||||
( |
||||
__global float * res, |
||||
int img_rows, |
||||
int img_cols, |
||||
int tpl_rows, |
||||
int tpl_cols, |
||||
int res_rows, |
||||
int res_cols, |
||||
int res_offset, |
||||
int res_step, |
||||
__global const uint * img_sums_c0, |
||||
__global const uint * img_sums_c1, |
||||
__global const uint * img_sums_c2, |
||||
__global const uint * img_sums_c3, |
||||
int img_sums_offset, |
||||
int img_sums_step, |
||||
float tpl_sum_c0, |
||||
float tpl_sum_c1, |
||||
float tpl_sum_c2, |
||||
float tpl_sum_c3 |
||||
) |
||||
{ |
||||
int gidx = get_global_id(0); |
||||
int gidy = get_global_id(1); |
||||
|
||||
img_sums_offset /= sizeof(*img_sums_c0); |
||||
img_sums_step /= sizeof(*img_sums_c0); |
||||
res_step /= sizeof(*res); |
||||
res_offset /= sizeof(*res); |
||||
|
||||
int res_idx = mad24(gidy, res_step, res_offset + gidx); |
||||
|
||||
if(gidx < res_cols && gidy < res_rows) |
||||
{ |
||||
float ccorr = res[res_idx]; |
||||
ccorr -= tpl_sum_c0*(float)( |
||||
(img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)]) |
||||
- (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)])); |
||||
ccorr -= tpl_sum_c1*(float)( |
||||
(img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)]) |
||||
- (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)])); |
||||
ccorr -= tpl_sum_c2*(float)( |
||||
(img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)]) |
||||
- (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)])); |
||||
ccorr -= tpl_sum_c3*(float)( |
||||
(img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)]) |
||||
- (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)])); |
||||
res[res_idx] = ccorr; |
||||
} |
||||
} |
||||
|
||||
__kernel |
||||
void matchTemplate_Prepared_CCOFF_NORMED_C1_D0 |
||||
( |
||||
__global float * res, |
||||
int img_rows, |
||||
int img_cols, |
||||
int tpl_rows, |
||||
int tpl_cols, |
||||
int res_rows, |
||||
int res_cols, |
||||
int res_offset, |
||||
int res_step, |
||||
float weight, |
||||
__global const uint * img_sums, |
||||
int img_sums_offset, |
||||
int img_sums_step, |
||||
__global const float * img_sqsums, |
||||
int img_sqsums_offset, |
||||
int img_sqsums_step, |
||||
float tpl_sum, |
||||
float tpl_sqsum |
||||
) |
||||
{ |
||||
int gidx = get_global_id(0); |
||||
int gidy = get_global_id(1); |
||||
|
||||
img_sqsums_step /= sizeof(*img_sqsums); |
||||
img_sqsums_offset /= sizeof(*img_sqsums); |
||||
img_sums_offset /= sizeof(*img_sums); |
||||
img_sums_step /= sizeof(*img_sums); |
||||
res_step /= sizeof(*res); |
||||
res_offset /= sizeof(*res); |
||||
|
||||
|
||||
int res_idx = mad24(gidy, res_step, res_offset + gidx); |
||||
|
||||
if(gidx < res_cols && gidy < res_rows) |
||||
{ |
||||
float image_sum_ = (float)( |
||||
(img_sums[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums[SUMS_PTR(tpl_cols, 0)]) |
||||
- (img_sums[SUMS_PTR(0, tpl_rows)] - img_sums[SUMS_PTR(0, 0)])); |
||||
|
||||
float image_sqsum_ = (float)( |
||||
(img_sqsums[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums[SQSUMS_PTR(tpl_cols, 0)]) - |
||||
(img_sqsums[SQSUMS_PTR(0, tpl_rows)] - img_sqsums[SQSUMS_PTR(0, 0)])); |
||||
res[res_idx] = normAcc(res[res_idx] - image_sum_ * tpl_sum, |
||||
sqrt(tpl_sqsum * (image_sqsum_ - weight * image_sum_ * image_sum_))); |
||||
} |
||||
} |
||||
__kernel |
||||
void matchTemplate_Prepared_CCOFF_NORMED_C4_D0 |
||||
( |
||||
__global float * res, |
||||
int img_rows, |
||||
int img_cols, |
||||
int tpl_rows, |
||||
int tpl_cols, |
||||
int res_rows, |
||||
int res_cols, |
||||
int res_offset, |
||||
int res_step, |
||||
float weight, |
||||
__global const uint * img_sums_c0, |
||||
__global const uint * img_sums_c1, |
||||
__global const uint * img_sums_c2, |
||||
__global const uint * img_sums_c3, |
||||
int img_sums_offset, |
||||
int img_sums_step, |
||||
__global const float * img_sqsums_c0, |
||||
__global const float * img_sqsums_c1, |
||||
__global const float * img_sqsums_c2, |
||||
__global const float * img_sqsums_c3, |
||||
int img_sqsums_offset, |
||||
int img_sqsums_step, |
||||
float tpl_sum_c0, |
||||
float tpl_sum_c1, |
||||
float tpl_sum_c2, |
||||
float tpl_sum_c3, |
||||
float tpl_sqsum |
||||
) |
||||
{ |
||||
int gidx = get_global_id(0); |
||||
int gidy = get_global_id(1); |
||||
|
||||
img_sqsums_step /= sizeof(*img_sqsums_c0); |
||||
img_sqsums_offset /= sizeof(*img_sqsums_c0); |
||||
img_sums_offset /= sizeof(*img_sums_c0); |
||||
img_sums_step /= sizeof(*img_sums_c0); |
||||
res_step /= sizeof(*res); |
||||
res_offset /= sizeof(*res); |
||||
|
||||
int res_idx = mad24(gidy, res_step, res_offset + gidx); |
||||
|
||||
if(gidx < res_cols && gidy < res_rows) |
||||
{ |
||||
float image_sum_c0 = (float)( |
||||
(img_sums_c0[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c0[SUMS_PTR(tpl_cols, 0)]) |
||||
- (img_sums_c0[SUMS_PTR(0, tpl_rows)] - img_sums_c0[SUMS_PTR(0, 0)])); |
||||
float image_sum_c1 = (float)( |
||||
(img_sums_c1[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c1[SUMS_PTR(tpl_cols, 0)]) |
||||
- (img_sums_c1[SUMS_PTR(0, tpl_rows)] - img_sums_c1[SUMS_PTR(0, 0)])); |
||||
float image_sum_c2 = (float)( |
||||
(img_sums_c2[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c2[SUMS_PTR(tpl_cols, 0)]) |
||||
- (img_sums_c2[SUMS_PTR(0, tpl_rows)] - img_sums_c2[SUMS_PTR(0, 0)])); |
||||
float image_sum_c3 = (float)( |
||||
(img_sums_c3[SUMS_PTR(tpl_cols, tpl_rows)] - img_sums_c3[SUMS_PTR(tpl_cols, 0)]) |
||||
- (img_sums_c3[SUMS_PTR(0, tpl_rows)] - img_sums_c3[SUMS_PTR(0, 0)])); |
||||
|
||||
float image_sqsum_c0 = (float)( |
||||
(img_sqsums_c0[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(tpl_cols, 0)]) - |
||||
(img_sqsums_c0[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c0[SQSUMS_PTR(0, 0)])); |
||||
float image_sqsum_c1 = (float)( |
||||
(img_sqsums_c1[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(tpl_cols, 0)]) - |
||||
(img_sqsums_c1[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c1[SQSUMS_PTR(0, 0)])); |
||||
float image_sqsum_c2 = (float)( |
||||
(img_sqsums_c2[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(tpl_cols, 0)]) - |
||||
(img_sqsums_c2[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c2[SQSUMS_PTR(0, 0)])); |
||||
float image_sqsum_c3 = (float)( |
||||
(img_sqsums_c3[SQSUMS_PTR(tpl_cols, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(tpl_cols, 0)]) - |
||||
(img_sqsums_c3[SQSUMS_PTR(0, tpl_rows)] - img_sqsums_c3[SQSUMS_PTR(0, 0)])); |
||||
|
||||
float num = res[res_idx] - |
||||
image_sum_c0 * tpl_sum_c0 - |
||||
image_sum_c1 * tpl_sum_c1 - |
||||
image_sum_c2 * tpl_sum_c2 - |
||||
image_sum_c3 * tpl_sum_c3; |
||||
float denum = sqrt( tpl_sqsum * ( |
||||
image_sqsum_c0 - weight * image_sum_c0 * image_sum_c0 + |
||||
image_sqsum_c1 - weight * image_sum_c1 * image_sum_c1 + |
||||
image_sqsum_c2 - weight * image_sum_c2 * image_sum_c2 + |
||||
image_sqsum_c3 - weight * image_sum_c0 * image_sum_c3) |
||||
); |
||||
res[res_idx] = normAcc(num, denum); |
||||
} |
||||
} |
||||
|
||||
////////////////////////////////////////////////////////////////////// |
||||
// extractFirstChannel |
||||
__kernel |
||||
void extractFirstChannel |
||||
( |
||||
const __global float4* img, |
||||
__global float* res, |
||||
int rows, |
||||
int cols, |
||||
int img_offset, |
||||
int res_offset, |
||||
int img_step, |
||||
int res_step |
||||
) |
||||
{ |
||||
img_step /= sizeof(float4); |
||||
res_step /= sizeof(float); |
||||
img_offset /= sizeof(float4); |
||||
res_offset /= sizeof(float); |
||||
img += img_offset; |
||||
res += res_offset; |
||||
int gidx = get_global_id(0); |
||||
int gidy = get_global_id(1); |
||||
if(gidx < cols && gidy < rows) |
||||
{ |
||||
res[gidx + gidy * res_step] = img[gidx + gidy * img_step].x; |
||||
} |
||||
} |
@ -0,0 +1,486 @@ |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Niko Li, newlife20080214@gmail.com |
||||
// Zero Lin, zero.lin@amd.com |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
// |
||||
|
||||
|
||||
/* |
||||
__kernel void medianFilter_C1(__global uchar * src, __global uchar * dst, int srcOffset, int dstOffset, int cols, |
||||
int rows, int srcStep, int dstStep, int m) |
||||
{ |
||||
int dx = get_global_id(0)-(m>>1); |
||||
int dy = get_global_id(1)-(m>>1); |
||||
|
||||
short histom[256]; |
||||
for(int i=0;i<256;++i) |
||||
histom[i]=0; |
||||
|
||||
|
||||
for(int i=0;i<m;++i) |
||||
{ |
||||
__global uchar * data = src + srcOffset + mul24(srcStep,clamp(dy + (i), 0, rows-1)); |
||||
for(int j=dx;j<dx+m;++j) |
||||
{ |
||||
histom[data[clamp(j, 0, cols-1)]]++; |
||||
} |
||||
} |
||||
|
||||
int now=0; |
||||
int goal=(m*m+1)>>1; |
||||
int v; |
||||
for(int i=0;i<256;++i) |
||||
{ |
||||
v=(now<goal?i:v); |
||||
now+=histom[i]; |
||||
} |
||||
|
||||
if(dy<rows && dx<cols) |
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=v; |
||||
} |
||||
*/ |
||||
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);} |
||||
__kernel void medianFilter3_C4_D0(__global uchar4 * src, __global uchar4 * dst, int srcOffset, int dstOffset, int cols, |
||||
int rows, int srcStep, int dstStep) |
||||
{ |
||||
|
||||
__local uchar4 data[18][18]; |
||||
__global uchar4* source=src + srcOffset; |
||||
|
||||
int dx = get_global_id(0) - get_local_id(0) -1; |
||||
int dy = get_global_id(1) - get_local_id(1) -1; |
||||
|
||||
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1); |
||||
|
||||
int dr=id/18; |
||||
int dc=id%18; |
||||
int r=clamp(dy+dr, 0, rows-1); |
||||
int c=clamp(dx+dc, 0, cols-1); |
||||
|
||||
data[dr][dc] = source[r*srcStep + c]; |
||||
r=clamp(dy+dr+9, 0, rows-1); |
||||
data[dr+9][dc] = source[r*srcStep + c]; |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
int x =get_local_id(0); |
||||
int y =get_local_id(1); |
||||
uchar4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2]; |
||||
uchar4 p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2]; |
||||
uchar4 p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2]; |
||||
uchar4 mid; |
||||
|
||||
op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1); |
||||
op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5); |
||||
op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7); |
||||
op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7); |
||||
op(p4, p2); op(p6, p4); op(p4, p2); |
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols) |
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4; |
||||
} |
||||
#undef op(a,b) |
||||
|
||||
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);} |
||||
__kernel void medianFilter3_C1_D0(__global uchar * src, __global uchar * dst, int srcOffset, int dstOffset, int cols, |
||||
int rows, int srcStep, int dstStep) |
||||
{ |
||||
|
||||
__local uchar data[18][18]; |
||||
__global uchar* source=src + srcOffset; |
||||
|
||||
int dx = get_global_id(0) - get_local_id(0) -1; |
||||
int dy = get_global_id(1) - get_local_id(1) -1; |
||||
|
||||
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1); |
||||
|
||||
int dr=id/18; |
||||
int dc=id%18; |
||||
int r=clamp(dy+dr, 0, rows-1); |
||||
int c=clamp(dx+dc, 0, cols-1); |
||||
|
||||
data[dr][dc] = source[r*srcStep + c]; |
||||
r=clamp(dy+dr+9, 0, rows-1); |
||||
data[dr+9][dc] = source[r*srcStep + c]; |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
int x =get_local_id(0); |
||||
int y =get_local_id(1); |
||||
uchar p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2]; |
||||
uchar p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2]; |
||||
uchar p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2]; |
||||
uchar mid; |
||||
|
||||
op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1); |
||||
op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5); |
||||
op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7); |
||||
op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7); |
||||
op(p4, p2); op(p6, p4); op(p4, p2); |
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols) |
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4; |
||||
} |
||||
#undef op(a,b) |
||||
|
||||
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);} |
||||
__kernel void medianFilter3_C1_D5(__global float * src, __global float * dst, int srcOffset, int dstOffset, int cols, |
||||
int rows, int srcStep, int dstStep) |
||||
{ |
||||
|
||||
__local float data[18][18]; |
||||
__global float* source=src + srcOffset; |
||||
|
||||
int dx = get_global_id(0) - get_local_id(0) -1; |
||||
int dy = get_global_id(1) - get_local_id(1) -1; |
||||
|
||||
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1); |
||||
|
||||
int dr=id/18; |
||||
int dc=id%18; |
||||
int r=clamp(dy+dr, 0, rows-1); |
||||
int c=clamp(dx+dc, 0, cols-1); |
||||
|
||||
data[dr][dc] = source[r*srcStep + c]; |
||||
r=clamp(dy+dr+9, 0, rows-1); |
||||
data[dr+9][dc] = source[r*srcStep + c]; |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
int x =get_local_id(0); |
||||
int y =get_local_id(1); |
||||
float p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2]; |
||||
float p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2]; |
||||
float p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2]; |
||||
float mid; |
||||
|
||||
op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1); |
||||
op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5); |
||||
op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7); |
||||
op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7); |
||||
op(p4, p2); op(p6, p4); op(p4, p2); |
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols) |
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4; |
||||
} |
||||
#undef op(a,b) |
||||
|
||||
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);} |
||||
__kernel void medianFilter3_C4_D5(__global float4 * src, __global float4 * dst, int srcOffset, int dstOffset, int cols, |
||||
int rows, int srcStep, int dstStep) |
||||
{ |
||||
|
||||
__local float4 data[18][18]; |
||||
__global float4* source=src + srcOffset; |
||||
|
||||
int dx = get_global_id(0) - get_local_id(0) -1; |
||||
int dy = get_global_id(1) - get_local_id(1) -1; |
||||
|
||||
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 9*18-1); |
||||
|
||||
int dr=id/18; |
||||
int dc=id%18; |
||||
int r=clamp(dy+dr, 0, rows-1); |
||||
int c=clamp(dx+dc, 0, cols-1); |
||||
|
||||
data[dr][dc] = source[r*srcStep + c]; |
||||
r=clamp(dy+dr+9, 0, rows-1); |
||||
data[dr+9][dc] = source[r*srcStep + c]; |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
int x =get_local_id(0); |
||||
int y =get_local_id(1); |
||||
float4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2]; |
||||
float4 p3=data[y+1][x], p4=data[y+1][x+1], p5=data[y+1][x+2]; |
||||
float4 p6=data[y+2][x], p7=data[y+2][x+1], p8=data[y+2][x+2]; |
||||
float4 mid; |
||||
|
||||
op(p1, p2); op(p4, p5); op(p7, p8); op(p0, p1); |
||||
op(p3, p4); op(p6, p7); op(p1, p2); op(p4, p5); |
||||
op(p7, p8); op(p0, p3); op(p5, p8); op(p4, p7); |
||||
op(p3, p6); op(p1, p4); op(p2, p5); op(p4, p7); |
||||
op(p4, p2); op(p6, p4); op(p4, p2); |
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols) |
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p4; |
||||
} |
||||
#undef op(a,b) |
||||
|
||||
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);} |
||||
__kernel void medianFilter5_C4_D0(__global uchar4 * src, __global uchar4 * dst, int srcOffset, int dstOffset, int cols, |
||||
int rows, int srcStep, int dstStep) |
||||
{ |
||||
|
||||
__local uchar4 data[20][20]; |
||||
__global uchar4* source=src + srcOffset; |
||||
|
||||
int dx = get_global_id(0) - get_local_id(0) -2; |
||||
int dy = get_global_id(1) - get_local_id(1) -2; |
||||
|
||||
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1); |
||||
|
||||
int dr=id/20; |
||||
int dc=id%20; |
||||
int r=clamp(dy+dr, 0, rows-1); |
||||
int c=clamp(dx+dc, 0, cols-1); |
||||
|
||||
data[dr][dc] = source[r*srcStep + c]; |
||||
r=clamp(dy+dr+10, 0, rows-1); |
||||
data[dr+10][dc] = source[r*srcStep + c]; |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
int x =get_local_id(0); |
||||
int y =get_local_id(1); |
||||
uchar4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4]; |
||||
uchar4 p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4]; |
||||
uchar4 p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4]; |
||||
uchar4 p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4]; |
||||
uchar4 p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4]; |
||||
uchar4 mid; |
||||
|
||||
op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4); |
||||
op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4); |
||||
op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8); |
||||
op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11); |
||||
op(p8, p9); op(p7, p10); op(p7, p8); op(p9, p10); op(p0, p6); |
||||
op(p4, p10); op(p4, p6); op(p2, p8); op(p2, p4); op(p6, p8); |
||||
op(p1, p7); op(p5, p11); op(p5, p7); op(p3, p9); op(p3, p5); |
||||
op(p7, p9); op(p1, p2); op(p3, p4); op(p5, p6); op(p7, p8); |
||||
op(p9, p10); op(p13, p14); op(p12, p13); op(p13, p14); op(p16, p17); |
||||
op(p15, p16); op(p16, p17); op(p12, p15); op(p14, p17); op(p14, p15); |
||||
op(p13, p16); op(p13, p14); op(p15, p16); op(p19, p20); op(p18, p19); |
||||
op(p19, p20); op(p21, p22); op(p23, p24); op(p21, p23); op(p22, p24); |
||||
op(p22, p23); op(p18, p21); op(p20, p23); op(p20, p21); op(p19, p22); |
||||
op(p22, p24); op(p19, p20); op(p21, p22); op(p23, p24); op(p12, p18); |
||||
op(p16, p22); op(p16, p18); op(p14, p20); op(p20, p24); op(p14, p16); |
||||
op(p18, p20); op(p22, p24); op(p13, p19); op(p17, p23); op(p17, p19); |
||||
op(p15, p21); op(p15, p17); op(p19, p21); op(p13, p14); op(p15, p16); |
||||
op(p17, p18); op(p19, p20); op(p21, p22); op(p23, p24); op(p0, p12); |
||||
op(p8, p20); op(p8, p12); op(p4, p16); op(p16, p24); op(p12, p16); |
||||
op(p2, p14); op(p10, p22); op(p10, p14); op(p6, p18); op(p6, p10); |
||||
op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17); |
||||
op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19); |
||||
op(p7, p11); op(p11, p13); op(p11, p12); |
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols) |
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12; |
||||
} |
||||
#undef op(a,b) |
||||
|
||||
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);} |
||||
__kernel void medianFilter5_C1_D0(__global uchar * src, __global uchar * dst, int srcOffset, int dstOffset, int cols, |
||||
int rows, int srcStep, int dstStep) |
||||
{ |
||||
|
||||
__local uchar data[20][20]; |
||||
__global uchar* source=src + srcOffset; |
||||
|
||||
int dx = get_global_id(0) - get_local_id(0) -2; |
||||
int dy = get_global_id(1) - get_local_id(1) -2; |
||||
|
||||
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1); |
||||
|
||||
int dr=id/20; |
||||
int dc=id%20; |
||||
int r=clamp(dy+dr, 0, rows-1); |
||||
int c=clamp(dx+dc, 0, cols-1); |
||||
|
||||
data[dr][dc] = source[r*srcStep + c]; |
||||
r=clamp(dy+dr+10, 0, rows-1); |
||||
data[dr+10][dc] = source[r*srcStep + c]; |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
int x =get_local_id(0); |
||||
int y =get_local_id(1); |
||||
uchar p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4]; |
||||
uchar p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4]; |
||||
uchar p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4]; |
||||
uchar p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4]; |
||||
uchar p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4]; |
||||
uchar mid; |
||||
|
||||
op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4); |
||||
op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4); |
||||
op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8); |
||||
op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11); |
||||
op(p8, p9); op(p7, p10); op(p7, p8); op(p9, p10); op(p0, p6); |
||||
op(p4, p10); op(p4, p6); op(p2, p8); op(p2, p4); op(p6, p8); |
||||
op(p1, p7); op(p5, p11); op(p5, p7); op(p3, p9); op(p3, p5); |
||||
op(p7, p9); op(p1, p2); op(p3, p4); op(p5, p6); op(p7, p8); |
||||
op(p9, p10); op(p13, p14); op(p12, p13); op(p13, p14); op(p16, p17); |
||||
op(p15, p16); op(p16, p17); op(p12, p15); op(p14, p17); op(p14, p15); |
||||
op(p13, p16); op(p13, p14); op(p15, p16); op(p19, p20); op(p18, p19); |
||||
op(p19, p20); op(p21, p22); op(p23, p24); op(p21, p23); op(p22, p24); |
||||
op(p22, p23); op(p18, p21); op(p20, p23); op(p20, p21); op(p19, p22); |
||||
op(p22, p24); op(p19, p20); op(p21, p22); op(p23, p24); op(p12, p18); |
||||
op(p16, p22); op(p16, p18); op(p14, p20); op(p20, p24); op(p14, p16); |
||||
op(p18, p20); op(p22, p24); op(p13, p19); op(p17, p23); op(p17, p19); |
||||
op(p15, p21); op(p15, p17); op(p19, p21); op(p13, p14); op(p15, p16); |
||||
op(p17, p18); op(p19, p20); op(p21, p22); op(p23, p24); op(p0, p12); |
||||
op(p8, p20); op(p8, p12); op(p4, p16); op(p16, p24); op(p12, p16); |
||||
op(p2, p14); op(p10, p22); op(p10, p14); op(p6, p18); op(p6, p10); |
||||
op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17); |
||||
op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19); |
||||
op(p7, p11); op(p11, p13); op(p11, p12); |
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols) |
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12; |
||||
} |
||||
#undef op(a,b) |
||||
|
||||
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);} |
||||
__kernel void medianFilter5_C4_D5(__global float4 * src, __global float4 * dst, int srcOffset, int dstOffset, int cols, |
||||
int rows, int srcStep, int dstStep) |
||||
{ |
||||
|
||||
__local float4 data[20][20]; |
||||
__global float4* source=src + srcOffset; |
||||
|
||||
int dx = get_global_id(0) - get_local_id(0) -2; |
||||
int dy = get_global_id(1) - get_local_id(1) -2; |
||||
|
||||
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1); |
||||
|
||||
int dr=id/20; |
||||
int dc=id%20; |
||||
int r=clamp(dy+dr, 0, rows-1); |
||||
int c=clamp(dx+dc, 0, cols-1); |
||||
|
||||
data[dr][dc] = source[r*srcStep + c]; |
||||
r=clamp(dy+dr+10, 0, rows-1); |
||||
data[dr+10][dc] = source[r*srcStep + c]; |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
int x =get_local_id(0); |
||||
int y =get_local_id(1); |
||||
float4 p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4]; |
||||
float4 p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4]; |
||||
float4 p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4]; |
||||
float4 p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4]; |
||||
float4 p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4]; |
||||
float4 mid; |
||||
|
||||
op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4); |
||||
op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4); |
||||
op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8); |
||||
op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11); |
||||
op(p8, p9); op(p7, p10); op(p7, p8); op(p9, p10); op(p0, p6); |
||||
op(p4, p10); op(p4, p6); op(p2, p8); op(p2, p4); op(p6, p8); |
||||
op(p1, p7); op(p5, p11); op(p5, p7); op(p3, p9); op(p3, p5); |
||||
op(p7, p9); op(p1, p2); op(p3, p4); op(p5, p6); op(p7, p8); |
||||
op(p9, p10); op(p13, p14); op(p12, p13); op(p13, p14); op(p16, p17); |
||||
op(p15, p16); op(p16, p17); op(p12, p15); op(p14, p17); op(p14, p15); |
||||
op(p13, p16); op(p13, p14); op(p15, p16); op(p19, p20); op(p18, p19); |
||||
op(p19, p20); op(p21, p22); op(p23, p24); op(p21, p23); op(p22, p24); |
||||
op(p22, p23); op(p18, p21); op(p20, p23); op(p20, p21); op(p19, p22); |
||||
op(p22, p24); op(p19, p20); op(p21, p22); op(p23, p24); op(p12, p18); |
||||
op(p16, p22); op(p16, p18); op(p14, p20); op(p20, p24); op(p14, p16); |
||||
op(p18, p20); op(p22, p24); op(p13, p19); op(p17, p23); op(p17, p19); |
||||
op(p15, p21); op(p15, p17); op(p19, p21); op(p13, p14); op(p15, p16); |
||||
op(p17, p18); op(p19, p20); op(p21, p22); op(p23, p24); op(p0, p12); |
||||
op(p8, p20); op(p8, p12); op(p4, p16); op(p16, p24); op(p12, p16); |
||||
op(p2, p14); op(p10, p22); op(p10, p14); op(p6, p18); op(p6, p10); |
||||
op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17); |
||||
op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19); |
||||
op(p7, p11); op(p11, p13); op(p11, p12); |
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols) |
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12; |
||||
} |
||||
#undef op(a,b) |
||||
|
||||
#define op(a,b) {mid=a; a=min(a,b); b=max(mid,b);} |
||||
__kernel void medianFilter5_C1_D5(__global float * src, __global float * dst, int srcOffset, int dstOffset, int cols, |
||||
int rows, int srcStep, int dstStep) |
||||
{ |
||||
|
||||
__local float data[20][20]; |
||||
__global float* source=src + srcOffset; |
||||
|
||||
int dx = get_global_id(0) - get_local_id(0) -2; |
||||
int dy = get_global_id(1) - get_local_id(1) -2; |
||||
|
||||
const int id = min((int)(get_local_id(0)*16+get_local_id(1)), 10*20-1); |
||||
|
||||
int dr=id/20; |
||||
int dc=id%20; |
||||
int r=clamp(dy+dr, 0, rows-1); |
||||
int c=clamp(dx+dc, 0, cols-1); |
||||
|
||||
data[dr][dc] = source[r*srcStep + c]; |
||||
r=clamp(dy+dr+10, 0, rows-1); |
||||
data[dr+10][dc] = source[r*srcStep + c]; |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
int x =get_local_id(0); |
||||
int y =get_local_id(1); |
||||
float p0=data[y][x], p1=data[y][x+1], p2=data[y][x+2], p3=data[y][x+3], p4=data[y][x+4]; |
||||
float p5=data[y+1][x], p6=data[y+1][x+1], p7=data[y+1][x+2], p8=data[y+1][x+3], p9=data[y+1][x+4]; |
||||
float p10=data[y+2][x], p11=data[y+2][x+1], p12=data[y+2][x+2], p13=data[y+2][x+3], p14=data[y+2][x+4]; |
||||
float p15=data[y+3][x], p16=data[y+3][x+1], p17=data[y+3][x+2], p18=data[y+3][x+3], p19=data[y+3][x+4]; |
||||
float p20=data[y+4][x], p21=data[y+4][x+1], p22=data[y+4][x+2], p23=data[y+4][x+3], p24=data[y+4][x+4]; |
||||
float mid; |
||||
|
||||
op(p1, p2); op(p0, p1); op(p1, p2); op(p4, p5); op(p3, p4); |
||||
op(p4, p5); op(p0, p3); op(p2, p5); op(p2, p3); op(p1, p4); |
||||
op(p1, p2); op(p3, p4); op(p7, p8); op(p6, p7); op(p7, p8); |
||||
op(p10, p11); op(p9, p10); op(p10, p11); op(p6, p9); op(p8, p11); |
||||
op(p8, p9); op(p7, p10); op(p7, p8); op(p9, p10); op(p0, p6); |
||||
op(p4, p10); op(p4, p6); op(p2, p8); op(p2, p4); op(p6, p8); |
||||
op(p1, p7); op(p5, p11); op(p5, p7); op(p3, p9); op(p3, p5); |
||||
op(p7, p9); op(p1, p2); op(p3, p4); op(p5, p6); op(p7, p8); |
||||
op(p9, p10); op(p13, p14); op(p12, p13); op(p13, p14); op(p16, p17); |
||||
op(p15, p16); op(p16, p17); op(p12, p15); op(p14, p17); op(p14, p15); |
||||
op(p13, p16); op(p13, p14); op(p15, p16); op(p19, p20); op(p18, p19); |
||||
op(p19, p20); op(p21, p22); op(p23, p24); op(p21, p23); op(p22, p24); |
||||
op(p22, p23); op(p18, p21); op(p20, p23); op(p20, p21); op(p19, p22); |
||||
op(p22, p24); op(p19, p20); op(p21, p22); op(p23, p24); op(p12, p18); |
||||
op(p16, p22); op(p16, p18); op(p14, p20); op(p20, p24); op(p14, p16); |
||||
op(p18, p20); op(p22, p24); op(p13, p19); op(p17, p23); op(p17, p19); |
||||
op(p15, p21); op(p15, p17); op(p19, p21); op(p13, p14); op(p15, p16); |
||||
op(p17, p18); op(p19, p20); op(p21, p22); op(p23, p24); op(p0, p12); |
||||
op(p8, p20); op(p8, p12); op(p4, p16); op(p16, p24); op(p12, p16); |
||||
op(p2, p14); op(p10, p22); op(p10, p14); op(p6, p18); op(p6, p10); |
||||
op(p10, p12); op(p1, p13); op(p9, p21); op(p9, p13); op(p5, p17); |
||||
op(p13, p17); op(p3, p15); op(p11, p23); op(p11, p15); op(p7, p19); |
||||
op(p7, p11); op(p11, p13); op(p11, p12); |
||||
|
||||
if(get_global_id(1)<rows && get_global_id(0)<cols) |
||||
dst[dstOffset + get_global_id(1)*dstStep + get_global_id(0)]=p12; |
||||
} |
||||
#undef op(a,b) |
@ -0,0 +1,207 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Shengen Yan,yanshengen@gmail.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if defined (DOUBLE_SUPPORT) |
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable |
||||
#endif |
||||
/////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
/////////////////////////////////Macro for border type//////////////////////////////////////////// |
||||
///////////////////////////////////////////////////////////////////////////////////////////////// |
||||
#ifdef BORDER_REPLICATE |
||||
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh |
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i)) |
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr)) |
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i)) |
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr)) |
||||
#endif |
||||
|
||||
#ifdef BORDER_REFLECT |
||||
//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb |
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i)) |
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr)) |
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i)) |
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr)) |
||||
#endif |
||||
|
||||
#ifdef BORDER_REFLECT101 |
||||
//BORDER_REFLECT101: gfedcb|abcdefgh|gfedcba |
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i)) |
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr)) |
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i)) |
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr)) |
||||
#endif |
||||
|
||||
#ifdef BORDER_WRAP |
||||
//BORDER_WRAP: cdefgh|abcdefgh|abcdefg |
||||
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i)) |
||||
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr)) |
||||
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i)) |
||||
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr)) |
||||
#endif |
||||
|
||||
#define THREADS 256 |
||||
#define ELEM(i, l_edge, r_edge, elem1, elem2) (i) >= (l_edge) && (i) < (r_edge) ? (elem1) : (elem2) |
||||
/////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
/////////////////////////////////////calcHarris//////////////////////////////////////////////////// |
||||
/////////////////////////////////////////////////////////////////////////////////////////////////// |
||||
__kernel void calcMinEigenVal(__global const float *Dx,__global const float *Dy, __global float *dst, |
||||
int dx_offset, int dx_whole_rows, int dx_whole_cols, int dx_step, |
||||
int dy_offset, int dy_whole_rows, int dy_whole_cols, int dy_step, |
||||
int dst_offset, int dst_rows, int dst_cols, int dst_step, |
||||
float k) |
||||
{ |
||||
int col = get_local_id(0); |
||||
const int gX = get_group_id(0); |
||||
const int gY = get_group_id(1); |
||||
const int glx = get_global_id(0); |
||||
const int gly = get_global_id(1); |
||||
|
||||
int dx_x_off = (dx_offset % dx_step) >> 2; |
||||
int dx_y_off = dx_offset / dx_step; |
||||
int dy_x_off = (dy_offset % dy_step) >> 2; |
||||
int dy_y_off = dy_offset / dy_step; |
||||
int dst_x_off = (dst_offset % dst_step) >> 2; |
||||
int dst_y_off = dst_offset / dst_step; |
||||
|
||||
int dx_startX = gX * (THREADS-ksX+1) - anX + dx_x_off; |
||||
int dx_startY = (gY << 1) - anY + dx_y_off; |
||||
int dy_startX = gX * (THREADS-ksX+1) - anX + dy_x_off; |
||||
int dy_startY = (gY << 1) - anY + dy_y_off; |
||||
int dst_startX = gX * (THREADS-ksX+1) + dst_x_off; |
||||
int dst_startY = (gY << 1) + dst_y_off; |
||||
|
||||
float dx_data[ksY+1],dy_data[ksY+1],data[3][ksY+1]; |
||||
__local float temp[6][THREADS]; |
||||
#ifdef BORDER_CONSTANT |
||||
bool dx_con,dy_con; |
||||
float dx_s,dy_s; |
||||
for(int i=0; i < ksY+1; i++) |
||||
{ |
||||
dx_con = dx_startX+col >= 0 && dx_startX+col < dx_whole_cols && dx_startY+i >= 0 && dx_startY+i < dx_whole_rows; |
||||
dx_s = Dx[(dx_startY+i)*(dx_step>>2)+(dx_startX+col)]; |
||||
dx_data[i] = dx_con ? dx_s : 0.0; |
||||
dy_con = dy_startX+col >= 0 && dy_startX+col < dy_whole_cols && dy_startY+i >= 0 && dy_startY+i < dy_whole_rows; |
||||
dy_s = Dy[(dy_startY+i)*(dy_step>>2)+(dy_startX+col)]; |
||||
dy_data[i] = dy_con ? dy_s : 0.0; |
||||
data[0][i] = dx_data[i] * dx_data[i]; |
||||
data[1][i] = dx_data[i] * dy_data[i]; |
||||
data[2][i] = dy_data[i] * dy_data[i]; |
||||
} |
||||
#else |
||||
int clamped_col = min(dst_cols, col); |
||||
|
||||
for(int i=0; i < ksY+1; i++) |
||||
{ |
||||
int dx_selected_row; |
||||
int dx_selected_col; |
||||
dx_selected_row = ADDR_H(dx_startY+i, 0, dx_whole_rows); |
||||
dx_selected_row = ADDR_B(dx_startY+i, dx_whole_rows, dx_selected_row); |
||||
dx_selected_col = ADDR_L(dx_startX+clamped_col, 0, dx_whole_cols); |
||||
dx_selected_col = ADDR_R(dx_startX+clamped_col, dx_whole_cols, dx_selected_col); |
||||
dx_data[i] = Dx[dx_selected_row * (dx_step>>2) + dx_selected_col]; |
||||
|
||||
int dy_selected_row; |
||||
int dy_selected_col; |
||||
dy_selected_row = ADDR_H(dy_startY+i, 0, dy_whole_rows); |
||||
dy_selected_row = ADDR_B(dy_startY+i, dy_whole_rows, dy_selected_row); |
||||
dy_selected_col = ADDR_L(dy_startX+clamped_col, 0, dy_whole_cols); |
||||
dy_selected_col = ADDR_R(dy_startX+clamped_col, dy_whole_cols, dy_selected_col); |
||||
dy_data[i] = Dy[dy_selected_row * (dy_step>>2) + dy_selected_col]; |
||||
|
||||
data[0][i] = dx_data[i] * dx_data[i]; |
||||
data[1][i] = dx_data[i] * dy_data[i]; |
||||
data[2][i] = dy_data[i] * dy_data[i]; |
||||
} |
||||
#endif |
||||
float sum0 = 0.0, sum1 = 0.0, sum2 = 0.0; |
||||
for(int i=1; i < ksY; i++) |
||||
{ |
||||
sum0 += (data[0][i]); |
||||
sum1 += (data[1][i]); |
||||
sum2 += (data[2][i]); |
||||
} |
||||
float sum01,sum02,sum11,sum12,sum21,sum22; |
||||
sum01 = sum0 + (data[0][0]); |
||||
sum02 = sum0 + (data[0][ksY]); |
||||
temp[0][col] = sum01; |
||||
temp[1][col] = sum02; |
||||
sum11 = sum1 + (data[1][0]); |
||||
sum12 = sum1 + (data[1][ksY]); |
||||
temp[2][col] = sum11; |
||||
temp[3][col] = sum12; |
||||
sum21 = sum2 + (data[2][0]); |
||||
sum22 = sum2 + (data[2][ksY]); |
||||
temp[4][col] = sum21; |
||||
temp[5][col] = sum22; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if(col < (THREADS-(ksX-1))) |
||||
{ |
||||
col += anX; |
||||
int posX = dst_startX - dst_x_off + col - anX; |
||||
int posY = (gly << 1); |
||||
int till = (ksX + 1)%2; |
||||
float tmp_sum[6]={ 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0 }; |
||||
for(int k=0; k<6; k++) |
||||
for(int i=-anX; i<=anX - till; i++) |
||||
{ |
||||
tmp_sum[k] += temp[k][col+i]; |
||||
} |
||||
|
||||
if(posX < dst_cols && (posY) < dst_rows) |
||||
{ |
||||
float a = tmp_sum[0] * 0.5f; |
||||
float b = tmp_sum[2]; |
||||
float c = tmp_sum[4] * 0.5f; |
||||
dst[(dst_startY+0) * (dst_step>>2)+ dst_startX + col - anX] = (float)((a+c) - sqrt((a-c)*(a-c) + b*b)); |
||||
} |
||||
if(posX < dst_cols && (posY + 1) < dst_rows) |
||||
{ |
||||
float a = tmp_sum[1] * 0.5f; |
||||
float b = tmp_sum[3]; |
||||
float c = tmp_sum[5] * 0.5f; |
||||
dst[(dst_startY+1) * (dst_step>>2)+ dst_startX + col - anX] = (float)((a+c) - sqrt((a-c)*(a-c) + b*b)); |
||||
} |
||||
} |
||||
} |
@ -0,0 +1,980 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Sen Liu, swjtuls1987@126.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if defined (DOUBLE_SUPPORT) |
||||
|
||||
#ifdef cl_khr_fp64 |
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable |
||||
#elif defined (cl_amd_fp64) |
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable |
||||
#endif |
||||
typedef double T; |
||||
typedef double F; |
||||
typedef double4 F4; |
||||
#define convert_F4 convert_double4 |
||||
|
||||
#else |
||||
typedef float F; |
||||
typedef float4 F4; |
||||
typedef long T; |
||||
#define convert_F4 convert_float4 |
||||
#endif |
||||
|
||||
#define DST_ROW_00 0 |
||||
#define DST_ROW_10 1 |
||||
#define DST_ROW_01 2 |
||||
#define DST_ROW_20 3 |
||||
#define DST_ROW_11 4 |
||||
#define DST_ROW_02 5 |
||||
#define DST_ROW_30 6 |
||||
#define DST_ROW_21 7 |
||||
#define DST_ROW_12 8 |
||||
#define DST_ROW_03 9 |
||||
|
||||
__kernel void icvContourMoments(int contour_total, |
||||
__global float* reader_oclmat_data, |
||||
__global T* dst_a, |
||||
int dst_step) |
||||
{ |
||||
T xi_1, yi_1, xi_12, yi_12, xi, yi, xi2, yi2, dxy, xii_1, yii_1; |
||||
int idx = get_global_id(0); |
||||
|
||||
if (idx < 0 || idx >= contour_total) |
||||
return; |
||||
|
||||
xi_1 = (T)(*(reader_oclmat_data + (get_global_id(0) << 1))); |
||||
yi_1 = (T)(*(reader_oclmat_data + (get_global_id(0) << 1) + 1)); |
||||
xi_12 = xi_1 * xi_1; |
||||
yi_12 = yi_1 * yi_1; |
||||
|
||||
if(idx == contour_total - 1) |
||||
{ |
||||
xi = (T)(*(reader_oclmat_data)); |
||||
yi = (T)(*(reader_oclmat_data + 1)); |
||||
} |
||||
else |
||||
{ |
||||
xi = (T)(*(reader_oclmat_data + (idx + 1) * 2)); |
||||
yi = (T)(*(reader_oclmat_data + (idx + 1) * 2 + 1)); |
||||
} |
||||
|
||||
xi2 = xi * xi; |
||||
yi2 = yi * yi; |
||||
dxy = xi_1 * yi - xi * yi_1; |
||||
xii_1 = xi_1 + xi; |
||||
yii_1 = yi_1 + yi; |
||||
|
||||
dst_step /= sizeof(T); |
||||
*( dst_a + DST_ROW_00 * dst_step + idx) = dxy; |
||||
*( dst_a + DST_ROW_10 * dst_step + idx) = dxy * xii_1; |
||||
*( dst_a + DST_ROW_01 * dst_step + idx) = dxy * yii_1; |
||||
*( dst_a + DST_ROW_20 * dst_step + idx) = dxy * (xi_1 * xii_1 + xi2); |
||||
*( dst_a + DST_ROW_11 * dst_step + idx) = dxy * (xi_1 * (yii_1 + yi_1) + xi * (yii_1 + yi)); |
||||
*( dst_a + DST_ROW_02 * dst_step + idx) = dxy * (yi_1 * yii_1 + yi2); |
||||
*( dst_a + DST_ROW_30 * dst_step + idx) = dxy * xii_1 * (xi_12 + xi2); |
||||
*( dst_a + DST_ROW_03 * dst_step + idx) = dxy * yii_1 * (yi_12 + yi2); |
||||
*( dst_a + DST_ROW_21 * dst_step + idx) = |
||||
dxy * (xi_12 * (3 * yi_1 + yi) + 2 * xi * xi_1 * yii_1 + |
||||
xi2 * (yi_1 + 3 * yi)); |
||||
*( dst_a + DST_ROW_12 * dst_step + idx) = |
||||
dxy * (yi_12 * (3 * xi_1 + xi) + 2 * yi * yi_1 * xii_1 + |
||||
yi2 * (xi_1 + 3 * xi)); |
||||
} |
||||
|
||||
__kernel void dst_sum(int src_rows, int src_cols, int tile_height, int tile_width, int TILE_SIZE, |
||||
__global F* sum, __global F* dst_m, int dst_step) |
||||
{ |
||||
int gidy = get_global_id(0); |
||||
int gidx = get_global_id(1); |
||||
int block_y = src_rows/tile_height; |
||||
int block_x = src_cols/tile_width; |
||||
int block_num; |
||||
|
||||
if(src_rows > TILE_SIZE && src_rows % TILE_SIZE != 0) |
||||
block_y ++; |
||||
if(src_cols > TILE_SIZE && src_cols % TILE_SIZE != 0) |
||||
block_x ++; |
||||
block_num = block_y * block_x; |
||||
__local F dst_sum[10][128]; |
||||
if(gidy<128-block_num) |
||||
for(int i=0; i<10; i++) |
||||
dst_sum[i][gidy+block_num]=0; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
dst_step /= sizeof(F); |
||||
if(gidy<block_num) |
||||
{ |
||||
dst_sum[0][gidy] = *(dst_m + mad24(DST_ROW_00 * block_y, dst_step, gidy)); |
||||
dst_sum[1][gidy] = *(dst_m + mad24(DST_ROW_10 * block_y, dst_step, gidy)); |
||||
dst_sum[2][gidy] = *(dst_m + mad24(DST_ROW_01 * block_y, dst_step, gidy)); |
||||
dst_sum[3][gidy] = *(dst_m + mad24(DST_ROW_20 * block_y, dst_step, gidy)); |
||||
dst_sum[4][gidy] = *(dst_m + mad24(DST_ROW_11 * block_y, dst_step, gidy)); |
||||
dst_sum[5][gidy] = *(dst_m + mad24(DST_ROW_02 * block_y, dst_step, gidy)); |
||||
dst_sum[6][gidy] = *(dst_m + mad24(DST_ROW_30 * block_y, dst_step, gidy)); |
||||
dst_sum[7][gidy] = *(dst_m + mad24(DST_ROW_21 * block_y, dst_step, gidy)); |
||||
dst_sum[8][gidy] = *(dst_m + mad24(DST_ROW_12 * block_y, dst_step, gidy)); |
||||
dst_sum[9][gidy] = *(dst_m + mad24(DST_ROW_03 * block_y, dst_step, gidy)); |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
for(int lsize=64; lsize>0; lsize>>=1) |
||||
{ |
||||
if(gidy<lsize) |
||||
{ |
||||
int lsize2 = gidy + lsize; |
||||
for(int i=0; i<10; i++) |
||||
dst_sum[i][gidy] += dst_sum[i][lsize2]; |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
if(gidy==0) |
||||
for(int i=0; i<10; i++) |
||||
sum[i] = dst_sum[i][0]; |
||||
} |
||||
|
||||
__kernel void CvMoments_D0(__global uchar16* src_data, int src_rows, int src_cols, int src_step, |
||||
__global F* dst_m, |
||||
int dst_cols, int dst_step, int blocky, |
||||
int depth, int cn, int coi, int binary, int TILE_SIZE) |
||||
{ |
||||
uchar tmp_coi[16]; // get the coi data |
||||
uchar16 tmp[16]; |
||||
int VLEN_C = 16; // vector length of uchar |
||||
|
||||
int gidy = get_global_id(0); |
||||
int gidx = get_global_id(1); |
||||
int wgidy = get_group_id(0); |
||||
int wgidx = get_group_id(1); |
||||
int lidy = get_local_id(0); |
||||
int lidx = get_local_id(1); |
||||
int y = wgidy*TILE_SIZE; // vector length of uchar |
||||
int x = wgidx*TILE_SIZE; // vector length of uchar |
||||
int kcn = (cn==2)?2:4; |
||||
int rstep = min(src_step, TILE_SIZE); |
||||
int tileSize_height = min(TILE_SIZE, src_rows - y); |
||||
int tileSize_width = min(TILE_SIZE, src_cols - x); |
||||
|
||||
if ( y+lidy < src_rows ) |
||||
{ |
||||
if( tileSize_width < TILE_SIZE ) |
||||
for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ ) |
||||
*((__global uchar*)src_data+(y+lidy)*src_step+x+i) = 0; |
||||
|
||||
if( coi > 0 ) //channel of interest |
||||
for(int i = 0; i < tileSize_width; i += VLEN_C) |
||||
{ |
||||
for(int j=0; j<VLEN_C; j++) |
||||
tmp_coi[j] = *((__global uchar*)src_data+(y+lidy)*src_step+(x+i+j)*kcn+coi-1); |
||||
tmp[i/VLEN_C] = (uchar16)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7], |
||||
tmp_coi[8],tmp_coi[9],tmp_coi[10],tmp_coi[11],tmp_coi[12],tmp_coi[13],tmp_coi[14],tmp_coi[15]); |
||||
} |
||||
else |
||||
for(int i=0; i < tileSize_width; i+=VLEN_C) |
||||
tmp[i/VLEN_C] = *(src_data+(y+lidy)*src_step/VLEN_C+(x+i)/VLEN_C); |
||||
} |
||||
|
||||
uchar16 zero = (uchar16)(0); |
||||
uchar16 full = (uchar16)(255); |
||||
if( binary ) |
||||
for(int i=0; i < tileSize_width; i+=VLEN_C) |
||||
tmp[i/VLEN_C] = (tmp[i/VLEN_C]!=zero)?full:zero; |
||||
|
||||
F mom[10]; |
||||
__local int m[10][128]; |
||||
if(lidy < 128) |
||||
{ |
||||
for(int i=0; i<10; i++) |
||||
m[i][lidy]=0; |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
int lm[10] = {0}; |
||||
int16 x0 = (int16)(0); |
||||
int16 x1 = (int16)(0); |
||||
int16 x2 = (int16)(0); |
||||
int16 x3 = (int16)(0); |
||||
for( int xt = 0 ; xt < tileSize_width; xt+=(VLEN_C) ) |
||||
{ |
||||
int16 v_xt = (int16)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7, xt+8, xt+9, xt+10, xt+11, xt+12, xt+13, xt+14, xt+15); |
||||
int16 p = convert_int16(tmp[xt/VLEN_C]); |
||||
int16 xp = v_xt * p, xxp = xp *v_xt; |
||||
x0 += p; |
||||
x1 += xp; |
||||
x2 += xxp; |
||||
x3 += xxp * v_xt; |
||||
} |
||||
x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7 + x0.s8 + x0.s9 + x0.sa + x0.sb + x0.sc + x0.sd + x0.se + x0.sf; |
||||
x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7 + x1.s8 + x1.s9 + x1.sa + x1.sb + x1.sc + x1.sd + x1.se + x1.sf; |
||||
x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7 + x2.s8 + x2.s9 + x2.sa + x2.sb + x2.sc + x2.sd + x2.se + x2.sf; |
||||
x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7 + x3.s8 + x3.s9 + x3.sa + x3.sb + x3.sc + x3.sd + x3.se + x3.sf; |
||||
int py = lidy * ((int)x0.s0); |
||||
int sy = lidy*lidy; |
||||
int bheight = min(tileSize_height, TILE_SIZE/2); |
||||
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height) |
||||
{ |
||||
m[9][lidy-bheight] = ((int)py) * sy; // m03 |
||||
m[8][lidy-bheight] = ((int)x1.s0) * sy; // m12 |
||||
m[7][lidy-bheight] = ((int)x2.s0) * lidy; // m21 |
||||
m[6][lidy-bheight] = x3.s0; // m30 |
||||
m[5][lidy-bheight] = x0.s0 * sy; // m02 |
||||
m[4][lidy-bheight] = x1.s0 * lidy; // m11 |
||||
m[3][lidy-bheight] = x2.s0; // m20 |
||||
m[2][lidy-bheight] = py; // m01 |
||||
m[1][lidy-bheight] = x1.s0; // m10 |
||||
m[0][lidy-bheight] = x0.s0; // m00 |
||||
} |
||||
else if(lidy < bheight) |
||||
{ |
||||
lm[9] = ((int)py) * sy; // m03 |
||||
lm[8] = ((int)x1.s0) * sy; // m12 |
||||
lm[7] = ((int)x2.s0) * lidy; // m21 |
||||
lm[6] = x3.s0; // m30 |
||||
lm[5] = x0.s0 * sy; // m02 |
||||
lm[4] = x1.s0 * lidy; // m11 |
||||
lm[3] = x2.s0; // m20 |
||||
lm[2] = py; // m01 |
||||
lm[1] = x1.s0; // m10 |
||||
lm[0] = x0.s0; // m00 |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
for( int j = bheight; j >= 1; j = j/2 ) |
||||
{ |
||||
if(lidy < j) |
||||
for( int i = 0; i < 10; i++ ) |
||||
lm[i] = lm[i] + m[i][lidy]; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if(lidy >= j/2&&lidy < j) |
||||
for( int i = 0; i < 10; i++ ) |
||||
m[i][lidy-j/2] = lm[i]; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
|
||||
if(lidy == 0&&lidx == 0) |
||||
{ |
||||
for( int mt = 0; mt < 10; mt++ ) |
||||
mom[mt] = (F)lm[mt]; |
||||
if(binary) |
||||
{ |
||||
F s = 1./255; |
||||
for( int mt = 0; mt < 10; mt++ ) |
||||
mom[mt] *= s; |
||||
} |
||||
F xm = x * mom[0], ym = y * mom[0]; |
||||
|
||||
// accumulate moments computed in each tile |
||||
dst_step /= sizeof(F); |
||||
|
||||
// + m00 ( = m00' ) |
||||
*(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; |
||||
|
||||
// + m10 ( = m10' + x*m00' ) |
||||
*(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; |
||||
|
||||
// + m01 ( = m01' + y*m00' ) |
||||
*(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; |
||||
|
||||
// + m20 ( = m20' + 2*x*m10' + x*x*m00' ) |
||||
*(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); |
||||
|
||||
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) |
||||
*(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; |
||||
|
||||
// + m02 ( = m02' + 2*y*m01' + y*y*m00' ) |
||||
*(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); |
||||
|
||||
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) |
||||
*(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); |
||||
|
||||
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') |
||||
*(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; |
||||
|
||||
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') |
||||
*(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; |
||||
|
||||
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) |
||||
*(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); |
||||
} |
||||
} |
||||
|
||||
__kernel void CvMoments_D2(__global ushort8* src_data, int src_rows, int src_cols, int src_step, |
||||
__global F* dst_m, |
||||
int dst_cols, int dst_step, int blocky, |
||||
int depth, int cn, int coi, int binary, const int TILE_SIZE) |
||||
{ |
||||
ushort tmp_coi[8]; // get the coi data |
||||
ushort8 tmp[32]; |
||||
int VLEN_US = 8; // vector length of ushort |
||||
int gidy = get_global_id(0); |
||||
int gidx = get_global_id(1); |
||||
int wgidy = get_group_id(0); |
||||
int wgidx = get_group_id(1); |
||||
int lidy = get_local_id(0); |
||||
int lidx = get_local_id(1); |
||||
int y = wgidy*TILE_SIZE; // real Y index of pixel |
||||
int x = wgidx*TILE_SIZE; // real X index of pixel |
||||
int kcn = (cn==2)?2:4; |
||||
int rstep = min(src_step/2, TILE_SIZE); |
||||
int tileSize_height = min(TILE_SIZE, src_rows - y); |
||||
int tileSize_width = min(TILE_SIZE, src_cols -x); |
||||
|
||||
if ( y+lidy < src_rows ) |
||||
{ |
||||
if(src_cols > TILE_SIZE && tileSize_width < TILE_SIZE) |
||||
for(int i=tileSize_width; i < rstep && (x+i) < src_cols; i++ ) |
||||
*((__global ushort*)src_data+(y+lidy)*src_step/2+x+i) = 0; |
||||
if( coi > 0 ) |
||||
for(int i=0; i < tileSize_width; i+=VLEN_US) |
||||
{ |
||||
for(int j=0; j<VLEN_US; j++) |
||||
tmp_coi[j] = *((__global ushort*)src_data+(y+lidy)*(int)src_step/2+(x+i+j)*kcn+coi-1); |
||||
tmp[i/VLEN_US] = (ushort8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]); |
||||
} |
||||
else |
||||
for(int i=0; i < tileSize_width; i+=VLEN_US) |
||||
tmp[i/VLEN_US] = *(src_data+(y+lidy)*src_step/(2*VLEN_US)+(x+i)/VLEN_US); |
||||
} |
||||
|
||||
ushort8 zero = (ushort8)(0); |
||||
ushort8 full = (ushort8)(255); |
||||
if( binary ) |
||||
for(int i=0; i < tileSize_width; i+=VLEN_US) |
||||
tmp[i/VLEN_US] = (tmp[i/VLEN_US]!=zero)?full:zero; |
||||
F mom[10]; |
||||
__local long m[10][128]; |
||||
if(lidy < 128) |
||||
for(int i=0; i<10; i++) |
||||
m[i][lidy]=0; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
long lm[10] = {0}; |
||||
int8 x0 = (int8)(0); |
||||
int8 x1 = (int8)(0); |
||||
int8 x2 = (int8)(0); |
||||
long8 x3 = (long8)(0); |
||||
for( int xt = 0 ; xt < tileSize_width; xt+=(VLEN_US) ) |
||||
{ |
||||
int8 v_xt = (int8)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7); |
||||
int8 p = convert_int8(tmp[xt/VLEN_US]); |
||||
int8 xp = v_xt * p, xxp = xp * v_xt; |
||||
x0 += p; |
||||
x1 += xp; |
||||
x2 += xxp; |
||||
x3 += convert_long8(xxp) *convert_long8(v_xt); |
||||
} |
||||
x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7; |
||||
x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7; |
||||
x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7; |
||||
x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7; |
||||
|
||||
int py = lidy * x0.s0, sy = lidy*lidy; |
||||
int bheight = min(tileSize_height, TILE_SIZE/2); |
||||
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height) |
||||
{ |
||||
m[9][lidy-bheight] = ((long)py) * sy; // m03 |
||||
m[8][lidy-bheight] = ((long)x1.s0) * sy; // m12 |
||||
m[7][lidy-bheight] = ((long)x2.s0) * lidy; // m21 |
||||
m[6][lidy-bheight] = x3.s0; // m30 |
||||
m[5][lidy-bheight] = x0.s0 * sy; // m02 |
||||
m[4][lidy-bheight] = x1.s0 * lidy; // m11 |
||||
m[3][lidy-bheight] = x2.s0; // m20 |
||||
m[2][lidy-bheight] = py; // m01 |
||||
m[1][lidy-bheight] = x1.s0; // m10 |
||||
m[0][lidy-bheight] = x0.s0; // m00 |
||||
} |
||||
else if(lidy < bheight) |
||||
{ |
||||
lm[9] = ((long)py) * sy; // m03 |
||||
lm[8] = ((long)x1.s0) * sy; // m12 |
||||
lm[7] = ((long)x2.s0) * lidy; // m21 |
||||
lm[6] = x3.s0; // m30 |
||||
lm[5] = x0.s0 * sy; // m02 |
||||
lm[4] = x1.s0 * lidy; // m11 |
||||
lm[3] = x2.s0; // m20 |
||||
lm[2] = py; // m01 |
||||
lm[1] = x1.s0; // m10 |
||||
lm[0] = x0.s0; // m00 |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
for( int j = TILE_SIZE/2; j >= 1; j = j/2 ) |
||||
{ |
||||
if(lidy < j) |
||||
for( int i = 0; i < 10; i++ ) |
||||
lm[i] = lm[i] + m[i][lidy]; |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
for( int j = TILE_SIZE/2; j >= 1; j = j/2 ) |
||||
{ |
||||
if(lidy >= j/2&&lidy < j) |
||||
for( int i = 0; i < 10; i++ ) |
||||
m[i][lidy-j/2] = lm[i]; |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if(lidy == 0&&lidx == 0) |
||||
{ |
||||
for(int mt = 0; mt < 10; mt++ ) |
||||
mom[mt] = (F)lm[mt]; |
||||
|
||||
if(binary) |
||||
{ |
||||
F s = 1./255; |
||||
for( int mt = 0; mt < 10; mt++ ) |
||||
mom[mt] *= s; |
||||
} |
||||
|
||||
F xm = x *mom[0], ym = y * mom[0]; |
||||
|
||||
// accumulate moments computed in each tile |
||||
dst_step /= sizeof(F); |
||||
|
||||
// + m00 ( = m00' ) |
||||
*(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; |
||||
|
||||
// + m10 ( = m10' + x*m00' ) |
||||
*(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; |
||||
|
||||
// + m01 ( = m01' + y*m00' ) |
||||
*(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; |
||||
|
||||
// + m20 ( = m20' + 2*x*m10' + x*x*m00' ) |
||||
*(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); |
||||
|
||||
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) |
||||
*(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; |
||||
|
||||
// + m02 ( = m02' + 2*y*m01' + y*y*m00' ) |
||||
*(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); |
||||
|
||||
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) |
||||
*(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); |
||||
|
||||
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') |
||||
*(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; |
||||
|
||||
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') |
||||
*(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; |
||||
|
||||
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) |
||||
*(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); |
||||
} |
||||
} |
||||
|
||||
__kernel void CvMoments_D3(__global short8* src_data, int src_rows, int src_cols, int src_step, |
||||
__global F* dst_m, |
||||
int dst_cols, int dst_step, int blocky, |
||||
int depth, int cn, int coi, int binary, const int TILE_SIZE) |
||||
{ |
||||
short tmp_coi[8]; // get the coi data |
||||
short8 tmp[32]; |
||||
int VLEN_S =8; // vector length of short |
||||
int gidy = get_global_id(0); |
||||
int gidx = get_global_id(1); |
||||
int wgidy = get_group_id(0); |
||||
int wgidx = get_group_id(1); |
||||
int lidy = get_local_id(0); |
||||
int lidx = get_local_id(1); |
||||
int y = wgidy*TILE_SIZE; // real Y index of pixel |
||||
int x = wgidx*TILE_SIZE; // real X index of pixel |
||||
int kcn = (cn==2)?2:4; |
||||
int rstep = min(src_step/2, TILE_SIZE); |
||||
int tileSize_height = min(TILE_SIZE, src_rows - y); |
||||
int tileSize_width = min(TILE_SIZE, src_cols -x); |
||||
|
||||
if ( y+lidy < src_rows ) |
||||
{ |
||||
if(tileSize_width < TILE_SIZE) |
||||
for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ ) |
||||
*((__global short*)src_data+(y+lidy)*src_step/2+x+i) = 0; |
||||
if( coi > 0 ) |
||||
for(int i=0; i < tileSize_width; i+=VLEN_S) |
||||
{ |
||||
for(int j=0; j<VLEN_S; j++) |
||||
tmp_coi[j] = *((__global short*)src_data+(y+lidy)*src_step/2+(x+i+j)*kcn+coi-1); |
||||
tmp[i/VLEN_S] = (short8)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3],tmp_coi[4],tmp_coi[5],tmp_coi[6],tmp_coi[7]); |
||||
} |
||||
else |
||||
for(int i=0; i < tileSize_width; i+=VLEN_S) |
||||
tmp[i/VLEN_S] = *(src_data+(y+lidy)*src_step/(2*VLEN_S)+(x+i)/VLEN_S); |
||||
} |
||||
|
||||
short8 zero = (short8)(0); |
||||
short8 full = (short8)(255); |
||||
if( binary ) |
||||
for(int i=0; i < tileSize_width; i+=(VLEN_S)) |
||||
tmp[i/VLEN_S] = (tmp[i/VLEN_S]!=zero)?full:zero; |
||||
|
||||
F mom[10]; |
||||
__local long m[10][128]; |
||||
if(lidy < 128) |
||||
for(int i=0; i<10; i++) |
||||
m[i][lidy]=0; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
long lm[10] = {0}; |
||||
int8 x0 = (int8)(0); |
||||
int8 x1 = (int8)(0); |
||||
int8 x2 = (int8)(0); |
||||
long8 x3 = (long8)(0); |
||||
for( int xt = 0 ; xt < tileSize_width; xt+= (VLEN_S)) |
||||
{ |
||||
int8 v_xt = (int8)(xt, xt+1, xt+2, xt+3, xt+4, xt+5, xt+6, xt+7); |
||||
int8 p = convert_int8(tmp[xt/VLEN_S]); |
||||
int8 xp = v_xt * p, xxp = xp * v_xt; |
||||
x0 += p; |
||||
x1 += xp; |
||||
x2 += xxp; |
||||
x3 += convert_long8(xxp) * convert_long8(v_xt); |
||||
} |
||||
x0.s0 += x0.s1 + x0.s2 + x0.s3 + x0.s4 + x0.s5 + x0.s6 + x0.s7; |
||||
x1.s0 += x1.s1 + x1.s2 + x1.s3 + x1.s4 + x1.s5 + x1.s6 + x1.s7; |
||||
x2.s0 += x2.s1 + x2.s2 + x2.s3 + x2.s4 + x2.s5 + x2.s6 + x2.s7; |
||||
x3.s0 += x3.s1 + x3.s2 + x3.s3 + x3.s4 + x3.s5 + x3.s6 + x3.s7; |
||||
|
||||
int py = lidy * x0.s0, sy = lidy*lidy; |
||||
int bheight = min(tileSize_height, TILE_SIZE/2); |
||||
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height) |
||||
{ |
||||
m[9][lidy-bheight] = ((long)py) * sy; // m03 |
||||
m[8][lidy-bheight] = ((long)x1.s0) * sy; // m12 |
||||
m[7][lidy-bheight] = ((long)x2.s0) * lidy; // m21 |
||||
m[6][lidy-bheight] = x3.s0; // m30 |
||||
m[5][lidy-bheight] = x0.s0 * sy; // m02 |
||||
m[4][lidy-bheight] = x1.s0 * lidy; // m11 |
||||
m[3][lidy-bheight] = x2.s0; // m20 |
||||
m[2][lidy-bheight] = py; // m01 |
||||
m[1][lidy-bheight] = x1.s0; // m10 |
||||
m[0][lidy-bheight] = x0.s0; // m00 |
||||
} |
||||
else if(lidy < bheight) |
||||
{ |
||||
lm[9] = ((long)py) * sy; // m03 |
||||
lm[8] = ((long)(x1.s0)) * sy; // m12 |
||||
lm[7] = ((long)(x2.s0)) * lidy; // m21 |
||||
lm[6] = x3.s0; // m30 |
||||
lm[5] = x0.s0 * sy; // m02 |
||||
lm[4] = x1.s0 * lidy; // m11 |
||||
lm[3] = x2.s0; // m20 |
||||
lm[2] = py; // m01 |
||||
lm[1] = x1.s0; // m10 |
||||
lm[0] = x0.s0; // m00 |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
for( int j = TILE_SIZE/2; j >=1; j = j/2 ) |
||||
{ |
||||
if(lidy < j) |
||||
for( int i = 0; i < 10; i++ ) |
||||
lm[i] = lm[i] + m[i][lidy]; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if(lidy >= j/2&&lidy < j) |
||||
for( int i = 0; i < 10; i++ ) |
||||
m[i][lidy-j/2] = lm[i]; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
if(lidy ==0 &&lidx ==0) |
||||
{ |
||||
for(int mt = 0; mt < 10; mt++ ) |
||||
mom[mt] = (F)lm[mt]; |
||||
|
||||
if(binary) |
||||
{ |
||||
F s = 1./255; |
||||
for( int mt = 0; mt < 10; mt++ ) |
||||
mom[mt] *= s; |
||||
} |
||||
|
||||
F xm = x * mom[0], ym = y*mom[0]; |
||||
|
||||
// accumulate moments computed in each tile |
||||
dst_step /= sizeof(F); |
||||
|
||||
// + m00 ( = m00' ) |
||||
*(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; |
||||
|
||||
// + m10 ( = m10' + x*m00' ) |
||||
*(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; |
||||
|
||||
// + m01 ( = m01' + y*m00' ) |
||||
*(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; |
||||
|
||||
// + m20 ( = m20' + 2*x*m10' + x*x*m00' ) |
||||
*(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); |
||||
|
||||
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) |
||||
*(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; |
||||
|
||||
// + m02 ( = m02' + 2*y*m01' + y*y*m00' ) |
||||
*(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); |
||||
|
||||
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) |
||||
*(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); |
||||
|
||||
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') |
||||
*(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; |
||||
|
||||
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') |
||||
*(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; |
||||
|
||||
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) |
||||
*(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); |
||||
} |
||||
} |
||||
|
||||
__kernel void CvMoments_D5( __global float* src_data, int src_rows, int src_cols, int src_step, |
||||
__global F* dst_m, |
||||
int dst_cols, int dst_step, int blocky, |
||||
int depth, int cn, int coi, int binary, const int TILE_SIZE) |
||||
{ |
||||
float tmp_coi[4]; // get the coi data |
||||
float4 tmp[64] ; |
||||
int VLEN_F = 4; // vector length of float |
||||
int gidy = get_global_id(0); |
||||
int gidx = get_global_id(1); |
||||
int wgidy = get_group_id(0); |
||||
int wgidx = get_group_id(1); |
||||
int lidy = get_local_id(0); |
||||
int lidx = get_local_id(1); |
||||
int y = wgidy*TILE_SIZE; // real Y index of pixel |
||||
int x = wgidx*TILE_SIZE; // real X index of pixel |
||||
int kcn = (cn==2)?2:4; |
||||
int rstep = min(src_step/4, TILE_SIZE); |
||||
int tileSize_height = min(TILE_SIZE, src_rows - y); |
||||
int tileSize_width = min(TILE_SIZE, src_cols -x); |
||||
int maxIdx = mul24(src_rows, src_cols); |
||||
int yOff = (y+lidy)*src_step; |
||||
int index; |
||||
|
||||
if ( y+lidy < src_rows ) |
||||
{ |
||||
if(tileSize_width < TILE_SIZE) |
||||
for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ ) |
||||
*((__global float*)src_data+(y+lidy)*src_step/4+x+i) = 0; |
||||
if( coi > 0 ) |
||||
for(int i=0; i < tileSize_width; i+=VLEN_F) |
||||
{ |
||||
for(int j=0; j<4; j++) |
||||
tmp_coi[j] = *(src_data+(y+lidy)*src_step/4+(x+i+j)*kcn+coi-1); |
||||
tmp[i/VLEN_F] = (float4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]); |
||||
} |
||||
else |
||||
for(int i=0; i < tileSize_width; i+=VLEN_F) |
||||
tmp[i/VLEN_F] = (float4)(*(src_data+(y+lidy)*src_step/4+x+i),*(src_data+(y+lidy)*src_step/4+x+i+1),*(src_data+(y+lidy)*src_step/4+x+i+2),*(src_data+(y+lidy)*src_step/4+x+i+3)); |
||||
} |
||||
|
||||
float4 zero = (float4)(0); |
||||
float4 full = (float4)(255); |
||||
if( binary ) |
||||
for(int i=0; i < tileSize_width; i+=4) |
||||
tmp[i/VLEN_F] = (tmp[i/VLEN_F]!=zero)?full:zero; |
||||
F mom[10]; |
||||
__local F m[10][128]; |
||||
if(lidy < 128) |
||||
for(int i = 0; i < 10; i ++) |
||||
m[i][lidy] = 0; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
F lm[10] = {0}; |
||||
F4 x0 = (F4)(0); |
||||
F4 x1 = (F4)(0); |
||||
F4 x2 = (F4)(0); |
||||
F4 x3 = (F4)(0); |
||||
for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_F ) |
||||
{ |
||||
F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3); |
||||
F4 p = convert_F4(tmp[xt/VLEN_F]); |
||||
F4 xp = v_xt * p, xxp = xp * v_xt; |
||||
x0 += p; |
||||
x1 += xp; |
||||
x2 += xxp; |
||||
x3 += xxp * v_xt; |
||||
} |
||||
x0.s0 += x0.s1 + x0.s2 + x0.s3; |
||||
x1.s0 += x1.s1 + x1.s2 + x1.s3; |
||||
x2.s0 += x2.s1 + x2.s2 + x2.s3; |
||||
x3.s0 += x3.s1 + x3.s2 + x3.s3; |
||||
|
||||
F py = lidy * x0.s0, sy = lidy*lidy; |
||||
int bheight = min(tileSize_height, TILE_SIZE/2); |
||||
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height) |
||||
{ |
||||
m[9][lidy-bheight] = ((F)py) * sy; // m03 |
||||
m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12 |
||||
m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21 |
||||
m[6][lidy-bheight] = x3.s0; // m30 |
||||
m[5][lidy-bheight] = x0.s0 * sy; // m02 |
||||
m[4][lidy-bheight] = x1.s0 * lidy; // m11 |
||||
m[3][lidy-bheight] = x2.s0; // m20 |
||||
m[2][lidy-bheight] = py; // m01 |
||||
m[1][lidy-bheight] = x1.s0; // m10 |
||||
m[0][lidy-bheight] = x0.s0; // m00 |
||||
} |
||||
|
||||
else if(lidy < bheight) |
||||
{ |
||||
lm[9] = ((F)py) * sy; // m03 |
||||
lm[8] = ((F)x1.s0) * sy; // m12 |
||||
lm[7] = ((F)x2.s0) * lidy; // m21 |
||||
lm[6] = x3.s0; // m30 |
||||
lm[5] = x0.s0 * sy; // m02 |
||||
lm[4] = x1.s0 * lidy; // m11 |
||||
lm[3] = x2.s0; // m20 |
||||
lm[2] = py; // m01 |
||||
lm[1] = x1.s0; // m10 |
||||
lm[0] = x0.s0; // m00 |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
for( int j = TILE_SIZE/2; j >= 1; j = j/2 ) |
||||
{ |
||||
if(lidy < j) |
||||
for( int i = 0; i < 10; i++ ) |
||||
lm[i] = lm[i] + m[i][lidy]; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if(lidy >= j/2&&lidy < j) |
||||
for( int i = 0; i < 10; i++ ) |
||||
m[i][lidy-j/2] = lm[i]; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
if(lidy == 0&&lidx == 0) |
||||
{ |
||||
for( int mt = 0; mt < 10; mt++ ) |
||||
mom[mt] = (F)lm[mt]; |
||||
if(binary) |
||||
{ |
||||
F s = 1./255; |
||||
for( int mt = 0; mt < 10; mt++ ) |
||||
mom[mt] *= s; |
||||
} |
||||
|
||||
F xm = x * mom[0], ym = y * mom[0]; |
||||
|
||||
// accumulate moments computed in each tile |
||||
dst_step /= sizeof(F); |
||||
|
||||
// + m00 ( = m00' ) |
||||
*(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; |
||||
|
||||
// + m10 ( = m10' + x*m00' ) |
||||
*(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; |
||||
|
||||
// + m01 ( = m01' + y*m00' ) |
||||
*(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; |
||||
|
||||
// + m20 ( = m20' + 2*x*m10' + x*x*m00' ) |
||||
*(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); |
||||
|
||||
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) |
||||
*(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; |
||||
|
||||
// + m02 ( = m02' + 2*y*m01' + y*y*m00' ) |
||||
*(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); |
||||
|
||||
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) |
||||
*(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); |
||||
|
||||
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') |
||||
*(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; |
||||
|
||||
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') |
||||
*(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; |
||||
|
||||
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) |
||||
*(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); |
||||
} |
||||
} |
||||
|
||||
__kernel void CvMoments_D6(__global F* src_data, int src_rows, int src_cols, int src_step, |
||||
__global F* dst_m, |
||||
int dst_cols, int dst_step, int blocky, |
||||
int depth, int cn, int coi, int binary, const int TILE_SIZE) |
||||
{ |
||||
F tmp_coi[4]; // get the coi data |
||||
F4 tmp[64]; |
||||
int VLEN_D = 4; // length of vetor |
||||
int gidy = get_global_id(0); |
||||
int gidx = get_global_id(1); |
||||
int wgidy = get_group_id(0); |
||||
int wgidx = get_group_id(1); |
||||
int lidy = get_local_id(0); |
||||
int lidx = get_local_id(1); |
||||
int y = wgidy*TILE_SIZE; // real Y index of pixel |
||||
int x = wgidx*TILE_SIZE; // real X index of pixel |
||||
int kcn = (cn==2)?2:4; |
||||
int rstep = min(src_step/8, TILE_SIZE); |
||||
int tileSize_height = min(TILE_SIZE, src_rows - y); |
||||
int tileSize_width = min(TILE_SIZE, src_cols - x); |
||||
|
||||
if ( y+lidy < src_rows ) |
||||
{ |
||||
if(tileSize_width < TILE_SIZE) |
||||
for(int i = tileSize_width; i < rstep && (x+i) < src_cols; i++ ) |
||||
*((__global F*)src_data+(y+lidy)*src_step/8+x+i) = 0; |
||||
if( coi > 0 ) |
||||
for(int i=0; i < tileSize_width; i+=VLEN_D) |
||||
{ |
||||
for(int j=0; j<4 && ((x+i+j)*kcn+coi-1)<src_cols; j++) |
||||
tmp_coi[j] = *(src_data+(y+lidy)*src_step/8+(x+i+j)*kcn+coi-1); |
||||
tmp[i/VLEN_D] = (F4)(tmp_coi[0],tmp_coi[1],tmp_coi[2],tmp_coi[3]); |
||||
} |
||||
else |
||||
for(int i=0; i < tileSize_width && (x+i+3) < src_cols; i+=VLEN_D) |
||||
tmp[i/VLEN_D] = (F4)(*(src_data+(y+lidy)*src_step/8+x+i),*(src_data+(y+lidy)*src_step/8+x+i+1),*(src_data+(y+lidy)*src_step/8+x+i+2),*(src_data+(y+lidy)*src_step/8+x+i+3)); |
||||
} |
||||
|
||||
F4 zero = (F4)(0); |
||||
F4 full = (F4)(255); |
||||
if( binary ) |
||||
for(int i=0; i < tileSize_width; i+=VLEN_D) |
||||
tmp[i/VLEN_D] = (tmp[i/VLEN_D]!=zero)?full:zero; |
||||
F mom[10]; |
||||
__local F m[10][128]; |
||||
if(lidy < 128) |
||||
for(int i=0; i<10; i++) |
||||
m[i][lidy]=0; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
F lm[10] = {0}; |
||||
F4 x0 = (F4)(0); |
||||
F4 x1 = (F4)(0); |
||||
F4 x2 = (F4)(0); |
||||
F4 x3 = (F4)(0); |
||||
for( int xt = 0 ; xt < tileSize_width; xt+=VLEN_D ) |
||||
{ |
||||
F4 v_xt = (F4)(xt, xt+1, xt+2, xt+3); |
||||
F4 p = tmp[xt/VLEN_D]; |
||||
F4 xp = v_xt * p, xxp = xp * v_xt; |
||||
x0 += p; |
||||
x1 += xp; |
||||
x2 += xxp; |
||||
x3 += xxp *v_xt; |
||||
} |
||||
x0.s0 += x0.s1 + x0.s2 + x0.s3; |
||||
x1.s0 += x1.s1 + x1.s2 + x1.s3; |
||||
x2.s0 += x2.s1 + x2.s2 + x2.s3; |
||||
x3.s0 += x3.s1 + x3.s2 + x3.s3; |
||||
|
||||
F py = lidy * x0.s0, sy = lidy*lidy; |
||||
int bheight = min(tileSize_height, TILE_SIZE/2); |
||||
if(bheight >= TILE_SIZE/2&&lidy > bheight-1&&lidy < tileSize_height) |
||||
{ |
||||
m[9][lidy-bheight] = ((F)py) * sy; // m03 |
||||
m[8][lidy-bheight] = ((F)x1.s0) * sy; // m12 |
||||
m[7][lidy-bheight] = ((F)x2.s0) * lidy; // m21 |
||||
m[6][lidy-bheight] = x3.s0; // m30 |
||||
m[5][lidy-bheight] = x0.s0 * sy; // m02 |
||||
m[4][lidy-bheight] = x1.s0 * lidy; // m11 |
||||
m[3][lidy-bheight] = x2.s0; // m20 |
||||
m[2][lidy-bheight] = py; // m01 |
||||
m[1][lidy-bheight] = x1.s0; // m10 |
||||
m[0][lidy-bheight] = x0.s0; // m00 |
||||
} |
||||
else if(lidy < bheight) |
||||
{ |
||||
lm[9] = ((F)py) * sy; // m03 |
||||
lm[8] = ((F)x1.s0) * sy; // m12 |
||||
lm[7] = ((F)x2.s0) * lidy; // m21 |
||||
lm[6] = x3.s0; // m30 |
||||
lm[5] = x0.s0 * sy; // m02 |
||||
lm[4] = x1.s0 * lidy; // m11 |
||||
lm[3] = x2.s0; // m20 |
||||
lm[2] = py; // m01 |
||||
lm[1] = x1.s0; // m10 |
||||
lm[0] = x0.s0; // m00 |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
for( int j = TILE_SIZE/2; j >= 1; j = j/2 ) |
||||
{ |
||||
if(lidy < j) |
||||
for( int i = 0; i < 10; i++ ) |
||||
lm[i] = lm[i] + m[i][lidy]; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if(lidy >= j/2&&lidy < j) |
||||
for( int i = 0; i < 10; i++ ) |
||||
m[i][lidy-j/2] = lm[i]; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
if(lidy == 0&&lidx == 0) |
||||
{ |
||||
for( int mt = 0; mt < 10; mt++ ) |
||||
mom[mt] = (F)lm[mt]; |
||||
if(binary) |
||||
{ |
||||
F s = 1./255; |
||||
for( int mt = 0; mt < 10; mt++ ) |
||||
mom[mt] *= s; |
||||
} |
||||
|
||||
F xm = x * mom[0], ym = y * mom[0]; |
||||
|
||||
// accumulate moments computed in each tile |
||||
dst_step /= sizeof(F); |
||||
|
||||
// + m00 ( = m00' ) |
||||
*(dst_m + mad24(DST_ROW_00 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[0]; |
||||
|
||||
// + m10 ( = m10' + x*m00' ) |
||||
*(dst_m + mad24(DST_ROW_10 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[1] + xm; |
||||
|
||||
// + m01 ( = m01' + y*m00' ) |
||||
*(dst_m + mad24(DST_ROW_01 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[2] + ym; |
||||
|
||||
// + m20 ( = m20' + 2*x*m10' + x*x*m00' ) |
||||
*(dst_m + mad24(DST_ROW_20 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[3] + x * (mom[1] * 2 + xm); |
||||
|
||||
// + m11 ( = m11' + x*m01' + y*m10' + x*y*m00' ) |
||||
*(dst_m + mad24(DST_ROW_11 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[4] + x * (mom[2] + ym) + y * mom[1]; |
||||
|
||||
// + m02 ( = m02' + 2*y*m01' + y*y*m00' ) |
||||
*(dst_m + mad24(DST_ROW_02 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[5] + y * (mom[2] * 2 + ym); |
||||
|
||||
// + m30 ( = m30' + 3*x*m20' + 3*x*x*m10' + x*x*x*m00' ) |
||||
*(dst_m + mad24(DST_ROW_30 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[6] + x * (3. * mom[3] + x * (3. * mom[1] + xm)); |
||||
|
||||
// + m21 ( = m21' + x*(2*m11' + 2*y*m10' + x*m01' + x*y*m00') + y*m20') |
||||
*(dst_m + mad24(DST_ROW_21 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[7] + x * (2 * (mom[4] + y * mom[1]) + x * (mom[2] + ym)) + y * mom[3]; |
||||
|
||||
// + m12 ( = m12' + y*(2*m11' + 2*x*m01' + y*m10' + x*y*m00') + x*m02') |
||||
*(dst_m + mad24(DST_ROW_12 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[8] + y * (2 * (mom[4] + x * mom[2]) + y * (mom[1] + xm)) + x * mom[5]; |
||||
|
||||
// + m03 ( = m03' + 3*y*m02' + 3*y*y*m01' + y*y*y*m00' ) |
||||
*(dst_m + mad24(DST_ROW_03 * blocky, dst_step, mad24(wgidy, dst_cols, wgidx))) = mom[9] + y * (3. * mom[5] + y * (3. * mom[2] + ym)); |
||||
} |
||||
} |
@ -0,0 +1,228 @@ |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Niko Li, newlife20080214@gmail.com |
||||
// Zero Lin, zero.lin@amd.com |
||||
// Yao Wang, bitwangyaoyao@gmail.com |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
// |
||||
|
||||
|
||||
#ifdef ERODE |
||||
#define MORPH_OP(A,B) min((A),(B)) |
||||
#endif |
||||
#ifdef DILATE |
||||
#define MORPH_OP(A,B) max((A),(B)) |
||||
#endif |
||||
//BORDER_CONSTANT: iiiiii|abcdefgh|iiiiiii |
||||
#define ELEM(i,l_edge,r_edge,elem1,elem2) (i)<(l_edge) | (i) >= (r_edge) ? (elem1) : (elem2) |
||||
#ifndef GENTYPE |
||||
|
||||
__kernel void morph_C1_D0(__global const uchar * restrict src, |
||||
__global uchar *dst, |
||||
int src_offset_x, int src_offset_y, |
||||
int cols, int rows, |
||||
int src_step_in_pixel, int dst_step_in_pixel, |
||||
__constant uchar * mat_kernel, |
||||
int src_whole_cols, int src_whole_rows, |
||||
int dst_offset_in_pixel) |
||||
{ |
||||
int l_x = get_local_id(0); |
||||
int l_y = get_local_id(1); |
||||
int x = get_group_id(0)*4*LSIZE0; |
||||
int y = get_group_id(1)*LSIZE1; |
||||
int start_x = x+src_offset_x-RADIUSX & 0xfffffffc; |
||||
int end_x = x + src_offset_x+LSIZE0*4+RADIUSX & 0xfffffffc; |
||||
int width = (end_x -start_x+4)>>2; |
||||
int offset = src_offset_x-RADIUSX & 3; |
||||
int start_y = y+src_offset_y-RADIUSY; |
||||
int point1 = mad24(l_y,LSIZE0,l_x); |
||||
int point2 = point1 + LSIZE0*LSIZE1; |
||||
int tl_x = (point1 % width)<<2; |
||||
int tl_y = point1 / width; |
||||
int tl_x2 = (point2 % width)<<2; |
||||
int tl_y2 = point2 / width; |
||||
int cur_x = start_x + tl_x; |
||||
int cur_y = start_y + tl_y; |
||||
int cur_x2 = start_x + tl_x2; |
||||
int cur_y2 = start_y + tl_y2; |
||||
int start_addr = mad24(cur_y,src_step_in_pixel,cur_x); |
||||
int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2); |
||||
uchar4 temp0,temp1; |
||||
__local uchar4 LDS_DAT[2*LSIZE1*LSIZE0]; |
||||
|
||||
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols); |
||||
//read pixels from src |
||||
start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0; |
||||
start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0; |
||||
temp0 = *(__global uchar4*)&src[start_addr]; |
||||
temp1 = *(__global uchar4*)&src[start_addr2]; |
||||
//judge if read out of boundary |
||||
temp0.x= ELEM(cur_x,0,src_whole_cols,VAL,temp0.x); |
||||
temp0.y= ELEM(cur_x+1,0,src_whole_cols,VAL,temp0.y); |
||||
temp0.z= ELEM(cur_x+2,0,src_whole_cols,VAL,temp0.z); |
||||
temp0.w= ELEM(cur_x+3,0,src_whole_cols,VAL,temp0.w); |
||||
temp0= ELEM(cur_y,0,src_whole_rows,(uchar4)VAL,temp0); |
||||
|
||||
temp1.x= ELEM(cur_x2,0,src_whole_cols,VAL,temp1.x); |
||||
temp1.y= ELEM(cur_x2+1,0,src_whole_cols,VAL,temp1.y); |
||||
temp1.z= ELEM(cur_x2+2,0,src_whole_cols,VAL,temp1.z); |
||||
temp1.w= ELEM(cur_x2+3,0,src_whole_cols,VAL,temp1.w); |
||||
temp1= ELEM(cur_y2,0,src_whole_rows,(uchar4)VAL,temp1); |
||||
|
||||
LDS_DAT[point1] = temp0; |
||||
LDS_DAT[point2] = temp1; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
uchar4 res = (uchar4)VAL; |
||||
|
||||
for(int i=0; i<2*RADIUSY+1; i++) |
||||
for(int j=0; j<2*RADIUSX+1; j++) |
||||
{ |
||||
res = |
||||
#ifndef RECTKERNEL |
||||
mat_kernel[i*(2*RADIUSX+1)+j] ? |
||||
#endif |
||||
MORPH_OP(res,vload4(0,(__local uchar*)&LDS_DAT[mad24((l_y+i),width,l_x)]+offset+j)) |
||||
#ifndef RECTKERNEL |
||||
:res |
||||
#endif |
||||
; |
||||
} |
||||
|
||||
int gidx = get_global_id(0)<<2; |
||||
int gidy = get_global_id(1); |
||||
int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel); |
||||
|
||||
if(gidx+3<cols && gidy<rows && ((dst_offset_in_pixel&3)==0)) |
||||
{ |
||||
*(__global uchar4*)&dst[out_addr] = res; |
||||
} |
||||
else |
||||
{ |
||||
if(gidx+3<cols && gidy<rows) |
||||
{ |
||||
dst[out_addr] = res.x; |
||||
dst[out_addr+1] = res.y; |
||||
dst[out_addr+2] = res.z; |
||||
dst[out_addr+3] = res.w; |
||||
} |
||||
else if(gidx+2<cols && gidy<rows) |
||||
{ |
||||
dst[out_addr] = res.x; |
||||
dst[out_addr+1] = res.y; |
||||
dst[out_addr+2] = res.z; |
||||
} |
||||
else if(gidx+1<cols && gidy<rows) |
||||
{ |
||||
dst[out_addr] = res.x; |
||||
dst[out_addr+1] = res.y; |
||||
} |
||||
else if(gidx<cols && gidy<rows) |
||||
{ |
||||
dst[out_addr] = res.x; |
||||
} |
||||
} |
||||
} |
||||
|
||||
#else |
||||
|
||||
__kernel void morph(__global const GENTYPE * restrict src, |
||||
__global GENTYPE *dst, |
||||
int src_offset_x, int src_offset_y, |
||||
int cols, int rows, |
||||
int src_step_in_pixel, int dst_step_in_pixel, |
||||
__constant uchar * mat_kernel, |
||||
int src_whole_cols, int src_whole_rows, |
||||
int dst_offset_in_pixel) |
||||
{ |
||||
int l_x = get_local_id(0); |
||||
int l_y = get_local_id(1); |
||||
int x = get_group_id(0)*LSIZE0; |
||||
int y = get_group_id(1)*LSIZE1; |
||||
int start_x = x+src_offset_x-RADIUSX; |
||||
int end_x = x + src_offset_x+LSIZE0+RADIUSX; |
||||
int width = end_x -(x+src_offset_x-RADIUSX)+1; |
||||
int start_y = y+src_offset_y-RADIUSY; |
||||
int point1 = mad24(l_y,LSIZE0,l_x); |
||||
int point2 = point1 + LSIZE0*LSIZE1; |
||||
int tl_x = point1 % width; |
||||
int tl_y = point1 / width; |
||||
int tl_x2 = point2 % width; |
||||
int tl_y2 = point2 / width; |
||||
int cur_x = start_x + tl_x; |
||||
int cur_y = start_y + tl_y; |
||||
int cur_x2 = start_x + tl_x2; |
||||
int cur_y2 = start_y + tl_y2; |
||||
int start_addr = mad24(cur_y,src_step_in_pixel,cur_x); |
||||
int start_addr2 = mad24(cur_y2,src_step_in_pixel,cur_x2); |
||||
GENTYPE temp0,temp1; |
||||
__local GENTYPE LDS_DAT[2*LSIZE1*LSIZE0]; |
||||
|
||||
int end_addr = mad24(src_whole_rows - 1,src_step_in_pixel,src_whole_cols); |
||||
//read pixels from src |
||||
start_addr = ((start_addr < end_addr) && (start_addr > 0)) ? start_addr : 0; |
||||
start_addr2 = ((start_addr2 < end_addr) && (start_addr2 > 0)) ? start_addr2 : 0; |
||||
temp0 = src[start_addr]; |
||||
temp1 = src[start_addr2]; |
||||
//judge if read out of boundary |
||||
temp0= ELEM(cur_x,0,src_whole_cols,(GENTYPE)VAL,temp0); |
||||
temp0= ELEM(cur_y,0,src_whole_rows,(GENTYPE)VAL,temp0); |
||||
|
||||
temp1= ELEM(cur_x2,0,src_whole_cols,(GENTYPE)VAL,temp1); |
||||
temp1= ELEM(cur_y2,0,src_whole_rows,(GENTYPE)VAL,temp1); |
||||
|
||||
LDS_DAT[point1] = temp0; |
||||
LDS_DAT[point2] = temp1; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
GENTYPE res = (GENTYPE)VAL; |
||||
for(int i=0; i<2*RADIUSY+1; i++) |
||||
for(int j=0; j<2*RADIUSX+1; j++) |
||||
{ |
||||
res = |
||||
#ifndef RECTKERNEL |
||||
mat_kernel[i*(2*RADIUSX+1)+j] ? |
||||
#endif |
||||
MORPH_OP(res,LDS_DAT[mad24(l_y+i,width,l_x+j)]) |
||||
#ifndef RECTKERNEL |
||||
:res |
||||
#endif |
||||
; |
||||
} |
||||
int gidx = get_global_id(0); |
||||
int gidy = get_global_id(1); |
||||
int out_addr = mad24(gidy,dst_step_in_pixel,gidx+dst_offset_in_pixel); |
||||
if(gidx<cols && gidy<rows) |
||||
{ |
||||
dst[out_addr] = res; |
||||
} |
||||
} |
||||
|
||||
#endif |
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,323 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Wu Zailong, bullet@yeah.net |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if defined (DOUBLE_SUPPORT) |
||||
#ifdef cl_khr_fp64 |
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable |
||||
#elif defined (cl_amd_fp64) |
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable |
||||
#endif |
||||
#endif |
||||
|
||||
#ifdef INTER_NEAREST |
||||
#define convertToWT |
||||
#endif |
||||
|
||||
#ifdef BORDER_CONSTANT |
||||
#define EXTRAPOLATE(v2, v) v = scalar; |
||||
#elif defined BORDER_REPLICATE |
||||
#define EXTRAPOLATE(v2, v) \ |
||||
{ \ |
||||
v2 = max(min(v2, (int2)(src_cols - 1, src_rows - 1)), zero); \ |
||||
v = convertToWT(src[mad24(v2.y, src_step, v2.x + src_offset)]); \ |
||||
} |
||||
#elif defined BORDER_WRAP |
||||
#define EXTRAPOLATE(v2, v) \ |
||||
{ \ |
||||
if (v2.x < 0) \ |
||||
v2.x -= ((v2.x - src_cols + 1) / src_cols) * src_cols; \ |
||||
if (v2.x >= src_cols) \ |
||||
v2.x %= src_cols; \ |
||||
\ |
||||
if (v2.y < 0) \ |
||||
v2.y -= ((v2.y - src_rows + 1) / src_rows) * src_rows; \ |
||||
if( v2.y >= src_rows ) \ |
||||
v2.y %= src_rows; \ |
||||
v = convertToWT(src[mad24(v2.y, src_step, v2.x + src_offset)]); \ |
||||
} |
||||
#elif defined(BORDER_REFLECT) || defined(BORDER_REFLECT_101) |
||||
#ifdef BORDER_REFLECT |
||||
#define DELTA int delta = 0 |
||||
#else |
||||
#define DELTA int delta = 1 |
||||
#endif |
||||
#define EXTRAPOLATE(v2, v) \ |
||||
{ \ |
||||
DELTA; \ |
||||
if (src_cols == 1) \ |
||||
v2.x = 0; \ |
||||
else \ |
||||
do \ |
||||
{ \ |
||||
if( v2.x < 0 ) \ |
||||
v2.x = -v2.x - 1 + delta; \ |
||||
else \ |
||||
v2.x = src_cols - 1 - (v2.x - src_cols) - delta; \ |
||||
} \ |
||||
while (v2.x >= src_cols || v2.x < 0); \ |
||||
\ |
||||
if (src_rows == 1) \ |
||||
v2.y = 0; \ |
||||
else \ |
||||
do \ |
||||
{ \ |
||||
if( v2.y < 0 ) \ |
||||
v2.y = -v2.y - 1 + delta; \ |
||||
else \ |
||||
v2.y = src_rows - 1 - (v2.y - src_rows) - delta; \ |
||||
} \ |
||||
while (v2.y >= src_rows || v2.y < 0); \ |
||||
v = convertToWT(src[mad24(v2.y, src_step, v2.x + src_offset)]); \ |
||||
} |
||||
#else |
||||
#error No extrapolation method |
||||
#endif |
||||
|
||||
#define NEED_EXTRAPOLATION(gx, gy) (gx >= src_cols || gy >= src_rows || gx < 0 || gy < 0) |
||||
|
||||
#ifdef INTER_NEAREST |
||||
|
||||
__kernel void remap_2_32FC1(__global const T * restrict src, __global T * dst, |
||||
__global float * map1, __global float * map2, |
||||
int src_offset, int dst_offset, int map1_offset, int map2_offset, |
||||
int src_step, int dst_step, int map1_step, int map2_step, |
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, T scalar) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < dst_cols && y < dst_rows) |
||||
{ |
||||
int dstIdx = mad24(y, dst_step, x + dst_offset); |
||||
int map1Idx = mad24(y, map1_step, x + map1_offset); |
||||
int map2Idx = mad24(y, map2_step, x + map2_offset); |
||||
|
||||
int gx = convert_int_sat_rte(map1[map1Idx]); |
||||
int gy = convert_int_sat_rte(map2[map2Idx]); |
||||
|
||||
if (NEED_EXTRAPOLATION(gx, gy)) |
||||
{ |
||||
int2 gxy = (int2)(gx, gy), zero = (int2)(0); |
||||
EXTRAPOLATE(gxy, dst[dstIdx]); |
||||
} |
||||
else |
||||
{ |
||||
int srcIdx = mad24(gy, src_step, gx + src_offset); |
||||
dst[dstIdx] = src[srcIdx]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
__kernel void remap_32FC2(__global const T * restrict src, __global T * dst, __global float2 * map1, |
||||
int src_offset, int dst_offset, int map1_offset, |
||||
int src_step, int dst_step, int map1_step, |
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, T scalar) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < dst_cols && y < dst_rows) |
||||
{ |
||||
int dstIdx = mad24(y, dst_step, x + dst_offset); |
||||
int map1Idx = mad24(y, map1_step, x + map1_offset); |
||||
|
||||
int2 gxy = convert_int2_sat_rte(map1[map1Idx]); |
||||
int gx = gxy.x, gy = gxy.y; |
||||
|
||||
if (NEED_EXTRAPOLATION(gx, gy)) |
||||
{ |
||||
int2 zero = (int2)(0); |
||||
EXTRAPOLATE(gxy, dst[dstIdx]); |
||||
} |
||||
else |
||||
{ |
||||
int srcIdx = mad24(gy, src_step, gx + src_offset); |
||||
dst[dstIdx] = src[srcIdx]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
__kernel void remap_16SC2(__global const T * restrict src, __global T * dst, __global short2 * map1, |
||||
int src_offset, int dst_offset, int map1_offset, |
||||
int src_step, int dst_step, int map1_step, |
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, T scalar) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < dst_cols && y < dst_rows) |
||||
{ |
||||
int dstIdx = mad24(y, dst_step, x + dst_offset); |
||||
int map1Idx = mad24(y, map1_step, x + map1_offset); |
||||
|
||||
int2 gxy = convert_int2(map1[map1Idx]); |
||||
int gx = gxy.x, gy = gxy.y; |
||||
|
||||
if (NEED_EXTRAPOLATION(gx, gy)) |
||||
{ |
||||
int2 zero = (int2)(0); |
||||
EXTRAPOLATE(gxy, dst[dstIdx]); |
||||
} |
||||
else |
||||
{ |
||||
int srcIdx = mad24(gy, src_step, gx + src_offset); |
||||
dst[dstIdx] = src[srcIdx]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
#elif INTER_LINEAR |
||||
|
||||
__kernel void remap_2_32FC1(__global T const * restrict src, __global T * dst, |
||||
__global float * map1, __global float * map2, |
||||
int src_offset, int dst_offset, int map1_offset, int map2_offset, |
||||
int src_step, int dst_step, int map1_step, int map2_step, |
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, T nVal) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < dst_cols && y < dst_rows) |
||||
{ |
||||
int dstIdx = mad24(y, dst_step, x + dst_offset); |
||||
int map1Idx = mad24(y, map1_step, x + map1_offset); |
||||
int map2Idx = mad24(y, map2_step, x + map2_offset); |
||||
|
||||
float2 map_data = (float2)(map1[map1Idx], map2[map2Idx]); |
||||
|
||||
int2 map_dataA = convert_int2_sat_rtn(map_data); |
||||
int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y); |
||||
int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1); |
||||
int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y +1); |
||||
int2 zero = (int2)(0); |
||||
|
||||
float2 _u = map_data - convert_float2(map_dataA); |
||||
WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)32)) / (WT2)32; |
||||
WT scalar = convertToWT(nVal); |
||||
WT a = scalar, b = scalar, c = scalar, d = scalar; |
||||
|
||||
if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y)) |
||||
a = convertToWT(src[mad24(map_dataA.y, src_step, map_dataA.x + src_offset)]); |
||||
else |
||||
EXTRAPOLATE(map_dataA, a); |
||||
|
||||
if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y)) |
||||
b = convertToWT(src[mad24(map_dataB.y, src_step, map_dataB.x + src_offset)]); |
||||
else |
||||
EXTRAPOLATE(map_dataB, b); |
||||
|
||||
if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y)) |
||||
c = convertToWT(src[mad24(map_dataC.y, src_step, map_dataC.x + src_offset)]); |
||||
else |
||||
EXTRAPOLATE(map_dataC, c); |
||||
|
||||
if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y)) |
||||
d = convertToWT(src[mad24(map_dataD.y, src_step, map_dataD.x + src_offset)]); |
||||
else |
||||
EXTRAPOLATE(map_dataD, d); |
||||
|
||||
WT dst_data = a * (WT)(1 - u.x) * (WT)(1 - u.y) + |
||||
b * (WT)(u.x) * (WT)(1 - u.y) + |
||||
c * (WT)(1 - u.x) * (WT)(u.y) + |
||||
d * (WT)(u.x) * (WT)(u.y); |
||||
dst[dstIdx] = convertToT(dst_data); |
||||
} |
||||
} |
||||
|
||||
__kernel void remap_32FC2(__global T const * restrict src, __global T * dst, |
||||
__global float2 * map1, |
||||
int src_offset, int dst_offset, int map1_offset, |
||||
int src_step, int dst_step, int map1_step, |
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, T nVal) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y = get_global_id(1); |
||||
|
||||
if (x < dst_cols && y < dst_rows) |
||||
{ |
||||
int dstIdx = mad24(y, dst_step, x + dst_offset); |
||||
int map1Idx = mad24(y, map1_step, x + map1_offset); |
||||
|
||||
float2 map_data = map1[map1Idx]; |
||||
int2 map_dataA = convert_int2_sat_rtn(map_data); |
||||
int2 map_dataB = (int2)(map_dataA.x + 1, map_dataA.y); |
||||
int2 map_dataC = (int2)(map_dataA.x, map_dataA.y + 1); |
||||
int2 map_dataD = (int2)(map_dataA.x + 1, map_dataA.y + 1); |
||||
int2 zero = (int2)(0); |
||||
|
||||
float2 _u = map_data - convert_float2(map_dataA); |
||||
WT2 u = convertToWT2(convert_int2_rte(convertToWT2(_u) * (WT2)32)) / (WT2)32; |
||||
WT scalar = convertToWT(nVal); |
||||
WT a = scalar, b = scalar, c = scalar, d = scalar; |
||||
|
||||
if (!NEED_EXTRAPOLATION(map_dataA.x, map_dataA.y)) |
||||
a = convertToWT(src[mad24(map_dataA.y, src_step, map_dataA.x + src_offset)]); |
||||
else |
||||
EXTRAPOLATE(map_dataA, a); |
||||
|
||||
if (!NEED_EXTRAPOLATION(map_dataB.x, map_dataB.y)) |
||||
b = convertToWT(src[mad24(map_dataB.y, src_step, map_dataB.x + src_offset)]); |
||||
else |
||||
EXTRAPOLATE(map_dataB, b); |
||||
|
||||
if (!NEED_EXTRAPOLATION(map_dataC.x, map_dataC.y)) |
||||
c = convertToWT(src[mad24(map_dataC.y, src_step, map_dataC.x + src_offset)]); |
||||
else |
||||
EXTRAPOLATE(map_dataC, c); |
||||
|
||||
if (!NEED_EXTRAPOLATION(map_dataD.x, map_dataD.y)) |
||||
d = convertToWT(src[mad24(map_dataD.y, src_step, map_dataD.x + src_offset)]); |
||||
else |
||||
EXTRAPOLATE(map_dataD, d); |
||||
|
||||
WT dst_data = a * (WT)(1 - u.x) * (WT)(1 - u.y) + |
||||
b * (WT)(u.x) * (WT)(1 - u.y) + |
||||
c * (WT)(1 - u.x) * (WT)(u.y) + |
||||
d * (WT)(u.x) * (WT)(u.y); |
||||
dst[dstIdx] = convertToT(dst_data); |
||||
} |
||||
} |
||||
|
||||
#endif |
@ -0,0 +1,152 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Zhang Ying, zhangying913@gmail.com |
||||
// Niko Li, newlife20080214@gmail.com |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
|
||||
// resize kernel |
||||
// Currently, CV_8UC1 CV_8UC4 CV_32FC1 and CV_32FC4are supported. |
||||
// We shall support other types later if necessary. |
||||
|
||||
#if defined DOUBLE_SUPPORT |
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable |
||||
#define F double |
||||
#else |
||||
#define F float |
||||
#endif |
||||
|
||||
#define INTER_RESIZE_COEF_BITS 11 |
||||
#define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS) |
||||
#define CAST_BITS (INTER_RESIZE_COEF_BITS << 1) |
||||
#define CAST_SCALE (1.0f/(1<<CAST_BITS)) |
||||
#define INC(x,l) min(x+1,l-1) |
||||
|
||||
#define PIXSIZE ((int)sizeof(PIXTYPE)) |
||||
#define noconvert(x) (x) |
||||
|
||||
#if defined INTER_LINEAR |
||||
|
||||
__kernel void resizeLN(__global const uchar* srcptr, int srcstep, int srcoffset, |
||||
int srcrows, int srccols, |
||||
__global uchar* dstptr, int dststep, int dstoffset, |
||||
int dstrows, int dstcols, |
||||
float ifx, float ify) |
||||
{ |
||||
int dx = get_global_id(0); |
||||
int dy = get_global_id(1); |
||||
|
||||
float sx = ((dx+0.5f) * ifx - 0.5f), sy = ((dy+0.5f) * ify - 0.5f); |
||||
int x = floor(sx), y = floor(sy); |
||||
|
||||
float u = sx - x, v = sy - y; |
||||
|
||||
if ( x<0 ) x=0,u=0; |
||||
if ( x>=srccols ) x=srccols-1,u=0; |
||||
if ( y<0 ) y=0,v=0; |
||||
if ( y>=srcrows ) y=srcrows-1,v=0; |
||||
|
||||
int y_ = INC(y,srcrows); |
||||
int x_ = INC(x,srccols); |
||||
const PIXTYPE* src = (const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE)); |
||||
|
||||
#if depth == 0 |
||||
u = u * INTER_RESIZE_COEF_SCALE; |
||||
v = v * INTER_RESIZE_COEF_SCALE; |
||||
|
||||
int U = rint(u); |
||||
int V = rint(v); |
||||
int U1 = rint(INTER_RESIZE_COEF_SCALE - u); |
||||
int V1 = rint(INTER_RESIZE_COEF_SCALE - v); |
||||
|
||||
WORKTYPE data0 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE))); |
||||
WORKTYPE data1 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x_*PIXSIZE))); |
||||
WORKTYPE data2 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x*PIXSIZE))); |
||||
WORKTYPE data3 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x_*PIXSIZE))); |
||||
WORKTYPE val = mul24((WORKTYPE)mul24(U1, V1), data0) + mul24((WORKTYPE)mul24(U, V1), data1) + |
||||
mul24((WORKTYPE)mul24(U1, V), data2) + mul24((WORKTYPE)mul24(U, V), data3); |
||||
|
||||
PIXTYPE uval = convertToDT((val + (1<<(CAST_BITS-1)))>>CAST_BITS); |
||||
#else |
||||
float u1 = 1.f-u; |
||||
float v1 = 1.f-v; |
||||
WORKTYPE data0 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x*PIXSIZE))); |
||||
WORKTYPE data1 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y, srcstep, srcoffset + x_*PIXSIZE))); |
||||
WORKTYPE data2 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x*PIXSIZE))); |
||||
WORKTYPE data3 = convertToWT(*(const PIXTYPE*)(srcptr + mad24(y_, srcstep, srcoffset + x_*PIXSIZE))); |
||||
PIXTYPE uval = u1 * v1 * s_data1 + u * v1 * s_data2 + u1 * v *s_data3 + u * v *s_data4; |
||||
#endif |
||||
|
||||
if(dx < dstcols && dy < dstrows) |
||||
{ |
||||
PIXTYPE* dst = (PIXTYPE*)(dstptr + mad24(dy, dststep, dstoffset + dx*PIXSIZE)); |
||||
dst[0] = uval; |
||||
} |
||||
} |
||||
|
||||
#elif defined INTER_NEAREST |
||||
|
||||
__kernel void resizeNN(__global const uchar* srcptr, int srcstep, int srcoffset, |
||||
int srcrows, int srccols, |
||||
__global uchar* dstptr, int dststep, int dstoffset, |
||||
int dstrows, int dstcols, |
||||
float ifx, float ify) |
||||
{ |
||||
int dx = get_global_id(0); |
||||
int dy = get_global_id(1); |
||||
|
||||
if( dx < dstcols && dy < dstrows ) |
||||
{ |
||||
F s1 = dx*ifx; |
||||
F s2 = dy*ify; |
||||
int sx = min(convert_int_rtz(s1), srccols-1); |
||||
int sy = min(convert_int_rtz(s2), srcrows-1); |
||||
PIXTYPE* dst = (PIXTYPE*)(dstptr + |
||||
mad24(dy, dststep, dstoffset + dx*PIXSIZE)); |
||||
const PIXTYPE* src = (const PIXTYPE*)(srcptr + |
||||
mad24(sy, srcstep, srcoffset + sx*PIXSIZE)); |
||||
dst[0] = src[0]; |
||||
} |
||||
} |
||||
|
||||
#endif |
||||
|
@ -0,0 +1,152 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Zhang Ying, zhangying913@gmail.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
#if defined (DOUBLE_SUPPORT) |
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable |
||||
#endif |
||||
|
||||
// threshold type: |
||||
// enum { THRESH_BINARY=0, THRESH_BINARY_INV=1, THRESH_TRUNC=2, THRESH_TOZERO=3, |
||||
// THRESH_TOZERO_INV=4, THRESH_MASK=7, THRESH_OTSU=8 }; |
||||
|
||||
__kernel void threshold_C1_D0(__global const uchar * restrict src, __global uchar *dst, |
||||
int src_offset, int src_step, |
||||
int dst_offset, int dst_rows, int dst_cols, int dst_step, |
||||
uchar thresh, uchar max_val, int thresh_type |
||||
) |
||||
{ |
||||
int gx = get_global_id(0); |
||||
const int gy = get_global_id(1); |
||||
|
||||
int offset = (dst_offset & 15); |
||||
src_offset -= offset; |
||||
|
||||
int dstart = (gx << 4) - offset; |
||||
if(dstart < dst_cols && gy < dst_rows) |
||||
{ |
||||
uchar16 sdata = vload16(gx, src+src_offset+gy*src_step); |
||||
uchar16 ddata; |
||||
uchar16 zero = 0; |
||||
switch (thresh_type) |
||||
{ |
||||
case 0: |
||||
ddata = ((sdata > thresh) ) ? (uchar16)(max_val) : (uchar16)(0); |
||||
break; |
||||
case 1: |
||||
ddata = ((sdata > thresh)) ? zero : (uchar16)(max_val); |
||||
break; |
||||
case 2: |
||||
ddata = ((sdata > thresh)) ? (uchar16)(thresh) : sdata; |
||||
break; |
||||
case 3: |
||||
ddata = ((sdata > thresh)) ? sdata : zero; |
||||
break; |
||||
case 4: |
||||
ddata = ((sdata > thresh)) ? zero : sdata; |
||||
break; |
||||
default: |
||||
ddata = sdata; |
||||
} |
||||
int16 dpos = (int16)(dstart, dstart+1, dstart+2, dstart+3, dstart+4, dstart+5, dstart+6, dstart+7, dstart+8, |
||||
dstart+9, dstart+10, dstart+11, dstart+12, dstart+13, dstart+14, dstart+15); |
||||
uchar16 dVal = *(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart); |
||||
int16 con = dpos >= 0 && dpos < dst_cols; |
||||
ddata = convert_uchar16(con != 0) ? ddata : dVal; |
||||
if(dstart < dst_cols) |
||||
{ |
||||
*(__global uchar16*)(dst+dst_offset+gy*dst_step+dstart) = ddata; |
||||
} |
||||
} |
||||
} |
||||
|
||||
|
||||
__kernel void threshold_C1_D5(__global const float * restrict src, __global float *dst, |
||||
int src_offset, int src_step, |
||||
int dst_offset, int dst_rows, int dst_cols, int dst_step, |
||||
float thresh, float max_val, int thresh_type |
||||
) |
||||
{ |
||||
const int gx = get_global_id(0); |
||||
const int gy = get_global_id(1); |
||||
|
||||
int offset = (dst_offset & 3); |
||||
src_offset -= offset; |
||||
|
||||
int dstart = (gx << 2) - offset; |
||||
if(dstart < dst_cols && gy < dst_rows) |
||||
{ |
||||
float4 sdata = vload4(gx, src+src_offset+gy*src_step); |
||||
float4 ddata; |
||||
float4 zero = 0; |
||||
switch (thresh_type) |
||||
{ |
||||
case 0: |
||||
ddata = sdata > thresh ? (float4)(max_val) : (float4)(0.f); |
||||
break; |
||||
case 1: |
||||
ddata = sdata > thresh ? zero : (float4)max_val; |
||||
break; |
||||
case 2: |
||||
ddata = sdata > thresh ? (float4)thresh : sdata; |
||||
break; |
||||
case 3: |
||||
ddata = sdata > thresh ? sdata : (float4)(0.f); |
||||
break; |
||||
case 4: |
||||
ddata = sdata > thresh ? (float4)(0.f) : sdata; |
||||
break; |
||||
default: |
||||
ddata = sdata; |
||||
} |
||||
int4 dpos = (int4)(dstart, dstart+1, dstart+2, dstart+3); |
||||
float4 dVal = *(__global float4*)(dst+dst_offset+gy*dst_step+dstart); |
||||
int4 con = dpos >= 0 && dpos < dst_cols; |
||||
ddata = convert_float4(con) != (float4)(0) ? ddata : dVal; |
||||
if(dstart < dst_cols) |
||||
{ |
||||
*(__global float4*)(dst+dst_offset+gy*dst_step+dstart) = ddata; |
||||
} |
||||
} |
||||
} |
@ -0,0 +1,761 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Zhang Ying, zhangying913@gmail.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
|
||||
//warpAffine kernel |
||||
//support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic. |
||||
|
||||
#if defined (DOUBLE_SUPPORT) |
||||
#ifdef cl_khr_fp64 |
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable |
||||
#elif defined (cl_amd_fp64) |
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable |
||||
#endif |
||||
typedef double F; |
||||
typedef double4 F4; |
||||
#define convert_F4 convert_double4 |
||||
#else |
||||
typedef float F; |
||||
typedef float4 F4; |
||||
#define convert_F4 convert_float4 |
||||
#endif |
||||
|
||||
#define INTER_BITS 5 |
||||
#define INTER_TAB_SIZE (1 << INTER_BITS) |
||||
#define INTER_SCALE 1.f/INTER_TAB_SIZE |
||||
#define AB_BITS max(10, (int)INTER_BITS) |
||||
#define AB_SCALE (1 << AB_BITS) |
||||
#define INTER_REMAP_COEF_BITS 15 |
||||
#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS) |
||||
|
||||
inline void interpolateCubic( float x, float* coeffs ) |
||||
{ |
||||
const float A = -0.75f; |
||||
|
||||
coeffs[0] = ((A*(x + 1.f) - 5.0f*A)*(x + 1.f) + 8.0f*A)*(x + 1.f) - 4.0f*A; |
||||
coeffs[1] = ((A + 2.f)*x - (A + 3.f))*x*x + 1.f; |
||||
coeffs[2] = ((A + 2.f)*(1.f - x) - (A + 3.f))*(1.f - x)*(1.f - x) + 1.f; |
||||
coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; |
||||
} |
||||
|
||||
|
||||
/**********************************************8UC1********************************************* |
||||
***********************************************************************************************/ |
||||
__kernel void warpAffineNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows, |
||||
int dst_cols, int dst_rows, int srcStep, int dstStep, |
||||
int src_offset, int dst_offset, __constant F * M, int threadCols ) |
||||
{ |
||||
int dx = get_global_id(0); |
||||
int dy = get_global_id(1); |
||||
|
||||
if( dx < threadCols && dy < dst_rows) |
||||
{ |
||||
dx = (dx<<2) - (dst_offset&3); |
||||
|
||||
int round_delta = (AB_SCALE>>1); |
||||
|
||||
int4 X, Y; |
||||
int4 sx, sy; |
||||
int4 DX = (int4)(dx, dx+1, dx+2, dx+3); |
||||
DX = (DX << AB_BITS); |
||||
F4 M0DX, M3DX; |
||||
M0DX = M[0] * convert_F4(DX); |
||||
M3DX = M[3] * convert_F4(DX); |
||||
X = convert_int4(rint(M0DX)); |
||||
Y = convert_int4(rint(M3DX)); |
||||
int tmp1, tmp2; |
||||
tmp1 = rint((M[1]*dy + M[2]) * AB_SCALE); |
||||
tmp2 = rint((M[4]*dy + M[5]) * AB_SCALE); |
||||
|
||||
X += tmp1 + round_delta; |
||||
Y += tmp2 + round_delta; |
||||
|
||||
sx = convert_int4(convert_short4(X >> AB_BITS)); |
||||
sy = convert_int4(convert_short4(Y >> AB_BITS)); |
||||
|
||||
__global uchar4 * d = (__global uchar4 *)(dst+dst_offset+dy*dstStep+dx); |
||||
uchar4 dval = *d; |
||||
DX = (int4)(dx, dx+1, dx+2, dx+3); |
||||
int4 dcon = DX >= 0 && DX < dst_cols && dy >= 0 && dy < dst_rows; |
||||
int4 scon = sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows; |
||||
int4 spos = src_offset + sy * srcStep + sx; |
||||
uchar4 sval; |
||||
sval.s0 = scon.s0 ? src[spos.s0] : 0; |
||||
sval.s1 = scon.s1 ? src[spos.s1] : 0; |
||||
sval.s2 = scon.s2 ? src[spos.s2] : 0; |
||||
sval.s3 = scon.s3 ? src[spos.s3] : 0; |
||||
dval = convert_uchar4(dcon) != (uchar4)(0,0,0,0) ? sval : dval; |
||||
*d = dval; |
||||
} |
||||
} |
||||
|
||||
__kernel void warpAffineLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst, int src_cols, int src_rows, |
||||
int dst_cols, int dst_rows, int srcStep, int dstStep, |
||||
int src_offset, int dst_offset, __constant F * M, int threadCols ) |
||||
{ |
||||
int dx = get_global_id(0); |
||||
int dy = get_global_id(1); |
||||
|
||||
|
||||
if( dx < threadCols && dy < dst_rows) |
||||
{ |
||||
dx = (dx<<2) - (dst_offset&3); |
||||
|
||||
int round_delta = ((AB_SCALE >> INTER_BITS) >> 1); |
||||
|
||||
int4 X, Y; |
||||
short4 ax, ay; |
||||
int4 sx, sy; |
||||
int4 DX = (int4)(dx, dx+1, dx+2, dx+3); |
||||
DX = (DX << AB_BITS); |
||||
F4 M0DX, M3DX; |
||||
M0DX = M[0] * convert_F4(DX); |
||||
M3DX = M[3] * convert_F4(DX); |
||||
X = convert_int4(rint(M0DX)); |
||||
Y = convert_int4(rint(M3DX)); |
||||
|
||||
int tmp1, tmp2; |
||||
tmp1 = rint((M[1]*dy + M[2]) * AB_SCALE); |
||||
tmp2 = rint((M[4]*dy + M[5]) * AB_SCALE); |
||||
|
||||
X += tmp1 + round_delta; |
||||
Y += tmp2 + round_delta; |
||||
|
||||
X = X >> (AB_BITS - INTER_BITS); |
||||
Y = Y >> (AB_BITS - INTER_BITS); |
||||
|
||||
sx = convert_int4(convert_short4(X >> INTER_BITS)); |
||||
sy = convert_int4(convert_short4(Y >> INTER_BITS)); |
||||
ax = convert_short4(X & (INTER_TAB_SIZE-1)); |
||||
ay = convert_short4(Y & (INTER_TAB_SIZE-1)); |
||||
|
||||
uchar4 v0, v1, v2,v3; |
||||
int4 scon0, scon1, scon2, scon3; |
||||
int4 spos0, spos1, spos2, spos3; |
||||
|
||||
scon0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows); |
||||
scon1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows); |
||||
scon2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows); |
||||
scon3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows); |
||||
spos0 = src_offset + sy * srcStep + sx; |
||||
spos1 = src_offset + sy * srcStep + sx + 1; |
||||
spos2 = src_offset + (sy+1) * srcStep + sx; |
||||
spos3 = src_offset + (sy+1) * srcStep + sx + 1; |
||||
|
||||
v0.s0 = scon0.s0 ? src[spos0.s0] : 0; |
||||
v1.s0 = scon1.s0 ? src[spos1.s0] : 0; |
||||
v2.s0 = scon2.s0 ? src[spos2.s0] : 0; |
||||
v3.s0 = scon3.s0 ? src[spos3.s0] : 0; |
||||
|
||||
v0.s1 = scon0.s1 ? src[spos0.s1] : 0; |
||||
v1.s1 = scon1.s1 ? src[spos1.s1] : 0; |
||||
v2.s1 = scon2.s1 ? src[spos2.s1] : 0; |
||||
v3.s1 = scon3.s1 ? src[spos3.s1] : 0; |
||||
|
||||
v0.s2 = scon0.s2 ? src[spos0.s2] : 0; |
||||
v1.s2 = scon1.s2 ? src[spos1.s2] : 0; |
||||
v2.s2 = scon2.s2 ? src[spos2.s2] : 0; |
||||
v3.s2 = scon3.s2 ? src[spos3.s2] : 0; |
||||
|
||||
v0.s3 = scon0.s3 ? src[spos0.s3] : 0; |
||||
v1.s3 = scon1.s3 ? src[spos1.s3] : 0; |
||||
v2.s3 = scon2.s3 ? src[spos2.s3] : 0; |
||||
v3.s3 = scon3.s3 ? src[spos3.s3] : 0; |
||||
|
||||
short4 itab0, itab1, itab2, itab3; |
||||
float4 taby, tabx; |
||||
taby = INTER_SCALE * convert_float4(ay); |
||||
tabx = INTER_SCALE * convert_float4(ax); |
||||
|
||||
itab0 = convert_short4_sat(( (1.0f-taby)*(1.0f-tabx) * (float4)INTER_REMAP_COEF_SCALE )); |
||||
itab1 = convert_short4_sat(( (1.0f-taby)*tabx * (float4)INTER_REMAP_COEF_SCALE )); |
||||
itab2 = convert_short4_sat(( taby*(1.0f-tabx) * (float4)INTER_REMAP_COEF_SCALE )); |
||||
itab3 = convert_short4_sat(( taby*tabx * (float4)INTER_REMAP_COEF_SCALE )); |
||||
|
||||
|
||||
int4 val; |
||||
uchar4 tval; |
||||
val = convert_int4(v0) * convert_int4(itab0) + convert_int4(v1) * convert_int4(itab1) |
||||
+ convert_int4(v2) * convert_int4(itab2) + convert_int4(v3) * convert_int4(itab3); |
||||
tval = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; |
||||
|
||||
__global uchar4 * d =(__global uchar4 *)(dst+dst_offset+dy*dstStep+dx); |
||||
uchar4 dval = *d; |
||||
DX = (int4)(dx, dx+1, dx+2, dx+3); |
||||
int4 dcon = DX >= 0 && DX < dst_cols && dy >= 0 && dy < dst_rows; |
||||
dval = convert_uchar4(dcon != 0) ? tval : dval; |
||||
*d = dval; |
||||
} |
||||
} |
||||
|
||||
__kernel void warpAffineCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows, |
||||
int dst_cols, int dst_rows, int srcStep, int dstStep, |
||||
int src_offset, int dst_offset, __constant F * M, int threadCols ) |
||||
{ |
||||
int dx = get_global_id(0); |
||||
int dy = get_global_id(1); |
||||
|
||||
if( dx < threadCols && dy < dst_rows) |
||||
{ |
||||
int round_delta = ((AB_SCALE>>INTER_BITS)>>1); |
||||
|
||||
int X0 = rint(M[0] * dx * AB_SCALE); |
||||
int Y0 = rint(M[3] * dx * AB_SCALE); |
||||
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; |
||||
Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; |
||||
int X = X0 >> (AB_BITS - INTER_BITS); |
||||
int Y = Y0 >> (AB_BITS - INTER_BITS); |
||||
|
||||
short sx = (short)(X >> INTER_BITS) - 1; |
||||
short sy = (short)(Y >> INTER_BITS) - 1; |
||||
short ay = (short)(Y & (INTER_TAB_SIZE-1)); |
||||
short ax = (short)(X & (INTER_TAB_SIZE-1)); |
||||
|
||||
uchar v[16]; |
||||
int i, j; |
||||
|
||||
#pragma unroll 4 |
||||
for(i=0; i<4; i++) |
||||
for(j=0; j<4; j++) |
||||
{ |
||||
v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : 0; |
||||
} |
||||
|
||||
short itab[16]; |
||||
float tab1y[4], tab1x[4]; |
||||
float axx, ayy; |
||||
|
||||
ayy = 1.f/INTER_TAB_SIZE * ay; |
||||
axx = 1.f/INTER_TAB_SIZE * ax; |
||||
interpolateCubic(ayy, tab1y); |
||||
interpolateCubic(axx, tab1x); |
||||
int isum = 0; |
||||
|
||||
#pragma unroll 16 |
||||
for( i=0; i<16; i++ ) |
||||
{ |
||||
F v = tab1y[(i>>2)] * tab1x[(i&3)]; |
||||
isum += itab[i] = convert_short_sat( rint( v * INTER_REMAP_COEF_SCALE ) ); |
||||
} |
||||
|
||||
if( isum != INTER_REMAP_COEF_SCALE ) |
||||
{ |
||||
int k1, k2; |
||||
int diff = isum - INTER_REMAP_COEF_SCALE; |
||||
int Mk1=2, Mk2=2, mk1=2, mk2=2; |
||||
for( k1 = 2; k1 < 4; k1++ ) |
||||
for( k2 = 2; k2 < 4; k2++ ) |
||||
{ |
||||
if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) |
||||
mk1 = k1, mk2 = k2; |
||||
else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) |
||||
Mk1 = k1, Mk2 = k2; |
||||
} |
||||
diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); |
||||
} |
||||
|
||||
if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) |
||||
{ |
||||
int sum=0; |
||||
for ( i =0; i<16; i++ ) |
||||
{ |
||||
sum += v[i] * itab[i] ; |
||||
} |
||||
dst[dst_offset+dy*dstStep+dx] = convert_uchar_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; |
||||
} |
||||
} |
||||
} |
||||
|
||||
/**********************************************8UC4********************************************* |
||||
***********************************************************************************************/ |
||||
|
||||
__kernel void warpAffineNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows, |
||||
int dst_cols, int dst_rows, int srcStep, int dstStep, |
||||
int src_offset, int dst_offset, __constant F * M, int threadCols ) |
||||
{ |
||||
int dx = get_global_id(0); |
||||
int dy = get_global_id(1); |
||||
|
||||
if( dx < threadCols && dy < dst_rows) |
||||
{ |
||||
int round_delta = (AB_SCALE >> 1); |
||||
|
||||
int X0 = rint(M[0] * dx * AB_SCALE); |
||||
int Y0 = rint(M[3] * dx * AB_SCALE); |
||||
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; |
||||
Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; |
||||
|
||||
int sx0 = (short)(X0 >> AB_BITS); |
||||
int sy0 = (short)(Y0 >> AB_BITS); |
||||
|
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) |
||||
dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>2)+sy0*(srcStep>>2)+sx0] : (uchar4)0; |
||||
} |
||||
} |
||||
|
||||
__kernel void warpAffineLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows, |
||||
int dst_cols, int dst_rows, int srcStep, int dstStep, |
||||
int src_offset, int dst_offset, __constant F * M, int threadCols ) |
||||
{ |
||||
int dx = get_global_id(0); |
||||
int dy = get_global_id(1); |
||||
|
||||
|
||||
if( dx < threadCols && dy < dst_rows) |
||||
{ |
||||
int round_delta = AB_SCALE/INTER_TAB_SIZE/2; |
||||
|
||||
src_offset = (src_offset>>2); |
||||
srcStep = (srcStep>>2); |
||||
|
||||
int tmp = (dx << AB_BITS); |
||||
int X0 = rint(M[0] * tmp); |
||||
int Y0 = rint(M[3] * tmp); |
||||
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; |
||||
Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; |
||||
X0 = X0 >> (AB_BITS - INTER_BITS); |
||||
Y0 = Y0 >> (AB_BITS - INTER_BITS); |
||||
|
||||
short sx0 = (short)(X0 >> INTER_BITS); |
||||
short sy0 = (short)(Y0 >> INTER_BITS); |
||||
short ax0 = (short)(X0 & (INTER_TAB_SIZE-1)); |
||||
short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1)); |
||||
|
||||
int4 v0, v1, v2, v3; |
||||
|
||||
v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? convert_int4(src[src_offset+sy0 * srcStep + sx0]) : 0; |
||||
v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? convert_int4(src[src_offset+sy0 * srcStep + sx0+1]) : 0; |
||||
v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? convert_int4(src[src_offset+(sy0+1) * srcStep + sx0]) : 0; |
||||
v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? convert_int4(src[src_offset+(sy0+1) * srcStep + sx0+1]) : 0; |
||||
|
||||
int itab0, itab1, itab2, itab3; |
||||
float taby, tabx; |
||||
taby = 1.f/INTER_TAB_SIZE*ay0; |
||||
tabx = 1.f/INTER_TAB_SIZE*ax0; |
||||
|
||||
itab0 = convert_short_sat(rint( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE )); |
||||
itab1 = convert_short_sat(rint( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE )); |
||||
itab2 = convert_short_sat(rint( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE )); |
||||
itab3 = convert_short_sat(rint( taby*tabx * INTER_REMAP_COEF_SCALE )); |
||||
|
||||
int4 val; |
||||
val = v0 * itab0 + v1 * itab1 + v2 * itab2 + v3 * itab3; |
||||
|
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) |
||||
dst[(dst_offset>>2)+dy*(dstStep>>2)+dx] = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; |
||||
} |
||||
} |
||||
|
||||
__kernel void warpAffineCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, int src_cols, int src_rows, |
||||
int dst_cols, int dst_rows, int srcStep, int dstStep, |
||||
int src_offset, int dst_offset, __constant F * M, int threadCols ) |
||||
{ |
||||
int dx = get_global_id(0); |
||||
int dy = get_global_id(1); |
||||
|
||||
if( dx < threadCols && dy < dst_rows) |
||||
{ |
||||
int round_delta = ((AB_SCALE>>INTER_BITS)>>1); |
||||
|
||||
src_offset = (src_offset>>2); |
||||
srcStep = (srcStep>>2); |
||||
dst_offset = (dst_offset>>2); |
||||
dstStep = (dstStep>>2); |
||||
|
||||
int tmp = (dx << AB_BITS); |
||||
int X0 = rint(M[0] * tmp); |
||||
int Y0 = rint(M[3] * tmp); |
||||
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; |
||||
Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; |
||||
X0 = X0 >> (AB_BITS - INTER_BITS); |
||||
Y0 = Y0 >> (AB_BITS - INTER_BITS); |
||||
|
||||
int sx = (short)(X0 >> INTER_BITS) - 1; |
||||
int sy = (short)(Y0 >> INTER_BITS) - 1; |
||||
int ay = (short)(Y0 & (INTER_TAB_SIZE-1)); |
||||
int ax = (short)(X0 & (INTER_TAB_SIZE-1)); |
||||
|
||||
uchar4 v[16]; |
||||
int i,j; |
||||
#pragma unroll 4 |
||||
for(i=0; i<4; i++) |
||||
for(j=0; j<4; j++) |
||||
{ |
||||
v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0; |
||||
} |
||||
int itab[16]; |
||||
float tab1y[4], tab1x[4]; |
||||
float axx, ayy; |
||||
|
||||
ayy = INTER_SCALE * ay; |
||||
axx = INTER_SCALE * ax; |
||||
interpolateCubic(ayy, tab1y); |
||||
interpolateCubic(axx, tab1x); |
||||
int isum = 0; |
||||
|
||||
#pragma unroll 16 |
||||
for( i=0; i<16; i++ ) |
||||
{ |
||||
float tmp; |
||||
tmp = tab1y[(i>>2)] * tab1x[(i&3)] * INTER_REMAP_COEF_SCALE; |
||||
itab[i] = rint(tmp); |
||||
isum += itab[i]; |
||||
} |
||||
|
||||
if( isum != INTER_REMAP_COEF_SCALE ) |
||||
{ |
||||
int k1, k2; |
||||
int diff = isum - INTER_REMAP_COEF_SCALE; |
||||
int Mk1=2, Mk2=2, mk1=2, mk2=2; |
||||
|
||||
for( k1 = 2; k1 < 4; k1++ ) |
||||
for( k2 = 2; k2 < 4; k2++ ) |
||||
{ |
||||
|
||||
if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) |
||||
mk1 = k1, mk2 = k2; |
||||
else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) |
||||
Mk1 = k1, Mk2 = k2; |
||||
} |
||||
|
||||
diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); |
||||
} |
||||
|
||||
if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) |
||||
{ |
||||
int4 sum=0; |
||||
for ( i =0; i<16; i++ ) |
||||
{ |
||||
sum += convert_int4(v[i]) * itab[i]; |
||||
} |
||||
dst[dst_offset+dy*dstStep+dx] = convert_uchar4_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; |
||||
} |
||||
} |
||||
} |
||||
|
||||
|
||||
/**********************************************32FC1******************************************** |
||||
***********************************************************************************************/ |
||||
|
||||
__kernel void warpAffineNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, |
||||
int dst_cols, int dst_rows, int srcStep, int dstStep, |
||||
int src_offset, int dst_offset, __constant F * M, int threadCols ) |
||||
{ |
||||
int dx = get_global_id(0); |
||||
int dy = get_global_id(1); |
||||
|
||||
if( dx < threadCols && dy < dst_rows) |
||||
{ |
||||
int round_delta = AB_SCALE/2; |
||||
|
||||
int X0 = rint(M[0] * dx * AB_SCALE); |
||||
int Y0 = rint(M[3] * dx * AB_SCALE); |
||||
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; |
||||
Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; |
||||
|
||||
short sx0 = (short)(X0 >> AB_BITS); |
||||
short sy0 = (short)(Y0 >> AB_BITS); |
||||
|
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) |
||||
dst[(dst_offset>>2)+dy*dstStep+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>2)+sy0*srcStep+sx0] : 0; |
||||
} |
||||
} |
||||
|
||||
__kernel void warpAffineLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, |
||||
int dst_cols, int dst_rows, int srcStep, int dstStep, |
||||
int src_offset, int dst_offset, __constant F * M, int threadCols ) |
||||
{ |
||||
int dx = get_global_id(0); |
||||
int dy = get_global_id(1); |
||||
|
||||
if( dx < threadCols && dy < dst_rows) |
||||
{ |
||||
int round_delta = AB_SCALE/INTER_TAB_SIZE/2; |
||||
|
||||
src_offset = (src_offset>>2); |
||||
|
||||
int X0 = rint(M[0] * dx * AB_SCALE); |
||||
int Y0 = rint(M[3] * dx * AB_SCALE); |
||||
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; |
||||
Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; |
||||
X0 = X0 >> (AB_BITS - INTER_BITS); |
||||
Y0 = Y0 >> (AB_BITS - INTER_BITS); |
||||
|
||||
short sx0 = (short)(X0 >> INTER_BITS); |
||||
short sy0 = (short)(Y0 >> INTER_BITS); |
||||
short ax0 = (short)(X0 & (INTER_TAB_SIZE-1)); |
||||
short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1)); |
||||
|
||||
float v0, v1, v2, v3; |
||||
|
||||
v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : 0; |
||||
v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : 0; |
||||
v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : 0; |
||||
v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : 0; |
||||
|
||||
float tab[4]; |
||||
float taby[2], tabx[2]; |
||||
taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0; |
||||
taby[1] = 1.f/INTER_TAB_SIZE*ay0; |
||||
tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0; |
||||
tabx[1] = 1.f/INTER_TAB_SIZE*ax0; |
||||
|
||||
tab[0] = taby[0] * tabx[0]; |
||||
tab[1] = taby[0] * tabx[1]; |
||||
tab[2] = taby[1] * tabx[0]; |
||||
tab[3] = taby[1] * tabx[1]; |
||||
|
||||
float sum = 0; |
||||
sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3]; |
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) |
||||
dst[(dst_offset>>2)+dy*dstStep+dx] = sum; |
||||
} |
||||
} |
||||
|
||||
__kernel void warpAffineCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, |
||||
int dst_cols, int dst_rows, int srcStep, int dstStep, |
||||
int src_offset, int dst_offset, __constant F * M, int threadCols ) |
||||
{ |
||||
int dx = get_global_id(0); |
||||
int dy = get_global_id(1); |
||||
|
||||
if( dx < threadCols && dy < dst_rows) |
||||
{ |
||||
int round_delta = AB_SCALE/INTER_TAB_SIZE/2; |
||||
|
||||
src_offset = (src_offset>>2); |
||||
dst_offset = (dst_offset>>2); |
||||
|
||||
int X0 = rint(M[0] * dx * AB_SCALE); |
||||
int Y0 = rint(M[3] * dx * AB_SCALE); |
||||
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; |
||||
Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; |
||||
X0 = X0 >> (AB_BITS - INTER_BITS); |
||||
Y0 = Y0 >> (AB_BITS - INTER_BITS); |
||||
|
||||
short sx = (short)(X0 >> INTER_BITS) - 1; |
||||
short sy = (short)(Y0 >> INTER_BITS) - 1; |
||||
short ay = (short)(Y0 & (INTER_TAB_SIZE-1)); |
||||
short ax = (short)(X0 & (INTER_TAB_SIZE-1)); |
||||
|
||||
float v[16]; |
||||
int i; |
||||
|
||||
for(i=0; i<16; i++) |
||||
v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : 0; |
||||
|
||||
float tab[16]; |
||||
float tab1y[4], tab1x[4]; |
||||
float axx, ayy; |
||||
|
||||
ayy = 1.f/INTER_TAB_SIZE * ay; |
||||
axx = 1.f/INTER_TAB_SIZE * ax; |
||||
interpolateCubic(ayy, tab1y); |
||||
interpolateCubic(axx, tab1x); |
||||
|
||||
#pragma unroll 4 |
||||
for( i=0; i<16; i++ ) |
||||
{ |
||||
tab[i] = tab1y[(i>>2)] * tab1x[(i&3)]; |
||||
} |
||||
|
||||
if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) |
||||
{ |
||||
float sum = 0; |
||||
#pragma unroll 4 |
||||
for ( i =0; i<16; i++ ) |
||||
{ |
||||
sum += v[i] * tab[i]; |
||||
} |
||||
dst[dst_offset+dy*dstStep+dx] = sum; |
||||
|
||||
} |
||||
} |
||||
} |
||||
|
||||
|
||||
/**********************************************32FC4******************************************** |
||||
***********************************************************************************************/ |
||||
|
||||
__kernel void warpAffineNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, |
||||
int dst_cols, int dst_rows, int srcStep, int dstStep, |
||||
int src_offset, int dst_offset, __constant F * M, int threadCols ) |
||||
{ |
||||
int dx = get_global_id(0); |
||||
int dy = get_global_id(1); |
||||
|
||||
if( dx < threadCols && dy < dst_rows) |
||||
{ |
||||
int round_delta = AB_SCALE/2; |
||||
|
||||
int X0 = rint(M[0] * dx * AB_SCALE); |
||||
int Y0 = rint(M[3] * dx * AB_SCALE); |
||||
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; |
||||
Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; |
||||
|
||||
short sx0 = (short)(X0 >> AB_BITS); |
||||
short sy0 = (short)(Y0 >> AB_BITS); |
||||
|
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) |
||||
dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx0>=0 && sx0<src_cols && sy0>=0 && sy0<src_rows) ? src[(src_offset>>4)+sy0*(srcStep>>2)+sx0] : (float4)0; |
||||
} |
||||
} |
||||
|
||||
__kernel void warpAffineLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, |
||||
int dst_cols, int dst_rows, int srcStep, int dstStep, |
||||
int src_offset, int dst_offset, __constant F * M, int threadCols ) |
||||
{ |
||||
int dx = get_global_id(0); |
||||
int dy = get_global_id(1); |
||||
|
||||
if( dx < threadCols && dy < dst_rows) |
||||
{ |
||||
int round_delta = AB_SCALE/INTER_TAB_SIZE/2; |
||||
|
||||
src_offset = (src_offset>>4); |
||||
dst_offset = (dst_offset>>4); |
||||
srcStep = (srcStep>>2); |
||||
dstStep = (dstStep>>2); |
||||
|
||||
int X0 = rint(M[0] * dx * AB_SCALE); |
||||
int Y0 = rint(M[3] * dx * AB_SCALE); |
||||
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; |
||||
Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; |
||||
X0 = X0 >> (AB_BITS - INTER_BITS); |
||||
Y0 = Y0 >> (AB_BITS - INTER_BITS); |
||||
|
||||
short sx0 = (short)(X0 >> INTER_BITS); |
||||
short sy0 = (short)(Y0 >> INTER_BITS); |
||||
short ax0 = (short)(X0 & (INTER_TAB_SIZE-1)); |
||||
short ay0 = (short)(Y0 & (INTER_TAB_SIZE-1)); |
||||
|
||||
float4 v0, v1, v2, v3; |
||||
|
||||
v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : (float4)0; |
||||
v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : (float4)0; |
||||
v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : (float4)0; |
||||
v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : (float4)0; |
||||
|
||||
float tab[4]; |
||||
float taby[2], tabx[2]; |
||||
taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0; |
||||
taby[1] = 1.f/INTER_TAB_SIZE*ay0; |
||||
tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0; |
||||
tabx[1] = 1.f/INTER_TAB_SIZE*ax0; |
||||
|
||||
tab[0] = taby[0] * tabx[0]; |
||||
tab[1] = taby[0] * tabx[1]; |
||||
tab[2] = taby[1] * tabx[0]; |
||||
tab[3] = taby[1] * tabx[1]; |
||||
|
||||
float4 sum = 0; |
||||
sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3]; |
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) |
||||
dst[dst_offset+dy*dstStep+dx] = sum; |
||||
} |
||||
} |
||||
|
||||
__kernel void warpAffineCubic_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, |
||||
int dst_cols, int dst_rows, int srcStep, int dstStep, |
||||
int src_offset, int dst_offset, __constant F * M, int threadCols ) |
||||
{ |
||||
int dx = get_global_id(0); |
||||
int dy = get_global_id(1); |
||||
|
||||
if( dx < threadCols && dy < dst_rows) |
||||
{ |
||||
int round_delta = AB_SCALE/INTER_TAB_SIZE/2; |
||||
|
||||
src_offset = (src_offset>>4); |
||||
dst_offset = (dst_offset>>4); |
||||
srcStep = (srcStep>>2); |
||||
dstStep = (dstStep>>2); |
||||
|
||||
int X0 = rint(M[0] * dx * AB_SCALE); |
||||
int Y0 = rint(M[3] * dx * AB_SCALE); |
||||
X0 += rint((M[1]*dy + M[2]) * AB_SCALE) + round_delta; |
||||
Y0 += rint((M[4]*dy + M[5]) * AB_SCALE) + round_delta; |
||||
X0 = X0 >> (AB_BITS - INTER_BITS); |
||||
Y0 = Y0 >> (AB_BITS - INTER_BITS); |
||||
|
||||
short sx = (short)(X0 >> INTER_BITS) - 1; |
||||
short sy = (short)(Y0 >> INTER_BITS) - 1; |
||||
short ay = (short)(Y0 & (INTER_TAB_SIZE-1)); |
||||
short ax = (short)(X0 & (INTER_TAB_SIZE-1)); |
||||
|
||||
float4 v[16]; |
||||
int i; |
||||
|
||||
for(i=0; i<16; i++) |
||||
v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float4)0; |
||||
|
||||
float tab[16]; |
||||
float tab1y[4], tab1x[4]; |
||||
float axx, ayy; |
||||
|
||||
ayy = 1.f/INTER_TAB_SIZE * ay; |
||||
axx = 1.f/INTER_TAB_SIZE * ax; |
||||
interpolateCubic(ayy, tab1y); |
||||
interpolateCubic(axx, tab1x); |
||||
|
||||
#pragma unroll 4 |
||||
for( i=0; i<16; i++ ) |
||||
{ |
||||
tab[i] = tab1y[(i>>2)] * tab1x[(i&3)]; |
||||
} |
||||
|
||||
if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) |
||||
{ |
||||
float4 sum = 0; |
||||
#pragma unroll 4 |
||||
for ( i =0; i<16; i++ ) |
||||
{ |
||||
sum += v[i] * tab[i]; |
||||
} |
||||
dst[dst_offset+dy*dstStep+dx] = sum; |
||||
|
||||
} |
||||
} |
||||
} |
@ -0,0 +1,688 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Zhang Ying, zhangying913@gmail.com |
||||
// |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
|
||||
//wrapPerspective kernel |
||||
//support data types: CV_8UC1, CV_8UC4, CV_32FC1, CV_32FC4, and three interpolation methods: NN, Linear, Cubic. |
||||
|
||||
#if defined (DOUBLE_SUPPORT) |
||||
#ifdef cl_khr_fp64 |
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable |
||||
#elif defined (cl_amd_fp64) |
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable |
||||
#endif |
||||
typedef double F; |
||||
typedef double4 F4; |
||||
#define convert_F4 convert_double4 |
||||
#else |
||||
typedef float F; |
||||
typedef float4 F4; |
||||
#define convert_F4 convert_float4 |
||||
#endif |
||||
|
||||
|
||||
#define INTER_BITS 5 |
||||
#define INTER_TAB_SIZE (1 << INTER_BITS) |
||||
#define INTER_SCALE 1.f/INTER_TAB_SIZE |
||||
#define AB_BITS max(10, (int)INTER_BITS) |
||||
#define AB_SCALE (1 << AB_BITS) |
||||
#define INTER_REMAP_COEF_BITS 15 |
||||
#define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS) |
||||
|
||||
inline void interpolateCubic( float x, float* coeffs ) |
||||
{ |
||||
const float A = -0.75f; |
||||
|
||||
coeffs[0] = ((A*(x + 1.f) - 5.0f*A)*(x + 1.f) + 8.0f*A)*(x + 1.f) - 4.0f*A; |
||||
coeffs[1] = ((A + 2.f)*x - (A + 3.f))*x*x + 1.f; |
||||
coeffs[2] = ((A + 2.f)*(1.f - x) - (A + 3.f))*(1.f - x)*(1.f - x) + 1.f; |
||||
coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; |
||||
} |
||||
|
||||
|
||||
/**********************************************8UC1********************************************* |
||||
***********************************************************************************************/ |
||||
__kernel void warpPerspectiveNN_C1_D0(__global uchar const * restrict src, __global uchar * dst, int src_cols, int src_rows, |
||||
int dst_cols, int dst_rows, int srcStep, int dstStep, |
||||
int src_offset, int dst_offset, __constant F * M, int threadCols ) |
||||
{ |
||||
int dx = get_global_id(0); |
||||
int dy = get_global_id(1); |
||||
|
||||
if( dx < threadCols && dy < dst_rows) |
||||
{ |
||||
dx = (dx<<2) - (dst_offset&3); |
||||
|
||||
F4 DX = (F4)(dx, dx+1, dx+2, dx+3); |
||||
F4 X0 = M[0]*DX + M[1]*dy + M[2]; |
||||
F4 Y0 = M[3]*DX + M[4]*dy + M[5]; |
||||
F4 W = M[6]*DX + M[7]*dy + M[8],one=1,zero=0; |
||||
W = (W!=zero) ? one/W : zero; |
||||
short4 X = convert_short4(rint(X0*W)); |
||||
short4 Y = convert_short4(rint(Y0*W)); |
||||
int4 sx = convert_int4(X); |
||||
int4 sy = convert_int4(Y); |
||||
|
||||
int4 DXD = (int4)(dx, dx+1, dx+2, dx+3); |
||||
__global uchar4 * d = (__global uchar4 *)(dst+dst_offset+dy*dstStep+dx); |
||||
uchar4 dval = *d; |
||||
int4 dcon = DXD >= 0 && DXD < dst_cols && dy >= 0 && dy < dst_rows; |
||||
int4 scon = sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows; |
||||
int4 spos = src_offset + sy * srcStep + sx; |
||||
uchar4 sval; |
||||
sval.s0 = scon.s0 ? src[spos.s0] : 0; |
||||
sval.s1 = scon.s1 ? src[spos.s1] : 0; |
||||
sval.s2 = scon.s2 ? src[spos.s2] : 0; |
||||
sval.s3 = scon.s3 ? src[spos.s3] : 0; |
||||
dval = convert_uchar4(dcon) != (uchar4)(0,0,0,0) ? sval : dval; |
||||
*d = dval; |
||||
} |
||||
} |
||||
|
||||
__kernel void warpPerspectiveLinear_C1_D0(__global const uchar * restrict src, __global uchar * dst, |
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, |
||||
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) |
||||
{ |
||||
int dx = get_global_id(0); |
||||
int dy = get_global_id(1); |
||||
|
||||
if( dx < threadCols && dy < dst_rows) |
||||
{ |
||||
F X0 = M[0]*dx + M[1]*dy + M[2]; |
||||
F Y0 = M[3]*dx + M[4]*dy + M[5]; |
||||
F W = M[6]*dx + M[7]*dy + M[8]; |
||||
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; |
||||
int X = rint(X0*W); |
||||
int Y = rint(Y0*W); |
||||
|
||||
int sx = (short)(X >> INTER_BITS); |
||||
int sy = (short)(Y >> INTER_BITS); |
||||
int ay = (short)(Y & (INTER_TAB_SIZE-1)); |
||||
int ax = (short)(X & (INTER_TAB_SIZE-1)); |
||||
|
||||
uchar v[4]; |
||||
int i; |
||||
#pragma unroll 4 |
||||
for(i=0; i<4; i++) |
||||
v[i] = (sx+(i&1) >= 0 && sx+(i&1) < src_cols && sy+(i>>1) >= 0 && sy+(i>>1) < src_rows) ? src[src_offset + (sy+(i>>1)) * srcStep + (sx+(i&1))] : (uchar)0; |
||||
|
||||
short itab[4]; |
||||
float tab1y[2], tab1x[2]; |
||||
tab1y[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay; |
||||
tab1y[1] = 1.f/INTER_TAB_SIZE*ay; |
||||
tab1x[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax; |
||||
tab1x[1] = 1.f/INTER_TAB_SIZE*ax; |
||||
|
||||
#pragma unroll 4 |
||||
for(i=0; i<4; i++) |
||||
{ |
||||
float v = tab1y[(i>>1)] * tab1x[(i&1)]; |
||||
itab[i] = convert_short_sat(rint( v * INTER_REMAP_COEF_SCALE )); |
||||
} |
||||
if(dx >=0 && dx < dst_cols && dy >= 0 && dy < dst_rows) |
||||
{ |
||||
int sum = 0; |
||||
for ( i =0; i<4; i++ ) |
||||
{ |
||||
sum += v[i] * itab[i] ; |
||||
} |
||||
dst[dst_offset+dy*dstStep+dx] = convert_uchar_sat ( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; |
||||
} |
||||
} |
||||
} |
||||
|
||||
__kernel void warpPerspectiveCubic_C1_D0(__global uchar * src, __global uchar * dst, int src_cols, int src_rows, |
||||
int dst_cols, int dst_rows, int srcStep, int dstStep, |
||||
int src_offset, int dst_offset, __constant F * M, int threadCols ) |
||||
{ |
||||
int dx = get_global_id(0); |
||||
int dy = get_global_id(1); |
||||
|
||||
if( dx < threadCols && dy < dst_rows) |
||||
{ |
||||
F X0 = M[0]*dx + M[1]*dy + M[2]; |
||||
F Y0 = M[3]*dx + M[4]*dy + M[5]; |
||||
F W = M[6]*dx + M[7]*dy + M[8]; |
||||
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; |
||||
int X = rint(X0*W); |
||||
int Y = rint(Y0*W); |
||||
|
||||
short sx = (short)(X >> INTER_BITS) - 1; |
||||
short sy = (short)(Y >> INTER_BITS) - 1; |
||||
short ay = (short)(Y & (INTER_TAB_SIZE-1)); |
||||
short ax = (short)(X & (INTER_TAB_SIZE-1)); |
||||
|
||||
uchar v[16]; |
||||
int i, j; |
||||
|
||||
#pragma unroll 4 |
||||
for(i=0; i<4; i++) |
||||
for(j=0; j<4; j++) |
||||
{ |
||||
v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? src[src_offset+(sy+i) * srcStep + (sx+j)] : (uchar)0; |
||||
} |
||||
|
||||
short itab[16]; |
||||
float tab1y[4], tab1x[4]; |
||||
float axx, ayy; |
||||
|
||||
ayy = 1.f/INTER_TAB_SIZE * ay; |
||||
axx = 1.f/INTER_TAB_SIZE * ax; |
||||
interpolateCubic(ayy, tab1y); |
||||
interpolateCubic(axx, tab1x); |
||||
|
||||
int isum = 0; |
||||
#pragma unroll 16 |
||||
for( i=0; i<16; i++ ) |
||||
{ |
||||
F v = tab1y[(i>>2)] * tab1x[(i&3)]; |
||||
isum += itab[i] = convert_short_sat( rint( v * INTER_REMAP_COEF_SCALE ) ); |
||||
} |
||||
if( isum != INTER_REMAP_COEF_SCALE ) |
||||
{ |
||||
int k1, k2; |
||||
int diff = isum - INTER_REMAP_COEF_SCALE; |
||||
int Mk1=2, Mk2=2, mk1=2, mk2=2; |
||||
for( k1 = 2; k1 < 4; k1++ ) |
||||
for( k2 = 2; k2 < 4; k2++ ) |
||||
{ |
||||
if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) |
||||
mk1 = k1, mk2 = k2; |
||||
else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) |
||||
Mk1 = k1, Mk2 = k2; |
||||
} |
||||
diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); |
||||
} |
||||
|
||||
|
||||
if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) |
||||
{ |
||||
int sum=0; |
||||
for ( i =0; i<16; i++ ) |
||||
{ |
||||
sum += v[i] * itab[i] ; |
||||
} |
||||
dst[dst_offset+dy*dstStep+dx] = convert_uchar_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; |
||||
} |
||||
} |
||||
} |
||||
|
||||
/**********************************************8UC4********************************************* |
||||
***********************************************************************************************/ |
||||
|
||||
__kernel void warpPerspectiveNN_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, |
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, |
||||
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) |
||||
{ |
||||
int dx = get_global_id(0); |
||||
int dy = get_global_id(1); |
||||
|
||||
if( dx < threadCols && dy < dst_rows) |
||||
{ |
||||
|
||||
F X0 = M[0]*dx + M[1]*dy + M[2]; |
||||
F Y0 = M[3]*dx + M[4]*dy + M[5]; |
||||
F W = M[6]*dx + M[7]*dy + M[8]; |
||||
W = (W != 0.0) ? 1./W : 0.0; |
||||
int X = rint(X0*W); |
||||
int Y = rint(Y0*W); |
||||
short sx = (short)X; |
||||
short sy = (short)Y; |
||||
|
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) |
||||
dst[(dst_offset>>2)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>2)+sy*(srcStep>>2)+sx] : (uchar4)0; |
||||
} |
||||
} |
||||
|
||||
__kernel void warpPerspectiveLinear_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, |
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, |
||||
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) |
||||
{ |
||||
int dx = get_global_id(0); |
||||
int dy = get_global_id(1); |
||||
|
||||
if( dx < threadCols && dy < dst_rows) |
||||
{ |
||||
src_offset = (src_offset>>2); |
||||
srcStep = (srcStep>>2); |
||||
|
||||
F X0 = M[0]*dx + M[1]*dy + M[2]; |
||||
F Y0 = M[3]*dx + M[4]*dy + M[5]; |
||||
F W = M[6]*dx + M[7]*dy + M[8]; |
||||
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; |
||||
int X = rint(X0*W); |
||||
int Y = rint(Y0*W); |
||||
|
||||
short sx = (short)(X >> INTER_BITS); |
||||
short sy = (short)(Y >> INTER_BITS); |
||||
short ay = (short)(Y & (INTER_TAB_SIZE-1)); |
||||
short ax = (short)(X & (INTER_TAB_SIZE-1)); |
||||
|
||||
|
||||
int4 v0, v1, v2, v3; |
||||
|
||||
v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx]) : (int4)0; |
||||
v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? convert_int4(src[src_offset+sy * srcStep + sx+1]) : (int4)0; |
||||
v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx]) : (int4)0; |
||||
v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? convert_int4(src[src_offset+(sy+1) * srcStep + sx+1]) : (int4)0; |
||||
|
||||
int itab0, itab1, itab2, itab3; |
||||
float taby, tabx; |
||||
taby = 1.f/INTER_TAB_SIZE*ay; |
||||
tabx = 1.f/INTER_TAB_SIZE*ax; |
||||
|
||||
itab0 = convert_short_sat(rint( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE )); |
||||
itab1 = convert_short_sat(rint( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE )); |
||||
itab2 = convert_short_sat(rint( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE )); |
||||
itab3 = convert_short_sat(rint( taby*tabx * INTER_REMAP_COEF_SCALE )); |
||||
|
||||
int4 val; |
||||
val = v0 * itab0 + v1 * itab1 + v2 * itab2 + v3 * itab3; |
||||
|
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) |
||||
dst[(dst_offset>>2)+dy*(dstStep>>2)+dx] = convert_uchar4_sat ( (val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; |
||||
} |
||||
} |
||||
|
||||
__kernel void warpPerspectiveCubic_C4_D0(__global uchar4 const * restrict src, __global uchar4 * dst, |
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, |
||||
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) |
||||
{ |
||||
int dx = get_global_id(0); |
||||
int dy = get_global_id(1); |
||||
|
||||
if( dx < threadCols && dy < dst_rows) |
||||
{ |
||||
src_offset = (src_offset>>2); |
||||
srcStep = (srcStep>>2); |
||||
dst_offset = (dst_offset>>2); |
||||
dstStep = (dstStep>>2); |
||||
|
||||
F X0 = M[0]*dx + M[1]*dy + M[2]; |
||||
F Y0 = M[3]*dx + M[4]*dy + M[5]; |
||||
F W = M[6]*dx + M[7]*dy + M[8]; |
||||
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; |
||||
int X = rint(X0*W); |
||||
int Y = rint(Y0*W); |
||||
|
||||
short sx = (short)(X >> INTER_BITS) - 1; |
||||
short sy = (short)(Y >> INTER_BITS) - 1; |
||||
short ay = (short)(Y & (INTER_TAB_SIZE-1)); |
||||
short ax = (short)(X & (INTER_TAB_SIZE-1)); |
||||
|
||||
uchar4 v[16]; |
||||
int i,j; |
||||
#pragma unroll 4 |
||||
for(i=0; i<4; i++) |
||||
for(j=0; j<4; j++) |
||||
{ |
||||
v[i*4+j] = (sx+j >= 0 && sx+j < src_cols && sy+i >= 0 && sy+i < src_rows) ? (src[src_offset+(sy+i) * srcStep + (sx+j)]) : (uchar4)0; |
||||
} |
||||
int itab[16]; |
||||
float tab1y[4], tab1x[4]; |
||||
float axx, ayy; |
||||
|
||||
ayy = INTER_SCALE * ay; |
||||
axx = INTER_SCALE * ax; |
||||
interpolateCubic(ayy, tab1y); |
||||
interpolateCubic(axx, tab1x); |
||||
int isum = 0; |
||||
|
||||
#pragma unroll 16 |
||||
for( i=0; i<16; i++ ) |
||||
{ |
||||
float tmp; |
||||
tmp = tab1y[(i>>2)] * tab1x[(i&3)] * INTER_REMAP_COEF_SCALE; |
||||
itab[i] = rint(tmp); |
||||
isum += itab[i]; |
||||
} |
||||
|
||||
if( isum != INTER_REMAP_COEF_SCALE ) |
||||
{ |
||||
int k1, k2; |
||||
int diff = isum - INTER_REMAP_COEF_SCALE; |
||||
int Mk1=2, Mk2=2, mk1=2, mk2=2; |
||||
|
||||
for( k1 = 2; k1 < 4; k1++ ) |
||||
for( k2 = 2; k2 < 4; k2++ ) |
||||
{ |
||||
|
||||
if( itab[(k1<<2)+k2] < itab[(mk1<<2)+mk2] ) |
||||
mk1 = k1, mk2 = k2; |
||||
else if( itab[(k1<<2)+k2] > itab[(Mk1<<2)+Mk2] ) |
||||
Mk1 = k1, Mk2 = k2; |
||||
} |
||||
|
||||
diff<0 ? (itab[(Mk1<<2)+Mk2]=(short)(itab[(Mk1<<2)+Mk2]-diff)) : (itab[(mk1<<2)+mk2]=(short)(itab[(mk1<<2)+mk2]-diff)); |
||||
} |
||||
|
||||
if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) |
||||
{ |
||||
int4 sum=0; |
||||
for ( i =0; i<16; i++ ) |
||||
{ |
||||
sum += convert_int4(v[i]) * itab[i]; |
||||
} |
||||
dst[dst_offset+dy*dstStep+dx] = convert_uchar4_sat( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ) ; |
||||
} |
||||
} |
||||
} |
||||
|
||||
|
||||
/**********************************************32FC1******************************************** |
||||
***********************************************************************************************/ |
||||
|
||||
__kernel void warpPerspectiveNN_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, |
||||
int dst_cols, int dst_rows, int srcStep, int dstStep, |
||||
int src_offset, int dst_offset, __constant F * M, int threadCols ) |
||||
{ |
||||
int dx = get_global_id(0); |
||||
int dy = get_global_id(1); |
||||
|
||||
if( dx < threadCols && dy < dst_rows) |
||||
{ |
||||
F X0 = M[0]*dx + M[1]*dy + M[2]; |
||||
F Y0 = M[3]*dx + M[4]*dy + M[5]; |
||||
F W = M[6]*dx + M[7]*dy + M[8]; |
||||
W = (W != 0.0) ? 1./W : 0.0; |
||||
int X = rint(X0*W); |
||||
int Y = rint(Y0*W); |
||||
short sx = (short)X; |
||||
short sy = (short)Y; |
||||
|
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) |
||||
dst[(dst_offset>>2)+dy*dstStep+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>2)+sy*srcStep+sx] : 0; |
||||
} |
||||
} |
||||
|
||||
__kernel void warpPerspectiveLinear_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, |
||||
int dst_cols, int dst_rows, int srcStep, int dstStep, |
||||
int src_offset, int dst_offset, __constant F * M, int threadCols ) |
||||
{ |
||||
int dx = get_global_id(0); |
||||
int dy = get_global_id(1); |
||||
|
||||
if( dx < threadCols && dy < dst_rows) |
||||
{ |
||||
src_offset = (src_offset>>2); |
||||
|
||||
F X0 = M[0]*dx + M[1]*dy + M[2]; |
||||
F Y0 = M[3]*dx + M[4]*dy + M[5]; |
||||
F W = M[6]*dx + M[7]*dy + M[8]; |
||||
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; |
||||
int X = rint(X0*W); |
||||
int Y = rint(Y0*W); |
||||
|
||||
short sx = (short)(X >> INTER_BITS); |
||||
short sy = (short)(Y >> INTER_BITS); |
||||
short ay = (short)(Y & (INTER_TAB_SIZE-1)); |
||||
short ax = (short)(X & (INTER_TAB_SIZE-1)); |
||||
|
||||
float v0, v1, v2, v3; |
||||
|
||||
v0 = (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx] : (float)0; |
||||
v1 = (sx+1 >= 0 && sx+1 < src_cols && sy >= 0 && sy < src_rows) ? src[src_offset+sy * srcStep + sx+1] : (float)0; |
||||
v2 = (sx >= 0 && sx < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx] : (float)0; |
||||
v3 = (sx+1 >= 0 && sx+1 < src_cols && sy+1 >= 0 && sy+1 < src_rows) ? src[src_offset+(sy+1) * srcStep + sx+1] : (float)0; |
||||
|
||||
float tab[4]; |
||||
float taby[2], tabx[2]; |
||||
taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay; |
||||
taby[1] = 1.f/INTER_TAB_SIZE*ay; |
||||
tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax; |
||||
tabx[1] = 1.f/INTER_TAB_SIZE*ax; |
||||
|
||||
tab[0] = taby[0] * tabx[0]; |
||||
tab[1] = taby[0] * tabx[1]; |
||||
tab[2] = taby[1] * tabx[0]; |
||||
tab[3] = taby[1] * tabx[1]; |
||||
|
||||
float sum = 0; |
||||
sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3]; |
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) |
||||
dst[(dst_offset>>2)+dy*dstStep+dx] = sum; |
||||
} |
||||
} |
||||
|
||||
__kernel void warpPerspectiveCubic_C1_D5(__global float * src, __global float * dst, int src_cols, int src_rows, |
||||
int dst_cols, int dst_rows, int srcStep, int dstStep, |
||||
int src_offset, int dst_offset, __constant F * M, int threadCols ) |
||||
{ |
||||
int dx = get_global_id(0); |
||||
int dy = get_global_id(1); |
||||
|
||||
if( dx < threadCols && dy < dst_rows) |
||||
{ |
||||
src_offset = (src_offset>>2); |
||||
dst_offset = (dst_offset>>2); |
||||
|
||||
F X0 = M[0]*dx + M[1]*dy + M[2]; |
||||
F Y0 = M[3]*dx + M[4]*dy + M[5]; |
||||
F W = M[6]*dx + M[7]*dy + M[8]; |
||||
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; |
||||
int X = rint(X0*W); |
||||
int Y = rint(Y0*W); |
||||
|
||||
short sx = (short)(X >> INTER_BITS) - 1; |
||||
short sy = (short)(Y >> INTER_BITS) - 1; |
||||
short ay = (short)(Y & (INTER_TAB_SIZE-1)); |
||||
short ax = (short)(X & (INTER_TAB_SIZE-1)); |
||||
|
||||
float v[16]; |
||||
int i; |
||||
|
||||
for(i=0; i<16; i++) |
||||
v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float)0; |
||||
|
||||
float tab[16]; |
||||
float tab1y[4], tab1x[4]; |
||||
float axx, ayy; |
||||
|
||||
ayy = 1.f/INTER_TAB_SIZE * ay; |
||||
axx = 1.f/INTER_TAB_SIZE * ax; |
||||
interpolateCubic(ayy, tab1y); |
||||
interpolateCubic(axx, tab1x); |
||||
|
||||
#pragma unroll 4 |
||||
for( i=0; i<16; i++ ) |
||||
{ |
||||
tab[i] = tab1y[(i>>2)] * tab1x[(i&3)]; |
||||
} |
||||
|
||||
if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) |
||||
{ |
||||
float sum = 0; |
||||
#pragma unroll 4 |
||||
for ( i =0; i<16; i++ ) |
||||
{ |
||||
sum += v[i] * tab[i]; |
||||
} |
||||
dst[dst_offset+dy*dstStep+dx] = sum; |
||||
|
||||
} |
||||
} |
||||
} |
||||
|
||||
|
||||
/**********************************************32FC4******************************************** |
||||
***********************************************************************************************/ |
||||
|
||||
__kernel void warpPerspectiveNN_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, |
||||
int dst_cols, int dst_rows, int srcStep, int dstStep, |
||||
int src_offset, int dst_offset, __constant F * M, int threadCols ) |
||||
{ |
||||
int dx = get_global_id(0); |
||||
int dy = get_global_id(1); |
||||
|
||||
if( dx < threadCols && dy < dst_rows) |
||||
{ |
||||
F X0 = M[0]*dx + M[1]*dy + M[2]; |
||||
F Y0 = M[3]*dx + M[4]*dy + M[5]; |
||||
F W = M[6]*dx + M[7]*dy + M[8]; |
||||
W =(W != 0.0)? 1./W : 0.0; |
||||
int X = rint(X0*W); |
||||
int Y = rint(Y0*W); |
||||
short sx = (short)X; |
||||
short sy = (short)Y; |
||||
|
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) |
||||
dst[(dst_offset>>4)+dy*(dstStep>>2)+dx]= (sx>=0 && sx<src_cols && sy>=0 && sy<src_rows) ? src[(src_offset>>4)+sy*(srcStep>>2)+sx] : (float)0; |
||||
} |
||||
} |
||||
|
||||
__kernel void warpPerspectiveLinear_C4_D5(__global float4 * src, __global float4 * dst, int src_cols, int src_rows, |
||||
int dst_cols, int dst_rows, int srcStep, int dstStep, |
||||
int src_offset, int dst_offset, __constant F * M, int threadCols ) |
||||
{ |
||||
int dx = get_global_id(0); |
||||
int dy = get_global_id(1); |
||||
|
||||
if( dx < threadCols && dy < dst_rows) |
||||
{ |
||||
src_offset = (src_offset>>4); |
||||
dst_offset = (dst_offset>>4); |
||||
srcStep = (srcStep>>2); |
||||
dstStep = (dstStep>>2); |
||||
|
||||
F X0 = M[0]*dx + M[1]*dy + M[2]; |
||||
F Y0 = M[3]*dx + M[4]*dy + M[5]; |
||||
F W = M[6]*dx + M[7]*dy + M[8]; |
||||
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; |
||||
int X = rint(X0*W); |
||||
int Y = rint(Y0*W); |
||||
|
||||
short sx0 = (short)(X >> INTER_BITS); |
||||
short sy0 = (short)(Y >> INTER_BITS); |
||||
short ay0 = (short)(Y & (INTER_TAB_SIZE-1)); |
||||
short ax0 = (short)(X & (INTER_TAB_SIZE-1)); |
||||
|
||||
|
||||
float4 v0, v1, v2, v3; |
||||
|
||||
v0 = (sx0 >= 0 && sx0 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0] : (float4)0; |
||||
v1 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0 >= 0 && sy0 < src_rows) ? src[src_offset+sy0 * srcStep + sx0+1] : (float4)0; |
||||
v2 = (sx0 >= 0 && sx0 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0] : (float4)0; |
||||
v3 = (sx0+1 >= 0 && sx0+1 < src_cols && sy0+1 >= 0 && sy0+1 < src_rows) ? src[src_offset+(sy0+1) * srcStep + sx0+1] : (float4)0; |
||||
|
||||
float tab[4]; |
||||
float taby[2], tabx[2]; |
||||
taby[0] = 1.0 - 1.f/INTER_TAB_SIZE*ay0; |
||||
taby[1] = 1.f/INTER_TAB_SIZE*ay0; |
||||
tabx[0] = 1.0 - 1.f/INTER_TAB_SIZE*ax0; |
||||
tabx[1] = 1.f/INTER_TAB_SIZE*ax0; |
||||
|
||||
tab[0] = taby[0] * tabx[0]; |
||||
tab[1] = taby[0] * tabx[1]; |
||||
tab[2] = taby[1] * tabx[0]; |
||||
tab[3] = taby[1] * tabx[1]; |
||||
|
||||
float4 sum = 0; |
||||
sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3]; |
||||
if(dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) |
||||
dst[dst_offset+dy*dstStep+dx] = sum; |
||||
} |
||||
} |
||||
|
||||
__kernel void warpPerspectiveCubic_C4_D5(__global float4 * src, __global float4 * dst, |
||||
int src_cols, int src_rows, int dst_cols, int dst_rows, int srcStep, |
||||
int dstStep, int src_offset, int dst_offset, __constant F * M, int threadCols ) |
||||
{ |
||||
int dx = get_global_id(0); |
||||
int dy = get_global_id(1); |
||||
|
||||
if( dx < threadCols && dy < dst_rows ) |
||||
{ |
||||
src_offset = (src_offset>>4); |
||||
dst_offset = (dst_offset>>4); |
||||
srcStep = (srcStep>>2); |
||||
dstStep = (dstStep>>2); |
||||
|
||||
F X0 = M[0]*dx + M[1]*dy + M[2]; |
||||
F Y0 = M[3]*dx + M[4]*dy + M[5]; |
||||
F W = M[6]*dx + M[7]*dy + M[8]; |
||||
W = (W != 0.0) ? INTER_TAB_SIZE/W : 0.0; |
||||
int X = rint(X0*W); |
||||
int Y = rint(Y0*W); |
||||
|
||||
short sx = (short)(X >> INTER_BITS)-1; |
||||
short sy = (short)(Y >> INTER_BITS)-1; |
||||
short ay = (short)(Y & (INTER_TAB_SIZE-1)); |
||||
short ax = (short)(X & (INTER_TAB_SIZE-1)); |
||||
|
||||
|
||||
float4 v[16]; |
||||
int i; |
||||
|
||||
for(i=0; i<16; i++) |
||||
v[i] = (sx+(i&3) >= 0 && sx+(i&3) < src_cols && sy+(i>>2) >= 0 && sy+(i>>2) < src_rows) ? src[src_offset+(sy+(i>>2)) * srcStep + (sx+(i&3))] : (float4)0; |
||||
|
||||
float tab[16]; |
||||
float tab1y[4], tab1x[4]; |
||||
float axx, ayy; |
||||
|
||||
ayy = 1.f/INTER_TAB_SIZE * ay; |
||||
axx = 1.f/INTER_TAB_SIZE * ax; |
||||
interpolateCubic(ayy, tab1y); |
||||
interpolateCubic(axx, tab1x); |
||||
|
||||
#pragma unroll 4 |
||||
for( i=0; i<16; i++ ) |
||||
{ |
||||
tab[i] = tab1y[(i>>2)] * tab1x[(i&3)]; |
||||
} |
||||
|
||||
if( dx >= 0 && dx < dst_cols && dy >= 0 && dy < dst_rows) |
||||
{ |
||||
float4 sum = 0; |
||||
#pragma unroll 4 |
||||
for ( i =0; i<16; i++ ) |
||||
{ |
||||
sum += v[i] * tab[i]; |
||||
} |
||||
dst[dst_offset+dy*dstStep+dx] = sum; |
||||
|
||||
} |
||||
} |
||||
} |
@ -0,0 +1,81 @@ |
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or implied warranties, including, but not limited to, the implied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "test_precomp.hpp" |
||||
#include <string> |
||||
|
||||
using namespace cv; |
||||
using namespace std; |
||||
|
||||
class CV_ImgprocUMatTest : public cvtest::BaseTest |
||||
{ |
||||
public: |
||||
CV_ImgprocUMatTest() {} |
||||
~CV_ImgprocUMatTest() {} |
||||
protected: |
||||
void run(int) |
||||
{ |
||||
string imgpath = string(ts->get_data_path()) + "shared/lena.png"; |
||||
Mat img = imread(imgpath, 1), gray, smallimg, result; |
||||
UMat uimg = img.getUMat(ACCESS_READ), ugray, usmallimg, uresult; |
||||
|
||||
cvtColor(img, gray, COLOR_BGR2GRAY); |
||||
resize(gray, smallimg, Size(), 0.75, 0.75, INTER_LINEAR); |
||||
equalizeHist(smallimg, result); |
||||
|
||||
cvtColor(uimg, ugray, COLOR_BGR2GRAY); |
||||
resize(ugray, usmallimg, Size(), 0.75, 0.75, INTER_LINEAR); |
||||
equalizeHist(usmallimg, uresult); |
||||
|
||||
imshow("orig", uimg); |
||||
imshow("small", usmallimg); |
||||
imshow("equalized gray", uresult); |
||||
waitKey(); |
||||
destroyWindow("orig"); |
||||
destroyWindow("small"); |
||||
destroyWindow("equalized gray"); |
||||
|
||||
ts->set_failed_test_info(cvtest::TS::OK); |
||||
} |
||||
}; |
||||
|
||||
TEST(Imgproc_UMat, regression) { CV_ImgprocUMatTest test; test.safe_run(); } |
@ -0,0 +1,423 @@ |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Niko Li, newlife20080214@gmail.com |
||||
// Wang Weiyan, wangweiyanster@gmail.com |
||||
// Jia Haipeng, jiahaipeng95@gmail.com |
||||
// Nathan, liujun@multicorewareinc.com |
||||
// Peng Xiao, pengxiao@outlook.com |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
// |
||||
|
||||
#pragma OPENCL EXTENSION cl_amd_printf : enable |
||||
#define CV_HAAR_FEATURE_MAX 3 |
||||
|
||||
#define calc_sum(rect,offset) (sum[(rect).p0+offset] - sum[(rect).p1+offset] - sum[(rect).p2+offset] + sum[(rect).p3+offset]) |
||||
#define calc_sum1(rect,offset,i) (sum[(rect).p0[i]+offset] - sum[(rect).p1[i]+offset] - sum[(rect).p2[i]+offset] + sum[(rect).p3[i]+offset]) |
||||
|
||||
typedef int sumtype; |
||||
typedef float sqsumtype; |
||||
|
||||
#ifndef STUMP_BASED |
||||
#define STUMP_BASED 1 |
||||
#endif |
||||
|
||||
typedef struct __attribute__((aligned (128) )) GpuHidHaarTreeNode |
||||
{ |
||||
int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned (64))); |
||||
float weight[CV_HAAR_FEATURE_MAX]; |
||||
float threshold; |
||||
float alpha[3] __attribute__((aligned (16))); |
||||
int left __attribute__((aligned (4))); |
||||
int right __attribute__((aligned (4))); |
||||
} |
||||
GpuHidHaarTreeNode; |
||||
|
||||
|
||||
typedef struct __attribute__((aligned (32))) GpuHidHaarClassifier |
||||
{ |
||||
int count __attribute__((aligned (4))); |
||||
GpuHidHaarTreeNode* node __attribute__((aligned (8))); |
||||
float* alpha __attribute__((aligned (8))); |
||||
} |
||||
GpuHidHaarClassifier; |
||||
|
||||
|
||||
typedef struct __attribute__((aligned (64))) GpuHidHaarStageClassifier |
||||
{ |
||||
int count __attribute__((aligned (4))); |
||||
float threshold __attribute__((aligned (4))); |
||||
int two_rects __attribute__((aligned (4))); |
||||
int reserved0 __attribute__((aligned (8))); |
||||
int reserved1 __attribute__((aligned (8))); |
||||
int reserved2 __attribute__((aligned (8))); |
||||
int reserved3 __attribute__((aligned (8))); |
||||
} |
||||
GpuHidHaarStageClassifier; |
||||
|
||||
|
||||
typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade |
||||
{ |
||||
int count __attribute__((aligned (4))); |
||||
int is_stump_based __attribute__((aligned (4))); |
||||
int has_tilted_features __attribute__((aligned (4))); |
||||
int is_tree __attribute__((aligned (4))); |
||||
int pq0 __attribute__((aligned (4))); |
||||
int pq1 __attribute__((aligned (4))); |
||||
int pq2 __attribute__((aligned (4))); |
||||
int pq3 __attribute__((aligned (4))); |
||||
int p0 __attribute__((aligned (4))); |
||||
int p1 __attribute__((aligned (4))); |
||||
int p2 __attribute__((aligned (4))); |
||||
int p3 __attribute__((aligned (4))); |
||||
float inv_window_area __attribute__((aligned (4))); |
||||
} GpuHidHaarClassifierCascade; |
||||
|
||||
__kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCascade( |
||||
global GpuHidHaarStageClassifier * stagecascadeptr, |
||||
global int4 * info, |
||||
global GpuHidHaarTreeNode * nodeptr, |
||||
global const int * restrict sum1, |
||||
global const float * restrict sqsum1, |
||||
global int4 * candidate, |
||||
const int pixelstep, |
||||
const int loopcount, |
||||
const int start_stage, |
||||
const int split_stage, |
||||
const int end_stage, |
||||
const int startnode, |
||||
const int splitnode, |
||||
const int4 p, |
||||
const int4 pq, |
||||
const float correction) |
||||
{ |
||||
int grpszx = get_local_size(0); |
||||
int grpszy = get_local_size(1); |
||||
int grpnumx = get_num_groups(0); |
||||
int grpidx = get_group_id(0); |
||||
int lclidx = get_local_id(0); |
||||
int lclidy = get_local_id(1); |
||||
|
||||
int lcl_sz = mul24(grpszx,grpszy); |
||||
int lcl_id = mad24(lclidy,grpszx,lclidx); |
||||
|
||||
__local int lclshare[1024]; |
||||
__local int* lcldata = lclshare;//for save win data |
||||
__local int* glboutindex = lcldata + 28*28;//for save global out index |
||||
__local int* lclcount = glboutindex + 1;//for save the numuber of temp pass pixel |
||||
__local int* lcloutindex = lclcount + 1;//for save info of temp pass pixel |
||||
__local float* partialsum = (__local float*)(lcloutindex + (lcl_sz<<1)); |
||||
glboutindex[0]=0; |
||||
int outputoff = mul24(grpidx,256); |
||||
|
||||
//assume window size is 20X20 |
||||
#define WINDOWSIZE 20+1 |
||||
//make sure readwidth is the multiple of 4 |
||||
//ystep =1, from host code |
||||
int readwidth = ((grpszx-1 + WINDOWSIZE+3)>>2)<<2; |
||||
int readheight = grpszy-1+WINDOWSIZE; |
||||
int read_horiz_cnt = readwidth >> 2;//each read int4 |
||||
int total_read = mul24(read_horiz_cnt,readheight); |
||||
int read_loop = (total_read + lcl_sz - 1) >> 6; |
||||
candidate[outputoff+(lcl_id<<2)] = (int4)0; |
||||
candidate[outputoff+(lcl_id<<2)+1] = (int4)0; |
||||
candidate[outputoff+(lcl_id<<2)+2] = (int4)0; |
||||
candidate[outputoff+(lcl_id<<2)+3] = (int4)0; |
||||
for(int scalei = 0; scalei <loopcount; scalei++) |
||||
{ |
||||
int4 scaleinfo1= info[scalei]; |
||||
int width = (scaleinfo1.x & 0xffff0000) >> 16; |
||||
int height = scaleinfo1.x & 0xffff; |
||||
int grpnumperline =(scaleinfo1.y & 0xffff0000) >> 16; |
||||
int totalgrp = scaleinfo1.y & 0xffff; |
||||
int imgoff = scaleinfo1.z; |
||||
float factor = as_float(scaleinfo1.w); |
||||
|
||||
__global const int * sum = sum1 + imgoff; |
||||
__global const float * sqsum = sqsum1 + imgoff; |
||||
for(int grploop=grpidx; grploop<totalgrp; grploop+=grpnumx) |
||||
{ |
||||
int grpidy = grploop / grpnumperline; |
||||
int grpidx = grploop - mul24(grpidy, grpnumperline); |
||||
int x = mad24(grpidx,grpszx,lclidx); |
||||
int y = mad24(grpidy,grpszy,lclidy); |
||||
int grpoffx = x-lclidx; |
||||
int grpoffy = y-lclidy; |
||||
|
||||
for(int i=0; i<read_loop; i++) |
||||
{ |
||||
int pos_id = mad24(i,lcl_sz,lcl_id); |
||||
pos_id = pos_id < total_read ? pos_id : 0; |
||||
|
||||
int lcl_y = pos_id / read_horiz_cnt; |
||||
int lcl_x = pos_id - mul24(lcl_y, read_horiz_cnt); |
||||
|
||||
int glb_x = grpoffx + (lcl_x<<2); |
||||
int glb_y = grpoffy + lcl_y; |
||||
|
||||
int glb_off = mad24(min(glb_y, height - 1),pixelstep,glb_x); |
||||
int4 data = *(__global int4*)&sum[glb_off]; |
||||
int lcl_off = mad24(lcl_y, readwidth, lcl_x<<2); |
||||
|
||||
vstore4(data, 0, &lcldata[lcl_off]); |
||||
} |
||||
|
||||
lcloutindex[lcl_id] = 0; |
||||
lclcount[0] = 0; |
||||
int result = 1; |
||||
int nodecounter= startnode; |
||||
float mean, variance_norm_factor; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
int lcl_off = mad24(lclidy,readwidth,lclidx); |
||||
int4 cascadeinfo1, cascadeinfo2; |
||||
cascadeinfo1 = p; |
||||
cascadeinfo2 = pq; |
||||
|
||||
cascadeinfo1.x +=lcl_off; |
||||
cascadeinfo1.z +=lcl_off; |
||||
mean = (lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.x)] - lcldata[mad24(cascadeinfo1.y,readwidth,cascadeinfo1.z)] - |
||||
lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.x)] + lcldata[mad24(cascadeinfo1.w,readwidth,cascadeinfo1.z)]) |
||||
*correction; |
||||
|
||||
int p_offset = mad24(y, pixelstep, x); |
||||
|
||||
cascadeinfo2.x +=p_offset; |
||||
cascadeinfo2.z +=p_offset; |
||||
variance_norm_factor =sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.x)] - sqsum[mad24(cascadeinfo2.y, pixelstep, cascadeinfo2.z)] - |
||||
sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.x)] + sqsum[mad24(cascadeinfo2.w, pixelstep, cascadeinfo2.z)]; |
||||
|
||||
variance_norm_factor = variance_norm_factor * correction - mean * mean; |
||||
variance_norm_factor = variance_norm_factor >=0.f ? sqrt(variance_norm_factor) : 1.f; |
||||
|
||||
for(int stageloop = start_stage; (stageloop < split_stage) && result; stageloop++ ) |
||||
{ |
||||
float stage_sum = 0.f; |
||||
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop); |
||||
float stagethreshold = as_float(stageinfo.y); |
||||
for(int nodeloop = 0; nodeloop < stageinfo.x; ) |
||||
{ |
||||
__global GpuHidHaarTreeNode* currentnodeptr = (nodeptr + nodecounter); |
||||
|
||||
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0])); |
||||
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0])); |
||||
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0])); |
||||
float4 w = *(__global float4*)(&(currentnodeptr->weight[0])); |
||||
float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0])); |
||||
|
||||
float nodethreshold = w.w * variance_norm_factor; |
||||
|
||||
info1.x +=lcl_off; |
||||
info1.z +=lcl_off; |
||||
info2.x +=lcl_off; |
||||
info2.z +=lcl_off; |
||||
|
||||
float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] - |
||||
lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x; |
||||
|
||||
classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] - |
||||
lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y; |
||||
|
||||
info3.x +=lcl_off; |
||||
info3.z +=lcl_off; |
||||
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] - |
||||
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z; |
||||
|
||||
bool passThres = classsum >= nodethreshold; |
||||
#if STUMP_BASED |
||||
stage_sum += passThres ? alpha3.y : alpha3.x; |
||||
nodecounter++; |
||||
nodeloop++; |
||||
#else |
||||
bool isRootNode = (nodecounter & 1) == 0; |
||||
if(isRootNode) |
||||
{ |
||||
if( (passThres && currentnodeptr->right) || |
||||
(!passThres && currentnodeptr->left)) |
||||
{ |
||||
nodecounter ++; |
||||
} |
||||
else |
||||
{ |
||||
stage_sum += alpha3.x; |
||||
nodecounter += 2; |
||||
nodeloop ++; |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
stage_sum += passThres ? alpha3.z : alpha3.y; |
||||
nodecounter ++; |
||||
nodeloop ++; |
||||
} |
||||
#endif |
||||
} |
||||
|
||||
result = (stage_sum >= stagethreshold); |
||||
} |
||||
|
||||
if(result && (x < width) && (y < height)) |
||||
{ |
||||
int queueindex = atomic_inc(lclcount); |
||||
lcloutindex[queueindex<<1] = (lclidy << 16) | lclidx; |
||||
lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor); |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
int queuecount = lclcount[0]; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
nodecounter = splitnode; |
||||
for(int stageloop = split_stage; stageloop< end_stage && queuecount>0; stageloop++) |
||||
{ |
||||
lclcount[0]=0; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
int2 stageinfo = *(global int2*)(stagecascadeptr+stageloop); |
||||
float stagethreshold = as_float(stageinfo.y); |
||||
|
||||
int perfscale = queuecount > 4 ? 3 : 2; |
||||
int queuecount_loop = (queuecount + (1<<perfscale)-1) >> perfscale; |
||||
int lcl_compute_win = lcl_sz >> perfscale; |
||||
int lcl_compute_win_id = (lcl_id >>(6-perfscale)); |
||||
int lcl_loops = (stageinfo.x + lcl_compute_win -1) >> (6-perfscale); |
||||
int lcl_compute_id = lcl_id - (lcl_compute_win_id << (6-perfscale)); |
||||
for(int queueloop=0; queueloop<queuecount_loop; queueloop++) |
||||
{ |
||||
float stage_sum = 0.f; |
||||
int temp_coord = lcloutindex[lcl_compute_win_id<<1]; |
||||
float variance_norm_factor = as_float(lcloutindex[(lcl_compute_win_id<<1)+1]); |
||||
int queue_pixel = mad24(((temp_coord & (int)0xffff0000)>>16),readwidth,temp_coord & 0xffff); |
||||
|
||||
if(lcl_compute_win_id < queuecount) |
||||
{ |
||||
int tempnodecounter = lcl_compute_id; |
||||
float part_sum = 0.f; |
||||
const int stump_factor = STUMP_BASED ? 1 : 2; |
||||
int root_offset = 0; |
||||
for(int lcl_loop=0; lcl_loop<lcl_loops && tempnodecounter<stageinfo.x;) |
||||
{ |
||||
__global GpuHidHaarTreeNode* currentnodeptr = |
||||
nodeptr + (nodecounter + tempnodecounter) * stump_factor + root_offset; |
||||
|
||||
int4 info1 = *(__global int4*)(&(currentnodeptr->p[0][0])); |
||||
int4 info2 = *(__global int4*)(&(currentnodeptr->p[1][0])); |
||||
int4 info3 = *(__global int4*)(&(currentnodeptr->p[2][0])); |
||||
float4 w = *(__global float4*)(&(currentnodeptr->weight[0])); |
||||
float3 alpha3 = *(__global float3*)(&(currentnodeptr->alpha[0])); |
||||
float nodethreshold = w.w * variance_norm_factor; |
||||
|
||||
info1.x +=queue_pixel; |
||||
info1.z +=queue_pixel; |
||||
info2.x +=queue_pixel; |
||||
info2.z +=queue_pixel; |
||||
|
||||
float classsum = (lcldata[mad24(info1.y,readwidth,info1.x)] - lcldata[mad24(info1.y,readwidth,info1.z)] - |
||||
lcldata[mad24(info1.w,readwidth,info1.x)] + lcldata[mad24(info1.w,readwidth,info1.z)]) * w.x; |
||||
|
||||
|
||||
classsum += (lcldata[mad24(info2.y,readwidth,info2.x)] - lcldata[mad24(info2.y,readwidth,info2.z)] - |
||||
lcldata[mad24(info2.w,readwidth,info2.x)] + lcldata[mad24(info2.w,readwidth,info2.z)]) * w.y; |
||||
|
||||
info3.x +=queue_pixel; |
||||
info3.z +=queue_pixel; |
||||
classsum += (lcldata[mad24(info3.y,readwidth,info3.x)] - lcldata[mad24(info3.y,readwidth,info3.z)] - |
||||
lcldata[mad24(info3.w,readwidth,info3.x)] + lcldata[mad24(info3.w,readwidth,info3.z)]) * w.z; |
||||
|
||||
bool passThres = classsum >= nodethreshold; |
||||
#if STUMP_BASED |
||||
part_sum += passThres ? alpha3.y : alpha3.x; |
||||
tempnodecounter += lcl_compute_win; |
||||
lcl_loop++; |
||||
#else |
||||
if(root_offset == 0) |
||||
{ |
||||
if( (passThres && currentnodeptr->right) || |
||||
(!passThres && currentnodeptr->left)) |
||||
{ |
||||
root_offset = 1; |
||||
} |
||||
else |
||||
{ |
||||
part_sum += alpha3.x; |
||||
tempnodecounter += lcl_compute_win; |
||||
lcl_loop++; |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
part_sum += passThres ? alpha3.z : alpha3.y; |
||||
tempnodecounter += lcl_compute_win; |
||||
lcl_loop++; |
||||
root_offset = 0; |
||||
} |
||||
#endif |
||||
}//end for(int lcl_loop=0;lcl_loop<lcl_loops;lcl_loop++) |
||||
partialsum[lcl_id]=part_sum; |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
if(lcl_compute_win_id < queuecount) |
||||
{ |
||||
for(int i=0; i<lcl_compute_win && (lcl_compute_id==0); i++) |
||||
{ |
||||
stage_sum += partialsum[lcl_id+i]; |
||||
} |
||||
if(stage_sum >= stagethreshold && (lcl_compute_id==0)) |
||||
{ |
||||
int queueindex = atomic_inc(lclcount); |
||||
lcloutindex[queueindex<<1] = temp_coord; |
||||
lcloutindex[(queueindex<<1)+1] = as_int(variance_norm_factor); |
||||
} |
||||
lcl_compute_win_id +=(1<<perfscale); |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
}//end for(int queueloop=0;queueloop<queuecount_loop;queueloop++) |
||||
|
||||
queuecount = lclcount[0]; |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
nodecounter += stageinfo.x; |
||||
}//end for(int stageloop = splitstage; stageloop< endstage && queuecount>0;stageloop++) |
||||
|
||||
if(lcl_id<queuecount) |
||||
{ |
||||
int temp = lcloutindex[lcl_id<<1]; |
||||
int x = mad24(grpidx,grpszx,temp & 0xffff); |
||||
int y = mad24(grpidy,grpszy,((temp & (int)0xffff0000) >> 16)); |
||||
temp = glboutindex[0]; |
||||
int4 candidate_result; |
||||
candidate_result.zw = (int2)convert_int_rtn(factor*20.f); |
||||
candidate_result.x = convert_int_rtn(x*factor); |
||||
candidate_result.y = convert_int_rtn(y*factor); |
||||
atomic_inc(glboutindex); |
||||
candidate[outputoff+temp+lcl_id] = candidate_result; |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
}//end for(int grploop=grpidx;grploop<totalgrp;grploop+=grpnumx) |
||||
}//end for(int scalei = 0; scalei <loopcount; scalei++) |
||||
} |
@ -0,0 +1,306 @@ |
||||
/*M/////////////////////////////////////////////////////////////////////////////////////// |
||||
// |
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. |
||||
// |
||||
// By downloading, copying, installing or using the software you agree to this license. |
||||
// If you do not agree to this license, do not download, install, |
||||
// copy or use the software. |
||||
// |
||||
// |
||||
// License Agreement |
||||
// For Open Source Computer Vision Library |
||||
// |
||||
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. |
||||
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. |
||||
// Third party copyrights are property of their respective owners. |
||||
// |
||||
// @Authors |
||||
// Wu Xinglong, wxl370@126.com |
||||
// Sen Liu, swjtuls1987@126.com |
||||
// Peng Xiao, pengxiao@outlook.com |
||||
// Redistribution and use in source and binary forms, with or without modification, |
||||
// are permitted provided that the following conditions are met: |
||||
// |
||||
// * Redistribution's of source code must retain the above copyright notice, |
||||
// this list of conditions and the following disclaimer. |
||||
// |
||||
// * Redistribution's in binary form must reproduce the above copyright notice, |
||||
// this list of conditions and the following disclaimer in the documentation |
||||
// and/or other materials provided with the distribution. |
||||
// |
||||
// * The name of the copyright holders may not be used to endorse or promote products |
||||
// derived from this software without specific prior written permission. |
||||
// |
||||
// This software is provided by the copyright holders and contributors as is and |
||||
// any express or implied warranties, including, but not limited to, the implied |
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed. |
||||
// In no event shall the Intel Corporation or contributors be liable for any direct, |
||||
// indirect, incidental, special, exemplary, or consequential damages |
||||
// (including, but not limited to, procurement of substitute goods or services; |
||||
// loss of use, data, or profits; or business interruption) however caused |
||||
// and on any theory of liability, whether in contract, strict liability, |
||||
// or tort (including negligence or otherwise) arising in any way out of |
||||
// the use of this software, even if advised of the possibility of such damage. |
||||
// |
||||
//M*/ |
||||
|
||||
// Enter your kernel in this window |
||||
//#pragma OPENCL EXTENSION cl_amd_printf:enable |
||||
#define CV_HAAR_FEATURE_MAX 3 |
||||
typedef int sumtype; |
||||
typedef float sqsumtype; |
||||
|
||||
typedef struct __attribute__((aligned(128))) GpuHidHaarTreeNode |
||||
{ |
||||
int p[CV_HAAR_FEATURE_MAX][4] __attribute__((aligned(64))); |
||||
float weight[CV_HAAR_FEATURE_MAX] /*__attribute__((aligned (16)))*/; |
||||
float threshold /*__attribute__((aligned (4)))*/; |
||||
float alpha[3] __attribute__((aligned(16))); |
||||
int left __attribute__((aligned(4))); |
||||
int right __attribute__((aligned(4))); |
||||
} |
||||
GpuHidHaarTreeNode; |
||||
typedef struct __attribute__((aligned(32))) GpuHidHaarClassifier |
||||
{ |
||||
int count __attribute__((aligned(4))); |
||||
GpuHidHaarTreeNode *node __attribute__((aligned(8))); |
||||
float *alpha __attribute__((aligned(8))); |
||||
} |
||||
GpuHidHaarClassifier; |
||||
typedef struct __attribute__((aligned(64))) GpuHidHaarStageClassifier |
||||
{ |
||||
int count __attribute__((aligned(4))); |
||||
float threshold __attribute__((aligned(4))); |
||||
int two_rects __attribute__((aligned(4))); |
||||
int reserved0 __attribute__((aligned(8))); |
||||
int reserved1 __attribute__((aligned(8))); |
||||
int reserved2 __attribute__((aligned(8))); |
||||
int reserved3 __attribute__((aligned(8))); |
||||
} |
||||
GpuHidHaarStageClassifier; |
||||
typedef struct __attribute__((aligned(64))) GpuHidHaarClassifierCascade |
||||
{ |
||||
int count __attribute__((aligned(4))); |
||||
int is_stump_based __attribute__((aligned(4))); |
||||
int has_tilted_features __attribute__((aligned(4))); |
||||
int is_tree __attribute__((aligned(4))); |
||||
int pq0 __attribute__((aligned(4))); |
||||
int pq1 __attribute__((aligned(4))); |
||||
int pq2 __attribute__((aligned(4))); |
||||
int pq3 __attribute__((aligned(4))); |
||||
int p0 __attribute__((aligned(4))); |
||||
int p1 __attribute__((aligned(4))); |
||||
int p2 __attribute__((aligned(4))); |
||||
int p3 __attribute__((aligned(4))); |
||||
float inv_window_area __attribute__((aligned(4))); |
||||
} GpuHidHaarClassifierCascade; |
||||
|
||||
__kernel void gpuRunHaarClassifierCascade_scaled2( |
||||
global GpuHidHaarStageClassifier *stagecascadeptr, |
||||
global int4 *info, |
||||
global GpuHidHaarTreeNode *nodeptr, |
||||
global const int *restrict sum, |
||||
global const float *restrict sqsum, |
||||
global int4 *candidate, |
||||
const int rows, |
||||
const int cols, |
||||
const int step, |
||||
const int loopcount, |
||||
const int start_stage, |
||||
const int split_stage, |
||||
const int end_stage, |
||||
const int startnode, |
||||
global int4 *p, |
||||
global float *correction, |
||||
const int nodecount) |
||||
{ |
||||
int grpszx = get_local_size(0); |
||||
int grpszy = get_local_size(1); |
||||
int grpnumx = get_num_groups(0); |
||||
int grpidx = get_group_id(0); |
||||
int lclidx = get_local_id(0); |
||||
int lclidy = get_local_id(1); |
||||
int lcl_sz = mul24(grpszx, grpszy); |
||||
int lcl_id = mad24(lclidy, grpszx, lclidx); |
||||
__local int glboutindex[1]; |
||||
__local int lclcount[1]; |
||||
__local int lcloutindex[64]; |
||||
glboutindex[0] = 0; |
||||
int outputoff = mul24(grpidx, 256); |
||||
candidate[outputoff + (lcl_id << 2)] = (int4)0; |
||||
candidate[outputoff + (lcl_id << 2) + 1] = (int4)0; |
||||
candidate[outputoff + (lcl_id << 2) + 2] = (int4)0; |
||||
candidate[outputoff + (lcl_id << 2) + 3] = (int4)0; |
||||
int max_idx = rows * cols - 1; |
||||
for (int scalei = 0; scalei < loopcount; scalei++) |
||||
{ |
||||
int4 scaleinfo1; |
||||
scaleinfo1 = info[scalei]; |
||||
int width = (scaleinfo1.x & 0xffff0000) >> 16; |
||||
int height = scaleinfo1.x & 0xffff; |
||||
int grpnumperline = (scaleinfo1.y & 0xffff0000) >> 16; |
||||
int totalgrp = scaleinfo1.y & 0xffff; |
||||
float factor = as_float(scaleinfo1.w); |
||||
float correction_t = correction[scalei]; |
||||
int ystep = (int)(max(2.0f, factor) + 0.5f); |
||||
|
||||
for (int grploop = get_group_id(0); grploop < totalgrp; grploop += grpnumx) |
||||
{ |
||||
int4 cascadeinfo = p[scalei]; |
||||
int grpidy = grploop / grpnumperline; |
||||
int grpidx = grploop - mul24(grpidy, grpnumperline); |
||||
int ix = mad24(grpidx, grpszx, lclidx); |
||||
int iy = mad24(grpidy, grpszy, lclidy); |
||||
int x = ix * ystep; |
||||
int y = iy * ystep; |
||||
lcloutindex[lcl_id] = 0; |
||||
lclcount[0] = 0; |
||||
int nodecounter; |
||||
float mean, variance_norm_factor; |
||||
//if((ix < width) && (iy < height)) |
||||
{ |
||||
const int p_offset = mad24(y, step, x); |
||||
cascadeinfo.x += p_offset; |
||||
cascadeinfo.z += p_offset; |
||||
mean = (sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)] |
||||
- sum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] - |
||||
sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)] |
||||
+ sum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)]) |
||||
* correction_t; |
||||
variance_norm_factor = sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.x), 0, max_idx)] |
||||
- sqsum[clamp(mad24(cascadeinfo.y, step, cascadeinfo.z), 0, max_idx)] - |
||||
sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.x), 0, max_idx)] |
||||
+ sqsum[clamp(mad24(cascadeinfo.w, step, cascadeinfo.z), 0, max_idx)]; |
||||
variance_norm_factor = variance_norm_factor * correction_t - mean * mean; |
||||
variance_norm_factor = variance_norm_factor >= 0.f ? sqrt(variance_norm_factor) : 1.f; |
||||
bool result = true; |
||||
nodecounter = startnode + nodecount * scalei; |
||||
for (int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++) |
||||
{ |
||||
float stage_sum = 0.f; |
||||
int stagecount = stagecascadeptr[stageloop].count; |
||||
for (int nodeloop = 0; nodeloop < stagecount;) |
||||
{ |
||||
__global GpuHidHaarTreeNode *currentnodeptr = (nodeptr + nodecounter); |
||||
int4 info1 = *(__global int4 *)(&(currentnodeptr->p[0][0])); |
||||
int4 info2 = *(__global int4 *)(&(currentnodeptr->p[1][0])); |
||||
int4 info3 = *(__global int4 *)(&(currentnodeptr->p[2][0])); |
||||
float4 w = *(__global float4 *)(&(currentnodeptr->weight[0])); |
||||
float3 alpha3 = *(__global float3 *)(&(currentnodeptr->alpha[0])); |
||||
float nodethreshold = w.w * variance_norm_factor; |
||||
|
||||
info1.x += p_offset; |
||||
info1.z += p_offset; |
||||
info2.x += p_offset; |
||||
info2.z += p_offset; |
||||
info3.x += p_offset; |
||||
info3.z += p_offset; |
||||
float classsum = (sum[clamp(mad24(info1.y, step, info1.x), 0, max_idx)] |
||||
- sum[clamp(mad24(info1.y, step, info1.z), 0, max_idx)] - |
||||
sum[clamp(mad24(info1.w, step, info1.x), 0, max_idx)] |
||||
+ sum[clamp(mad24(info1.w, step, info1.z), 0, max_idx)]) * w.x; |
||||
classsum += (sum[clamp(mad24(info2.y, step, info2.x), 0, max_idx)] |
||||
- sum[clamp(mad24(info2.y, step, info2.z), 0, max_idx)] - |
||||
sum[clamp(mad24(info2.w, step, info2.x), 0, max_idx)] |
||||
+ sum[clamp(mad24(info2.w, step, info2.z), 0, max_idx)]) * w.y; |
||||
classsum += (sum[clamp(mad24(info3.y, step, info3.x), 0, max_idx)] |
||||
- sum[clamp(mad24(info3.y, step, info3.z), 0, max_idx)] - |
||||
sum[clamp(mad24(info3.w, step, info3.x), 0, max_idx)] |
||||
+ sum[clamp(mad24(info3.w, step, info3.z), 0, max_idx)]) * w.z; |
||||
|
||||
bool passThres = classsum >= nodethreshold; |
||||
|
||||
#if STUMP_BASED |
||||
stage_sum += passThres ? alpha3.y : alpha3.x; |
||||
nodecounter++; |
||||
nodeloop++; |
||||
#else |
||||
bool isRootNode = (nodecounter & 1) == 0; |
||||
if(isRootNode) |
||||
{ |
||||
if( (passThres && currentnodeptr->right) || |
||||
(!passThres && currentnodeptr->left)) |
||||
{ |
||||
nodecounter ++; |
||||
} |
||||
else |
||||
{ |
||||
stage_sum += alpha3.x; |
||||
nodecounter += 2; |
||||
nodeloop ++; |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
stage_sum += (passThres ? alpha3.z : alpha3.y); |
||||
nodecounter ++; |
||||
nodeloop ++; |
||||
} |
||||
#endif |
||||
} |
||||
result = (int)(stage_sum >= stagecascadeptr[stageloop].threshold); |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
|
||||
if (result && (ix < width) && (iy < height)) |
||||
{ |
||||
int queueindex = atomic_inc(lclcount); |
||||
lcloutindex[queueindex] = (y << 16) | x; |
||||
} |
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
int queuecount = lclcount[0]; |
||||
|
||||
if (lcl_id < queuecount) |
||||
{ |
||||
int temp = lcloutindex[lcl_id]; |
||||
int x = temp & 0xffff; |
||||
int y = (temp & (int)0xffff0000) >> 16; |
||||
temp = atomic_inc(glboutindex); |
||||
int4 candidate_result; |
||||
candidate_result.zw = (int2)convert_int_rtn(factor * 20.f); |
||||
candidate_result.x = x; |
||||
candidate_result.y = y; |
||||
candidate[outputoff + temp + lcl_id] = candidate_result; |
||||
} |
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE); |
||||
} |
||||
} |
||||
} |
||||
} |
||||
__kernel void gpuscaleclassifier(global GpuHidHaarTreeNode *orinode, global GpuHidHaarTreeNode *newnode, float scale, float weight_scale, int nodenum) |
||||
{ |
||||
int counter = get_global_id(0); |
||||
int tr_x[3], tr_y[3], tr_h[3], tr_w[3], i = 0; |
||||
GpuHidHaarTreeNode t1 = *(orinode + counter); |
||||
#pragma unroll |
||||
|
||||
for (i = 0; i < 3; i++) |
||||
{ |
||||
tr_x[i] = (int)(t1.p[i][0] * scale + 0.5f); |
||||
tr_y[i] = (int)(t1.p[i][1] * scale + 0.5f); |
||||
tr_w[i] = (int)(t1.p[i][2] * scale + 0.5f); |
||||
tr_h[i] = (int)(t1.p[i][3] * scale + 0.5f); |
||||
} |
||||
|
||||
t1.weight[0] = t1.p[2][0] ? -(t1.weight[1] * tr_h[1] * tr_w[1] + t1.weight[2] * tr_h[2] * tr_w[2]) / (tr_h[0] * tr_w[0]) : -t1.weight[1] * tr_h[1] * tr_w[1] / (tr_h[0] * tr_w[0]); |
||||
counter += nodenum; |
||||
#pragma unroll |
||||
|
||||
for (i = 0; i < 3; i++) |
||||
{ |
||||
newnode[counter].p[i][0] = tr_x[i]; |
||||
newnode[counter].p[i][1] = tr_y[i]; |
||||
newnode[counter].p[i][2] = tr_x[i] + tr_w[i]; |
||||
newnode[counter].p[i][3] = tr_y[i] + tr_h[i]; |
||||
newnode[counter].weight[i] = t1.weight[i] * weight_scale; |
||||
} |
||||
|
||||
newnode[counter].left = t1.left; |
||||
newnode[counter].right = t1.right; |
||||
newnode[counter].threshold = t1.threshold; |
||||
newnode[counter].alpha[0] = t1.alpha[0]; |
||||
newnode[counter].alpha[1] = t1.alpha[1]; |
||||
newnode[counter].alpha[2] = t1.alpha[2]; |
||||
} |
@ -0,0 +1,276 @@ |
||||
#include "opencv2/objdetect.hpp" |
||||
#include "opencv2/highgui.hpp" |
||||
#include "opencv2/imgproc.hpp" |
||||
#include "opencv2/core/utility.hpp" |
||||
#include "opencv2/core/ocl.hpp" |
||||
|
||||
#include <cctype> |
||||
#include <iostream> |
||||
#include <iterator> |
||||
#include <stdio.h> |
||||
|
||||
using namespace std; |
||||
using namespace cv; |
||||
|
||||
static void help() |
||||
{ |
||||
cout << "\nThis program demonstrates the cascade recognizer. Now you can use Haar or LBP features.\n" |
||||
"This classifier can recognize many kinds of rigid objects, once the appropriate classifier is trained.\n" |
||||
"It's most known use is for faces.\n" |
||||
"Usage:\n" |
||||
"./facedetect [--cascade=<cascade_path> this is the primary trained classifier such as frontal face]\n" |
||||
" [--nested-cascade[=nested_cascade_path this an optional secondary classifier such as eyes]]\n" |
||||
" [--scale=<image scale greater or equal to 1, try 1.3 for example>]\n" |
||||
" [--try-flip]\n" |
||||
" [filename|camera_index]\n\n" |
||||
"see facedetect.cmd for one call:\n" |
||||
"./facedetect --cascade=\"../../data/haarcascades/haarcascade_frontalface_alt.xml\" --nested-cascade=\"../../data/haarcascades/haarcascade_eye.xml\" --scale=1.3\n\n" |
||||
"During execution:\n\tHit any key to quit.\n" |
||||
"\tUsing OpenCV version " << CV_VERSION << "\n" << endl; |
||||
} |
||||
|
||||
void detectAndDraw( UMat& img, Mat& canvas, CascadeClassifier& cascade, |
||||
CascadeClassifier& nestedCascade, |
||||
double scale, bool tryflip ); |
||||
|
||||
string cascadeName = "../../data/haarcascades/haarcascade_frontalface_alt.xml"; |
||||
string nestedCascadeName = "../../data/haarcascades/haarcascade_eye_tree_eyeglasses.xml"; |
||||
|
||||
int main( int argc, const char** argv ) |
||||
{ |
||||
VideoCapture capture; |
||||
UMat frame, image; |
||||
Mat canvas; |
||||
const string scaleOpt = "--scale="; |
||||
size_t scaleOptLen = scaleOpt.length(); |
||||
const string cascadeOpt = "--cascade="; |
||||
size_t cascadeOptLen = cascadeOpt.length(); |
||||
const string nestedCascadeOpt = "--nested-cascade"; |
||||
size_t nestedCascadeOptLen = nestedCascadeOpt.length(); |
||||
const string tryFlipOpt = "--try-flip"; |
||||
size_t tryFlipOptLen = tryFlipOpt.length(); |
||||
String inputName; |
||||
bool tryflip = false; |
||||
|
||||
help(); |
||||
|
||||
CascadeClassifier cascade, nestedCascade; |
||||
double scale = 1; |
||||
|
||||
for( int i = 1; i < argc; i++ ) |
||||
{ |
||||
cout << "Processing " << i << " " << argv[i] << endl; |
||||
if( cascadeOpt.compare( 0, cascadeOptLen, argv[i], cascadeOptLen ) == 0 ) |
||||
{ |
||||
cascadeName.assign( argv[i] + cascadeOptLen ); |
||||
cout << " from which we have cascadeName= " << cascadeName << endl; |
||||
} |
||||
else if( nestedCascadeOpt.compare( 0, nestedCascadeOptLen, argv[i], nestedCascadeOptLen ) == 0 ) |
||||
{ |
||||
if( argv[i][nestedCascadeOpt.length()] == '=' ) |
||||
nestedCascadeName.assign( argv[i] + nestedCascadeOpt.length() + 1 ); |
||||
if( !nestedCascade.load( nestedCascadeName ) ) |
||||
cerr << "WARNING: Could not load classifier cascade for nested objects" << endl; |
||||
} |
||||
else if( scaleOpt.compare( 0, scaleOptLen, argv[i], scaleOptLen ) == 0 ) |
||||
{ |
||||
if( !sscanf( argv[i] + scaleOpt.length(), "%lf", &scale ) || scale > 1 ) |
||||
scale = 1; |
||||
cout << " from which we read scale = " << scale << endl; |
||||
} |
||||
else if( tryFlipOpt.compare( 0, tryFlipOptLen, argv[i], tryFlipOptLen ) == 0 ) |
||||
{ |
||||
tryflip = true; |
||||
cout << " will try to flip image horizontally to detect assymetric objects\n"; |
||||
} |
||||
else if( argv[i][0] == '-' ) |
||||
{ |
||||
cerr << "WARNING: Unknown option %s" << argv[i] << endl; |
||||
} |
||||
else |
||||
inputName = argv[i]; |
||||
} |
||||
|
||||
if( !cascade.load( cascadeName ) ) |
||||
{ |
||||
cerr << "ERROR: Could not load classifier cascade" << endl; |
||||
help(); |
||||
return -1; |
||||
} |
||||
|
||||
if( inputName.empty() || (isdigit(inputName.c_str()[0]) && inputName.c_str()[1] == '\0') ) |
||||
{ |
||||
int c = inputName.empty() ? 0 : inputName.c_str()[0] - '0'; |
||||
if(!capture.open(c)) |
||||
cout << "Capture from camera #" << c << " didn't work" << endl; |
||||
} |
||||
else |
||||
{ |
||||
if( inputName.empty() ) |
||||
inputName = "lena.jpg"; |
||||
image = imread( inputName, 1 ).getUMat(ACCESS_READ); |
||||
if( image.empty() ) |
||||
{ |
||||
if(!capture.open( inputName )) |
||||
cout << "Could not read " << inputName << endl; |
||||
} |
||||
} |
||||
|
||||
namedWindow( "result", 1 ); |
||||
|
||||
if( capture.isOpened() ) |
||||
{ |
||||
cout << "Video capturing has been started ..." << endl; |
||||
for(;;) |
||||
{ |
||||
capture >> frame; |
||||
if( frame.empty() ) |
||||
break; |
||||
|
||||
detectAndDraw( frame, canvas, cascade, nestedCascade, scale, tryflip ); |
||||
|
||||
if( waitKey( 10 ) >= 0 ) |
||||
break; |
||||
} |
||||
} |
||||
else |
||||
{ |
||||
cout << "Detecting face(s) in " << inputName << endl; |
||||
if( !image.empty() ) |
||||
{ |
||||
detectAndDraw( image, canvas, cascade, nestedCascade, scale, tryflip ); |
||||
waitKey(0); |
||||
} |
||||
else if( !inputName.empty() ) |
||||
{ |
||||
/* assume it is a text file containing the
|
||||
list of the image filenames to be processed - one per line */ |
||||
FILE* f = fopen( inputName.c_str(), "rt" ); |
||||
if( f ) |
||||
{ |
||||
char buf[1000+1]; |
||||
while( fgets( buf, 1000, f ) ) |
||||
{ |
||||
int len = (int)strlen(buf), c; |
||||
while( len > 0 && isspace(buf[len-1]) ) |
||||
len--; |
||||
buf[len] = '\0'; |
||||
cout << "file " << buf << endl; |
||||
image = imread( buf, 1 ).getUMat(ACCESS_READ); |
||||
if( !image.empty() ) |
||||
{ |
||||
detectAndDraw( image, canvas, cascade, nestedCascade, scale, tryflip ); |
||||
c = waitKey(0); |
||||
if( c == 27 || c == 'q' || c == 'Q' ) |
||||
break; |
||||
} |
||||
else |
||||
{ |
||||
cerr << "Aw snap, couldn't read image " << buf << endl; |
||||
} |
||||
} |
||||
fclose(f); |
||||
} |
||||
} |
||||
} |
||||
|
||||
return 0; |
||||
} |
||||
|
||||
void detectAndDraw( UMat& img, Mat& canvas, CascadeClassifier& cascade, |
||||
CascadeClassifier& nestedCascade, |
||||
double scale0, bool tryflip ) |
||||
{ |
||||
int i = 0; |
||||
double t = 0, scale=1; |
||||
vector<Rect> faces, faces2; |
||||
const static Scalar colors[] = |
||||
{ |
||||
Scalar(0,0,255), |
||||
Scalar(0,128,255), |
||||
Scalar(0,255,255), |
||||
Scalar(0,255,0), |
||||
Scalar(255,128,0), |
||||
Scalar(255,255,0), |
||||
Scalar(255,0,0), |
||||
Scalar(255,0,255) |
||||
}; |
||||
static UMat gray, smallImg; |
||||
|
||||
t = (double)getTickCount(); |
||||
|
||||
cvtColor( img, gray, COLOR_BGR2GRAY ); |
||||
resize( gray, smallImg, Size(), scale0, scale0, INTER_LINEAR ); |
||||
cvtColor(smallImg, canvas, COLOR_GRAY2BGR); |
||||
equalizeHist( smallImg, smallImg ); |
||||
|
||||
cascade.detectMultiScale( smallImg, faces, |
||||
1.1, 2, 0 |
||||
//|CASCADE_FIND_BIGGEST_OBJECT
|
||||
//|CASCADE_DO_ROUGH_SEARCH
|
||||
|CASCADE_SCALE_IMAGE |
||||
, |
||||
Size(30, 30) ); |
||||
if( tryflip ) |
||||
{ |
||||
flip(smallImg, smallImg, 1); |
||||
cascade.detectMultiScale( smallImg, faces2, |
||||
1.1, 2, 0 |
||||
//|CASCADE_FIND_BIGGEST_OBJECT
|
||||
//|CASCADE_DO_ROUGH_SEARCH
|
||||
|CASCADE_SCALE_IMAGE |
||||
, |
||||
Size(30, 30) ); |
||||
for( vector<Rect>::const_iterator r = faces2.begin(); r != faces2.end(); r++ ) |
||||
{ |
||||
faces.push_back(Rect(smallImg.cols - r->x - r->width, r->y, r->width, r->height)); |
||||
} |
||||
} |
||||
t = (double)getTickCount() - t; |
||||
cvtColor(smallImg, canvas, COLOR_GRAY2BGR); |
||||
|
||||
double fps = getTickFrequency()/t; |
||||
|
||||
putText(canvas, format("OpenCL: %s, fps: %.1f", ocl::useOpenCL() ? "ON" : "OFF", fps), Point(250, 50), |
||||
FONT_HERSHEY_SIMPLEX, 1, Scalar(0,255,0), 3); |
||||
|
||||
for( vector<Rect>::const_iterator r = faces.begin(); r != faces.end(); r++, i++ ) |
||||
{ |
||||
vector<Rect> nestedObjects; |
||||
Point center; |
||||
Scalar color = colors[i%8]; |
||||
int radius; |
||||
|
||||
double aspect_ratio = (double)r->width/r->height; |
||||
if( 0.75 < aspect_ratio && aspect_ratio < 1.3 ) |
||||
{ |
||||
center.x = cvRound((r->x + r->width*0.5)*scale); |
||||
center.y = cvRound((r->y + r->height*0.5)*scale); |
||||
radius = cvRound((r->width + r->height)*0.25*scale); |
||||
circle( canvas, center, radius, color, 3, 8, 0 ); |
||||
} |
||||
else |
||||
rectangle( canvas, Point(cvRound(r->x*scale), cvRound(r->y*scale)), |
||||
Point(cvRound((r->x + r->width-1)*scale), cvRound((r->y + r->height-1)*scale)), |
||||
color, 3, 8, 0); |
||||
if( nestedCascade.empty() ) |
||||
continue; |
||||
UMat smallImgROI = smallImg(*r); |
||||
nestedCascade.detectMultiScale( smallImgROI, nestedObjects, |
||||
1.1, 2, 0 |
||||
//|CASCADE_FIND_BIGGEST_OBJECT
|
||||
//|CASCADE_DO_ROUGH_SEARCH
|
||||
//|CASCADE_DO_CANNY_PRUNING
|
||||
|CASCADE_SCALE_IMAGE |
||||
, |
||||
Size(30, 30) ); |
||||
for( vector<Rect>::const_iterator nr = nestedObjects.begin(); nr != nestedObjects.end(); nr++ ) |
||||
{ |
||||
center.x = cvRound((r->x + nr->x + nr->width*0.5)*scale); |
||||
center.y = cvRound((r->y + nr->y + nr->height*0.5)*scale); |
||||
radius = cvRound((nr->width + nr->height)*0.25*scale); |
||||
circle( canvas, center, radius, color, 3, 8, 0 ); |
||||
} |
||||
} |
||||
imshow( "result", canvas ); |
||||
} |
Loading…
Reference in new issue