mirror of https://github.com/opencv/opencv.git
Merge pull request #23098 from savuor:nanMask
finiteMask() and doubles for patchNaNs() #23098 Related to #22826 Connected PR in extra: [#1037@extra](https://github.com/opencv/opencv_extra/pull/1037) ### TODOs: - [ ] Vectorize `finiteMask()` for 64FC3 and 64FC4 ### Changes This PR: * adds a new function `finiteMask()` * extends `patchNaNs()` by CV_64F support * moves `patchNaNs()` and `finiteMask()` to a separate file **NOTE:** now the function is called `finiteMask()` as discussed with the OpenCV core team ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [x] The PR is proposed to the proper branch - [x] There is a reference to the original bug report and related work - [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [x] The feature is well documented and sample code can be built with the project CMakepull/24517/head
parent
34f34f6227
commit
53aad98a1a
15 changed files with 1190 additions and 138 deletions
@ -0,0 +1,152 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html.
|
||||
|
||||
#include "precomp.hpp" |
||||
#include "opencl_kernels_core.hpp" |
||||
|
||||
#include "nan_mask.simd.hpp" |
||||
#include "nan_mask.simd_declarations.hpp" // defines CV_CPU_DISPATCH_MODES_ALL=AVX2,...,BASELINE based on CMakeLists.txt content |
||||
|
||||
namespace cv { |
||||
|
||||
#ifdef HAVE_OPENCL |
||||
|
||||
static bool ocl_patchNaNs( InputOutputArray _a, double value ) |
||||
{ |
||||
int ftype = _a.depth(); |
||||
|
||||
const ocl::Device d = ocl::Device::getDefault(); |
||||
bool doubleSupport = d.doubleFPConfig() > 0; |
||||
if (!doubleSupport && ftype == CV_64F) |
||||
{ |
||||
return false; |
||||
} |
||||
|
||||
int rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1; |
||||
ocl::Kernel k("KF", ocl::core::arithm_oclsrc, |
||||
format("-D UNARY_OP -D OP_PATCH_NANS -D dstT=%s -D DEPTH_dst=%d -D rowsPerWI=%d %s", |
||||
ftype == CV_64F ? "double" : "float", ftype, rowsPerWI, |
||||
doubleSupport ? "-D DOUBLE_SUPPORT" : "")); |
||||
if (k.empty()) |
||||
return false; |
||||
|
||||
UMat a = _a.getUMat(); |
||||
int cn = a.channels(); |
||||
|
||||
// to pass float or double to args
|
||||
if (ftype == CV_32F) |
||||
{ |
||||
k.args(ocl::KernelArg::ReadOnlyNoSize(a), ocl::KernelArg::WriteOnly(a, cn), (float)value); |
||||
} |
||||
else // CV_64F
|
||||
{ |
||||
k.args(ocl::KernelArg::ReadOnlyNoSize(a), ocl::KernelArg::WriteOnly(a, cn), value); |
||||
} |
||||
|
||||
size_t globalsize[2] = { (size_t)a.cols * cn, ((size_t)a.rows + rowsPerWI - 1) / rowsPerWI }; |
||||
return k.run(2, globalsize, NULL, false); |
||||
} |
||||
|
||||
#endif |
||||
|
||||
static PatchNanFunc getPatchNanFunc(bool isDouble) |
||||
{ |
||||
CV_INSTRUMENT_REGION(); |
||||
CV_CPU_DISPATCH(getPatchNanFunc, (isDouble), CV_CPU_DISPATCH_MODES_ALL); |
||||
} |
||||
|
||||
void patchNaNs( InputOutputArray _a, double _val ) |
||||
{ |
||||
CV_INSTRUMENT_REGION(); |
||||
CV_Assert( _a.depth() == CV_32F || _a.depth() == CV_64F); |
||||
|
||||
CV_OCL_RUN(_a.isUMat() && _a.dims() <= 2, |
||||
ocl_patchNaNs(_a, _val)) |
||||
|
||||
Mat a = _a.getMat(); |
||||
const Mat* arrays[] = {&a, 0}; |
||||
uchar* ptrs[1] = {}; |
||||
NAryMatIterator it(arrays, ptrs); |
||||
size_t len = it.size*a.channels(); |
||||
|
||||
PatchNanFunc func = getPatchNanFunc(_a.depth() == CV_64F); |
||||
|
||||
for (size_t i = 0; i < it.nplanes; i++, ++it) |
||||
{ |
||||
func(ptrs[0], len, _val); |
||||
} |
||||
} |
||||
|
||||
|
||||
#ifdef HAVE_OPENCL |
||||
|
||||
static bool ocl_finiteMask(const UMat img, UMat mask) |
||||
{ |
||||
int channels = img.channels(); |
||||
int depth = img.depth(); |
||||
|
||||
const ocl::Device d = ocl::Device::getDefault(); |
||||
bool doubleSupport = d.doubleFPConfig() > 0; |
||||
if (!doubleSupport && depth == CV_64F) |
||||
{ |
||||
return false; |
||||
} |
||||
|
||||
int rowsPerWI = ocl::Device::getDefault().isIntel() ? 4 : 1; |
||||
ocl::Kernel k("finiteMask", ocl::core::finitemask_oclsrc, |
||||
format("-D srcT=%s -D cn=%d -D rowsPerWI=%d %s", |
||||
depth == CV_32F ? "float" : "double", channels, rowsPerWI, |
||||
doubleSupport ? "-D DOUBLE_SUPPORT" : "")); |
||||
if (k.empty()) |
||||
return false; |
||||
|
||||
k.args(ocl::KernelArg::ReadOnlyNoSize(img), ocl::KernelArg::WriteOnly(mask)); |
||||
|
||||
size_t globalsize[2] = { (size_t)img.cols, ((size_t)img.rows + rowsPerWI - 1) / rowsPerWI }; |
||||
return k.run(2, globalsize, NULL, false); |
||||
} |
||||
|
||||
#endif |
||||
|
||||
static FiniteMaskFunc getFiniteMaskFunc(bool isDouble, int cn) |
||||
{ |
||||
CV_INSTRUMENT_REGION(); |
||||
CV_CPU_DISPATCH(getFiniteMaskFunc, (isDouble, cn), CV_CPU_DISPATCH_MODES_ALL); |
||||
} |
||||
|
||||
void finiteMask(InputArray _src, OutputArray _mask) |
||||
{ |
||||
CV_INSTRUMENT_REGION(); |
||||
|
||||
int channels = _src.channels(); |
||||
int depth = _src.depth(); |
||||
CV_Assert( channels > 0 && channels <= 4); |
||||
CV_Assert( depth == CV_32F || depth == CV_64F ); |
||||
std::vector<int> vsz(_src.dims()); |
||||
_src.sizend(vsz.data()); |
||||
_mask.create(_src.dims(), vsz.data(), CV_8UC1); |
||||
|
||||
CV_OCL_RUN(_src.isUMat() && _mask.isUMat() && _src.dims() <= 2, |
||||
ocl_finiteMask(_src.getUMat(), _mask.getUMat())); |
||||
|
||||
Mat src = _src.getMat(); |
||||
Mat mask = _mask.getMat(); |
||||
|
||||
const Mat *arrays[]={&src, &mask, 0}; |
||||
Mat planes[2]; |
||||
NAryMatIterator it(arrays, planes); |
||||
size_t total = planes[0].total(); |
||||
size_t i, nplanes = it.nplanes; |
||||
|
||||
FiniteMaskFunc func = getFiniteMaskFunc((depth == CV_64F), channels); |
||||
|
||||
for( i = 0; i < nplanes; i++, ++it ) |
||||
{ |
||||
const uchar* sptr = planes[0].ptr(); |
||||
uchar* dptr = planes[1].ptr(); |
||||
|
||||
func(sptr, dptr, total); |
||||
} |
||||
} |
||||
} //namespace cv
|
@ -0,0 +1,440 @@ |
||||
// This file is part of OpenCV project.
|
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||
// of this distribution and at http://opencv.org/license.html
|
||||
|
||||
|
||||
#include "precomp.hpp" |
||||
|
||||
namespace cv { |
||||
|
||||
typedef void (*PatchNanFunc)(uchar* tptr, size_t len, double newVal); |
||||
typedef void (*FiniteMaskFunc)(const uchar *src, uchar *dst, size_t total); |
||||
|
||||
CV_CPU_OPTIMIZATION_NAMESPACE_BEGIN |
||||
|
||||
PatchNanFunc getPatchNanFunc(bool isDouble); |
||||
FiniteMaskFunc getFiniteMaskFunc(bool isDouble, int cn); |
||||
|
||||
#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY |
||||
|
||||
static void patchNaNs_32f(uchar* ptr, size_t ulen, double newVal) |
||||
{ |
||||
CV_INSTRUMENT_REGION(); |
||||
|
||||
int32_t* tptr = (int32_t*)ptr; |
||||
Cv32suf val; |
||||
val.f = (float)newVal; |
||||
|
||||
int j = 0; |
||||
int len = (int)ulen; |
||||
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE) |
||||
v_int32 v_pos_mask = vx_setall_s32(0x7fffffff), v_exp_mask = vx_setall_s32(0x7f800000); |
||||
v_int32 v_val = vx_setall_s32(val.i); |
||||
|
||||
int cWidth = VTraits<v_int32>::vlanes(); |
||||
for (; j < len - cWidth*2 + 1; j += cWidth*2) |
||||
{ |
||||
v_int32 v_src0 = vx_load(tptr + j); |
||||
v_int32 v_src1 = vx_load(tptr + j + cWidth); |
||||
|
||||
v_int32 v_cmp_mask0 = v_lt(v_exp_mask, v_and(v_src0, v_pos_mask)); |
||||
v_int32 v_cmp_mask1 = v_lt(v_exp_mask, v_and(v_src1, v_pos_mask)); |
||||
|
||||
if (v_check_any(v_or(v_cmp_mask0, v_cmp_mask1))) |
||||
{ |
||||
v_int32 v_dst0 = v_select(v_cmp_mask0, v_val, v_src0); |
||||
v_int32 v_dst1 = v_select(v_cmp_mask1, v_val, v_src1); |
||||
|
||||
v_store(tptr + j, v_dst0); |
||||
v_store(tptr + j + cWidth, v_dst1); |
||||
} |
||||
} |
||||
#endif |
||||
|
||||
for (; j < len; j++) |
||||
{ |
||||
if ((tptr[j] & 0x7fffffff) > 0x7f800000) |
||||
{ |
||||
tptr[j] = val.i; |
||||
} |
||||
} |
||||
} |
||||
|
||||
|
||||
static void patchNaNs_64f(uchar* ptr, size_t ulen, double newVal) |
||||
{ |
||||
CV_INSTRUMENT_REGION(); |
||||
|
||||
int64_t* tptr = (int64_t*)ptr; |
||||
Cv64suf val; |
||||
val.f = newVal; |
||||
|
||||
int j = 0; |
||||
int len = (int)ulen; |
||||
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE) |
||||
v_int64 v_exp_mask = vx_setall_s64(0x7FF0000000000000); |
||||
v_int64 v_pos_mask = vx_setall_s64(0x7FFFFFFFFFFFFFFF); |
||||
v_int64 v_val = vx_setall_s64(val.i); |
||||
|
||||
int cWidth = VTraits<v_int64>::vlanes(); |
||||
for (; j < len - cWidth * 2 + 1; j += cWidth*2) |
||||
{ |
||||
v_int64 v_src0 = vx_load(tptr + j); |
||||
v_int64 v_src1 = vx_load(tptr + j + cWidth); |
||||
|
||||
v_int64 v_cmp_mask0 = v_lt(v_exp_mask, v_and(v_src0, v_pos_mask)); |
||||
v_int64 v_cmp_mask1 = v_lt(v_exp_mask, v_and(v_src1, v_pos_mask)); |
||||
|
||||
if (v_check_any(v_cmp_mask0) || v_check_any(v_cmp_mask1)) |
||||
{ |
||||
// v_select is not available for v_int64, emulating it
|
||||
v_int32 val32 = v_reinterpret_as_s32(v_val); |
||||
v_int64 v_dst0 = v_reinterpret_as_s64(v_select(v_reinterpret_as_s32(v_cmp_mask0), val32, v_reinterpret_as_s32(v_src0))); |
||||
v_int64 v_dst1 = v_reinterpret_as_s64(v_select(v_reinterpret_as_s32(v_cmp_mask1), val32, v_reinterpret_as_s32(v_src1))); |
||||
|
||||
v_store(tptr + j, v_dst0); |
||||
v_store(tptr + j + cWidth, v_dst1); |
||||
} |
||||
} |
||||
#endif |
||||
|
||||
for (; j < len; j++) |
||||
if ((tptr[j] & 0x7FFFFFFFFFFFFFFF) > 0x7FF0000000000000) |
||||
tptr[j] = val.i; |
||||
} |
||||
|
||||
PatchNanFunc getPatchNanFunc(bool isDouble) |
||||
{ |
||||
return isDouble ? (PatchNanFunc)GET_OPTIMIZED(patchNaNs_64f) |
||||
: (PatchNanFunc)GET_OPTIMIZED(patchNaNs_32f); |
||||
} |
||||
|
||||
////// finiteMask //////
|
||||
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE) |
||||
|
||||
template <typename _Tp, int cn> |
||||
int finiteMaskSIMD_(const _Tp *src, uchar *dst, size_t total); |
||||
|
||||
template <> |
||||
int finiteMaskSIMD_<float, 1>(const float *fsrc, uchar *dst, size_t utotal) |
||||
{ |
||||
const uint32_t* src = (const uint32_t*)fsrc; |
||||
const int osize = VTraits<v_uint8>::vlanes(); |
||||
v_uint32 vmaskExp = vx_setall_u32(0x7f800000); |
||||
|
||||
int i = 0; |
||||
int total = (int)utotal; |
||||
for(; i < total - osize + 1; i += osize ) |
||||
{ |
||||
v_uint32 vv0, vv1, vv2, vv3; |
||||
vv0 = v_ne(v_and(vx_load(src + i ), vmaskExp), vmaskExp); |
||||
vv1 = v_ne(v_and(vx_load(src + i + (osize/4)), vmaskExp), vmaskExp); |
||||
vv2 = v_ne(v_and(vx_load(src + i + 2*(osize/4)), vmaskExp), vmaskExp); |
||||
vv3 = v_ne(v_and(vx_load(src + i + 3*(osize/4)), vmaskExp), vmaskExp); |
||||
|
||||
v_store(dst + i, v_pack_b(vv0, vv1, vv2, vv3)); |
||||
} |
||||
|
||||
return i; |
||||
} |
||||
|
||||
|
||||
template <> |
||||
int finiteMaskSIMD_<float, 2>(const float *fsrc, uchar *dst, size_t utotal) |
||||
{ |
||||
const uint32_t* src = (const uint32_t*)fsrc; |
||||
const int size8 = VTraits<v_uint8>::vlanes(); |
||||
v_uint32 vmaskExp = vx_setall_u32(0x7f800000); |
||||
v_uint16 vmaskBoth = vx_setall_u16(0xffff); |
||||
|
||||
int i = 0; |
||||
int total = (int)utotal; |
||||
for(; i < total - (size8 / 2) + 1; i += (size8 / 2) ) |
||||
{ |
||||
v_uint32 vv0, vv1, vv2, vv3; |
||||
vv0 = v_ne(v_and(vx_load(src + i*2 ), vmaskExp), vmaskExp); |
||||
vv1 = v_ne(v_and(vx_load(src + i*2 + (size8 / 4)), vmaskExp), vmaskExp); |
||||
vv2 = v_ne(v_and(vx_load(src + i*2 + 2*(size8 / 4)), vmaskExp), vmaskExp); |
||||
vv3 = v_ne(v_and(vx_load(src + i*2 + 3*(size8 / 4)), vmaskExp), vmaskExp); |
||||
v_uint8 velems = v_pack_b(vv0, vv1, vv2, vv3); |
||||
v_uint16 vfinite = v_eq(v_reinterpret_as_u16(velems), vmaskBoth); |
||||
|
||||
// 2nd argument in vfinite is useless
|
||||
v_store_low(dst + i, v_pack(vfinite, vfinite)); |
||||
} |
||||
|
||||
return i; |
||||
} |
||||
|
||||
|
||||
template <> |
||||
int finiteMaskSIMD_<float, 3>(const float *fsrc, uchar *dst, size_t utotal) |
||||
{ |
||||
const uint32_t* src = (const uint32_t*)fsrc; |
||||
const int npixels = VTraits<v_float32>::vlanes(); |
||||
v_uint32 vmaskExp = vx_setall_u32(0x7f800000); |
||||
v_uint32 z = vx_setzero_u32(); |
||||
|
||||
int i = 0; |
||||
int total = (int)utotal; |
||||
for (; i < total - npixels + 1; i += npixels) |
||||
{ |
||||
v_uint32 vv0, vv1, vv2; |
||||
vv0 = v_ne(v_and(vx_load(src + i*3 ), vmaskExp), vmaskExp); |
||||
vv1 = v_ne(v_and(vx_load(src + i*3 + npixels), vmaskExp), vmaskExp); |
||||
vv2 = v_ne(v_and(vx_load(src + i*3 + 2*npixels), vmaskExp), vmaskExp); |
||||
|
||||
v_uint8 velems = v_pack_b(vv0, vv1, vv2, z); |
||||
|
||||
// 2nd arg is useless
|
||||
v_uint8 vsh1 = v_extract<1>(velems, velems); |
||||
v_uint8 vsh2 = v_extract<2>(velems, velems); |
||||
|
||||
v_uint8 vres3 = v_and(v_and(velems, vsh1), vsh2); |
||||
for (int j = 0; j < npixels; j++) |
||||
{ |
||||
dst[i + j] = v_get0(vres3); |
||||
// 2nd arg is useless
|
||||
vres3 = v_extract<3>(vres3, vres3); |
||||
} |
||||
} |
||||
|
||||
return i; |
||||
} |
||||
|
||||
|
||||
template <> |
||||
int finiteMaskSIMD_<float, 4>(const float *fsrc, uchar *dst, size_t utotal) |
||||
{ |
||||
const uint32_t* src = (const uint32_t*)fsrc; |
||||
const int npixels = VTraits<v_uint8>::vlanes() / 2; |
||||
const int nfloats = VTraits<v_uint32>::vlanes(); |
||||
const v_uint32 vMaskExp = vx_setall_u32(0x7f800000); |
||||
v_uint32 vmaskAll4 = vx_setall_u32(0xFFFFFFFF); |
||||
|
||||
int i = 0; |
||||
int total = (int)utotal; |
||||
for(; i < total - npixels + 1; i += npixels ) |
||||
{ |
||||
v_uint32 v0 = vx_load(src + i * 4 + 0*nfloats); |
||||
v_uint32 v1 = vx_load(src + i * 4 + 1*nfloats); |
||||
v_uint32 v2 = vx_load(src + i * 4 + 2*nfloats); |
||||
v_uint32 v3 = vx_load(src + i * 4 + 3*nfloats); |
||||
v_uint32 v4 = vx_load(src + i * 4 + 4*nfloats); |
||||
v_uint32 v5 = vx_load(src + i * 4 + 5*nfloats); |
||||
v_uint32 v6 = vx_load(src + i * 4 + 6*nfloats); |
||||
v_uint32 v7 = vx_load(src + i * 4 + 7*nfloats); |
||||
|
||||
v_uint32 vmask0 = v_ne(v_and(v0, vMaskExp), vMaskExp); |
||||
v_uint32 vmask1 = v_ne(v_and(v1, vMaskExp), vMaskExp); |
||||
v_uint32 vmask2 = v_ne(v_and(v2, vMaskExp), vMaskExp); |
||||
v_uint32 vmask3 = v_ne(v_and(v3, vMaskExp), vMaskExp); |
||||
v_uint32 vmask4 = v_ne(v_and(v4, vMaskExp), vMaskExp); |
||||
v_uint32 vmask5 = v_ne(v_and(v5, vMaskExp), vMaskExp); |
||||
v_uint32 vmask6 = v_ne(v_and(v6, vMaskExp), vMaskExp); |
||||
v_uint32 vmask7 = v_ne(v_and(v7, vMaskExp), vMaskExp); |
||||
|
||||
v_uint8 velems0 = v_pack_b(vmask0, vmask1, vmask2, vmask3); |
||||
v_uint8 velems1 = v_pack_b(vmask4, vmask5, vmask6, vmask7); |
||||
|
||||
v_uint32 vresWide0 = v_eq(v_reinterpret_as_u32(velems0), vmaskAll4); |
||||
v_uint32 vresWide1 = v_eq(v_reinterpret_as_u32(velems1), vmaskAll4); |
||||
|
||||
// last 2 args are useless
|
||||
v_uint8 vres = v_pack_b(vresWide0, vresWide1, vresWide0, vresWide1); |
||||
|
||||
v_store_low(dst + i, vres); |
||||
} |
||||
|
||||
return i; |
||||
} |
||||
|
||||
|
||||
template <> |
||||
int finiteMaskSIMD_<double, 1>(const double *dsrc, uchar *dst, size_t utotal) |
||||
{ |
||||
const uint64_t* src = (const uint64_t*)dsrc; |
||||
const int size8 = VTraits<v_uint8>::vlanes(); |
||||
int i = 0; |
||||
int total = (int)utotal; |
||||
|
||||
v_uint64 vmaskExp = vx_setall_u64(0x7ff0000000000000); |
||||
v_uint64 z = vx_setzero_u64(); |
||||
|
||||
for(; i < total - (size8 / 2) + 1; i += (size8 / 2) ) |
||||
{ |
||||
v_uint64 vv0, vv1, vv2, vv3; |
||||
vv0 = v_ne(v_and(vx_load(src + i ), vmaskExp), vmaskExp); |
||||
vv1 = v_ne(v_and(vx_load(src + i + (size8 / 8)), vmaskExp), vmaskExp); |
||||
vv2 = v_ne(v_and(vx_load(src + i + 2*(size8 / 8)), vmaskExp), vmaskExp); |
||||
vv3 = v_ne(v_and(vx_load(src + i + 3*(size8 / 8)), vmaskExp), vmaskExp); |
||||
|
||||
v_uint8 v = v_pack_b(vv0, vv1, vv2, vv3, z, z, z, z); |
||||
|
||||
v_store_low(dst + i, v); |
||||
} |
||||
|
||||
return i; |
||||
} |
||||
|
||||
template <> |
||||
int finiteMaskSIMD_<double, 2>(const double *dsrc, uchar *dst, size_t utotal) |
||||
{ |
||||
const uint64_t* src = (const uint64_t*)dsrc; |
||||
const int npixels = VTraits<v_uint8>::vlanes() / 2; |
||||
const int ndoubles = VTraits<v_uint64>::vlanes(); |
||||
v_uint64 vmaskExp = vx_setall_u64(0x7ff0000000000000); |
||||
v_uint16 vmaskBoth = vx_setall_u16(0xffff); |
||||
|
||||
int i = 0; |
||||
int total = (int)utotal; |
||||
for(; i < total - npixels + 1; i += npixels ) |
||||
{ |
||||
v_uint64 vv0 = v_ne(v_and(vx_load(src + i*2 + 0*ndoubles), vmaskExp), vmaskExp); |
||||
v_uint64 vv1 = v_ne(v_and(vx_load(src + i*2 + 1*ndoubles), vmaskExp), vmaskExp); |
||||
v_uint64 vv2 = v_ne(v_and(vx_load(src + i*2 + 2*ndoubles), vmaskExp), vmaskExp); |
||||
v_uint64 vv3 = v_ne(v_and(vx_load(src + i*2 + 3*ndoubles), vmaskExp), vmaskExp); |
||||
v_uint64 vv4 = v_ne(v_and(vx_load(src + i*2 + 4*ndoubles), vmaskExp), vmaskExp); |
||||
v_uint64 vv5 = v_ne(v_and(vx_load(src + i*2 + 5*ndoubles), vmaskExp), vmaskExp); |
||||
v_uint64 vv6 = v_ne(v_and(vx_load(src + i*2 + 6*ndoubles), vmaskExp), vmaskExp); |
||||
v_uint64 vv7 = v_ne(v_and(vx_load(src + i*2 + 7*ndoubles), vmaskExp), vmaskExp); |
||||
|
||||
v_uint8 velems0 = v_pack_b(vv0, vv1, vv2, vv3, vv4, vv5, vv6, vv7); |
||||
|
||||
v_uint16 vfinite0 = v_eq(v_reinterpret_as_u16(velems0), vmaskBoth); |
||||
|
||||
// 2nd arg is useless
|
||||
v_uint8 vres = v_pack(vfinite0, vfinite0); |
||||
v_store_low(dst + i, vres); |
||||
} |
||||
|
||||
return i; |
||||
} |
||||
|
||||
|
||||
template <> |
||||
int finiteMaskSIMD_<double, 3>(const double *dsrc, uchar *dst, size_t utotal) |
||||
{ |
||||
//TODO: vectorize it properly
|
||||
|
||||
const uint64_t* src = (const uint64_t*)dsrc; |
||||
const int npixels = VTraits<v_uint8>::vlanes() / 2; |
||||
uint64_t maskExp = 0x7ff0000000000000; |
||||
|
||||
int i = 0; |
||||
int total = (int)utotal; |
||||
for(; i < total - npixels + 1; i += npixels ) |
||||
{ |
||||
for (int j = 0; j < npixels; j++) |
||||
{ |
||||
uint64_t val0 = src[i * 3 + j * 3 + 0]; |
||||
uint64_t val1 = src[i * 3 + j * 3 + 1]; |
||||
uint64_t val2 = src[i * 3 + j * 3 + 2]; |
||||
|
||||
bool finite = ((val0 & maskExp) != maskExp) && |
||||
((val1 & maskExp) != maskExp) && |
||||
((val2 & maskExp) != maskExp); |
||||
|
||||
dst[i + j] = finite ? 255 : 0; |
||||
} |
||||
} |
||||
|
||||
return i; |
||||
} |
||||
|
||||
template <> |
||||
int finiteMaskSIMD_<double, 4>(const double *dsrc, uchar *dst, size_t utotal) |
||||
{ |
||||
//TODO: vectorize it properly
|
||||
|
||||
uint64_t* src = (uint64_t*)dsrc; |
||||
const int npixels = VTraits<v_uint8>::vlanes() / 2; |
||||
const int ndoubles = VTraits<v_uint64>::vlanes(); |
||||
v_uint16 vmaskExp16 = vx_setall_u16(0x7ff0); |
||||
v_uint32 z = vx_setzero_u32(); |
||||
|
||||
int i = 0; |
||||
int total = (int)utotal; |
||||
for(; i < total - npixels + 1; i += npixels ) |
||||
{ |
||||
v_uint16 vexpb0, vexpb1, vexpb2, vexpb3, vexpb4, vexpb5, vexpb6, vexpb7; |
||||
v_uint16 dummy; |
||||
v_load_deinterleave((uint16_t*)(src + 0*4*ndoubles), dummy, dummy, dummy, vexpb0); |
||||
v_load_deinterleave((uint16_t*)(src + 1*4*ndoubles), dummy, dummy, dummy, vexpb1); |
||||
v_load_deinterleave((uint16_t*)(src + 2*4*ndoubles), dummy, dummy, dummy, vexpb2); |
||||
v_load_deinterleave((uint16_t*)(src + 3*4*ndoubles), dummy, dummy, dummy, vexpb3); |
||||
|
||||
v_uint16 vcmp0 = v_eq(v_and(vexpb0, vmaskExp16), vmaskExp16); |
||||
v_uint16 vcmp1 = v_eq(v_and(vexpb1, vmaskExp16), vmaskExp16); |
||||
v_uint16 vcmp2 = v_eq(v_and(vexpb2, vmaskExp16), vmaskExp16); |
||||
v_uint16 vcmp3 = v_eq(v_and(vexpb3, vmaskExp16), vmaskExp16); |
||||
|
||||
v_uint8 velems0 = v_pack(vcmp0, vcmp1); |
||||
v_uint8 velems1 = v_pack(vcmp2, vcmp3); |
||||
|
||||
v_uint32 vResWide0 = v_eq(v_reinterpret_as_u32(velems0), z); |
||||
v_uint32 vResWide1 = v_eq(v_reinterpret_as_u32(velems1), z); |
||||
|
||||
v_uint16 vp16 = v_pack(vResWide0, vResWide1); |
||||
|
||||
// 2nd arg is useless
|
||||
v_uint8 vres = v_pack(vp16, vp16); |
||||
v_store_low(dst, vres); |
||||
|
||||
src += npixels * 4; |
||||
dst += npixels; |
||||
} |
||||
|
||||
return i; |
||||
} |
||||
|
||||
#endif |
||||
|
||||
|
||||
template <typename _Tp, int cn> |
||||
void finiteMask_(const uchar *src, uchar *dst, size_t total) |
||||
{ |
||||
CV_INSTRUMENT_REGION(); |
||||
size_t i = 0; |
||||
const _Tp* tsrc = (const _Tp*) src; |
||||
|
||||
#if (CV_SIMD || CV_SIMD_SCALABLE) |
||||
i = finiteMaskSIMD_<_Tp, cn>(tsrc, dst, total); |
||||
#endif |
||||
|
||||
for(; i < total; i++ ) |
||||
{ |
||||
bool finite = true; |
||||
for (int c = 0; c < cn; c++) |
||||
{ |
||||
_Tp val = tsrc[i * cn + c]; |
||||
finite = finite && !cvIsNaN(val) && !cvIsInf(val); |
||||
} |
||||
dst[i] = finite ? 255 : 0; |
||||
} |
||||
} |
||||
|
||||
FiniteMaskFunc getFiniteMaskFunc(bool isDouble, int cn) |
||||
{ |
||||
static FiniteMaskFunc tab[] = |
||||
{ |
||||
(FiniteMaskFunc)GET_OPTIMIZED((finiteMask_<float, 1>)), |
||||
(FiniteMaskFunc)GET_OPTIMIZED((finiteMask_<float, 2>)), |
||||
(FiniteMaskFunc)GET_OPTIMIZED((finiteMask_<float, 3>)), |
||||
(FiniteMaskFunc)GET_OPTIMIZED((finiteMask_<float, 4>)), |
||||
(FiniteMaskFunc)GET_OPTIMIZED((finiteMask_<double, 1>)), |
||||
(FiniteMaskFunc)GET_OPTIMIZED((finiteMask_<double, 2>)), |
||||
(FiniteMaskFunc)GET_OPTIMIZED((finiteMask_<double, 3>)), |
||||
(FiniteMaskFunc)GET_OPTIMIZED((finiteMask_<double, 4>)), |
||||
}; |
||||
|
||||
int idx = (isDouble ? 4 : 0) + cn - 1; |
||||
return tab[idx]; |
||||
} |
||||
|
||||
#endif |
||||
CV_CPU_OPTIMIZATION_NAMESPACE_END |
||||
} // namespace cv
|
@ -0,0 +1,44 @@ |
||||
// This file is part of OpenCV project. |
||||
// It is subject to the license terms in the LICENSE file found in the top-level directory |
||||
// of this distribution and at http://opencv.org/license.html |
||||
|
||||
// This kernel is compiled with the following possible defines: |
||||
// - srcT, cn: source type and number of channels per pixel |
||||
// - rowsPerWI: Intel GPU optimization |
||||
// - DOUBLE_SUPPORT: enable double support if available |
||||
|
||||
#ifdef DOUBLE_SUPPORT |
||||
#ifdef cl_amd_fp64 |
||||
#pragma OPENCL EXTENSION cl_amd_fp64:enable |
||||
#elif defined cl_khr_fp64 |
||||
#pragma OPENCL EXTENSION cl_khr_fp64:enable |
||||
#endif |
||||
#endif |
||||
|
||||
__kernel void finiteMask(__global const uchar * srcptr, int srcstep, int srcoffset, |
||||
__global uchar * dstptr, int dststep, int dstoffset, |
||||
int rows, int cols ) |
||||
{ |
||||
int x = get_global_id(0); |
||||
int y0 = get_global_id(1) * rowsPerWI; |
||||
|
||||
if (x < cols) |
||||
{ |
||||
int src_index = mad24(y0, srcstep, mad24(x, (int)sizeof(srcT) * cn, srcoffset)); |
||||
int dst_index = mad24(y0, dststep, x + dstoffset); |
||||
|
||||
for (int y = y0, y1 = min(rows, y0 + rowsPerWI); y < y1; ++y, src_index += srcstep, dst_index += dststep) |
||||
{ |
||||
bool vfinite = true; |
||||
|
||||
for (int c = 0; c < cn; c++) |
||||
{ |
||||
srcT val = *(__global srcT *)(srcptr + src_index + c * (int)sizeof(srcT)); |
||||
|
||||
vfinite = vfinite && !isnan(val) & !isinf(val); |
||||
} |
||||
|
||||
*(dstptr + dst_index) = vfinite ? 255 : 0; |
||||
} |
||||
} |
||||
} |
Loading…
Reference in new issue