From bf0173bf38a4bafee9a00cb99d4ce6696ed8ac67 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin <alexander.alekhin@intel.com> Date: Fri, 21 Jul 2017 16:42:28 +0300 Subject: [PATCH 1/9] ts: update valgrind suppressions --- cmake/OpenCVUtils.cmake | 6 + modules/calib3d/src/stereobm.cpp | 2 +- modules/core/src/stat.cpp | 2 +- modules/ts/misc/run.py | 2 +- modules/ts/misc/run_suite.py | 11 +- platforms/scripts/valgrind.supp | 198 ++++++++++++++++++++++- platforms/scripts/valgrind_3rdparty.supp | 113 +++++++++++++ 7 files changed, 324 insertions(+), 10 deletions(-) create mode 100644 platforms/scripts/valgrind_3rdparty.supp diff --git a/cmake/OpenCVUtils.cmake b/cmake/OpenCVUtils.cmake index d91d1a62d6..8ef7205009 100644 --- a/cmake/OpenCVUtils.cmake +++ b/cmake/OpenCVUtils.cmake @@ -533,6 +533,12 @@ macro(ocv_finalize_status) execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different "${OPENCV_BUILD_INFO_FILE}" "${OPENCV_MODULE_opencv_core_BINARY_DIR}/version_string.inc" OUTPUT_QUIET) endif() endif() + + if(UNIX) + install(FILES "${OpenCV_SOURCE_DIR}/platforms/scripts/valgrind.supp" + "${OpenCV_SOURCE_DIR}/platforms/scripts/valgrind_3rdparty.supp" + DESTINATION "${OPENCV_OTHER_INSTALL_PATH}" COMPONENT "dev") + endif() endmacro() diff --git a/modules/calib3d/src/stereobm.cpp b/modules/calib3d/src/stereobm.cpp index 5ed45a4884..f6d8213545 100644 --- a/modules/calib3d/src/stereobm.cpp +++ b/modules/calib3d/src/stereobm.cpp @@ -197,7 +197,7 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero ) { int x, y; const int OFS = 256*4, TABSZ = OFS*2 + 256; - uchar tab[TABSZ]; + uchar tab[TABSZ] = { 0 }; Size size = src.size(); for( x = 0; x < TABSZ; x++ ) diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index e324f85dac..13d8c215cf 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -1547,7 +1547,7 @@ static bool ocl_meanStdDev( InputArray _src, OutputArray _mean, OutputArray _sdv bool haveMask = _mask.kind() != _InputArray::NONE; int nz = haveMask ? -1 : (int)_src.total(); - Scalar mean, stddev; + Scalar mean(0), stddev(0); const int cn = _src.channels(); if (cn > 4) return false; diff --git a/modules/ts/misc/run.py b/modules/ts/misc/run.py index 0f2116e6af..0befa79ea9 100755 --- a/modules/ts/misc/run.py +++ b/modules/ts/misc/run.py @@ -37,7 +37,7 @@ if __name__ == "__main__": # Valgrind parser.add_argument("--valgrind", action="store_true", default=False, help="Run C++ tests in valgrind") - parser.add_argument("--valgrind_supp", metavar="FILE", help="Path to valgrind suppression file (example: --valgrind_supp opencv/platforms/scripts/valgrind.supp)") + parser.add_argument("--valgrind_supp", metavar="FILE", action='append', help="Path to valgrind suppression file (example: --valgrind_supp opencv/platforms/scripts/valgrind.supp)") parser.add_argument("--valgrind_opt", metavar="OPT", action="append", default=[], help="Add command line option to valgrind (example: --valgrind_opt=--leak-check=full)") # Android diff --git a/modules/ts/misc/run_suite.py b/modules/ts/misc/run_suite.py index 311f415240..8242c3a1db 100644 --- a/modules/ts/misc/run_suite.py +++ b/modules/ts/misc/run_suite.py @@ -103,10 +103,15 @@ class TestSuite(object): def wrapInValgrind(self, cmd = []): if self.options.valgrind: res = ['valgrind'] - if self.options.valgrind_supp: - res.append("--suppressions=%s" % self.options.valgrind_supp) + supp = self.options.valgrind_supp or [] + for f in supp: + if os.path.isfile(f): + res.append("--suppressions=%s" % f) + else: + print("WARNING: Valgrind suppression file is missing, SKIP: %s" % f) res.extend(self.options.valgrind_opt) - return res + cmd + [longTestFilter(LONG_TESTS_DEBUG_VALGRIND)] + has_gtest_filter = next((True for x in cmd if x.startswith('--gtest_filter=')), False) + return res + cmd + ([longTestFilter(LONG_TESTS_DEBUG_VALGRIND)] if not has_gtest_filter else []) return cmd def tryCommand(self, cmd): diff --git a/platforms/scripts/valgrind.supp b/platforms/scripts/valgrind.supp index 54833e08bd..e78d047c58 100644 --- a/platforms/scripts/valgrind.supp +++ b/platforms/scripts/valgrind.supp @@ -1,13 +1,203 @@ { - IPP static init + OpenCV-IPP static init Memcheck:Cond fun:ippicvGetCpuFeatures fun:ippicvStaticInit } { - TBB - allocate_via_handler_v3 issue + OpenCV-getInitializationMutex Memcheck:Leak - fun:malloc - fun:_ZN3tbb8internal23allocate_via_handler_v3Em + ... + fun:_ZN2cv22getInitializationMutexEv +} + +{ + OpenCV-getStdAllocator + Memcheck:Leak + ... + fun:_ZN2cv3Mat15getStdAllocatorEv +} + +{ + OpenCV-getOpenCLAllocator + Memcheck:Leak + ... + fun:_ZN2cv3ocl18getOpenCLAllocatorEv +} + +{ + OpenCV-getCoreTlsData + Memcheck:Leak + fun:_Znwm + fun:_ZN2cv14getCoreTlsDataEv +} + +{ + OpenCV-TLS-getTlsStorage + Memcheck:Leak + ... + fun:_ZN2cvL13getTlsStorageEv +} + +{ + OpenCV-TLS-getData() + Memcheck:Leak + ... + fun:*setData* + fun:_ZNK2cv16TLSDataContainer7getDataEv +} + +{ + OpenCV-parallel_for + Memcheck:Leak + ... + fun:_ZN2cv13ThreadManager8initPoolEv* +} + +{ + OpenCV-parallel_for + Memcheck:Leak + fun:_Znwm + fun:*instance* + fun:_ZN2cv21parallel_for_pthreadsERKNS_5RangeERKNS_16ParallelLoopBodyEd + fun:_ZN2cv13parallel_for_ERKNS_5RangeERKNS_16ParallelLoopBodyEd +} + +{ + OpenCV-parallel_for-ThreadManager::TLS + Memcheck:Leak + fun:_Znwm + fun:_ZNK2cv7TLSDataINS_13ThreadManager13work_thread_tEE18createDataInstanceEv +} + +{ + OpenCV-parallel_for-setNumThreads() + Memcheck:Leak + fun:_Znwm + fun:_ZN2cv13ThreadManager8instanceEv + fun:_ZN2cv33parallel_pthreads_set_threads_numEi + fun:_ZN2cv13setNumThreadsEi +} + +{ + OpenCV-parallel_for-getNumThreads() + Memcheck:Leak + ... + fun:_ZN2cv13getNumThreadsEv +} + +{ + OpenCV-getIPPSingelton + Memcheck:Leak + ... + fun:_ZN2cv3ippL15getIPPSingeltonEv +} + +{ + OpenCV-getGlobalMatOpInitializer + Memcheck:Leak + fun:_Znwm + fun:_ZN2cvL25getGlobalMatOpInitializerEv +} + +{ + OpenCV-CoreTLSData + Memcheck:Leak + ... + fun:_ZNK2cv7TLSDataINS_11CoreTLSDataEE3getEv +} + +{ + OpenCV-ThreadID + Memcheck:Leak + fun:_Znwm + fun:_ZNK2cv7TLSDataINS_12_GLOBAL__N_18ThreadIDEE18createDataInstanceEv +} + +{ + OpenCV-ThreadID-TLS + Memcheck:Leak + fun:_Znwm + fun:getThreadIDTLS +} + +{ + OpenCV-CoreTLS + Memcheck:Leak + fun:_Znwm + fun:_ZNK2cv7TLSDataINS_11CoreTLSDataEE18createDataInstanceEv +} + +{ + OpenCV-haveOpenCL + Memcheck:Leak + ... + fun:_ZN2cv3ocl10haveOpenCLEv +} + +{ + OpenCV-DNN-getLayerFactoryMutex + Memcheck:Leak + ... + fun:_ZN2cv3dnn*L20getLayerFactoryMutexEv +} + +{ + OpenCV-ocl::Context + Memcheck:Leak + ... + fun:_ZN2cv3ocl7Context10getDefaultEb +} + +{ + OpenCV-ocl::Device + Memcheck:Leak + ... + fun:_ZN2cv3ocl6Device10getDefaultEv +} + +{ + OpenCV-ocl::Queue + Memcheck:Leak + ... + fun:_ZN2cv3ocl5Queue6createERKNS0_7ContextERKNS0_6DeviceE +} + +{ + OpenCV-ocl::Program + Memcheck:Leak + ... + fun:_ZN2cv3ocl6Kernel6createEPKcRKNS0_7ProgramE +} + +{ + OpenCV-ocl::ProgramEntry + Memcheck:Leak + ... + fun:_ZNK2cv3ocl8internal12ProgramEntrycvRNS0_13ProgramSourceEEv +} + +{ + OpenCV-ocl::Context::getProg + Memcheck:Leak + ... + fun:_ZN2cv3ocl7Context7getProgERKNS0_13ProgramSourceERKNS_6StringERS5_ +} + + +{ + OpenCV-ITT + Memcheck:Leak + ... + fun:__itt_*create* +} + +{ + OpenCV-FFmpeg-swsscale + Memcheck:Addr16 + ... + fun:sws_scale + fun:_ZN20CvVideoWriter_FFMPEG10writeFrameEPKhiiiii + fun:cvWriteFrame_FFMPEG } diff --git a/platforms/scripts/valgrind_3rdparty.supp b/platforms/scripts/valgrind_3rdparty.supp new file mode 100644 index 0000000000..7b6472d827 --- /dev/null +++ b/platforms/scripts/valgrind_3rdparty.supp @@ -0,0 +1,113 @@ +{ + IPP static init + Memcheck:Cond + fun:ippicvGetCpuFeatures + fun:ippicvStaticInit +} + +{ + TBB - allocate_via_handler_v3 issue + Memcheck:Leak + fun:malloc + fun:_ZN3tbb8internal23allocate_via_handler_v3Em +} + +{ + GTest + Memcheck:Cond + fun:_ZN7testing8internal11CmpHelperLEIddEENS_15AssertionResultEPKcS4_RKT_RKT0_ +} + +{ + OpenCL + Memcheck:Cond + ... + obj:**/libOpenCL.so* +} + +{ + OpenCL-Intel + Memcheck:Cond + ... + obj:**/libigdrcl.so +} + +{ + OpenCL-Intel + Memcheck:Leak + ... + obj:*/libigdrcl.so* +} + +{ + OpenCL + Memcheck:Param + ioctl(generic) + ... + fun:clGetPlatformIDs +} + +{ + OpenCL-Init + Memcheck:Leak + ... + fun:clGetPlatformIDs +} + +{ + glib + Memcheck:Leak + fun:*alloc + obj:*/libglib* +} + +{ + gcrypt + Memcheck:Leak + ... + obj:*/libgcrypt* +} + +{ + p11-kit + Memcheck:Leak + fun:*alloc + obj:*/libp11-kit* +} + +{ + gobject + Memcheck:Leak + fun:*alloc + ... + obj:*/libgobject* +} + +{ + tasn + Memcheck:Leak + fun:*alloc + obj:*/libtasn*.so* +} + +{ + dl_init + Memcheck:Leak + ... + fun:_dl_init +} + +{ + dl_open + Memcheck:Leak + ... + fun:_dl_open +} + +{ + GDAL + Memcheck:Leak + fun:*alloc + ... + obj:/usr/lib/libgdal.so.1.17.1 +} From 1516103a15c629ee011e146405cf40946bb0b3dd Mon Sep 17 00:00:00 2001 From: Alexander Alekhin <alexander.alekhin@intel.com> Date: Mon, 24 Jul 2017 11:36:21 +0300 Subject: [PATCH 2/9] cmake: fix compiler flags --- cmake/OpenCVCompilerOptions.cmake | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake index 2f2e3f741f..2fcc37345a 100644 --- a/cmake/OpenCVCompilerOptions.cmake +++ b/cmake/OpenCVCompilerOptions.cmake @@ -203,7 +203,10 @@ if(CMAKE_COMPILER_IS_GNUCXX) endif() set(OPENCV_EXTRA_FLAGS_RELEASE "${OPENCV_EXTRA_FLAGS_RELEASE} -DNDEBUG") - set(OPENCV_EXTRA_FLAGS_DEBUG "${OPENCV_EXTRA_FLAGS_DEBUG} -O0 -DDEBUG -D_DEBUG") + if(NOT " ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_DEBUG} " MATCHES "-O") + set(OPENCV_EXTRA_FLAGS_DEBUG "${OPENCV_EXTRA_FLAGS_DEBUG} -O0") + endif() + set(OPENCV_EXTRA_FLAGS_DEBUG "${OPENCV_EXTRA_FLAGS_DEBUG} -DDEBUG -D_DEBUG") endif() if(MSVC) From d17b099994ef86d7f223a091536482737ef5fd5a Mon Sep 17 00:00:00 2001 From: Alexander Alekhin <alexander.alekhin@intel.com> Date: Mon, 24 Jul 2017 19:24:46 +0300 Subject: [PATCH 3/9] ts: don't run DNN tests with large models (valgrind) --- modules/ts/misc/run_long.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/modules/ts/misc/run_long.py b/modules/ts/misc/run_long.py index 6ae76aec0f..d2ef41fe30 100644 --- a/modules/ts/misc/run_long.py +++ b/modules/ts/misc/run_long.py @@ -7,6 +7,7 @@ from pprint import PrettyPrinter as PP LONG_TESTS_DEBUG_VALGRIND = [ ('calib3d', 'Calib3d_InitUndistortRectifyMap.accuracy', 2017.22), + ('dnn', 'Reproducibility*', 1000), # large DNN models ('features2d', 'Features2d_Feature2d.no_crash', 1235.68), ('ml', 'ML_RTrees.regression', 1423.47), ('optflow', 'DenseOpticalFlow_DeepFlow.ReferenceAccuracy', 1360.95), @@ -43,10 +44,8 @@ LONG_TESTS_DEBUG_VALGRIND = [ ] -def longTestFilter(data): - res = ['*', '-'] - for _, v, _ in data: - res.append(v) +def longTestFilter(data, module = None): + res = ['*', '-'] + [v for _, v, m in data if module is None or m == module] return '--gtest_filter={}'.format(':'.join(res)) From 1650c664bc6693e52e36a3ec53fc96cd8a762e34 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin <alexander.alekhin@intel.com> Date: Mon, 24 Jul 2017 23:27:14 +0300 Subject: [PATCH 4/9] ts: don't run imgcodecs tests on large images (valgrind) --- modules/ts/misc/run_long.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/ts/misc/run_long.py b/modules/ts/misc/run_long.py index d2ef41fe30..356244595f 100644 --- a/modules/ts/misc/run_long.py +++ b/modules/ts/misc/run_long.py @@ -9,6 +9,8 @@ LONG_TESTS_DEBUG_VALGRIND = [ ('calib3d', 'Calib3d_InitUndistortRectifyMap.accuracy', 2017.22), ('dnn', 'Reproducibility*', 1000), # large DNN models ('features2d', 'Features2d_Feature2d.no_crash', 1235.68), + ('imgcodecs', 'Imgcodecs_Png.write_big', 1000), # memory limit + ('imgcodecs', 'Imgcodecs_Tiff.decode_tile16384x16384', 1000), # memory limit ('ml', 'ML_RTrees.regression', 1423.47), ('optflow', 'DenseOpticalFlow_DeepFlow.ReferenceAccuracy', 1360.95), ('optflow', 'DenseOpticalFlow_DeepFlow_perf.perf/0', 1881.59), From aad6d28e133934220e800a5553369edca55f1fee Mon Sep 17 00:00:00 2001 From: Alexander Alekhin <alexander.alekhin@intel.com> Date: Tue, 25 Jul 2017 20:16:32 +0300 Subject: [PATCH 5/9] ts: don't run large videoio test (valgrind) --- modules/ts/misc/run_long.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modules/ts/misc/run_long.py b/modules/ts/misc/run_long.py index 356244595f..d820f9780a 100644 --- a/modules/ts/misc/run_long.py +++ b/modules/ts/misc/run_long.py @@ -26,6 +26,7 @@ LONG_TESTS_DEBUG_VALGRIND = [ ('shape', 'Shape_SCD.regression', 3311.46), ('tracking', 'AUKF.br_mean_squared_error', 10764.6), ('tracking', 'UKF.br_mean_squared_error', 5228.27), + ('videoio', 'Videoio_Video.ffmpeg_writebig', 1000), ('xfeatures2d', 'Features2d_RotationInvariance_Descriptor_BoostDesc_LBGM.regression', 1124.51), ('xfeatures2d', 'Features2d_RotationInvariance_Descriptor_VGG120.regression', 2198.1), ('xfeatures2d', 'Features2d_RotationInvariance_Descriptor_VGG48.regression', 1958.52), From 3f102e5d3a0b36a46a43024d6787c25f730f414d Mon Sep 17 00:00:00 2001 From: Alexander Alekhin <alexander.alekhin@intel.com> Date: Tue, 25 Jul 2017 16:13:56 +0300 Subject: [PATCH 6/9] dnn: protobuf shutdown --- modules/dnn/src/init.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/modules/dnn/src/init.cpp b/modules/dnn/src/init.cpp index 9222d5ecb1..97ea169f81 100644 --- a/modules/dnn/src/init.cpp +++ b/modules/dnn/src/init.cpp @@ -42,6 +42,8 @@ #include "precomp.hpp" #include <opencv2/dnn/layer.details.hpp> +#include <google/protobuf/stubs/common.h> + namespace cv { namespace dnn { CV__DNN_EXPERIMENTAL_NS_BEGIN @@ -56,11 +58,26 @@ Mutex& getInitializationMutex() // force initialization (single-threaded environment) Mutex* __initialization_mutex_initializer = &getInitializationMutex(); +namespace { +using namespace google::protobuf; +class ProtobufShutdown { +public: + bool initialized; + ProtobufShutdown() : initialized(true) {} + ~ProtobufShutdown() + { + initialized = false; + google::protobuf::ShutdownProtobufLibrary(); + } +}; +} // namespace void initializeLayerFactory() { CV_TRACE_FUNCTION(); + static ProtobufShutdown protobufShutdown; (void)protobufShutdown; + CV_DNN_REGISTER_LAYER_CLASS(Slice, SliceLayer); CV_DNN_REGISTER_LAYER_CLASS(Split, SplitLayer); CV_DNN_REGISTER_LAYER_CLASS(Concat, ConcatLayer); From 2e17251160761d48809c80ef9ea256143b7d06a9 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin <alexander.alekhin@intel.com> Date: Tue, 25 Jul 2017 19:58:00 +0300 Subject: [PATCH 7/9] calib3d: fix invalid memory access --- modules/calib3d/src/stereobm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/calib3d/src/stereobm.cpp b/modules/calib3d/src/stereobm.cpp index f6d8213545..f6708c47d9 100644 --- a/modules/calib3d/src/stereobm.cpp +++ b/modules/calib3d/src/stereobm.cpp @@ -227,7 +227,7 @@ prefilterXSobel( const Mat& src, Mat& dst, int ftzero ) v_int16x8 ftz2 = v_setall_s16((short)(ftzero*2)); v_int16x8 z = v_setzero_s16(); - for(; x <= size.width-8; x += 8 ) + for(; x <= (size.width - 1) - 8; x += 8 ) { v_int16x8 s00 = v_reinterpret_as_s16(v_load_expand(srow0 + x + 1)); v_int16x8 s01 = v_reinterpret_as_s16(v_load_expand(srow0 + x - 1)); From caa5e3b4c5c82dac993ba2acacf0413458a7bffd Mon Sep 17 00:00:00 2001 From: Alexander Alekhin <alexander.alekhin@intel.com> Date: Wed, 26 Jul 2017 13:11:31 +0300 Subject: [PATCH 8/9] imgproc: fix vectorized code of accumulate --- modules/imgproc/src/accum.simd.hpp | 197 +++++++++++------------------ 1 file changed, 73 insertions(+), 124 deletions(-) diff --git a/modules/imgproc/src/accum.simd.hpp b/modules/imgproc/src/accum.simd.hpp index e2be2c952e..7a29447497 100644 --- a/modules/imgproc/src/accum.simd.hpp +++ b/modules/imgproc/src/accum.simd.hpp @@ -425,9 +425,7 @@ void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn v_uint16x8 v_0 = v_setall_u16(0); for ( ; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint8x16 _v_mask = v_load(mask + x); - v_uint16x8 v_mask, dummy; - v_expand(_v_mask, v_mask, dummy); + v_uint16x8 v_mask = v_load_expand(mask + x); v_mask = ~(v_mask == v_0); v_uint16x8 v_src = v_load(src + x); v_src = v_src & v_mask; @@ -443,9 +441,7 @@ void acc_simd_(const ushort* src, float* dst, const uchar* mask, int len, int cn v_uint16x8 v_0 = v_setall_u16(0); for ( ; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint8x16 _v_mask = v_load(mask + x); - v_uint16x8 v_mask, dummy; - v_expand(_v_mask, v_mask, dummy); + v_uint16x8 v_mask = v_load_expand(mask + x); v_mask = ~(v_mask == v_0); v_uint16x8 v_src0, v_src1, v_src2; v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); @@ -491,8 +487,7 @@ void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn) { for ( ; x <= len - cVectorWidth ; x += cVectorWidth) { - v_uint16x8 v_masku16, dummy0; - v_expand(v_load(mask + x), v_masku16, dummy0); + v_uint16x8 v_masku16 = v_load_expand(mask + x); v_uint32x4 v_masku320, v_masku321; v_expand(v_masku16, v_masku320, v_masku321); v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0))); @@ -506,8 +501,7 @@ void acc_simd_(const float* src, float* dst, const uchar* mask, int len, int cn) { for ( ; x <= len - cVectorWidth ; x += cVectorWidth) { - v_uint16x8 v_masku16, dummy0; - v_expand(v_load(mask + x), v_masku16, dummy0); + v_uint16x8 v_masku16 = v_load_expand(mask + x); v_uint32x4 v_masku320, v_masku321; v_expand(v_masku16, v_masku320, v_masku321); v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_masku320 == v_reinterpret_as_u32(v_0))); @@ -770,8 +764,7 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c { for ( ; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 v_mask, dummy; - v_expand(v_load(mask + x), v_mask, dummy); + v_uint16x8 v_mask = v_load_expand(mask + x); v_mask = ~(v_mask == v_0); v_uint16x8 v_src = v_load(src + x); v_src = v_src & v_mask; @@ -803,8 +796,7 @@ void acc_simd_(const ushort* src, double* dst, const uchar* mask, int len, int c { for ( ; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 v_mask, dummy; - v_expand(v_load(mask + x), v_mask, dummy); + v_uint16x8 v_mask = v_load_expand(mask + x); v_mask = ~(v_mask == v_0); v_uint16x8 v_src0, v_src1, v_src2; v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); @@ -871,10 +863,7 @@ void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn { for ( ; x <= len - cVectorWidth ; x += cVectorWidth) { - v_uint16x8 v_masku16, dummy0; - v_expand(v_load(mask + x), v_masku16, dummy0); - v_uint32x4 v_masku32, dummy1; - v_expand(v_masku16, v_masku32, dummy1); + v_uint32x4 v_masku32 = v_load_expand_q(mask + x); v_uint64x2 v_masku640, v_masku641; v_expand(v_masku32, v_masku640, v_masku641); v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); @@ -892,10 +881,7 @@ void acc_simd_(const float* src, double* dst, const uchar* mask, int len, int cn { for ( ; x <= len - cVectorWidth ; x += cVectorWidth) { - v_uint16x8 v_masku16, dummy0; - v_expand(v_load(mask + x), v_masku16, dummy0); - v_uint32x4 v_masku32, dummy1; - v_expand(v_masku16, v_masku32, dummy1); + v_uint32x4 v_masku32 = v_load_expand_q(mask + x); v_uint64x2 v_masku640, v_masku641; v_expand(v_masku32, v_masku640, v_masku641); v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); @@ -947,10 +933,7 @@ void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int c { for ( ; x <= len - cVectorWidth ; x += cVectorWidth) { - v_uint16x8 v_masku16, dummy0; - v_expand(v_load(mask + x), v_masku16, dummy0); - v_uint32x4 v_masku32, dummy1; - v_expand(v_masku16, v_masku32, dummy1); + v_uint32x4 v_masku32 = v_load_expand_q(mask + x); v_uint64x2 v_masku640, v_masku641; v_expand(v_masku32, v_masku640, v_masku641); v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); @@ -967,10 +950,7 @@ void acc_simd_(const double* src, double* dst, const uchar* mask, int len, int c { for ( ; x <= len - cVectorWidth ; x += cVectorWidth) { - v_uint16x8 v_masku16, dummy0; - v_expand(v_load(mask + x), v_masku16, dummy0); - v_uint32x4 v_masku32, dummy1; - v_expand(v_masku16, v_masku32, dummy1); + v_uint32x4 v_masku32 = v_load_expand_q(mask + x); v_uint64x2 v_masku640, v_masku641; v_expand(v_masku32, v_masku640, v_masku641); v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); @@ -1157,9 +1137,9 @@ void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int { for ( ; x <= len - cVectorWidth ; x += cVectorWidth) { - v_uint16x8 stub = v_load_expand(mask + x); + v_uint16x8 v_mask16 = v_load_expand(mask + x); v_uint32x4 v_mask0, v_mask1; - v_expand(stub, v_mask0, v_mask1); + v_expand(v_mask16, v_mask0, v_mask1); v_mask0 = ~(v_mask0 == v_0); v_mask1 = ~(v_mask1 == v_0); v_uint16x8 v_src = v_load(src + x); @@ -1182,9 +1162,9 @@ void accSqr_simd_(const ushort* src, float* dst, const uchar* mask, int len, int { for ( ; x <= len - cVectorWidth ; x += cVectorWidth) { - v_uint16x8 stub = v_load_expand(mask + x); + v_uint16x8 v_mask16 = v_load_expand(mask + x); v_uint32x4 v_mask0, v_mask1; - v_expand(stub, v_mask0, v_mask1); + v_expand(v_mask16, v_mask0, v_mask1); v_mask0 = ~(v_mask0 == v_0); v_mask1 = ~(v_mask1 == v_0); @@ -1254,11 +1234,11 @@ void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 v_stub = v_load_expand(mask + x); - v_uint32x4 v_stub0, v_stub1; - v_expand(v_stub, v_stub0, v_stub1); - v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_stub0 == v_0)); - v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_stub1 == v_0)); + v_uint16x8 v_mask16 = v_load_expand(mask + x); + v_uint32x4 v_mask_0, v_mask_1; + v_expand(v_mask16, v_mask_0, v_mask_1); + v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0)); + v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0)); v_float32x4 v_src0 = v_load(src + x); v_float32x4 v_src1 = v_load(src + x + 4); v_src0 = v_src0 & v_mask0; @@ -1274,11 +1254,11 @@ void accSqr_simd_(const float* src, float* dst, const uchar* mask, int len, int { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 v_stub = v_load_expand(mask + x); - v_uint32x4 v_stub0, v_stub1; - v_expand(v_stub, v_stub0, v_stub1); - v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_stub0 == v_0)); - v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_stub1 == v_0)); + v_uint16x8 v_mask16 = v_load_expand(mask + x); + v_uint32x4 v_mask_0, v_mask_1; + v_expand(v_mask16, v_mask_0, v_mask_1); + v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_mask_0 == v_0)); + v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_mask_1 == v_0)); v_float32x4 v_src00, v_src10, v_src20, v_src01, v_src11, v_src21; v_load_deinterleave(src + x * cn, v_src00, v_src10, v_src20); @@ -1319,9 +1299,7 @@ void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int int size = len * cn; for (; x <= size - cVectorWidth; x += cVectorWidth) { - v_uint8x16 v_src = v_load(src + x); - v_uint16x8 v_int, dummy; - v_expand(v_src, v_int, dummy); + v_uint16x8 v_int = v_load_expand(src + x); v_uint32x4 v_int0, v_int1; v_expand(v_int, v_int0, v_int1); @@ -1353,17 +1331,15 @@ void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int } else { - v_uint8x16 v_0 = v_setzero_u8(); + v_uint16x8 v_0 = v_setzero_u16(); if (cn == 1) { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint8x16 v_mask = v_load(mask + x); + v_uint16x8 v_mask = v_load_expand(mask + x); v_mask = ~(v_mask == v_0); - v_uint8x16 v_src = v_load(src + x); - v_src = v_src & v_mask; - v_uint16x8 v_int, dummy; - v_expand(v_src, v_int, dummy); + v_uint16x8 v_src = v_load_expand(src + x); + v_uint16x8 v_int = v_src & v_mask; v_uint32x4 v_int0, v_int1; v_expand(v_int, v_int0, v_int1); @@ -1395,19 +1371,19 @@ void accSqr_simd_(const uchar* src, double* dst, const uchar* mask, int len, int } else if (cn == 3) { - for (; x <= len - cVectorWidth; x += cVectorWidth) + for (; x <= len - /*cVectorWidth*/16; x += cVectorWidth) { - v_uint8x16 v_mask = v_load(mask + x); - v_mask = ~(v_mask == v_0); v_uint8x16 v_src0, v_src1, v_src2; v_load_deinterleave(src + x * cn, v_src0, v_src1, v_src2); - v_src0 = v_src0 & v_mask; - v_src1 = v_src1 & v_mask; - v_src2 = v_src2 & v_mask; v_uint16x8 v_int0, v_int1, v_int2, dummy; v_expand(v_src0, v_int0, dummy); v_expand(v_src1, v_int1, dummy); v_expand(v_src2, v_int2, dummy); + v_uint16x8 v_mask = v_load_expand(mask + x); + v_mask = ~(v_mask == v_0); + v_int0 = v_int0 & v_mask; + v_int1 = v_int1 & v_mask; + v_int2 = v_int2 & v_mask; v_uint32x4 v_int00, v_int01, v_int10, v_int11, v_int20, v_int21; v_expand(v_int0, v_int00, v_int01); @@ -1627,9 +1603,7 @@ void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 stub = v_load_expand(mask + x); - v_uint32x4 v_mask, dummy; - v_expand(stub, v_mask, dummy); + v_uint32x4 v_mask = v_load_expand_q(mask + x);; v_mask = ~(v_mask == v_0); v_float32x4 v_src = v_load(src + x); v_src = v_src & v_reinterpret_as_f32(v_mask); @@ -1646,9 +1620,7 @@ void accSqr_simd_(const float* src, double* dst, const uchar* mask, int len, int { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 stub = v_load_expand(mask + x); - v_uint32x4 v_mask, dummy; - v_expand(stub, v_mask, dummy); + v_uint32x4 v_mask = v_load_expand_q(mask + x); v_mask = ~(v_mask == v_0); v_float32x4 v_src0, v_src1, v_src2; @@ -1709,11 +1681,9 @@ void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, in { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 stub = v_load_expand(mask + x); - v_uint32x4 stub0, stub1; - v_expand(stub, stub0, stub1); + v_uint32x4 v_mask32 = v_load_expand_q(mask + x); v_uint64x2 v_masku640, v_masku641; - v_expand(stub0, v_masku640, v_masku641); + v_expand(v_mask32, v_masku640, v_masku641); v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); v_float64x2 v_src0 = v_load(src + x); @@ -1731,11 +1701,9 @@ void accSqr_simd_(const double* src, double* dst, const uchar* mask, int len, in { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 stub = v_load_expand(mask + x); - v_uint32x4 stub0, stub1; - v_expand(stub, stub0, stub1); + v_uint32x4 v_mask32 = v_load_expand_q(mask + x); v_uint64x2 v_masku640, v_masku641; - v_expand(stub0, v_masku640, v_masku641); + v_expand(v_mask32, v_masku640, v_masku641); v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); @@ -2059,11 +2027,10 @@ void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 stub = v_load_expand(mask + x); - v_uint32x4 stub0, stub1; - v_expand(stub, stub0, stub1); - v_float32x4 v_mask0 = v_reinterpret_as_f32(~(stub0 == v_0)); - v_float32x4 v_mask1 = v_reinterpret_as_f32(~(stub1 == v_0)); + v_uint32x4 v_mask32_0 = v_load_expand_q(mask + x); + v_uint32x4 v_mask32_1 = v_load_expand_q(mask + x + 4); + v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0)); + v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0)); v_store(dst + x, v_load(dst + x) + ((v_load(src1 + x) * v_load(src2 + x)) & v_mask0)); v_store(dst + x + 4, v_load(dst + x + 4) + ((v_load(src1 + x + 4) * v_load(src2 + x + 4)) & v_mask1)); @@ -2073,11 +2040,10 @@ void accProd_simd_(const float* src1, const float* src2, float* dst, const uchar { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 stub = v_load_expand(mask + x); - v_uint32x4 stub0, stub1; - v_expand(stub, stub0, stub1); - v_float32x4 v_mask0 = v_reinterpret_as_f32(~(stub0 == v_0)); - v_float32x4 v_mask1 = v_reinterpret_as_f32(~(stub1 == v_0)); + v_uint32x4 v_mask32_0 = v_load_expand_q(mask + x); + v_uint32x4 v_mask32_1 = v_load_expand_q(mask + x + 4); + v_float32x4 v_mask0 = v_reinterpret_as_f32(~(v_mask32_0 == v_0)); + v_float32x4 v_mask1 = v_reinterpret_as_f32(~(v_mask32_1 == v_0)); v_float32x4 v_1src00, v_1src01, v_1src10, v_1src11, v_1src20, v_1src21; v_float32x4 v_2src00, v_2src01, v_2src10, v_2src11, v_2src20, v_2src21; @@ -2109,12 +2075,8 @@ void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const ucha int size = len * cn; for (; x <= size - cVectorWidth; x += cVectorWidth) { - v_uint8x16 v_1src = v_load(src1 + x); - v_uint8x16 v_2src = v_load(src2 + x); - - v_uint16x8 v_1int, v_2int, dummy; - v_expand(v_1src, v_1int, dummy); - v_expand(v_2src, v_2int, dummy); + v_uint16x8 v_1int = v_load_expand(src1 + x); + v_uint16x8 v_2int = v_load_expand(src2 + x); v_uint32x4 v_1int_0, v_1int_1, v_2int_0, v_2int_1; v_expand(v_1int, v_1int_0, v_1int_1); @@ -2148,19 +2110,15 @@ void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const ucha } else { - v_uint8x16 v_0 = v_setzero_u8(); + v_uint16x8 v_0 = v_setzero_u16(); if (cn == 1) { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint8x16 v_mask = v_load(mask + x); + v_uint16x8 v_mask = v_load_expand(mask + x); v_mask = ~(v_mask == v_0); - v_uint8x16 v_1src = v_load(src1 + x) & v_mask; - v_uint8x16 v_2src = v_load(src2 + x) & v_mask; - - v_uint16x8 v_1int, v_2int, dummy; - v_expand(v_1src, v_1int, dummy); - v_expand(v_2src, v_2int, dummy); + v_uint16x8 v_1int = v_load_expand(src1 + x) & v_mask; + v_uint16x8 v_2int = v_load_expand(src2 + x) & v_mask; v_uint32x4 v_1int_0, v_1int_1, v_2int_0, v_2int_1; v_expand(v_1int, v_1int_0, v_1int_1); @@ -2194,19 +2152,11 @@ void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const ucha } else if (cn == 3) { - for (; x <= len - cVectorWidth; x += cVectorWidth) + for (; x <= len - /*cVectorWidth*/16; x += cVectorWidth) { - v_uint8x16 v_mask = v_load(mask + x); - v_mask = ~(v_mask == v_0); v_uint8x16 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2; v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2); v_load_deinterleave(src2 + x * cn, v_2src0, v_2src1, v_2src2); - v_1src0 = v_1src0 & v_mask; - v_1src1 = v_1src1 & v_mask; - v_1src2 = v_1src2 & v_mask; - v_2src0 = v_2src0 & v_mask; - v_2src1 = v_2src1 & v_mask; - v_2src2 = v_2src2 & v_mask; v_uint16x8 v_1int0, v_1int1, v_1int2, v_2int0, v_2int1, v_2int2, dummy; v_expand(v_1src0, v_1int0, dummy); @@ -2216,6 +2166,15 @@ void accProd_simd_(const uchar* src1, const uchar* src2, double* dst, const ucha v_expand(v_2src1, v_2int1, dummy); v_expand(v_2src2, v_2int2, dummy); + v_uint16x8 v_mask = v_load_expand(mask + x); + v_mask = ~(v_mask == v_0); + v_1int0 = v_1int0 & v_mask; + v_1int1 = v_1int1 & v_mask; + v_1int2 = v_1int2 & v_mask; + v_2int0 = v_2int0 & v_mask; + v_2int1 = v_2int1 & v_mask; + v_2int2 = v_2int2 & v_mask; + v_uint32x4 v_1int00, v_1int01, v_1int10, v_1int11, v_1int20, v_1int21; v_uint32x4 v_2int00, v_2int01, v_2int10, v_2int11, v_2int20, v_2int21; v_expand(v_1int0, v_1int00, v_1int01); @@ -2440,9 +2399,7 @@ void accProd_simd_(const float* src1, const float* src2, double* dst, const ucha { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 stub = v_load_expand(mask + x); - v_uint32x4 v_mask, dummy; - v_expand(stub, v_mask, dummy); + v_uint32x4 v_mask = v_load_expand_q(mask + x); v_mask = ~(v_mask == v_0); v_float32x4 v_1src = v_load(src1 + x); v_float32x4 v_2src = v_load(src2 + x); @@ -2462,9 +2419,7 @@ void accProd_simd_(const float* src1, const float* src2, double* dst, const ucha { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 stub = v_load_expand(mask + x); - v_uint32x4 v_mask, dummy; - v_expand(stub, v_mask, dummy); + v_uint32x4 v_mask = v_load_expand_q(mask + x); v_mask = ~(v_mask == v_0); v_float32x4 v_1src0, v_1src1, v_1src2, v_2src0, v_2src1, v_2src2; v_load_deinterleave(src1 + x * cn, v_1src0, v_1src1, v_1src2); @@ -2522,11 +2477,9 @@ void accProd_simd_(const double* src1, const double* src2, double* dst, const uc { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 stub = v_load_expand(mask + x); - v_uint32x4 stub0, stub1; - v_expand(stub, stub0, stub1); + v_uint32x4 v_mask32 = v_load_expand_q(mask + x); v_uint64x2 v_masku640, v_masku641; - v_expand(stub0, v_masku640, v_masku641); + v_expand(v_mask32, v_masku640, v_masku641); v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); @@ -2543,11 +2496,9 @@ void accProd_simd_(const double* src1, const double* src2, double* dst, const uc { for (; x <= len - cVectorWidth; x += cVectorWidth) { - v_uint16x8 stub = v_load_expand(mask + x); - v_uint32x4 stub0, stub1; - v_expand(stub, stub0, stub1); + v_uint32x4 v_mask32 = v_load_expand_q(mask + x); v_uint64x2 v_masku640, v_masku641; - v_expand(stub0, v_masku640, v_masku641); + v_expand(v_mask32, v_masku640, v_masku641); v_float64x2 v_mask0 = v_reinterpret_as_f64(~(v_masku640 == v_0)); v_float64x2 v_mask1 = v_reinterpret_as_f64(~(v_masku641 == v_0)); @@ -2704,12 +2655,10 @@ void accW_simd_(const uchar* src, double* dst, const uchar* mask, int len, int c int size = len * cn; for (; x <= size - cVectorWidth; x += cVectorWidth) { - v_uint8x16 v_src = v_load(src + x); - v_uint16x8 v_int, dummy; - v_expand(v_src, v_int, dummy); + v_uint16x8 v_src16 = v_load_expand(src + x); v_uint32x4 v_int_0, v_int_1; - v_expand(v_int, v_int_0, v_int_1); + v_expand(v_src16, v_int_0, v_int_1); v_int32x4 v_int0 = v_reinterpret_as_s32(v_int_0); v_int32x4 v_int1 = v_reinterpret_as_s32(v_int_1); From 12213f998565c7e34756b9e9f2f4f43e56985181 Mon Sep 17 00:00:00 2001 From: Alexander Alekhin <alexander.alekhin@intel.com> Date: Wed, 26 Jul 2017 18:16:39 +0300 Subject: [PATCH 9/9] flann: fix out of buffer access --- .../flann/include/opencv2/flann/lsh_table.h | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/modules/flann/include/opencv2/flann/lsh_table.h b/modules/flann/include/opencv2/flann/lsh_table.h index 0907dc9fa1..fa7cc71486 100644 --- a/modules/flann/include/opencv2/flann/lsh_table.h +++ b/modules/flann/include/opencv2/flann/lsh_table.h @@ -147,6 +147,7 @@ public: LshTable() { key_size_ = 0; + feature_size_ = 0; speed_level_ = kArray; } @@ -157,7 +158,7 @@ public: */ LshTable(unsigned int feature_size, unsigned int key_size) { - (void)feature_size; + feature_size_ = feature_size; (void)key_size; std::cerr << "LSH is not implemented for that type" << std::endl; assert(0); @@ -332,6 +333,8 @@ private: */ unsigned int key_size_; + unsigned int feature_size_; + // Members only used for the unsigned char specialization /** The mask to apply to a feature to get the hash key * Only used in the unsigned char case @@ -345,9 +348,10 @@ private: template<> inline LshTable<unsigned char>::LshTable(unsigned int feature_size, unsigned int subsignature_size) { + feature_size_ = feature_size; initialize(subsignature_size); // Allocate the mask - mask_ = std::vector<size_t>((size_t)ceil((float)(feature_size * sizeof(char)) / (float)sizeof(size_t)), 0); + mask_ = std::vector<size_t>((feature_size * sizeof(char) + sizeof(size_t) - 1) / sizeof(size_t), 0); // A bit brutal but fast to code std::vector<size_t> indices(feature_size * CHAR_BIT); @@ -392,6 +396,7 @@ inline size_t LshTable<unsigned char>::getKey(const unsigned char* feature) cons { // no need to check if T is dividable by sizeof(size_t) like in the Hamming // distance computation as we have a mask + // FIXIT: This is bad assumption, because we reading tail bytes after of the allocated features buffer const size_t* feature_block_ptr = reinterpret_cast<const size_t*> ((const void*)feature); // Figure out the subsignature of the feature @@ -400,10 +405,20 @@ inline size_t LshTable<unsigned char>::getKey(const unsigned char* feature) cons size_t subsignature = 0; size_t bit_index = 1; - for (std::vector<size_t>::const_iterator pmask_block = mask_.begin(); pmask_block != mask_.end(); ++pmask_block) { + for (unsigned i = 0; i < feature_size_; i += sizeof(size_t)) { // get the mask and signature blocks - size_t feature_block = *feature_block_ptr; - size_t mask_block = *pmask_block; + size_t feature_block; + if (i <= feature_size_ - sizeof(size_t)) + { + feature_block = *feature_block_ptr; + } + else + { + size_t tmp = 0; + memcpy(&tmp, feature_block_ptr, feature_size_ - i); // preserve bytes order + feature_block = tmp; + } + size_t mask_block = mask_[i / sizeof(size_t)]; while (mask_block) { // Get the lowest set bit in the mask block size_t lowest_bit = mask_block & (-(ptrdiff_t)mask_block);