From af746a9269d93c2a61a11955baf877b1edc92098 Mon Sep 17 00:00:00 2001 From: chacha21 Date: Fri, 10 Feb 2017 16:26:24 +0100 Subject: [PATCH 01/12] optimize ICV_HLINE ICV_HLINE is split into several specific cases, according to pixel_size, to optimize memory copies of the same color components along the line. --- modules/imgproc/src/drawing.cpp | 165 +++++++++++++++++++++++++++++++- 1 file changed, 163 insertions(+), 2 deletions(-) diff --git a/modules/imgproc/src/drawing.cpp b/modules/imgproc/src/drawing.cpp index 9099d7378a..1dd436a514 100644 --- a/modules/imgproc/src/drawing.cpp +++ b/modules/imgproc/src/drawing.cpp @@ -40,6 +40,8 @@ //M*/ #include "precomp.hpp" +#include + namespace cv { @@ -1069,8 +1071,95 @@ EllipseEx( Mat& img, Point2l center, Size2l axes, * Polygons filling * \****************************************************************************************/ +//Endian macros stolen from SQLITE +#if (defined(i386) || defined(__i386__) || defined(_M_IX86) || \ + defined(__x86_64) || defined(__x86_64__) || defined(_M_X64) || \ + defined(_M_AMD64) || defined(_M_ARM) || defined(__x86) || \ + defined(__arm__)) +# define OPENCV_BYTEORDER 1234 +# define OPENCV_BIGENDIAN 0 +# define OPENCV_LITTLEENDIAN 1 +#elif (defined(sparc) || defined(__ppc__)) +# define OPENCV_BYTEORDER 4321 +# define OPENCV_BIGENDIAN 1 +# define OPENCV_LITTLEENDIAN 0 +#endif + +#if !defined(OPENCV_BYTEORDER) +static const int opencvOne = 1; +# define OPENCV_BIGENDIAN (*(char *)(&opencvOne)==0) +# define OPENCV_LITTLEENDIAN (*(char *)(&opencvOne)==1) +#endif + +# if defined(_MSC_VER) && _MSC_VER>=1400 +# if !defined(_WIN32_WCE) +# include +# pragma intrinsic(_byteswap_ushort) +# pragma intrinsic(_byteswap_ulong) +# pragma intrinsic(_ReadWriteBarrier) +# else +# include +# endif +# endif + +static inline uint32_t opencvBigToHost32(const uchar* p){ +#if OPENCV_BYTEORDER==4321 + uint32_t x; + memcpy(&x,p,4); + return x; +#elif OPENCV_BYTEORDER==1234 && defined(__GNUC__) && GCC_VERSION>=4003000 + uint32_t x; + memcpy(&x,p,4); + return __builtin_bswap32(x); +#elif OPENCV_BYTEORDER==1234 && defined(_MSC_VER) && _MSC_VER>=1300 + uint32_t x; + memcpy(&x,p,4); + return _byteswap_ulong(x); +#else + return ((unsigned)p[0]<<24) | (p[1]<<16) | (p[2]<<8) | p[3]; +#endif +} + +static inline uint32_t opencvBigToHost32(uint32_t x){ +#if OPENCV_BYTEORDER==4321 + return x; +#else + return opencvBigToHost32((uchar*)&x); +#endif +} + +static inline uint32_t opencvLittleToHost32(const uchar* p){ +#if OPENCV_BYTEORDER==1234 + uint32_t x; + memcpy(&x,p,4); + return x; +#elif OPENCV_BYTEORDER==4321 && defined(__GNUC__) && GCC_VERSION>=4003000 + uint32_t x; + memcpy(&x,p,4); + return __builtin_bswap32(x); +#elif OPENCV_BYTEORDER==4321 && defined(_MSC_VER) && _MSC_VER>=1300 + uint32_t x; + memcpy(&x,p,4); + return _byteswap_ulong(x); +#else + return ((unsigned)p[0]<<24) | (p[1]<<16) | (p[2]<<8) | p[3]; +#endif +} + +static inline uint32_t opencvLittleToHost32(uint32_t x){ +#if OPENCV_BYTEORDER==1234 + return x; +#else + return opencvLittleToHost32((uchar*)&x); +#endif +} + + + /* helper macros: filling horizontal row */ -#define ICV_HLINE( ptr, xl, xr, color, pix_size ) \ +#define is_aligned(POINTER, BYTE_COUNT) (((uintptr_t)(const void *)(POINTER)) % (BYTE_COUNT) == 0) + +/*#define ICV_HLINE( ptr, xl, xr, color, pix_size ) \ { \ uchar* hline_ptr = (uchar*)(ptr) + (xl)*(pix_size); \ uchar* hline_max_ptr = (uchar*)(ptr) + (xr)*(pix_size); \ @@ -1083,9 +1172,81 @@ EllipseEx( Mat& img, Point2l center, Size2l axes, hline_ptr[hline_j] = ((uchar*)color)[hline_j]; \ } \ } \ +}*/ + +#define ICV_HLINE( ptr, xl, xr, color, pix_size ) \ +if((pix_size) == 1) \ +{ \ + uchar* hline_ptr = (uchar*)(ptr) + (xl); \ + uchar* hline_max_ptr = (uchar*)(ptr) + (xr); \ + uchar hline_c = *(const uchar*)(color); \ + \ + memset(hline_ptr, hline_c, (hline_max_ptr - hline_ptr) + 1); \ +} \ +else if((pix_size) == 3) \ +{ \ + uchar* hline_ptr = (uchar*)(ptr) + (xl)*3; \ + uchar* hline_end = (uchar*)(ptr) + (xr+1)*3; \ + uchar* hbody12_start = std::min(hline_end, (uchar*)(12*(((uintptr_t)(hline_ptr)+11)/12))); \ + uchar* hbody12_end = std::min(hline_end, (uchar*)(12*(((uintptr_t)(hline_end))/12))); \ + if ((hbody12_start < hbody12_end)) \ + { \ + int offset = ((uintptr_t)(hbody12_start-hline_ptr))%3; \ + uint32_t c4[3]; \ + uchar* ptrC4 = reinterpret_cast(&c4); \ + ptrC4[0] = ((uchar*)(color))[(offset++)%3]; \ + ptrC4[1] = ((uchar*)(color))[(offset++)%3]; \ + ptrC4[2] = ((uchar*)(color))[(offset++)%3]; \ + memcpy(&ptrC4[3], &ptrC4[0], 3); \ + memcpy(&ptrC4[6], &ptrC4[0], 6); \ + c4[0] = opencvLittleToHost32(c4[0]); \ + c4[1] = opencvLittleToHost32(c4[1]); \ + c4[2] = opencvLittleToHost32(c4[2]); \ + for(offset = 0 ; hline_ptr < hbody12_start; offset = (offset+1)%3)\ + *hline_ptr++ = ((uchar*)(color))[offset]; \ + for(uint32_t* ptr32 = reinterpret_cast(hbody12_start), *ptr32End = reinterpret_cast(hbody12_end) ; ptr32 Date: Fri, 10 Feb 2017 21:58:02 +0100 Subject: [PATCH 02/12] do not use GCC_VERSION --- modules/imgproc/src/drawing.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/imgproc/src/drawing.cpp b/modules/imgproc/src/drawing.cpp index 1dd436a514..9bef7ba85d 100644 --- a/modules/imgproc/src/drawing.cpp +++ b/modules/imgproc/src/drawing.cpp @@ -1107,7 +1107,7 @@ static inline uint32_t opencvBigToHost32(const uchar* p){ uint32_t x; memcpy(&x,p,4); return x; -#elif OPENCV_BYTEORDER==1234 && defined(__GNUC__) && GCC_VERSION>=4003000 +#elif OPENCV_BYTEORDER==1234 && defined(__GNUC__) uint32_t x; memcpy(&x,p,4); return __builtin_bswap32(x); @@ -1133,7 +1133,7 @@ static inline uint32_t opencvLittleToHost32(const uchar* p){ uint32_t x; memcpy(&x,p,4); return x; -#elif OPENCV_BYTEORDER==4321 && defined(__GNUC__) && GCC_VERSION>=4003000 +#elif OPENCV_BYTEORDER==4321 && defined(__GNUC__) uint32_t x; memcpy(&x,p,4); return __builtin_bswap32(x); From 7521bcc32c9750b7c73800fc7c9d32e7c5db9646 Mon Sep 17 00:00:00 2001 From: chacha21 Date: Fri, 10 Feb 2017 22:34:44 +0100 Subject: [PATCH 03/12] comment unused function On MacOS and iOS, the unused opencvBigToHost32 is a warning for buildbot --- modules/imgproc/src/drawing.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/imgproc/src/drawing.cpp b/modules/imgproc/src/drawing.cpp index 9bef7ba85d..a3002b0c6a 100644 --- a/modules/imgproc/src/drawing.cpp +++ b/modules/imgproc/src/drawing.cpp @@ -1102,6 +1102,7 @@ static const int opencvOne = 1; # endif # endif +/* static inline uint32_t opencvBigToHost32(const uchar* p){ #if OPENCV_BYTEORDER==4321 uint32_t x; @@ -1127,6 +1128,7 @@ static inline uint32_t opencvBigToHost32(uint32_t x){ return opencvBigToHost32((uchar*)&x); #endif } +*/ static inline uint32_t opencvLittleToHost32(const uchar* p){ #if OPENCV_BYTEORDER==1234 From e19000a56f5b03dee1f412d8b25711307ea0be00 Mon Sep 17 00:00:00 2001 From: chacha21 Date: Sat, 11 Feb 2017 11:07:00 +0100 Subject: [PATCH 04/12] adaptation for iOS buildbot --- modules/imgproc/src/drawing.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/modules/imgproc/src/drawing.cpp b/modules/imgproc/src/drawing.cpp index a3002b0c6a..42e4a551ff 100644 --- a/modules/imgproc/src/drawing.cpp +++ b/modules/imgproc/src/drawing.cpp @@ -1086,6 +1086,7 @@ EllipseEx( Mat& img, Point2l center, Size2l axes, #endif #if !defined(OPENCV_BYTEORDER) +# define OPENCV_BYTEORDER 0 static const int opencvOne = 1; # define OPENCV_BIGENDIAN (*(char *)(&opencvOne)==0) # define OPENCV_LITTLEENDIAN (*(char *)(&opencvOne)==1) @@ -1143,13 +1144,15 @@ static inline uint32_t opencvLittleToHost32(const uchar* p){ uint32_t x; memcpy(&x,p,4); return _byteswap_ulong(x); +#elif OPENCV_LITTLEENDIAN + return x; #else return ((unsigned)p[0]<<24) | (p[1]<<16) | (p[2]<<8) | p[3]; #endif } static inline uint32_t opencvLittleToHost32(uint32_t x){ -#if OPENCV_BYTEORDER==1234 +#if OPENCV_LITTLEENDIAN return x; #else return opencvLittleToHost32((uchar*)&x); From 16a9407fbf5f9a7c30a7fb7322aaa6eb0001664b Mon Sep 17 00:00:00 2001 From: chacha21 Date: Sat, 11 Feb 2017 11:26:55 +0100 Subject: [PATCH 05/12] new try to adapt to iOS build bot --- modules/imgproc/src/drawing.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/imgproc/src/drawing.cpp b/modules/imgproc/src/drawing.cpp index 42e4a551ff..9632c77746 100644 --- a/modules/imgproc/src/drawing.cpp +++ b/modules/imgproc/src/drawing.cpp @@ -1075,7 +1075,7 @@ EllipseEx( Mat& img, Point2l center, Size2l axes, #if (defined(i386) || defined(__i386__) || defined(_M_IX86) || \ defined(__x86_64) || defined(__x86_64__) || defined(_M_X64) || \ defined(_M_AMD64) || defined(_M_ARM) || defined(__x86) || \ - defined(__arm__)) + defined(__arm__) || defined(__aarch64__)) # define OPENCV_BYTEORDER 1234 # define OPENCV_BIGENDIAN 0 # define OPENCV_LITTLEENDIAN 1 From 91a027043208ccf692b5d30c695f0aea76460c53 Mon Sep 17 00:00:00 2001 From: chacha21 Date: Tue, 21 Feb 2017 12:02:23 +0100 Subject: [PATCH 06/12] try to fix Android compilation --- modules/imgproc/src/drawing.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modules/imgproc/src/drawing.cpp b/modules/imgproc/src/drawing.cpp index 1dd436a514..dde11a7faa 100644 --- a/modules/imgproc/src/drawing.cpp +++ b/modules/imgproc/src/drawing.cpp @@ -1075,11 +1075,11 @@ EllipseEx( Mat& img, Point2l center, Size2l axes, #if (defined(i386) || defined(__i386__) || defined(_M_IX86) || \ defined(__x86_64) || defined(__x86_64__) || defined(_M_X64) || \ defined(_M_AMD64) || defined(_M_ARM) || defined(__x86) || \ - defined(__arm__)) + defined(__arm__) || defined(_LITTLE_ENDIAN) || defined(LITTLE_ENDIAN)) # define OPENCV_BYTEORDER 1234 # define OPENCV_BIGENDIAN 0 # define OPENCV_LITTLEENDIAN 1 -#elif (defined(sparc) || defined(__ppc__)) +#elif (defined(sparc) || defined(__ppc__) || defined(_BIG_ENDIAN) || defined(BIG_ENDIAN)) # define OPENCV_BYTEORDER 4321 # define OPENCV_BIGENDIAN 1 # define OPENCV_LITTLEENDIAN 0 @@ -1087,8 +1087,8 @@ EllipseEx( Mat& img, Point2l center, Size2l axes, #if !defined(OPENCV_BYTEORDER) static const int opencvOne = 1; -# define OPENCV_BIGENDIAN (*(char *)(&opencvOne)==0) -# define OPENCV_LITTLEENDIAN (*(char *)(&opencvOne)==1) +# define OPENCV_BIGENDIAN (*((const char *)(&opencvOne))==0) +# define OPENCV_LITTLEENDIAN (*((const char *)(&opencvOne))==1) #endif # if defined(_MSC_VER) && _MSC_VER>=1400 From 92a3dbe18fd1228230b249e422d52708b649ab9c Mon Sep 17 00:00:00 2001 From: chacha21 Date: Thu, 2 Mar 2017 09:44:12 +0100 Subject: [PATCH 07/12] more ICV_HLINE optimization added 64b optimization for 3 channels case not added 64b optimization for 4 channels case since timings did not show any improvement split ICV_HLINE cases into inline functions instead of macro for code size reduction, without significand speed drawback at first sight --- modules/imgproc/src/drawing.cpp | 235 ++++++++++++++++++++++++++++++-- 1 file changed, 226 insertions(+), 9 deletions(-) diff --git a/modules/imgproc/src/drawing.cpp b/modules/imgproc/src/drawing.cpp index 898198f163..3307b77f6e 100644 --- a/modules/imgproc/src/drawing.cpp +++ b/modules/imgproc/src/drawing.cpp @@ -1097,6 +1097,7 @@ static const int opencvOne = 1; # include # pragma intrinsic(_byteswap_ushort) # pragma intrinsic(_byteswap_ulong) +# pragma intrinsic(_byteswap_uint64) # pragma intrinsic(_ReadWriteBarrier) # else # include @@ -1134,20 +1135,20 @@ static inline uint32_t opencvBigToHost32(uint32_t x){ static inline uint32_t opencvLittleToHost32(const uchar* p){ #if OPENCV_BYTEORDER==1234 uint32_t x; - memcpy(&x,p,4); + memcpy(&x,p,sizeof(x)); return x; #elif OPENCV_BYTEORDER==4321 && defined(__GNUC__) uint32_t x; - memcpy(&x,p,4); + memcpy(&x,p,sizeof(x)); return __builtin_bswap32(x); #elif OPENCV_BYTEORDER==4321 && defined(_MSC_VER) && _MSC_VER>=1300 uint32_t x; - memcpy(&x,p,4); + memcpy(&x,p,sizeof(x)); return _byteswap_ulong(x); #elif OPENCV_LITTLEENDIAN return x; #else - return ((unsigned)p[0]<<24) | (p[1]<<16) | (p[2]<<8) | p[3]; + return (p[0]<<24) | (p[1]<<16) | (p[2]<<8) | p[3]; #endif } @@ -1159,7 +1160,33 @@ static inline uint32_t opencvLittleToHost32(uint32_t x){ #endif } +static inline uint64_t opencvLittleToHost64(const uchar* p){ +#if OPENCV_BYTEORDER==1234 + uint64_t x; + memcpy(&x,p,sizeof(x)); + return x; +#elif OPENCV_BYTEORDER==4321 && defined(__GNUC__) + uint64_t x; + memcpy(&x,p,sizeof(x)); + return __builtin_bswap64(x); +#elif OPENCV_BYTEORDER==4321 && defined(_MSC_VER) && _MSC_VER>=1300 + uint64_t x; + memcpy(&x,p,sizeof(x)); + return _byteswap_uint64(x); +#elif OPENCV_LITTLEENDIAN + return x; +#else + return (p[0]<<56) | (p[1]<<40) | (p[2]<<24) | (p[3]<<8) | (p[4]>>8) | (p[5]>>24) | (p[6]>>40) | (p[7]>>56); +#endif +} +static inline uint64_t opencvLittleToHost64(uint64_t x){ +#if OPENCV_LITTLEENDIAN + return x; +#else + return opencvLittleToHost64((uchar*)&x); +#endif +} /* helper macros: filling horizontal row */ #define is_aligned(POINTER, BYTE_COUNT) (((uintptr_t)(const void *)(POINTER)) % (BYTE_COUNT) == 0) @@ -1179,6 +1206,38 @@ static inline uint32_t opencvLittleToHost32(uint32_t x){ } \ }*/ +/* +template +static inline void icv_hline_impl(uchar* ptr, size_t xl, size_t xr, const uchar* color, unsigned pix_size_) +{ + const unsigned pix_size = pix_size_forced ? pix_size_forced : pix_size_; + + uchar* hline_ptr = ptr + xl*pix_size; + uchar* hline_max_ptr = ptr + xr*pix_size; + + for ( ; hline_ptr <= hline_max_ptr; hline_ptr += pix_size) + { + for (unsigned c = 0; c < pix_size; c++) + { + hline_ptr[c] = color[c]; + } + } +} + +#define ICV_HLINE( ptr, xl, xr, color, pix_size ) \ +{ \ + if (pix_size == 1) \ + icv_hline_impl<1>((uchar*)ptr, (xl), (xr), (const uchar*)color,pix_size); \ + else if (pix_size == 3) \ + icv_hline_impl<3>((uchar*)ptr, (xl), (xr), (const uchar*)color, pix_size); \ + else if (pix_size == 4) \ + icv_hline_impl<4>((uchar*)ptr, (xl), (xr), (const uchar*)color, pix_size); \ + else \ + icv_hline_impl<0>((uchar*)ptr, (xl), (xr), (const uchar*)color, pix_size); \ +} +*/ + +/* #define ICV_HLINE( ptr, xl, xr, color, pix_size ) \ if((pix_size) == 1) \ { \ @@ -1192,9 +1251,36 @@ else if((pix_size) == 3) \ { \ uchar* hline_ptr = (uchar*)(ptr) + (xl)*3; \ uchar* hline_end = (uchar*)(ptr) + (xr+1)*3; \ + uchar* hbody24_start = std::min(hline_end, (uchar*)(24*(((uintptr_t)(hline_ptr)+23)/24))); \ + uchar* hbody24_end = std::min(hline_end, (uchar*)(24*(((uintptr_t)(hline_end))/24))); \ uchar* hbody12_start = std::min(hline_end, (uchar*)(12*(((uintptr_t)(hline_ptr)+11)/12))); \ - uchar* hbody12_end = std::min(hline_end, (uchar*)(12*(((uintptr_t)(hline_end))/12))); \ - if ((hbody12_start < hbody12_end)) \ + uchar* hbody12_end = std::min(hline_end, (uchar*)(12*(((uintptr_t)(hline_end))/12))); \ + if (hbody24_start < hbody24_end) \ + { \ + int offset = ((uintptr_t)(hbody24_start-hline_ptr))%3; \ + uint64_t c4[3]; \ + uchar* ptrC4 = reinterpret_cast(&c4); \ + ptrC4[0] = ((uchar*)(color))[(offset++)%3]; \ + ptrC4[1] = ((uchar*)(color))[(offset++)%3]; \ + ptrC4[2] = ((uchar*)(color))[(offset++)%3]; \ + memcpy(&ptrC4[3], &ptrC4[0], 3); \ + memcpy(&ptrC4[6], &ptrC4[0], 6); \ + memcpy(&ptrC4[12], &ptrC4[0], 12); \ + c4[0] = opencvLittleToHost64(c4[0]); \ + c4[1] = opencvLittleToHost64(c4[1]); \ + c4[2] = opencvLittleToHost64(c4[2]); \ + for(offset = 0 ; hline_ptr < hbody24_start; offset = (offset+1)%3)\ + *hline_ptr++ = ((uchar*)(color))[offset]; \ + for(uint64_t* ptr64 = reinterpret_cast(hbody24_start), *ptr64End = reinterpret_cast(hbody24_end) ; ptr64(&c4); + ptrC4[0] = ((uchar*)(color))[(offset++)%3]; + ptrC4[1] = ((uchar*)(color))[(offset++)%3]; + ptrC4[2] = ((uchar*)(color))[(offset++)%3]; + memcpy(&ptrC4[3], &ptrC4[0], 3); + memcpy(&ptrC4[6], &ptrC4[0], 6); + memcpy(&ptrC4[12], &ptrC4[0], 12); + c4[0] = opencvLittleToHost64(c4[0]); + c4[1] = opencvLittleToHost64(c4[1]); + c4[2] = opencvLittleToHost64(c4[2]); + for(offset = 0 ; hline_ptr < hbody24_start; offset = (offset+1)%3) + *hline_ptr++ = ((uchar*)(color))[offset]; + for(uint64_t* ptr64 = reinterpret_cast(hbody24_start), *ptr64End = reinterpret_cast(hbody24_end) ; ptr64(&c4); + ptrC4[0] = ((uchar*)(color))[(offset++)%3]; + ptrC4[1] = ((uchar*)(color))[(offset++)%3]; + ptrC4[2] = ((uchar*)(color))[(offset++)%3]; + memcpy(&ptrC4[3], &ptrC4[0], 3); + memcpy(&ptrC4[6], &ptrC4[0], 6); + c4[0] = opencvLittleToHost32(c4[0]); + c4[1] = opencvLittleToHost32(c4[1]); + c4[2] = opencvLittleToHost32(c4[2]); + for(offset = 0 ; hline_ptr < hbody12_start; offset = (offset+1)%3) + *hline_ptr++ = ((uchar*)(color))[offset]; + for(uint32_t* ptr32 = reinterpret_cast(hbody12_start), *ptr32End = reinterpret_cast(hbody12_end) ; ptr32(color)); + else if (pix_size == 3) + ICV_HLINE_3(ptr, xl, xr, reinterpret_cast(color)); + else if (pix_size == 4) + ICV_HLINE_4(ptr, xl, xr, reinterpret_cast(color)); + else + ICV_HLINE_0(ptr, xl, xr, reinterpret_cast(color), pix_size); +} +//end ICV_HLINE() /* filling convex polygon. v - array of vertices, ntps - number of points */ static void From 27cfe31b64306ef1344813b1359cfa0fa556f34d Mon Sep 17 00:00:00 2001 From: chacha21 Date: Fri, 3 Mar 2017 11:47:46 +0100 Subject: [PATCH 08/12] more ICV_HLINE specific cases added ICV_HLINE custom implementations for element sizes up to 32 but timings show that it is not very relevant for sizes >= 12 --- modules/imgproc/src/drawing.cpp | 363 ++++++++++++++++++++++++++++---- 1 file changed, 327 insertions(+), 36 deletions(-) diff --git a/modules/imgproc/src/drawing.cpp b/modules/imgproc/src/drawing.cpp index 3307b77f6e..9345eb35cc 100644 --- a/modules/imgproc/src/drawing.cpp +++ b/modules/imgproc/src/drawing.cpp @@ -1104,33 +1104,33 @@ static const int opencvOne = 1; # endif # endif -/* -static inline uint32_t opencvBigToHost32(const uchar* p){ -#if OPENCV_BYTEORDER==4321 - uint32_t x; - memcpy(&x,p,4); +static inline uint16_t opencvLittleToHost16(const uchar* p){ +#if OPENCV_BYTEORDER==1234 + uint16_t x; + memcpy(&x,p,sizeof(x)); + return x; +#elif OPENCV_BYTEORDER==4321 && defined(__GNUC__) + uint16_t x; + memcpy(&x,p,sizeof(x)); + return (p[0]<<8) | (p[1]>>8); +#elif OPENCV_BYTEORDER==4321 && defined(_MSC_VER) && _MSC_VER>=1300 + uint16_t x; + memcpy(&x,p,sizeof(x)); + return _byteswap_ushort(x); +#elif OPENCV_LITTLEENDIAN return x; -#elif OPENCV_BYTEORDER==1234 && defined(__GNUC__) - uint32_t x; - memcpy(&x,p,4); - return __builtin_bswap32(x); -#elif OPENCV_BYTEORDER==1234 && defined(_MSC_VER) && _MSC_VER>=1300 - uint32_t x; - memcpy(&x,p,4); - return _byteswap_ulong(x); #else - return ((unsigned)p[0]<<24) | (p[1]<<16) | (p[2]<<8) | p[3]; + return (p[0]<<8) | (p[1]>>8); #endif } -static inline uint32_t opencvBigToHost32(uint32_t x){ -#if OPENCV_BYTEORDER==4321 +static inline uint16_t opencvLittleToHost16(uint16_t x){ +#if OPENCV_LITTLEENDIAN return x; #else - return opencvBigToHost32((uchar*)&x); + return opencvLittleToHost16((uchar*)&x); #endif } -*/ static inline uint32_t opencvLittleToHost32(const uchar* p){ #if OPENCV_BYTEORDER==1234 @@ -1338,6 +1338,26 @@ else \ } */ +static inline void ICV_HLINE_X(uchar* ptr, int xl, int xr, const uchar* color, int pix_size) +{ + uchar* hline_min_ptr = (uchar*)(ptr) + (xl)*(pix_size); + uchar* hline_end_ptr = (uchar*)(ptr) + (xr+1)*(pix_size); + uchar* hline_ptr = hline_min_ptr; + if (hline_min_ptr < hline_end_ptr) + { + memcpy(hline_ptr, color, pix_size); + hline_ptr += pix_size; + }//end if (hline_min_ptr < hline_end_ptr) + size_t sizeToCopy = pix_size; + while(hline_ptr < hline_end_ptr) + { + memcpy(hline_ptr, hline_min_ptr, sizeToCopy); + hline_ptr += sizeToCopy; + sizeToCopy = std::min(2*sizeToCopy, static_cast(hline_end_ptr-hline_ptr)); + }//end while(hline_ptr < hline_end_ptr) +} +//end ICV_HLINE_X() + static inline void ICV_HLINE_0(uchar* ptr, int xl, int xr, const uchar* color, int pix_size) { uchar* hline_ptr = (uchar*)(ptr) + (xl)*(pix_size); @@ -1345,7 +1365,7 @@ static inline void ICV_HLINE_0(uchar* ptr, int xl, int xr, const uchar* color, i for( ; hline_ptr <= hline_max_ptr; hline_ptr += (pix_size)) { int hline_j; - for( hline_j = 0; hline_j < (4); hline_j++ ) + for( hline_j = 0; hline_j < (pix_size); hline_j++ ) { hline_ptr[hline_j] = ((uchar*)color)[hline_j]; } @@ -1360,6 +1380,24 @@ static inline void ICV_HLINE_1(uchar* ptr, int xl, int xr, const uchar* color) uchar hline_c = *(const uchar*)(color); memset(hline_ptr, hline_c, (hline_max_ptr - hline_ptr) + 1); } +//end ICV_HLINE_1() + +static inline void ICV_HLINE_2(uchar* ptr, int xl, int xr, const uchar* color) +{ + if (is_aligned(((uchar*)(ptr) + (xl)*2), 0x2)) + { + uint16_t c = opencvLittleToHost16((uchar*)(color)); + uint16_t* hline_ptr = (uint16_t*)(ptr) + xl; + uint16_t* hline_max_ptr = (uint16_t*)(ptr) + xr; + for( ; hline_ptr <= hline_max_ptr; ) + *hline_ptr++ = c; + } + else + { + ICV_HLINE_X(ptr, xl, xr, color, 2); + } +} +//end ICV_HLINE_2() static inline void ICV_HLINE_3(uchar* ptr, int xl, int xr, const uchar* color) { @@ -1420,12 +1458,7 @@ static inline void ICV_HLINE_3(uchar* ptr, int xl, int xr, const uchar* color) } else { - for( ; hline_ptr < hline_end ; ) - { - *hline_ptr++ = ((uchar*)(color))[0]; - *hline_ptr++ = ((uchar*)(color))[1]; - *hline_ptr++ = ((uchar*)(color))[2]; - } + ICV_HLINE_X(ptr, xl, xr, color, 3); } } //end ICV_HLINE_3() @@ -1442,30 +1475,288 @@ static inline void ICV_HLINE_4(uchar* ptr, int xl, int xr, const uchar* color) } else { - uchar* hline_ptr = (uchar*)(ptr) + (xl)*(4); - uchar* hline_max_ptr = (uchar*)(ptr) + (xr)*(4); - for( ; hline_ptr <= hline_max_ptr; hline_ptr += (4)) + ICV_HLINE_X(ptr, xl, xr, color, 4); + } +} +//end ICV_HLINE_4() + +static inline void ICV_HLINE_6(uchar* ptr, int xl, int xr, const uchar* color) +{ + uchar* hline_ptr = (uchar*)(ptr) + (xl)*6; + uchar* hline_end = (uchar*)(ptr) + (xr+1)*6; + uchar* hbody24_start = std::min(hline_end, (uchar*)(24*(((uintptr_t)(hline_ptr)+23)/24))); + uchar* hbody24_end = std::min(hline_end, (uchar*)(24*(((uintptr_t)(hline_end))/24))); + uchar* hbody12_start = std::min(hline_end, (uchar*)(12*(((uintptr_t)(hline_ptr)+11)/12))); + uchar* hbody12_end = std::min(hline_end, (uchar*)(12*(((uintptr_t)(hline_end))/12))); + if (hbody24_start < hbody24_end) + { + int offset = ((uintptr_t)(hbody24_start-hline_ptr))%6; + uint64_t c4[3]; + uchar* ptrC4 = reinterpret_cast(&c4); + ptrC4[0] = ((uchar*)(color))[(offset++)%6]; + ptrC4[1] = ((uchar*)(color))[(offset++)%6]; + ptrC4[2] = ((uchar*)(color))[(offset++)%6]; + ptrC4[3] = ((uchar*)(color))[(offset++)%6]; + ptrC4[4] = ((uchar*)(color))[(offset++)%6]; + ptrC4[5] = ((uchar*)(color))[(offset++)%6]; + memcpy(&ptrC4[6], &ptrC4[0], 6); + memcpy(&ptrC4[12], &ptrC4[0], 12); + c4[0] = opencvLittleToHost64(c4[0]); + c4[1] = opencvLittleToHost64(c4[1]); + c4[2] = opencvLittleToHost64(c4[2]); + for(offset = 0 ; hline_ptr < hbody24_start; offset = (offset+1)%6) + *hline_ptr++ = ((uchar*)(color))[offset]; + for(uint64_t* ptr64 = reinterpret_cast(hbody24_start), *ptr64End = reinterpret_cast(hbody24_end) ; ptr64(&c4); + ptrC4[0] = ((uchar*)(color))[(offset++)%6]; + ptrC4[1] = ((uchar*)(color))[(offset++)%6]; + ptrC4[2] = ((uchar*)(color))[(offset++)%6]; + ptrC4[3] = ((uchar*)(color))[(offset++)%6]; + ptrC4[4] = ((uchar*)(color))[(offset++)%6]; + ptrC4[5] = ((uchar*)(color))[(offset++)%6]; + memcpy(&ptrC4[6], &ptrC4[0], 6); + c4[0] = opencvLittleToHost32(c4[0]); + c4[1] = opencvLittleToHost32(c4[1]); + c4[2] = opencvLittleToHost32(c4[2]); + for(offset = 0 ; hline_ptr < hbody12_start; offset = (offset+1)%6) + *hline_ptr++ = ((uchar*)(color))[offset]; + for(uint32_t* ptr32 = reinterpret_cast(hbody12_start), *ptr32End = reinterpret_cast(hbody12_end) ; ptr32(color), pix_size); + else if (pix_size == 1) ICV_HLINE_1(ptr, xl, xr, reinterpret_cast(color)); + else if (pix_size == 2) + ICV_HLINE_2(ptr, xl, xr, reinterpret_cast(color)); else if (pix_size == 3) ICV_HLINE_3(ptr, xl, xr, reinterpret_cast(color)); else if (pix_size == 4) ICV_HLINE_4(ptr, xl, xr, reinterpret_cast(color)); + else if (pix_size == 6) + ICV_HLINE_6(ptr, xl, xr, reinterpret_cast(color)); + else if (pix_size == 8) + ICV_HLINE_8(ptr, xl, xr, reinterpret_cast(color)); + //timings do not show relevant improvement when element_size >= 12 + /*else if (pix_size == 12) + ICV_HLINE_12(ptr, xl, xr, reinterpret_cast(color)); + else if (pix_size == 16) + ICV_HLINE_16(ptr, xl, xr, reinterpret_cast(color)); + else if (pix_size == 24) + ICV_HLINE_24(ptr, xl, xr, reinterpret_cast(color)); + else if (pix_size == 32) + ICV_HLINE_32(ptr, xl, xr, reinterpret_cast(color));*/ else - ICV_HLINE_0(ptr, xl, xr, reinterpret_cast(color), pix_size); + ICV_HLINE_X(ptr, xl, xr, reinterpret_cast(color), pix_size); } //end ICV_HLINE() From 94c58e7347178d18868422ebf264106549918edc Mon Sep 17 00:00:00 2001 From: chacha21 Date: Thu, 9 Mar 2017 17:28:52 +0100 Subject: [PATCH 09/12] minor changes to fix -Wunused-function warning on Apple platforms --- modules/imgproc/src/drawing.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/modules/imgproc/src/drawing.cpp b/modules/imgproc/src/drawing.cpp index 9345eb35cc..e998042abe 100644 --- a/modules/imgproc/src/drawing.cpp +++ b/modules/imgproc/src/drawing.cpp @@ -1184,7 +1184,7 @@ static inline uint64_t opencvLittleToHost64(uint64_t x){ #if OPENCV_LITTLEENDIAN return x; #else - return opencvLittleToHost64((uchar*)&x); + return opencvLittleToHost64((const uchar*)&x); #endif } @@ -1552,7 +1552,7 @@ static inline void ICV_HLINE_8(uchar* ptr, int xl, int xr, const uchar* color) { if (is_aligned(((uchar*)(ptr) + (xl)*8), 0x8)) { - uint64_t c = opencvLittleToHost64((uchar*)(color)); + uint64_t c = opencvLittleToHost64(color); uint64_t* hline_ptr = (uint64_t*)((uchar*)(ptr) + (xl)*(8)); uint64_t* hline_max_ptr = (uint64_t*)((uchar*)(ptr) + (xr)*(8)); for( ; hline_ptr <= hline_max_ptr; ) @@ -1604,8 +1604,8 @@ static inline void ICV_HLINE_16(uchar* ptr, int xl, int xr, const uchar* color) { if (is_aligned(((uchar*)(ptr) + (xl)*16), 0x8)) { - uint64_t c[2] = {opencvLittleToHost64((uchar*)(color)+0x00), - opencvLittleToHost64((uchar*)(color)+0x08)}; + uint64_t c[2] = {opencvLittleToHost64(color+0x00), + opencvLittleToHost64(color+0x08)}; uint64_t* hline_ptr = (uint64_t*)((uchar*)(ptr) + (xl)*(16)); uint64_t* hline_max_ptr = (uint64_t*)((uchar*)(ptr) + (xr)*(16)); for( ; hline_ptr <= hline_max_ptr; ) @@ -1641,9 +1641,9 @@ static inline void ICV_HLINE_24(uchar* ptr, int xl, int xr, const uchar* color) { if (is_aligned(((uchar*)(ptr) + (xl)*24), 0x8)) { - uint64_t c[3] = {opencvLittleToHost64((uchar*)(color)+0x00), - opencvLittleToHost64((uchar*)(color)+0x08), - opencvLittleToHost64((uchar*)(color)+0x10)}; + uint64_t c[3] = {opencvLittleToHost64(color+0x00), + opencvLittleToHost64(color+0x08), + opencvLittleToHost64(color+0x10)}; uint64_t* hline_ptr = (uint64_t*)((uchar*)(ptr) + (xl)*(24)); uint64_t* hline_max_ptr = (uint64_t*)((uchar*)(ptr) + (xr)*(24)); for( ; hline_ptr <= hline_max_ptr; ) @@ -1684,10 +1684,10 @@ static inline void ICV_HLINE_32(uchar* ptr, int xl, int xr, const uchar* color) { if (is_aligned(((uchar*)(ptr) + (xl)*32), 0x8)) { - uint64_t c[4] = {opencvLittleToHost64((uchar*)(color)+0x00), - opencvLittleToHost64((uchar*)(color)+0x08), - opencvLittleToHost64((uchar*)(color)+0x10), - opencvLittleToHost64((uchar*)(color)+0x18)}; + uint64_t c[4] = {opencvLittleToHost64(color+0x00), + opencvLittleToHost64(color+0x08), + opencvLittleToHost64(color+0x10), + opencvLittleToHost64(color+0x18)}; uint64_t* hline_ptr = (uint64_t*)((uchar*)(ptr) + (xl)*(32)); uint64_t* hline_max_ptr = (uint64_t*)((uchar*)(ptr) + (xr)*(32)); for( ; hline_ptr <= hline_max_ptr; ) From 8c7d29e52642cf8e256ae657c5b98befb0c748e2 Mon Sep 17 00:00:00 2001 From: chacha21 Date: Thu, 9 Mar 2017 18:08:34 +0100 Subject: [PATCH 10/12] more minor changes to fix -Wunused-function warning on Apple platforms --- modules/imgproc/src/drawing.cpp | 58 ++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 27 deletions(-) diff --git a/modules/imgproc/src/drawing.cpp b/modules/imgproc/src/drawing.cpp index e998042abe..e5075f62c5 100644 --- a/modules/imgproc/src/drawing.cpp +++ b/modules/imgproc/src/drawing.cpp @@ -1124,13 +1124,15 @@ static inline uint16_t opencvLittleToHost16(const uchar* p){ #endif } +/* static inline uint16_t opencvLittleToHost16(uint16_t x){ #if OPENCV_LITTLEENDIAN return x; #else - return opencvLittleToHost16((uchar*)&x); + return opencvLittleToHost16((const uchar*)&x); #endif } +*/ static inline uint32_t opencvLittleToHost32(const uchar* p){ #if OPENCV_BYTEORDER==1234 @@ -1156,7 +1158,7 @@ static inline uint32_t opencvLittleToHost32(uint32_t x){ #if OPENCV_LITTLEENDIAN return x; #else - return opencvLittleToHost32((uchar*)&x); + return opencvLittleToHost32((const uchar*)&x); #endif } @@ -1386,7 +1388,7 @@ static inline void ICV_HLINE_2(uchar* ptr, int xl, int xr, const uchar* color) { if (is_aligned(((uchar*)(ptr) + (xl)*2), 0x2)) { - uint16_t c = opencvLittleToHost16((uchar*)(color)); + uint16_t c = opencvLittleToHost16(color); uint16_t* hline_ptr = (uint16_t*)(ptr) + xl; uint16_t* hline_max_ptr = (uint16_t*)(ptr) + xr; for( ; hline_ptr <= hline_max_ptr; ) @@ -1467,7 +1469,7 @@ static inline void ICV_HLINE_4(uchar* ptr, int xl, int xr, const uchar* color) { if (is_aligned(((uchar*)(ptr) + (xl)*4), 0x4)) { - uint32_t c = opencvLittleToHost32((uchar*)(color)); + uint32_t c = opencvLittleToHost32(color); uint32_t* hline_ptr = (uint32_t*)(ptr) + xl; uint32_t* hline_max_ptr = (uint32_t*)(ptr) + xr; for( ; hline_ptr <= hline_max_ptr; ) @@ -1560,8 +1562,8 @@ static inline void ICV_HLINE_8(uchar* ptr, int xl, int xr, const uchar* color) } else if (is_aligned(((uchar*)(ptr) + (xl)*8), 0x4)) { - uint32_t c[2] = {opencvLittleToHost32((uchar*)(color)+0x00), - opencvLittleToHost32((uchar*)(color)+0x04)}; + uint32_t c[2] = {opencvLittleToHost32(color+0x00), + opencvLittleToHost32(color+0x04)}; uint32_t* hline_ptr = (uint32_t*)((uchar*)(ptr) + (xl)*(8)); uint32_t* hline_max_ptr = (uint32_t*)((uchar*)(ptr) + (xr)*(8)); for( ; hline_ptr <= hline_max_ptr; ) @@ -1577,13 +1579,14 @@ static inline void ICV_HLINE_8(uchar* ptr, int xl, int xr, const uchar* color) } //end ICV_HLINE_8() +/* static inline void ICV_HLINE_12(uchar* ptr, int xl, int xr, const uchar* color) { if (is_aligned(((uchar*)(ptr) + (xl)*12), 0x4)) { - uint32_t c[3] = {opencvLittleToHost32((uchar*)(color)+0x00), - opencvLittleToHost32((uchar*)(color)+0x04), - opencvLittleToHost32((uchar*)(color)+0x08)}; + uint32_t c[3] = {opencvLittleToHost32(color+0x00), + opencvLittleToHost32(color+0x04), + opencvLittleToHost32(color+0x08)}; uint32_t* hline_ptr = (uint32_t*)((uchar*)(ptr) + (xl)*(12)); uint32_t* hline_max_ptr = (uint32_t*)((uchar*)(ptr) + (xr)*(12)); for( ; hline_ptr <= hline_max_ptr; ) @@ -1616,10 +1619,10 @@ static inline void ICV_HLINE_16(uchar* ptr, int xl, int xr, const uchar* color) } else if (is_aligned(((uchar*)(ptr) + (xl)*16), 0x4)) { - uint32_t c[4] = {opencvLittleToHost32((uchar*)(color)+0x00), - opencvLittleToHost32((uchar*)(color)+0x04), - opencvLittleToHost32((uchar*)(color)+0x08), - opencvLittleToHost32((uchar*)(color)+0x0C)}; + uint32_t c[4] = {opencvLittleToHost32(color+0x00), + opencvLittleToHost32(color+0x04), + opencvLittleToHost32(color+0x08), + opencvLittleToHost32(color+0x0C)}; uint32_t* hline_ptr = (uint32_t*)((uchar*)(ptr) + (xl)*(16)); uint32_t* hline_max_ptr = (uint32_t*)((uchar*)(ptr) + (xr)*(16)); for( ; hline_ptr <= hline_max_ptr; ) @@ -1655,12 +1658,12 @@ static inline void ICV_HLINE_24(uchar* ptr, int xl, int xr, const uchar* color) } else if (is_aligned(((uchar*)(ptr) + (xl)*24), 0x4)) { - uint32_t c[6] = {opencvLittleToHost32((uchar*)(color)+0x00), - opencvLittleToHost32((uchar*)(color)+0x04), - opencvLittleToHost32((uchar*)(color)+0x08), - opencvLittleToHost32((uchar*)(color)+0x0C), - opencvLittleToHost32((uchar*)(color)+0x10), - opencvLittleToHost32((uchar*)(color)+0x14)}; + uint32_t c[6] = {opencvLittleToHost32(color+0x00), + opencvLittleToHost32(color+0x04), + opencvLittleToHost32(color+0x08), + opencvLittleToHost32(color+0x0C), + opencvLittleToHost32(color+0x10), + opencvLittleToHost32(color+0x14)}; uint32_t* hline_ptr = (uint32_t*)((uchar*)(ptr) + (xl)*(24)); uint32_t* hline_max_ptr = (uint32_t*)((uchar*)(ptr) + (xr)*(24)); for( ; hline_ptr <= hline_max_ptr; ) @@ -1700,14 +1703,14 @@ static inline void ICV_HLINE_32(uchar* ptr, int xl, int xr, const uchar* color) } else if (is_aligned(((uchar*)(ptr) + (xl)*2324), 0x4)) { - uint32_t c[8] = {opencvLittleToHost32((uchar*)(color)+0x00), - opencvLittleToHost32((uchar*)(color)+0x04), - opencvLittleToHost32((uchar*)(color)+0x08), - opencvLittleToHost32((uchar*)(color)+0x0C), - opencvLittleToHost32((uchar*)(color)+0x10), - opencvLittleToHost32((uchar*)(color)+0x14), - opencvLittleToHost32((uchar*)(color)+0x18), - opencvLittleToHost32((uchar*)(color)+0x1C)}; + uint32_t c[8] = {opencvLittleToHost32(color+0x00), + opencvLittleToHost32(color+0x04), + opencvLittleToHost32(color+0x08), + opencvLittleToHost32(color+0x0C), + opencvLittleToHost32(color+0x10), + opencvLittleToHost32(color+0x14), + opencvLittleToHost32(color+0x18), + opencvLittleToHost32(color+0x1C)}; uint32_t* hline_ptr = (uint32_t*)((uchar*)(ptr) + (xl)*(32)); uint32_t* hline_max_ptr = (uint32_t*)((uchar*)(ptr) + (xr)*(32)); for( ; hline_ptr <= hline_max_ptr; ) @@ -1728,6 +1731,7 @@ static inline void ICV_HLINE_32(uchar* ptr, int xl, int xr, const uchar* color) } } //end ICV_HLINE_32() +*/ static const bool ICV_HLINE_OPTIMIZATION = true; static inline void ICV_HLINE(uchar* ptr, int xl, int xr, const void* color, int pix_size) From fa4fd480725dd11f41fc007b14592fbac5f70a81 Mon Sep 17 00:00:00 2001 From: chacha21 Date: Wed, 10 May 2017 13:55:39 +0200 Subject: [PATCH 11/12] Drop best optimizations to reduce code size Only keep the ICV_HLINE_X optimization to reduce code size. --- modules/imgproc/src/drawing.cpp | 669 +------------------------------- 1 file changed, 1 insertion(+), 668 deletions(-) diff --git a/modules/imgproc/src/drawing.cpp b/modules/imgproc/src/drawing.cpp index e5075f62c5..da34be2b9b 100644 --- a/modules/imgproc/src/drawing.cpp +++ b/modules/imgproc/src/drawing.cpp @@ -1071,275 +1071,6 @@ EllipseEx( Mat& img, Point2l center, Size2l axes, * Polygons filling * \****************************************************************************************/ -//Endian macros stolen from SQLITE -#if (defined(i386) || defined(__i386__) || defined(_M_IX86) || \ - defined(__x86_64) || defined(__x86_64__) || defined(_M_X64) || \ - defined(_M_AMD64) || defined(_M_ARM) || defined(__x86) || \ - defined(__arm__) || defined(__aarch64__) || defined(_LITTLE_ENDIAN) || defined(LITTLE_ENDIAN)) -# define OPENCV_BYTEORDER 1234 -# define OPENCV_BIGENDIAN 0 -# define OPENCV_LITTLEENDIAN 1 -#elif (defined(sparc) || defined(__ppc__) || defined(_BIG_ENDIAN) || defined(BIG_ENDIAN)) -# define OPENCV_BYTEORDER 4321 -# define OPENCV_BIGENDIAN 1 -# define OPENCV_LITTLEENDIAN 0 -#endif - -#if !defined(OPENCV_BYTEORDER) -# define OPENCV_BYTEORDER 0 -static const int opencvOne = 1; -# define OPENCV_BIGENDIAN (*((const char *)(&opencvOne))==0) -# define OPENCV_LITTLEENDIAN (*((const char *)(&opencvOne))==1) -#endif - -# if defined(_MSC_VER) && _MSC_VER>=1400 -# if !defined(_WIN32_WCE) -# include -# pragma intrinsic(_byteswap_ushort) -# pragma intrinsic(_byteswap_ulong) -# pragma intrinsic(_byteswap_uint64) -# pragma intrinsic(_ReadWriteBarrier) -# else -# include -# endif -# endif - -static inline uint16_t opencvLittleToHost16(const uchar* p){ -#if OPENCV_BYTEORDER==1234 - uint16_t x; - memcpy(&x,p,sizeof(x)); - return x; -#elif OPENCV_BYTEORDER==4321 && defined(__GNUC__) - uint16_t x; - memcpy(&x,p,sizeof(x)); - return (p[0]<<8) | (p[1]>>8); -#elif OPENCV_BYTEORDER==4321 && defined(_MSC_VER) && _MSC_VER>=1300 - uint16_t x; - memcpy(&x,p,sizeof(x)); - return _byteswap_ushort(x); -#elif OPENCV_LITTLEENDIAN - return x; -#else - return (p[0]<<8) | (p[1]>>8); -#endif -} - -/* -static inline uint16_t opencvLittleToHost16(uint16_t x){ -#if OPENCV_LITTLEENDIAN - return x; -#else - return opencvLittleToHost16((const uchar*)&x); -#endif -} -*/ - -static inline uint32_t opencvLittleToHost32(const uchar* p){ -#if OPENCV_BYTEORDER==1234 - uint32_t x; - memcpy(&x,p,sizeof(x)); - return x; -#elif OPENCV_BYTEORDER==4321 && defined(__GNUC__) - uint32_t x; - memcpy(&x,p,sizeof(x)); - return __builtin_bswap32(x); -#elif OPENCV_BYTEORDER==4321 && defined(_MSC_VER) && _MSC_VER>=1300 - uint32_t x; - memcpy(&x,p,sizeof(x)); - return _byteswap_ulong(x); -#elif OPENCV_LITTLEENDIAN - return x; -#else - return (p[0]<<24) | (p[1]<<16) | (p[2]<<8) | p[3]; -#endif -} - -static inline uint32_t opencvLittleToHost32(uint32_t x){ -#if OPENCV_LITTLEENDIAN - return x; -#else - return opencvLittleToHost32((const uchar*)&x); -#endif -} - -static inline uint64_t opencvLittleToHost64(const uchar* p){ -#if OPENCV_BYTEORDER==1234 - uint64_t x; - memcpy(&x,p,sizeof(x)); - return x; -#elif OPENCV_BYTEORDER==4321 && defined(__GNUC__) - uint64_t x; - memcpy(&x,p,sizeof(x)); - return __builtin_bswap64(x); -#elif OPENCV_BYTEORDER==4321 && defined(_MSC_VER) && _MSC_VER>=1300 - uint64_t x; - memcpy(&x,p,sizeof(x)); - return _byteswap_uint64(x); -#elif OPENCV_LITTLEENDIAN - return x; -#else - return (p[0]<<56) | (p[1]<<40) | (p[2]<<24) | (p[3]<<8) | (p[4]>>8) | (p[5]>>24) | (p[6]>>40) | (p[7]>>56); -#endif -} - -static inline uint64_t opencvLittleToHost64(uint64_t x){ -#if OPENCV_LITTLEENDIAN - return x; -#else - return opencvLittleToHost64((const uchar*)&x); -#endif -} - -/* helper macros: filling horizontal row */ -#define is_aligned(POINTER, BYTE_COUNT) (((uintptr_t)(const void *)(POINTER)) % (BYTE_COUNT) == 0) - -/*#define ICV_HLINE( ptr, xl, xr, color, pix_size ) \ -{ \ - uchar* hline_ptr = (uchar*)(ptr) + (xl)*(pix_size); \ - uchar* hline_max_ptr = (uchar*)(ptr) + (xr)*(pix_size); \ - \ - for( ; hline_ptr <= hline_max_ptr; hline_ptr += (pix_size))\ - { \ - int hline_j; \ - for( hline_j = 0; hline_j < (pix_size); hline_j++ ) \ - { \ - hline_ptr[hline_j] = ((uchar*)color)[hline_j]; \ - } \ - } \ -}*/ - -/* -template -static inline void icv_hline_impl(uchar* ptr, size_t xl, size_t xr, const uchar* color, unsigned pix_size_) -{ - const unsigned pix_size = pix_size_forced ? pix_size_forced : pix_size_; - - uchar* hline_ptr = ptr + xl*pix_size; - uchar* hline_max_ptr = ptr + xr*pix_size; - - for ( ; hline_ptr <= hline_max_ptr; hline_ptr += pix_size) - { - for (unsigned c = 0; c < pix_size; c++) - { - hline_ptr[c] = color[c]; - } - } -} - -#define ICV_HLINE( ptr, xl, xr, color, pix_size ) \ -{ \ - if (pix_size == 1) \ - icv_hline_impl<1>((uchar*)ptr, (xl), (xr), (const uchar*)color,pix_size); \ - else if (pix_size == 3) \ - icv_hline_impl<3>((uchar*)ptr, (xl), (xr), (const uchar*)color, pix_size); \ - else if (pix_size == 4) \ - icv_hline_impl<4>((uchar*)ptr, (xl), (xr), (const uchar*)color, pix_size); \ - else \ - icv_hline_impl<0>((uchar*)ptr, (xl), (xr), (const uchar*)color, pix_size); \ -} -*/ - -/* -#define ICV_HLINE( ptr, xl, xr, color, pix_size ) \ -if((pix_size) == 1) \ -{ \ - uchar* hline_ptr = (uchar*)(ptr) + (xl); \ - uchar* hline_max_ptr = (uchar*)(ptr) + (xr); \ - uchar hline_c = *(const uchar*)(color); \ - \ - memset(hline_ptr, hline_c, (hline_max_ptr - hline_ptr) + 1); \ -} \ -else if((pix_size) == 3) \ -{ \ - uchar* hline_ptr = (uchar*)(ptr) + (xl)*3; \ - uchar* hline_end = (uchar*)(ptr) + (xr+1)*3; \ - uchar* hbody24_start = std::min(hline_end, (uchar*)(24*(((uintptr_t)(hline_ptr)+23)/24))); \ - uchar* hbody24_end = std::min(hline_end, (uchar*)(24*(((uintptr_t)(hline_end))/24))); \ - uchar* hbody12_start = std::min(hline_end, (uchar*)(12*(((uintptr_t)(hline_ptr)+11)/12))); \ - uchar* hbody12_end = std::min(hline_end, (uchar*)(12*(((uintptr_t)(hline_end))/12))); \ - if (hbody24_start < hbody24_end) \ - { \ - int offset = ((uintptr_t)(hbody24_start-hline_ptr))%3; \ - uint64_t c4[3]; \ - uchar* ptrC4 = reinterpret_cast(&c4); \ - ptrC4[0] = ((uchar*)(color))[(offset++)%3]; \ - ptrC4[1] = ((uchar*)(color))[(offset++)%3]; \ - ptrC4[2] = ((uchar*)(color))[(offset++)%3]; \ - memcpy(&ptrC4[3], &ptrC4[0], 3); \ - memcpy(&ptrC4[6], &ptrC4[0], 6); \ - memcpy(&ptrC4[12], &ptrC4[0], 12); \ - c4[0] = opencvLittleToHost64(c4[0]); \ - c4[1] = opencvLittleToHost64(c4[1]); \ - c4[2] = opencvLittleToHost64(c4[2]); \ - for(offset = 0 ; hline_ptr < hbody24_start; offset = (offset+1)%3)\ - *hline_ptr++ = ((uchar*)(color))[offset]; \ - for(uint64_t* ptr64 = reinterpret_cast(hbody24_start), *ptr64End = reinterpret_cast(hbody24_end) ; ptr64(&c4); \ - ptrC4[0] = ((uchar*)(color))[(offset++)%3]; \ - ptrC4[1] = ((uchar*)(color))[(offset++)%3]; \ - ptrC4[2] = ((uchar*)(color))[(offset++)%3]; \ - memcpy(&ptrC4[3], &ptrC4[0], 3); \ - memcpy(&ptrC4[6], &ptrC4[0], 6); \ - c4[0] = opencvLittleToHost32(c4[0]); \ - c4[1] = opencvLittleToHost32(c4[1]); \ - c4[2] = opencvLittleToHost32(c4[2]); \ - for(offset = 0 ; hline_ptr < hbody12_start; offset = (offset+1)%3)\ - *hline_ptr++ = ((uchar*)(color))[offset]; \ - for(uint32_t* ptr32 = reinterpret_cast(hbody12_start), *ptr32End = reinterpret_cast(hbody12_end) ; ptr32(&c4); - ptrC4[0] = ((uchar*)(color))[(offset++)%3]; - ptrC4[1] = ((uchar*)(color))[(offset++)%3]; - ptrC4[2] = ((uchar*)(color))[(offset++)%3]; - memcpy(&ptrC4[3], &ptrC4[0], 3); - memcpy(&ptrC4[6], &ptrC4[0], 6); - memcpy(&ptrC4[12], &ptrC4[0], 12); - c4[0] = opencvLittleToHost64(c4[0]); - c4[1] = opencvLittleToHost64(c4[1]); - c4[2] = opencvLittleToHost64(c4[2]); - for(offset = 0 ; hline_ptr < hbody24_start; offset = (offset+1)%3) - *hline_ptr++ = ((uchar*)(color))[offset]; - for(uint64_t* ptr64 = reinterpret_cast(hbody24_start), *ptr64End = reinterpret_cast(hbody24_end) ; ptr64(&c4); - ptrC4[0] = ((uchar*)(color))[(offset++)%3]; - ptrC4[1] = ((uchar*)(color))[(offset++)%3]; - ptrC4[2] = ((uchar*)(color))[(offset++)%3]; - memcpy(&ptrC4[3], &ptrC4[0], 3); - memcpy(&ptrC4[6], &ptrC4[0], 6); - c4[0] = opencvLittleToHost32(c4[0]); - c4[1] = opencvLittleToHost32(c4[1]); - c4[2] = opencvLittleToHost32(c4[2]); - for(offset = 0 ; hline_ptr < hbody12_start; offset = (offset+1)%3) - *hline_ptr++ = ((uchar*)(color))[offset]; - for(uint32_t* ptr32 = reinterpret_cast(hbody12_start), *ptr32End = reinterpret_cast(hbody12_end) ; ptr32(&c4); - ptrC4[0] = ((uchar*)(color))[(offset++)%6]; - ptrC4[1] = ((uchar*)(color))[(offset++)%6]; - ptrC4[2] = ((uchar*)(color))[(offset++)%6]; - ptrC4[3] = ((uchar*)(color))[(offset++)%6]; - ptrC4[4] = ((uchar*)(color))[(offset++)%6]; - ptrC4[5] = ((uchar*)(color))[(offset++)%6]; - memcpy(&ptrC4[6], &ptrC4[0], 6); - memcpy(&ptrC4[12], &ptrC4[0], 12); - c4[0] = opencvLittleToHost64(c4[0]); - c4[1] = opencvLittleToHost64(c4[1]); - c4[2] = opencvLittleToHost64(c4[2]); - for(offset = 0 ; hline_ptr < hbody24_start; offset = (offset+1)%6) - *hline_ptr++ = ((uchar*)(color))[offset]; - for(uint64_t* ptr64 = reinterpret_cast(hbody24_start), *ptr64End = reinterpret_cast(hbody24_end) ; ptr64(&c4); - ptrC4[0] = ((uchar*)(color))[(offset++)%6]; - ptrC4[1] = ((uchar*)(color))[(offset++)%6]; - ptrC4[2] = ((uchar*)(color))[(offset++)%6]; - ptrC4[3] = ((uchar*)(color))[(offset++)%6]; - ptrC4[4] = ((uchar*)(color))[(offset++)%6]; - ptrC4[5] = ((uchar*)(color))[(offset++)%6]; - memcpy(&ptrC4[6], &ptrC4[0], 6); - c4[0] = opencvLittleToHost32(c4[0]); - c4[1] = opencvLittleToHost32(c4[1]); - c4[2] = opencvLittleToHost32(c4[2]); - for(offset = 0 ; hline_ptr < hbody12_start; offset = (offset+1)%6) - *hline_ptr++ = ((uchar*)(color))[offset]; - for(uint32_t* ptr32 = reinterpret_cast(hbody12_start), *ptr32End = reinterpret_cast(hbody12_end) ; ptr32(color), pix_size); - else if (pix_size == 1) - ICV_HLINE_1(ptr, xl, xr, reinterpret_cast(color)); - else if (pix_size == 2) - ICV_HLINE_2(ptr, xl, xr, reinterpret_cast(color)); - else if (pix_size == 3) - ICV_HLINE_3(ptr, xl, xr, reinterpret_cast(color)); - else if (pix_size == 4) - ICV_HLINE_4(ptr, xl, xr, reinterpret_cast(color)); - else if (pix_size == 6) - ICV_HLINE_6(ptr, xl, xr, reinterpret_cast(color)); - else if (pix_size == 8) - ICV_HLINE_8(ptr, xl, xr, reinterpret_cast(color)); - //timings do not show relevant improvement when element_size >= 12 - /*else if (pix_size == 12) - ICV_HLINE_12(ptr, xl, xr, reinterpret_cast(color)); - else if (pix_size == 16) - ICV_HLINE_16(ptr, xl, xr, reinterpret_cast(color)); - else if (pix_size == 24) - ICV_HLINE_24(ptr, xl, xr, reinterpret_cast(color)); - else if (pix_size == 32) - ICV_HLINE_32(ptr, xl, xr, reinterpret_cast(color));*/ - else - ICV_HLINE_X(ptr, xl, xr, reinterpret_cast(color), pix_size); + ICV_HLINE_X(ptr, xl, xr, reinterpret_cast(color), pix_size); } //end ICV_HLINE() From 7763a86634ebfbd7206b389bca2a03393f9ea4af Mon Sep 17 00:00:00 2001 From: chacha21 Date: Fri, 19 May 2017 16:05:00 +0200 Subject: [PATCH 12/12] restored memset optimization when dropping optimizations in the last commit, I forgot to keep the simplest case where a single memset can be called --- modules/imgproc/src/drawing.cpp | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/modules/imgproc/src/drawing.cpp b/modules/imgproc/src/drawing.cpp index da34be2b9b..ba2ec9d2f2 100644 --- a/modules/imgproc/src/drawing.cpp +++ b/modules/imgproc/src/drawing.cpp @@ -1076,18 +1076,23 @@ static inline void ICV_HLINE_X(uchar* ptr, int xl, int xr, const uchar* color, i uchar* hline_min_ptr = (uchar*)(ptr) + (xl)*(pix_size); uchar* hline_end_ptr = (uchar*)(ptr) + (xr+1)*(pix_size); uchar* hline_ptr = hline_min_ptr; - if (hline_min_ptr < hline_end_ptr) + if (pix_size == 1) + memset(hline_min_ptr, *color, hline_end_ptr-hline_min_ptr); + else//if (pix_size != 1) { - memcpy(hline_ptr, color, pix_size); - hline_ptr += pix_size; - }//end if (hline_min_ptr < hline_end_ptr) - size_t sizeToCopy = pix_size; - while(hline_ptr < hline_end_ptr) - { - memcpy(hline_ptr, hline_min_ptr, sizeToCopy); - hline_ptr += sizeToCopy; - sizeToCopy = std::min(2*sizeToCopy, static_cast(hline_end_ptr-hline_ptr)); - }//end while(hline_ptr < hline_end_ptr) + if (hline_min_ptr < hline_end_ptr) + { + memcpy(hline_ptr, color, pix_size); + hline_ptr += pix_size; + }//end if (hline_min_ptr < hline_end_ptr) + size_t sizeToCopy = pix_size; + while(hline_ptr < hline_end_ptr) + { + memcpy(hline_ptr, hline_min_ptr, sizeToCopy); + hline_ptr += sizeToCopy; + sizeToCopy = std::min(2*sizeToCopy, static_cast(hline_end_ptr-hline_ptr)); + }//end while(hline_ptr < hline_end_ptr) + }//end if (pix_size != 1) } //end ICV_HLINE_X()