diff --git a/modules/core/src/softfloat.cpp b/modules/core/src/softfloat.cpp index 7d9d62717e..ac39ee8288 100644 --- a/modules/core/src/softfloat.cpp +++ b/modules/core/src/softfloat.cpp @@ -54,7 +54,7 @@ enum { tininess_afterRounding = 1 }; //fixed to make softfloat code stateless -const uint_fast8_t globalDetectTininess = tininess_afterRounding; +static const uint_fast8_t globalDetectTininess = tininess_afterRounding; /*---------------------------------------------------------------------------- | Software floating-point exception flags. @@ -69,7 +69,7 @@ enum { // Disabled to make softfloat code stateless // This function may be changed in the future for better error handling -inline void raiseFlags( uint_fast8_t /* flags */) +static inline void raiseFlags( uint_fast8_t /* flags */) { //exceptionFlags |= flags; } @@ -87,7 +87,7 @@ enum { }; //fixed to make softfloat code stateless -const uint_fast8_t globalRoundingMode = round_near_even; +static const uint_fast8_t globalRoundingMode = round_near_even; /*---------------------------------------------------------------------------- *----------------------------------------------------------------------------*/ @@ -123,85 +123,65 @@ typedef softdouble float64_t; /*---------------------------------------------------------------------------- | Integer-to-floating-point conversion routines. *----------------------------------------------------------------------------*/ -float32_t ui32_to_f32( uint32_t ); -float64_t ui32_to_f64( uint32_t ); -float32_t ui64_to_f32( uint64_t ); -float64_t ui64_to_f64( uint64_t ); -float32_t i32_to_f32( int32_t ); -float64_t i32_to_f64( int32_t ); -float32_t i64_to_f32( int64_t ); -float64_t i64_to_f64( int64_t ); +static float32_t ui32_to_f32( uint32_t ); +static float64_t ui32_to_f64( uint32_t ); +static float32_t ui64_to_f32( uint64_t ); +static float64_t ui64_to_f64( uint64_t ); +static float32_t i32_to_f32( int32_t ); +static float64_t i32_to_f64( int32_t ); +static float32_t i64_to_f32( int64_t ); +static float64_t i64_to_f64( int64_t ); /*---------------------------------------------------------------------------- | 32-bit (single-precision) floating-point operations. *----------------------------------------------------------------------------*/ -uint_fast32_t f32_to_ui32( float32_t, uint_fast8_t, bool ); -uint_fast64_t f32_to_ui64( float32_t, uint_fast8_t, bool ); -int_fast32_t f32_to_i32( float32_t, uint_fast8_t, bool ); -int_fast64_t f32_to_i64( float32_t, uint_fast8_t, bool ); -uint_fast32_t f32_to_ui32_r_minMag( float32_t, bool ); -uint_fast64_t f32_to_ui64_r_minMag( float32_t, bool ); -int_fast32_t f32_to_i32_r_minMag( float32_t, bool ); -int_fast64_t f32_to_i64_r_minMag( float32_t, bool ); -float64_t f32_to_f64( float32_t ); -float32_t f32_roundToInt( float32_t, uint_fast8_t, bool ); -float32_t f32_add( float32_t, float32_t ); -float32_t f32_sub( float32_t, float32_t ); -float32_t f32_mul( float32_t, float32_t ); -float32_t f32_mulAdd( float32_t, float32_t, float32_t ); -float32_t f32_div( float32_t, float32_t ); -float32_t f32_rem( float32_t, float32_t ); -float32_t f32_sqrt( float32_t ); -bool f32_eq( float32_t, float32_t ); -bool f32_le( float32_t, float32_t ); -bool f32_lt( float32_t, float32_t ); -bool f32_eq_signaling( float32_t, float32_t ); -bool f32_le_quiet( float32_t, float32_t ); -bool f32_lt_quiet( float32_t, float32_t ); -bool f32_isSignalingNaN( float32_t ); +static int_fast32_t f32_to_i32( float32_t, uint_fast8_t, bool ); +static int_fast32_t f32_to_i32_r_minMag( float32_t, bool ); +static float64_t f32_to_f64( float32_t ); +static float32_t f32_roundToInt( float32_t, uint_fast8_t, bool ); +static float32_t f32_add( float32_t, float32_t ); +static float32_t f32_sub( float32_t, float32_t ); +static float32_t f32_mul( float32_t, float32_t ); +static float32_t f32_mulAdd( float32_t, float32_t, float32_t ); +static float32_t f32_div( float32_t, float32_t ); +static float32_t f32_rem( float32_t, float32_t ); +static float32_t f32_sqrt( float32_t ); +static bool f32_eq( float32_t, float32_t ); +static bool f32_le( float32_t, float32_t ); +static bool f32_lt( float32_t, float32_t ); /*---------------------------------------------------------------------------- | 64-bit (double-precision) floating-point operations. *----------------------------------------------------------------------------*/ -uint_fast32_t f64_to_ui32( float64_t, uint_fast8_t, bool ); -uint_fast64_t f64_to_ui64( float64_t, uint_fast8_t, bool ); -int_fast32_t f64_to_i32( float64_t, uint_fast8_t, bool ); -int_fast64_t f64_to_i64( float64_t, uint_fast8_t, bool ); -uint_fast32_t f64_to_ui32_r_minMag( float64_t, bool ); -uint_fast64_t f64_to_ui64_r_minMag( float64_t, bool ); -int_fast32_t f64_to_i32_r_minMag( float64_t, bool ); -int_fast64_t f64_to_i64_r_minMag( float64_t, bool ); -float32_t f64_to_f32( float64_t ); -float64_t f64_roundToInt( float64_t, uint_fast8_t, bool ); -float64_t f64_add( float64_t, float64_t ); -float64_t f64_sub( float64_t, float64_t ); -float64_t f64_mul( float64_t, float64_t ); -float64_t f64_mulAdd( float64_t, float64_t, float64_t ); -float64_t f64_div( float64_t, float64_t ); -float64_t f64_rem( float64_t, float64_t ); -float64_t f64_sqrt( float64_t ); -bool f64_eq( float64_t, float64_t ); -bool f64_le( float64_t, float64_t ); -bool f64_lt( float64_t, float64_t ); -bool f64_eq_signaling( float64_t, float64_t ); -bool f64_le_quiet( float64_t, float64_t ); -bool f64_lt_quiet( float64_t, float64_t ); -bool f64_isSignalingNaN( float64_t ); +static int_fast32_t f64_to_i32( float64_t, uint_fast8_t, bool ); +static int_fast32_t f64_to_i32_r_minMag( float64_t, bool ); +static float32_t f64_to_f32( float64_t ); +static float64_t f64_roundToInt( float64_t, uint_fast8_t, bool ); +static float64_t f64_add( float64_t, float64_t ); +static float64_t f64_sub( float64_t, float64_t ); +static float64_t f64_mul( float64_t, float64_t ); +static float64_t f64_mulAdd( float64_t, float64_t, float64_t ); +static float64_t f64_div( float64_t, float64_t ); +static float64_t f64_rem( float64_t, float64_t ); +static float64_t f64_sqrt( float64_t ); +static bool f64_eq( float64_t, float64_t ); +static bool f64_le( float64_t, float64_t ); +static bool f64_lt( float64_t, float64_t ); /*---------------------------------------------------------------------------- | Ported from OpenCV and added for usability *----------------------------------------------------------------------------*/ -float32_t f32_powi( float32_t x, int y); -float64_t f64_powi( float64_t x, int y); +static float32_t f32_powi( float32_t x, int y); +static float64_t f64_powi( float64_t x, int y); -float32_t f32_exp( float32_t x); -float64_t f64_exp(float64_t x); -float32_t f32_log(float32_t x); -float64_t f64_log(float64_t x); -float32_t f32_cbrt(float32_t x); -float32_t f32_pow( float32_t x, float32_t y); -float64_t f64_pow( float64_t x, float64_t y); +static float32_t f32_exp( float32_t x); +static float64_t f64_exp(float64_t x); +static float32_t f32_log(float32_t x); +static float64_t f64_log(float64_t x); +static float32_t f32_cbrt(float32_t x); +static float32_t f32_pow( float32_t x, float32_t y); +static float64_t f64_pow( float64_t x, float64_t y); /*---------------------------------------------------------------------------- | softfloat and softdouble methods and members @@ -437,13 +417,7 @@ enum { /*---------------------------------------------------------------------------- *----------------------------------------------------------------------------*/ -static uint_fast32_t softfloat_roundToUI32( bool, uint_fast64_t, uint_fast8_t, bool ); -static uint_fast64_t softfloat_roundToUI64( bool, uint_fast64_t, uint_fast64_t, uint_fast8_t, bool ); static int_fast32_t softfloat_roundToI32( bool, uint_fast64_t, uint_fast8_t, bool ); -static int_fast64_t softfloat_roundToI64( bool, uint_fast64_t, uint_fast64_t, uint_fast8_t, bool ); - -/*---------------------------------------------------------------------------- -*----------------------------------------------------------------------------*/ struct exp16_sig32 { int_fast16_t exp; uint_fast32_t sig; }; static struct exp16_sig32 softfloat_normSubnormalF32Sig( uint_fast32_t ); @@ -478,7 +452,7 @@ static float64_t softfloat_mulAddF64( uint_fast64_t, uint_fast64_t, uint_fast64_ | significant bit to 1. This shifted-and-jammed value is returned. *----------------------------------------------------------------------------*/ -inline uint64_t softfloat_shortShiftRightJam64( uint64_t a, uint_fast8_t dist ) +static inline uint64_t softfloat_shortShiftRightJam64( uint64_t a, uint_fast8_t dist ) { return a>>dist | ((a & (((uint_fast64_t) 1<>dist | ((uint32_t) (a<<((~dist + 1) & 31)) != 0) : (a != 0); @@ -506,7 +480,7 @@ inline uint32_t softfloat_shiftRightJam32( uint32_t a, uint_fast16_t dist ) | greater than 64, the result will be either 0 or 1, depending on whether 'a' | is zero or nonzero. *----------------------------------------------------------------------------*/ -inline uint64_t softfloat_shiftRightJam64( uint64_t a, uint_fast32_t dist ) +static inline uint64_t softfloat_shiftRightJam64( uint64_t a, uint_fast32_t dist ) { //fixed unsigned unary minus: -x == ~x + 1 return (dist < 63) ? a>>dist | ((uint64_t) (a<<((~dist + 1) & 63)) != 0) : (a != 0); @@ -540,7 +514,7 @@ static const uint_least8_t softfloat_countLeadingZeros8[256] = { | Returns the number of leading 0 bits before the most-significant 1 bit of | 'a'. If 'a' is zero, 32 is returned. *----------------------------------------------------------------------------*/ -inline uint_fast8_t softfloat_countLeadingZeros32( uint32_t a ) +static inline uint_fast8_t softfloat_countLeadingZeros32( uint32_t a ) { uint_fast8_t count = 0; if ( a < 0x10000 ) { @@ -607,7 +581,7 @@ static const uint16_t softfloat_approxRecipSqrt_1k1s[16] = { | Shifts the 128 bits formed by concatenating 'a64' and 'a0' left by the | number of bits given in 'dist', which must be in the range 1 to 63. *----------------------------------------------------------------------------*/ -inline struct uint128 softfloat_shortShiftLeft128( uint64_t a64, uint64_t a0, uint_fast8_t dist ) +static inline struct uint128 softfloat_shortShiftLeft128( uint64_t a64, uint64_t a0, uint_fast8_t dist ) { struct uint128 z; z.v64 = a64<>(-dist & 63); @@ -622,7 +596,7 @@ inline struct uint128 softfloat_shortShiftLeft128( uint64_t a64, uint64_t a0, ui | bit of the shifted value by setting the least-significant bit to 1. This | shifted-and-jammed value is returned. *----------------------------------------------------------------------------*/ -inline struct uint128 softfloat_shortShiftRightJam128(uint64_t a64, uint64_t a0, uint_fast8_t dist ) +static inline struct uint128 softfloat_shortShiftRightJam128(uint64_t a64, uint64_t a0, uint_fast8_t dist ) { uint_fast8_t negDist = -dist; struct uint128 z; @@ -633,37 +607,6 @@ inline struct uint128 softfloat_shortShiftRightJam128(uint64_t a64, uint64_t a0, return z; } -/*---------------------------------------------------------------------------- -| Shifts the 128 bits formed by concatenating 'a' and 'extra' right by 64 -| _plus_ the number of bits given in 'dist', which must not be zero. This -| shifted value is at most 64 nonzero bits and is returned in the 'v' field -| of the 'struct uint64_extra' result. The 64-bit 'extra' field of the result -| contains a value formed as follows from the bits that were shifted off: The -| _last_ bit shifted off is the most-significant bit of the 'extra' field, and -| the other 63 bits of the 'extra' field are all zero if and only if _all_but_ -| _the_last_ bits shifted off were all zero. -| (This function makes more sense if 'a' and 'extra' are considered to form -| an unsigned fixed-point number with binary point between 'a' and 'extra'. -| This fixed-point value is shifted right by the number of bits given in -| 'dist', and the integer part of this shifted value is returned in the 'v' -| field of the result. The fractional part of the shifted value is modified -| as described above and returned in the 'extra' field of the result.) -*----------------------------------------------------------------------------*/ -inline struct uint64_extra softfloat_shiftRightJam64Extra(uint64_t a, uint64_t extra, uint_fast32_t dist ) -{ - struct uint64_extra z; - if ( dist < 64 ) { - z.v = a>>dist; - //fixed unsigned unary minus: -x == ~x + 1 - z.extra = a<<((~dist + 1) & 63); - } else { - z.v = 0; - z.extra = (dist == 64) ? a : (a != 0); - } - z.extra |= (extra != 0); - return z; -} - /*---------------------------------------------------------------------------- | Shifts the 128 bits formed by concatenating 'a64' and 'a0' right by the | number of bits given in 'dist', which must not be zero. If any nonzero bits @@ -681,7 +624,7 @@ static struct uint128 softfloat_shiftRightJam128( uint64_t a64, uint64_t a0, uin | 'a0' and the 128-bit integer formed by concatenating 'b64' and 'b0'. The | addition is modulo 2^128, so any carry out is lost. *----------------------------------------------------------------------------*/ -inline struct uint128 softfloat_add128( uint64_t a64, uint64_t a0, uint64_t b64, uint64_t b0 ) +static inline struct uint128 softfloat_add128( uint64_t a64, uint64_t a0, uint64_t b64, uint64_t b0 ) { struct uint128 z; z.v0 = a0 + b0; @@ -694,7 +637,7 @@ inline struct uint128 softfloat_add128( uint64_t a64, uint64_t a0, uint64_t b64, | and 'a0' and the 128-bit integer formed by concatenating 'b64' and 'b0'. | The subtraction is modulo 2^128, so any borrow out (carry out) is lost. *----------------------------------------------------------------------------*/ -inline struct uint128 softfloat_sub128( uint64_t a64, uint64_t a0, uint64_t b64, uint64_t b0 ) +static inline struct uint128 softfloat_sub128( uint64_t a64, uint64_t a0, uint64_t b64, uint64_t b0 ) { struct uint128 z; z.v0 = a0 - b0; @@ -713,7 +656,7 @@ static struct uint128 softfloat_mul64To128( uint64_t a, uint64_t b ); /*---------------------------------------------------------------------------- *----------------------------------------------------------------------------*/ -float32_t f32_add( float32_t a, float32_t b ) +static float32_t f32_add( float32_t a, float32_t b ) { uint_fast32_t uiA = a.v; uint_fast32_t uiB = b.v; @@ -725,7 +668,7 @@ float32_t f32_add( float32_t a, float32_t b ) } } -float32_t f32_div( float32_t a, float32_t b ) +static float32_t f32_div( float32_t a, float32_t b ) { uint_fast32_t uiA; bool signA; @@ -823,7 +766,7 @@ float32_t f32_div( float32_t a, float32_t b ) return float32_t::fromRaw(uiZ); } -bool f32_eq( float32_t a, float32_t b ) +static bool f32_eq( float32_t a, float32_t b ) { uint_fast32_t uiA; uint_fast32_t uiB; @@ -841,26 +784,7 @@ bool f32_eq( float32_t a, float32_t b ) return (uiA == uiB) || ! (uint32_t) ((uiA | uiB)<<1); } -bool f32_eq_signaling( float32_t a, float32_t b ) -{ - uint_fast32_t uiA; - uint_fast32_t uiB; - - uiA = a.v; - uiB = b.v; - if ( isNaNF32UI( uiA ) || isNaNF32UI( uiB ) ) { - raiseFlags( flag_invalid ); - return false; - } - return (uiA == uiB) || ! (uint32_t) ((uiA | uiB)<<1); -} - -bool f32_isSignalingNaN( float32_t a ) -{ - return softfloat_isSigNaNF32UI( a.v ); -} - -bool f32_le( float32_t a, float32_t b ) +static bool f32_le( float32_t a, float32_t b ) { uint_fast32_t uiA; uint_fast32_t uiB; @@ -879,30 +803,7 @@ bool f32_le( float32_t a, float32_t b ) : (uiA == uiB) || (signA ^ (uiA < uiB)); } -bool f32_le_quiet( float32_t a, float32_t b ) -{ - uint_fast32_t uiA; - uint_fast32_t uiB; - bool signA, signB; - - uiA = a.v; - uiB = b.v; - if ( isNaNF32UI( uiA ) || isNaNF32UI( uiB ) ) { - if ( - softfloat_isSigNaNF32UI( uiA ) || softfloat_isSigNaNF32UI( uiB ) - ) { - raiseFlags( flag_invalid ); - } - return false; - } - signA = signF32UI( uiA ); - signB = signF32UI( uiB ); - return - (signA != signB) ? signA || ! (uint32_t) ((uiA | uiB)<<1) - : (uiA == uiB) || (signA ^ (uiA < uiB)); -} - -bool f32_lt( float32_t a, float32_t b ) +static bool f32_lt( float32_t a, float32_t b ) { uint_fast32_t uiA; uint_fast32_t uiB; @@ -921,30 +822,7 @@ bool f32_lt( float32_t a, float32_t b ) : (uiA != uiB) && (signA ^ (uiA < uiB)); } -bool f32_lt_quiet( float32_t a, float32_t b ) -{ - uint_fast32_t uiA; - uint_fast32_t uiB; - bool signA, signB; - - uiA = a.v; - uiB = b.v; - if ( isNaNF32UI( uiA ) || isNaNF32UI( uiB ) ) { - if ( - softfloat_isSigNaNF32UI( uiA ) || softfloat_isSigNaNF32UI( uiB ) - ) { - raiseFlags( flag_invalid ); - } - return false; - } - signA = signF32UI( uiA ); - signB = signF32UI( uiB ); - return - (signA != signB) ? signA && ((uint32_t) ((uiA | uiB)<<1) != 0) - : (uiA != uiB) && (signA ^ (uiA < uiB)); -} - -float32_t f32_mulAdd( float32_t a, float32_t b, float32_t c ) +static float32_t f32_mulAdd( float32_t a, float32_t b, float32_t c ) { uint_fast32_t uiA; uint_fast32_t uiB; @@ -956,7 +834,7 @@ float32_t f32_mulAdd( float32_t a, float32_t b, float32_t c ) return softfloat_mulAddF32( uiA, uiB, uiC, 0 ); } -float32_t f32_mul( float32_t a, float32_t b ) +static float32_t f32_mul( float32_t a, float32_t b ) { uint_fast32_t uiA; bool signA; @@ -1043,7 +921,7 @@ float32_t f32_mul( float32_t a, float32_t b ) return float32_t::fromRaw(uiZ); } -float32_t f32_rem( float32_t a, float32_t b ) +static float32_t f32_rem( float32_t a, float32_t b ) { uint_fast32_t uiA; bool signA; @@ -1163,7 +1041,7 @@ float32_t f32_rem( float32_t a, float32_t b ) return float32_t::fromRaw(uiZ); } -float32_t f32_roundToInt( float32_t a, uint_fast8_t roundingMode, bool exact ) +static float32_t f32_roundToInt( float32_t a, uint_fast8_t roundingMode, bool exact ) { uint_fast32_t uiA; int_fast16_t exp; @@ -1227,7 +1105,7 @@ float32_t f32_roundToInt( float32_t a, uint_fast8_t roundingMode, bool exact ) return float32_t::fromRaw(uiZ); } -float32_t f32_sqrt( float32_t a ) +static float32_t f32_sqrt( float32_t a ) { uint_fast32_t uiA; bool signA; @@ -1300,7 +1178,7 @@ float32_t f32_sqrt( float32_t a ) return float32_t::fromRaw(uiZ); } -float32_t f32_sub( float32_t a, float32_t b ) +static float32_t f32_sub( float32_t a, float32_t b ) { uint_fast32_t uiA; uint_fast32_t uiB; @@ -1314,7 +1192,7 @@ float32_t f32_sub( float32_t a, float32_t b ) } } -float64_t f32_to_f64( float32_t a ) +static float64_t f32_to_f64( float32_t a ) { uint_fast32_t uiA; bool sign; @@ -1359,7 +1237,7 @@ float64_t f32_to_f64( float32_t a ) return float64_t::fromRaw(uiZ); } -int_fast32_t f32_to_i32( float32_t a, uint_fast8_t roundingMode, bool exact ) +static int_fast32_t f32_to_i32( float32_t a, uint_fast8_t roundingMode, bool exact ) { uint_fast32_t uiA; bool sign; @@ -1397,7 +1275,7 @@ int_fast32_t f32_to_i32( float32_t a, uint_fast8_t roundingMode, bool exact ) return softfloat_roundToI32( sign, sig64, roundingMode, exact ); } -int_fast32_t f32_to_i32_r_minMag( float32_t a, bool exact ) +static int_fast32_t f32_to_i32_r_minMag( float32_t a, bool exact ) { uint_fast32_t uiA; int_fast16_t exp; @@ -1440,254 +1318,7 @@ int_fast32_t f32_to_i32_r_minMag( float32_t a, bool exact ) return sign ? -absZ : absZ; } -int_fast64_t f32_to_i64( float32_t a, uint_fast8_t roundingMode, bool exact ) -{ - uint_fast32_t uiA; - bool sign; - int_fast16_t exp; - uint_fast32_t sig; - int_fast16_t shiftDist; - uint_fast64_t sig64, extra; - struct uint64_extra sig64Extra; - - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - uiA = a.v; - sign = signF32UI( uiA ); - exp = expF32UI( uiA ); - sig = fracF32UI( uiA ); - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - shiftDist = 0xBE - exp; - if ( shiftDist < 0 ) { - raiseFlags( flag_invalid ); - return - (exp == 0xFF) && sig ? i64_fromNaN - : sign ? i64_fromNegOverflow : i64_fromPosOverflow; - } - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - if ( exp ) sig |= 0x00800000; - sig64 = (uint_fast64_t) sig<<40; - extra = 0; - if ( shiftDist ) { - sig64Extra = softfloat_shiftRightJam64Extra( sig64, 0, shiftDist ); - sig64 = sig64Extra.v; - extra = sig64Extra.extra; - } - return softfloat_roundToI64( sign, sig64, extra, roundingMode, exact ); -} - -int_fast64_t f32_to_i64_r_minMag( float32_t a, bool exact ) -{ - uint_fast32_t uiA; - int_fast16_t exp; - uint_fast32_t sig; - int_fast16_t shiftDist; - bool sign; - uint_fast64_t sig64; - int_fast64_t absZ; - - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - uiA = a.v; - exp = expF32UI( uiA ); - sig = fracF32UI( uiA ); - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - shiftDist = 0xBE - exp; - if ( 64 <= shiftDist ) { - if ( exact && (exp | sig) ) { - raiseFlags(flag_inexact); - } - return 0; - } - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - sign = signF32UI( uiA ); - if ( shiftDist <= 0 ) { - if ( uiA == packToF32UI( 1, 0xBE, 0 ) ) { - return -INT64_C( 0x7FFFFFFFFFFFFFFF ) - 1; - } - raiseFlags( flag_invalid ); - return - (exp == 0xFF) && sig ? i64_fromNaN - : sign ? i64_fromNegOverflow : i64_fromPosOverflow; - } - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - sig |= 0x00800000; - sig64 = (uint_fast64_t) sig<<40; - absZ = sig64>>shiftDist; - shiftDist = 40 - shiftDist; - if ( exact && (shiftDist < 0) && (uint32_t) (sig<<(shiftDist & 31)) ) { - raiseFlags(flag_inexact); - } - return sign ? -absZ : absZ; -} - -uint_fast32_t f32_to_ui32( float32_t a, uint_fast8_t roundingMode, bool exact ) -{ - uint_fast32_t uiA; - bool sign; - int_fast16_t exp; - uint_fast32_t sig; - uint_fast64_t sig64; - int_fast16_t shiftDist; - - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - uiA = a.v; - sign = signF32UI( uiA ); - exp = expF32UI( uiA ); - sig = fracF32UI( uiA ); - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ -#if (ui32_fromNaN != ui32_fromPosOverflow) || (ui32_fromNaN != ui32_fromNegOverflow) - if ( (exp == 0xFF) && sig ) { -#if (ui32_fromNaN == ui32_fromPosOverflow) - sign = 0; -#elif (ui32_fromNaN == ui32_fromNegOverflow) - sign = 1; -#else - raiseFlags( flag_invalid ); - return ui32_fromNaN; -#endif - } -#endif - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - if ( exp ) sig |= 0x00800000; - sig64 = (uint_fast64_t) sig<<32; - shiftDist = 0xAA - exp; - if ( 0 < shiftDist ) sig64 = softfloat_shiftRightJam64( sig64, shiftDist ); - return softfloat_roundToUI32( sign, sig64, roundingMode, exact ); -} - -uint_fast32_t f32_to_ui32_r_minMag( float32_t a, bool exact ) -{ - uint_fast32_t uiA; - int_fast16_t exp; - uint_fast32_t sig; - int_fast16_t shiftDist; - bool sign; - uint_fast32_t z; - - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - uiA = a.v; - exp = expF32UI( uiA ); - sig = fracF32UI( uiA ); - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - shiftDist = 0x9E - exp; - if ( 32 <= shiftDist ) { - if ( exact && (exp | sig) ) { - raiseFlags(flag_inexact); - } - return 0; - } - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - sign = signF32UI( uiA ); - if ( sign || (shiftDist < 0) ) { - raiseFlags( flag_invalid ); - return - (exp == 0xFF) && sig ? ui32_fromNaN - : sign ? ui32_fromNegOverflow : ui32_fromPosOverflow; - } - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - sig = (sig | 0x00800000)<<8; - z = sig>>shiftDist; - if ( exact && (z<>shiftDist; - shiftDist = 40 - shiftDist; - if ( exact && (shiftDist < 0) && (uint32_t) (sig<<(shiftDist & 31)) ) { - raiseFlags(flag_inexact); - } - return z; -} - -float64_t f64_add( float64_t a, float64_t b ) +static float64_t f64_add( float64_t a, float64_t b ) { uint_fast64_t uiA; bool signA; @@ -1705,7 +1336,7 @@ float64_t f64_add( float64_t a, float64_t b ) } } -float64_t f64_div( float64_t a, float64_t b ) +static float64_t f64_div( float64_t a, float64_t b ) { uint_fast64_t uiA; bool signA; @@ -1827,7 +1458,7 @@ float64_t f64_div( float64_t a, float64_t b ) return float64_t::fromRaw(uiZ); } -bool f64_eq( float64_t a, float64_t b ) +static bool f64_eq( float64_t a, float64_t b ) { uint_fast64_t uiA; uint_fast64_t uiB; @@ -1845,26 +1476,7 @@ bool f64_eq( float64_t a, float64_t b ) return (uiA == uiB) || ! ((uiA | uiB) & UINT64_C( 0x7FFFFFFFFFFFFFFF )); } -bool f64_eq_signaling( float64_t a, float64_t b ) -{ - uint_fast64_t uiA; - uint_fast64_t uiB; - - uiA = a.v; - uiB = b.v; - if ( isNaNF64UI( uiA ) || isNaNF64UI( uiB ) ) { - raiseFlags( flag_invalid ); - return false; - } - return (uiA == uiB) || ! ((uiA | uiB) & UINT64_C( 0x7FFFFFFFFFFFFFFF )); -} - -bool f64_isSignalingNaN( float64_t a ) -{ - return softfloat_isSigNaNF64UI( a.v ); -} - -bool f64_le( float64_t a, float64_t b ) +static bool f64_le( float64_t a, float64_t b ) { uint_fast64_t uiA; uint_fast64_t uiB; @@ -1884,31 +1496,7 @@ bool f64_le( float64_t a, float64_t b ) : (uiA == uiB) || (signA ^ (uiA < uiB)); } -bool f64_le_quiet( float64_t a, float64_t b ) -{ - uint_fast64_t uiA; - uint_fast64_t uiB; - bool signA, signB; - - uiA = a.v; - uiB = b.v; - if ( isNaNF64UI( uiA ) || isNaNF64UI( uiB ) ) { - if ( - softfloat_isSigNaNF64UI( uiA ) || softfloat_isSigNaNF64UI( uiB ) - ) { - raiseFlags( flag_invalid ); - } - return false; - } - signA = signF64UI( uiA ); - signB = signF64UI( uiB ); - return - (signA != signB) - ? signA || ! ((uiA | uiB) & UINT64_C( 0x7FFFFFFFFFFFFFFF )) - : (uiA == uiB) || (signA ^ (uiA < uiB)); -} - -bool f64_lt( float64_t a, float64_t b ) +static bool f64_lt( float64_t a, float64_t b ) { uint_fast64_t uiA; uint_fast64_t uiB; @@ -1928,31 +1516,7 @@ bool f64_lt( float64_t a, float64_t b ) : (uiA != uiB) && (signA ^ (uiA < uiB)); } -bool f64_lt_quiet( float64_t a, float64_t b ) -{ - uint_fast64_t uiA; - uint_fast64_t uiB; - bool signA, signB; - - uiA = a.v; - uiB = b.v; - if ( isNaNF64UI( uiA ) || isNaNF64UI( uiB ) ) { - if ( - softfloat_isSigNaNF64UI( uiA ) || softfloat_isSigNaNF64UI( uiB ) - ) { - raiseFlags( flag_invalid ); - } - return false; - } - signA = signF64UI( uiA ); - signB = signF64UI( uiB ); - return - (signA != signB) - ? signA && ((uiA | uiB) & UINT64_C( 0x7FFFFFFFFFFFFFFF )) - : (uiA != uiB) && (signA ^ (uiA < uiB)); -} - -float64_t f64_mulAdd( float64_t a, float64_t b, float64_t c ) +static float64_t f64_mulAdd( float64_t a, float64_t b, float64_t c ) { uint_fast64_t uiA; uint_fast64_t uiB; @@ -1964,7 +1528,7 @@ float64_t f64_mulAdd( float64_t a, float64_t b, float64_t c ) return softfloat_mulAddF64( uiA, uiB, uiC, 0 ); } -float64_t f64_mul( float64_t a, float64_t b ) +static float64_t f64_mul( float64_t a, float64_t b ) { uint_fast64_t uiA; bool signA; @@ -2054,7 +1618,7 @@ float64_t f64_mul( float64_t a, float64_t b ) return float64_t::fromRaw(uiZ); } -float64_t f64_rem( float64_t a, float64_t b ) +static float64_t f64_rem( float64_t a, float64_t b ) { uint_fast64_t uiA; bool signA; @@ -2190,7 +1754,7 @@ float64_t f64_rem( float64_t a, float64_t b ) return float64_t::fromRaw(uiZ); } -float64_t f64_roundToInt( float64_t a, uint_fast8_t roundingMode, bool exact ) +static float64_t f64_roundToInt( float64_t a, uint_fast8_t roundingMode, bool exact ) { uint_fast64_t uiA; int_fast16_t exp; @@ -2254,7 +1818,7 @@ float64_t f64_roundToInt( float64_t a, uint_fast8_t roundingMode, bool exact ) return float64_t::fromRaw(uiZ); } -float64_t f64_sqrt( float64_t a ) +static float64_t f64_sqrt( float64_t a ) { uint_fast64_t uiA; bool signA; @@ -2311,273 +1875,94 @@ float64_t f64_sqrt( float64_t a ) if ( expA ) { sigA <<= 8; sig32Z >>= 1; - } else { - sigA <<= 9; - } - rem = sigA - (uint_fast64_t) sig32Z * sig32Z; - q = ((uint32_t) (rem>>2) * (uint_fast64_t) recipSqrt32)>>32; - sigZ = ((uint_fast64_t) sig32Z<<32 | 1<<5) + ((uint_fast64_t) q<<3); - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - if ( (sigZ & 0x1FF) < 1<<5 ) { - sigZ &= ~(uint_fast64_t) 0x3F; - shiftedSigZ = sigZ>>6; - rem = (sigA<<52) - shiftedSigZ * shiftedSigZ; - if ( rem & UINT64_C( 0x8000000000000000 ) ) { - --sigZ; - } else { - if ( rem ) sigZ |= 1; - } - } - return softfloat_roundPackToF64( 0, expZ, sigZ ); - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - invalid: - raiseFlags( flag_invalid ); - uiZ = defaultNaNF64UI; - uiZ: - return float64_t::fromRaw(uiZ); -} - -float64_t f64_sub( float64_t a, float64_t b ) -{ - uint_fast64_t uiA; - bool signA; - uint_fast64_t uiB; - bool signB; - - uiA = a.v; - signA = signF64UI( uiA ); - uiB = b.v; - signB = signF64UI( uiB ); - - if ( signA == signB ) { - return softfloat_subMagsF64( uiA, uiB, signA ); - } else { - return softfloat_addMagsF64( uiA, uiB, signA ); - } -} - -float32_t f64_to_f32( float64_t a ) -{ - uint_fast64_t uiA; - bool sign; - int_fast16_t exp; - uint_fast64_t frac; - struct commonNaN commonNaN; - uint_fast32_t uiZ, frac32; - - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - uiA = a.v; - sign = signF64UI( uiA ); - exp = expF64UI( uiA ); - frac = fracF64UI( uiA ); - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - if ( exp == 0x7FF ) { - if ( frac ) { - softfloat_f64UIToCommonNaN( uiA, &commonNaN ); - uiZ = softfloat_commonNaNToF32UI( &commonNaN ); - } else { - uiZ = packToF32UI( sign, 0xFF, 0 ); - } - goto uiZ; - } - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - frac32 = (uint_fast32_t)softfloat_shortShiftRightJam64( frac, 22 ); //fixed warning on type cast - if ( ! (exp | frac32) ) { - uiZ = packToF32UI( sign, 0, 0 ); - goto uiZ; - } - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - return softfloat_roundPackToF32( sign, exp - 0x381, frac32 | 0x40000000 ); - uiZ: - return float32_t::fromRaw(uiZ); -} - -int_fast32_t f64_to_i32( float64_t a, uint_fast8_t roundingMode, bool exact ) -{ - uint_fast64_t uiA; - bool sign; - int_fast16_t exp; - uint_fast64_t sig; - int_fast16_t shiftDist; - - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - uiA = a.v; - sign = signF64UI( uiA ); - exp = expF64UI( uiA ); - sig = fracF64UI( uiA ); - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ -#if (i32_fromNaN != i32_fromPosOverflow) || (i32_fromNaN != i32_fromNegOverflow) - if ( (exp == 0x7FF) && sig ) { -#if (i32_fromNaN == i32_fromPosOverflow) - sign = 0; -#elif (i32_fromNaN == i32_fromNegOverflow) - sign = 1; -#else - raiseFlags( flag_invalid ); - return i32_fromNaN; -#endif - } -#endif - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - if ( exp ) sig |= UINT64_C( 0x0010000000000000 ); - shiftDist = 0x427 - exp; - if ( 0 < shiftDist ) sig = softfloat_shiftRightJam64( sig, shiftDist ); - return softfloat_roundToI32( sign, sig, roundingMode, exact ); -} - -int_fast32_t f64_to_i32_r_minMag( float64_t a, bool exact ) -{ - uint_fast64_t uiA; - int_fast16_t exp; - uint_fast64_t sig; - int_fast16_t shiftDist; - bool sign; - int_fast32_t absZ; - - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - uiA = a.v; - exp = expF64UI( uiA ); - sig = fracF64UI( uiA ); - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - shiftDist = 0x433 - exp; - if ( 53 <= shiftDist ) { - if ( exact && (exp | sig) ) { - raiseFlags(flag_inexact); - } - return 0; - } - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - sign = signF64UI( uiA ); - if ( shiftDist < 22 ) { - if ( - sign && (exp == 0x41E) && (sig < UINT64_C( 0x0000000000200000 )) - ) { - if ( exact && sig ) { - raiseFlags(flag_inexact); - } - return -0x7FFFFFFF - 1; - } - raiseFlags( flag_invalid ); - return - (exp == 0x7FF) && sig ? i32_fromNaN - : sign ? i32_fromNegOverflow : i32_fromPosOverflow; + } else { + sigA <<= 9; } + rem = sigA - (uint_fast64_t) sig32Z * sig32Z; + q = ((uint32_t) (rem>>2) * (uint_fast64_t) recipSqrt32)>>32; + sigZ = ((uint_fast64_t) sig32Z<<32 | 1<<5) + ((uint_fast64_t) q<<3); /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ - sig |= UINT64_C( 0x0010000000000000 ); - absZ = (int_fast32_t)(sig>>shiftDist); //fixed warning on type cast - if ( exact && ((uint_fast64_t) (uint_fast32_t) absZ<>6; + rem = (sigA<<52) - shiftedSigZ * shiftedSigZ; + if ( rem & UINT64_C( 0x8000000000000000 ) ) { + --sigZ; + } else { + if ( rem ) sigZ |= 1; + } } - return sign ? -absZ : absZ; + return softfloat_roundPackToF64( 0, expZ, sigZ ); + /*------------------------------------------------------------------------ + *------------------------------------------------------------------------*/ + invalid: + raiseFlags( flag_invalid ); + uiZ = defaultNaNF64UI; + uiZ: + return float64_t::fromRaw(uiZ); } -int_fast64_t f64_to_i64( float64_t a, uint_fast8_t roundingMode, bool exact ) +static float64_t f64_sub( float64_t a, float64_t b ) { uint_fast64_t uiA; - bool sign; - int_fast16_t exp; - uint_fast64_t sig; - int_fast16_t shiftDist; - struct uint64_extra sigExtra; + bool signA; + uint_fast64_t uiB; + bool signB; - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ uiA = a.v; - sign = signF64UI( uiA ); - exp = expF64UI( uiA ); - sig = fracF64UI( uiA ); - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - if ( exp ) sig |= UINT64_C( 0x0010000000000000 ); - shiftDist = 0x433 - exp; - if ( shiftDist <= 0 ) { - if ( shiftDist < -11 ) goto invalid; - sigExtra.v = sig<<-shiftDist; - sigExtra.extra = 0; + signA = signF64UI( uiA ); + uiB = b.v; + signB = signF64UI( uiB ); + + if ( signA == signB ) { + return softfloat_subMagsF64( uiA, uiB, signA ); } else { - sigExtra = softfloat_shiftRightJam64Extra( sig, 0, shiftDist ); + return softfloat_addMagsF64( uiA, uiB, signA ); } - return - softfloat_roundToI64( - sign, sigExtra.v, sigExtra.extra, roundingMode, exact ); - - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - invalid: - raiseFlags( flag_invalid ); - return - (exp == 0x7FF) && fracF64UI( uiA ) ? i64_fromNaN - : sign ? i64_fromNegOverflow : i64_fromPosOverflow; } -int_fast64_t f64_to_i64_r_minMag( float64_t a, bool exact ) +static float32_t f64_to_f32( float64_t a ) { uint_fast64_t uiA; bool sign; int_fast16_t exp; - uint_fast64_t sig; - int_fast16_t shiftDist; - int_fast64_t absZ; + uint_fast64_t frac; + struct commonNaN commonNaN; + uint_fast32_t uiZ, frac32; /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ uiA = a.v; sign = signF64UI( uiA ); exp = expF64UI( uiA ); - sig = fracF64UI( uiA ); + frac = fracF64UI( uiA ); /*------------------------------------------------------------------------ *------------------------------------------------------------------------*/ - shiftDist = 0x433 - exp; - if ( shiftDist <= 0 ) { - /*-------------------------------------------------------------------- - *--------------------------------------------------------------------*/ - if ( shiftDist < -10 ) { - if ( uiA == packToF64UI( 1, 0x43E, 0 ) ) { - return -INT64_C( 0x7FFFFFFFFFFFFFFF ) - 1; - } - raiseFlags( flag_invalid ); - return - (exp == 0x7FF) && sig ? i64_fromNaN - : sign ? i64_fromNegOverflow : i64_fromPosOverflow; - } - /*-------------------------------------------------------------------- - *--------------------------------------------------------------------*/ - sig |= UINT64_C( 0x0010000000000000 ); - absZ = sig<<-shiftDist; - } else { - /*-------------------------------------------------------------------- - *--------------------------------------------------------------------*/ - if ( 53 <= shiftDist ) { - if ( exact && (exp | sig) ) { - raiseFlags(flag_inexact); - } - return 0; - } - /*-------------------------------------------------------------------- - *--------------------------------------------------------------------*/ - sig |= UINT64_C( 0x0010000000000000 ); - absZ = sig>>shiftDist; - if ( exact && (absZ<>shiftDist); //fixed warning on type cast - if ( exact && ((uint_fast64_t) z<>shiftDist); //fixed warning on type cast + if ( exact && ((uint_fast64_t) (uint_fast32_t) absZ<>shiftDist; - if ( exact && (uint64_t) (sig<<(-shiftDist & 63)) ) { - raiseFlags(flag_inexact); - } - } - return z; - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - invalid: - raiseFlags( flag_invalid ); - return - (exp == 0x7FF) && sig ? ui64_fromNaN - : sign ? ui64_fromNegOverflow : ui64_fromPosOverflow; + return sign ? -absZ : absZ; } -float32_t i32_to_f32( int32_t a ) +static float32_t i32_to_f32( int32_t a ) { bool sign; uint_fast32_t absA; @@ -2754,7 +2062,7 @@ float32_t i32_to_f32( int32_t a ) return softfloat_normRoundPackToF32( sign, 0x9C, absA ); } -float64_t i32_to_f64( int32_t a ) +static float64_t i32_to_f64( int32_t a ) { uint_fast64_t uiZ; bool sign; @@ -2775,7 +2083,7 @@ float64_t i32_to_f64( int32_t a ) return float64_t::fromRaw(uiZ); } -float32_t i64_to_f32( int64_t a ) +static float32_t i64_to_f32( int64_t a ) { bool sign; uint_fast64_t absA; @@ -2798,7 +2106,7 @@ float32_t i64_to_f32( int64_t a ) } } -float64_t i64_to_f64( int64_t a ) +static float64_t i64_to_f64( int64_t a ) { bool sign; uint_fast64_t absA; @@ -2812,7 +2120,7 @@ float64_t i64_to_f64( int64_t a ) return softfloat_normRoundPackToF64( sign, 0x43C, absA ); } -float32_t softfloat_addMagsF32( uint_fast32_t uiA, uint_fast32_t uiB ) +static float32_t softfloat_addMagsF32( uint_fast32_t uiA, uint_fast32_t uiB ) { int_fast16_t expA; uint_fast32_t sigA; @@ -2893,7 +2201,7 @@ float32_t softfloat_addMagsF32( uint_fast32_t uiA, uint_fast32_t uiB ) return float32_t::fromRaw(uiZ); } -float64_t +static float64_t softfloat_addMagsF64( uint_fast64_t uiA, uint_fast64_t uiB, bool signZ ) { int_fast16_t expA; @@ -2976,7 +2284,7 @@ float64_t return float64_t::fromRaw(uiZ); } -uint32_t softfloat_approxRecipSqrt32_1( unsigned int oddExpA, uint32_t a ) +static uint32_t softfloat_approxRecipSqrt32_1( unsigned int oddExpA, uint32_t a ) { int index; uint16_t eps, r0; @@ -3006,7 +2314,7 @@ uint32_t softfloat_approxRecipSqrt32_1( unsigned int oddExpA, uint32_t a ) | Converts the common NaN pointed to by `aPtr' into a 32-bit floating-point | NaN, and returns the bit pattern of this value as an unsigned integer. *----------------------------------------------------------------------------*/ -uint_fast32_t softfloat_commonNaNToF32UI( const struct commonNaN *aPtr ) +static uint_fast32_t softfloat_commonNaNToF32UI( const struct commonNaN *aPtr ) { return (uint_fast32_t) aPtr->sign<<31 | 0x7FC00000 | aPtr->v64>>41; } @@ -3015,14 +2323,14 @@ uint_fast32_t softfloat_commonNaNToF32UI( const struct commonNaN *aPtr ) | Converts the common NaN pointed to by `aPtr' into a 64-bit floating-point | NaN, and returns the bit pattern of this value as an unsigned integer. *----------------------------------------------------------------------------*/ -uint_fast64_t softfloat_commonNaNToF64UI( const struct commonNaN *aPtr ) +static uint_fast64_t softfloat_commonNaNToF64UI( const struct commonNaN *aPtr ) { return (uint_fast64_t) aPtr->sign<<63 | UINT64_C( 0x7FF8000000000000 ) | aPtr->v64>>12; } -uint_fast8_t softfloat_countLeadingZeros64( uint64_t a ) +static uint_fast8_t softfloat_countLeadingZeros64( uint64_t a ) { uint_fast8_t count; uint32_t a32; @@ -3054,7 +2362,7 @@ uint_fast8_t softfloat_countLeadingZeros64( uint64_t a ) | location pointed to by `zPtr'. If the NaN is a signaling NaN, the invalid | exception is raised. *----------------------------------------------------------------------------*/ -void softfloat_f32UIToCommonNaN( uint_fast32_t uiA, struct commonNaN *zPtr ) +static void softfloat_f32UIToCommonNaN( uint_fast32_t uiA, struct commonNaN *zPtr ) { if ( softfloat_isSigNaNF32UI( uiA ) ) { raiseFlags( flag_invalid ); @@ -3070,7 +2378,7 @@ void softfloat_f32UIToCommonNaN( uint_fast32_t uiA, struct commonNaN *zPtr ) | location pointed to by `zPtr'. If the NaN is a signaling NaN, the invalid | exception is raised. *----------------------------------------------------------------------------*/ -void softfloat_f64UIToCommonNaN( uint_fast64_t uiA, struct commonNaN *zPtr ) +static void softfloat_f64UIToCommonNaN( uint_fast64_t uiA, struct commonNaN *zPtr ) { if ( softfloat_isSigNaNF64UI( uiA ) ) { raiseFlags( flag_invalid ); @@ -3080,7 +2388,7 @@ void softfloat_f64UIToCommonNaN( uint_fast64_t uiA, struct commonNaN *zPtr ) zPtr->v0 = 0; } -struct uint128 softfloat_mul64To128( uint64_t a, uint64_t b ) +static struct uint128 softfloat_mul64To128( uint64_t a, uint64_t b ) { uint32_t a32, a0, b32, b0; struct uint128 z; @@ -3101,7 +2409,7 @@ struct uint128 softfloat_mul64To128( uint64_t a, uint64_t b ) return z; } -float32_t +static float32_t softfloat_mulAddF32( uint_fast32_t uiA, uint_fast32_t uiB, uint_fast32_t uiC, uint_fast8_t op ) { @@ -3279,7 +2587,7 @@ float32_t return float32_t::fromRaw(uiZ); } -float64_t +static float64_t softfloat_mulAddF64( uint_fast64_t uiA, uint_fast64_t uiB, uint_fast64_t uiC, uint_fast8_t op ) { @@ -3475,7 +2783,7 @@ float64_t return float64_t::fromRaw(uiZ); } -float32_t +static float32_t softfloat_normRoundPackToF32( bool sign, int_fast16_t exp, uint_fast32_t sig ) { int_fast8_t shiftDist; @@ -3489,7 +2797,7 @@ float32_t } } -float64_t +static float64_t softfloat_normRoundPackToF64( bool sign, int_fast16_t exp, uint_fast64_t sig ) { int_fast8_t shiftDist; @@ -3503,7 +2811,7 @@ float64_t } } -struct exp16_sig32 softfloat_normSubnormalF32Sig( uint_fast32_t sig ) +static struct exp16_sig32 softfloat_normSubnormalF32Sig( uint_fast32_t sig ) { int_fast8_t shiftDist; struct exp16_sig32 z; @@ -3514,7 +2822,7 @@ struct exp16_sig32 softfloat_normSubnormalF32Sig( uint_fast32_t sig ) return z; } -struct exp16_sig64 softfloat_normSubnormalF64Sig( uint_fast64_t sig ) +static struct exp16_sig64 softfloat_normSubnormalF64Sig( uint_fast64_t sig ) { int_fast8_t shiftDist; struct exp16_sig64 z; @@ -3531,7 +2839,7 @@ struct exp16_sig64 softfloat_normSubnormalF64Sig( uint_fast64_t sig ) | the combined NaN result. If either `uiA' or `uiB' has the pattern of a | signaling NaN, the invalid exception is raised. *----------------------------------------------------------------------------*/ -uint_fast32_t +static uint_fast32_t softfloat_propagateNaNF32UI( uint_fast32_t uiA, uint_fast32_t uiB ) { bool isSigNaNA; @@ -3550,7 +2858,7 @@ uint_fast32_t | the combined NaN result. If either `uiA' or `uiB' has the pattern of a | signaling NaN, the invalid exception is raised. *----------------------------------------------------------------------------*/ -uint_fast64_t +static uint_fast64_t softfloat_propagateNaNF64UI( uint_fast64_t uiA, uint_fast64_t uiB ) { bool isSigNaNA; @@ -3563,7 +2871,7 @@ uint_fast64_t return (isNaNF64UI( uiA ) ? uiA : uiB) | UINT64_C( 0x0008000000000000 ); } -float32_t +static float32_t softfloat_roundPackToF32( bool sign, int_fast16_t exp, uint_fast32_t sig ) { uint_fast8_t roundingMode; @@ -3629,7 +2937,7 @@ float32_t return float32_t::fromRaw(uiZ); } -float64_t +static float64_t softfloat_roundPackToF64( bool sign, int_fast16_t exp, uint_fast64_t sig ) { uint_fast8_t roundingMode; @@ -3699,7 +3007,7 @@ float64_t return float64_t::fromRaw(uiZ); } -int_fast32_t +static int_fast32_t softfloat_roundToI32( bool sign, uint_fast64_t sig, uint_fast8_t roundingMode, bool exact ) { @@ -3740,130 +3048,7 @@ int_fast32_t return sign ? i32_fromNegOverflow : i32_fromPosOverflow; } -int_fast64_t - softfloat_roundToI64( - bool sign, - uint_fast64_t sig, - uint_fast64_t sigExtra, - uint_fast8_t roundingMode, - bool exact - ) -{ - bool roundNearEven, doIncrement; - union { uint64_t ui; int64_t i; } uZ; - int_fast64_t z; - - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - roundNearEven = (roundingMode == round_near_even); - doIncrement = (UINT64_C( 0x8000000000000000 ) <= sigExtra); - if ( ! roundNearEven && (roundingMode != round_near_maxMag) ) { - doIncrement = - (roundingMode - == (sign ? round_min : round_max)) - && sigExtra; - } - if ( doIncrement ) { - ++sig; - if ( ! sig ) goto invalid; - sig &= - ~(uint_fast64_t) - (! (sigExtra & UINT64_C( 0x7FFFFFFFFFFFFFFF )) - & roundNearEven); - } - //fixed unsigned unary minus: -x == ~x + 1 - uZ.ui = sign ? (~sig + 1) : sig; - z = uZ.i; - if ( z && ((z < 0) ^ sign) ) goto invalid; - if ( exact && sigExtra ) { - raiseFlags(flag_inexact); - } - return z; - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - invalid: - raiseFlags( flag_invalid ); - return sign ? i64_fromNegOverflow : i64_fromPosOverflow; -} - -uint_fast32_t - softfloat_roundToUI32( - bool sign, uint_fast64_t sig, uint_fast8_t roundingMode, bool exact ) -{ - bool roundNearEven; - uint_fast16_t roundIncrement, roundBits; - uint_fast32_t z; - - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - roundNearEven = (roundingMode == round_near_even); - roundIncrement = 0x800; - if ( ! roundNearEven && (roundingMode != round_near_maxMag) ) { - roundIncrement = - (roundingMode - == (sign ? round_min : round_max)) - ? 0xFFF - : 0; - } - roundBits = sig & 0xFFF; - sig += roundIncrement; - if ( sig & UINT64_C( 0xFFFFF00000000000 ) ) goto invalid; - z = (uint_fast32_t)(sig>>12); //fixed warning on type cast - z &= ~(uint_fast32_t) (! (roundBits ^ 0x800) & roundNearEven); - if ( sign && z ) goto invalid; - if ( exact && roundBits ) { - raiseFlags(flag_inexact); - } - return z; - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - invalid: - raiseFlags( flag_invalid ); - return sign ? ui32_fromNegOverflow : ui32_fromPosOverflow; -} - -uint_fast64_t - softfloat_roundToUI64( - bool sign, - uint_fast64_t sig, - uint_fast64_t sigExtra, - uint_fast8_t roundingMode, - bool exact - ) -{ - bool roundNearEven, doIncrement; - - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - roundNearEven = (roundingMode == round_near_even); - doIncrement = (UINT64_C( 0x8000000000000000 ) <= sigExtra); - if ( ! roundNearEven && (roundingMode != round_near_maxMag) ) { - doIncrement = - (roundingMode - == (sign ? round_min : round_max)) - && sigExtra; - } - if ( doIncrement ) { - ++sig; - if ( ! sig ) goto invalid; - sig &= - ~(uint_fast64_t) - (! (sigExtra & UINT64_C( 0x7FFFFFFFFFFFFFFF )) - & roundNearEven); - } - if ( sign && sig ) goto invalid; - if ( exact && sigExtra ) { - raiseFlags(flag_inexact); - } - return sig; - /*------------------------------------------------------------------------ - *------------------------------------------------------------------------*/ - invalid: - raiseFlags( flag_invalid ); - return sign ? ui64_fromNegOverflow : ui64_fromPosOverflow; -} - -struct uint128 +static struct uint128 softfloat_shiftRightJam128( uint64_t a64, uint64_t a0, uint_fast32_t dist ) { uint_fast8_t u8NegDist; @@ -3888,7 +3073,7 @@ struct uint128 return z; } -float32_t softfloat_subMagsF32( uint_fast32_t uiA, uint_fast32_t uiB ) +static float32_t softfloat_subMagsF32( uint_fast32_t uiA, uint_fast32_t uiB ) { int_fast16_t expA; uint_fast32_t sigA; @@ -3985,7 +3170,7 @@ float32_t softfloat_subMagsF32( uint_fast32_t uiA, uint_fast32_t uiB ) return float32_t::fromRaw(uiZ); } -float64_t +static float64_t softfloat_subMagsF64( uint_fast64_t uiA, uint_fast64_t uiB, bool signZ ) { int_fast16_t expA; @@ -4080,7 +3265,7 @@ float64_t return float64_t::fromRaw(uiZ); } -float32_t ui32_to_f32( uint32_t a ) +static float32_t ui32_to_f32( uint32_t a ) { if ( ! a ) { return float32_t::fromRaw(0); @@ -4092,7 +3277,7 @@ float32_t ui32_to_f32( uint32_t a ) } } -float64_t ui32_to_f64( uint32_t a ) +static float64_t ui32_to_f64( uint32_t a ) { uint_fast64_t uiZ; int_fast8_t shiftDist; @@ -4107,7 +3292,7 @@ float64_t ui32_to_f64( uint32_t a ) return float64_t::fromRaw(uiZ); } -float32_t ui64_to_f32( uint64_t a ) +static float32_t ui64_to_f32( uint64_t a ) { int_fast8_t shiftDist; uint_fast32_t sig; @@ -4124,7 +3309,7 @@ float32_t ui64_to_f32( uint64_t a ) } } -float64_t ui64_to_f64( uint64_t a ) +static float64_t ui64_to_f64( uint64_t a ) { if ( ! a ) { return float64_t::fromRaw(0); @@ -4222,7 +3407,7 @@ static const float64_t exp_prescale = float64_t::fromRaw(0x3ff71547652b82fe) * f static const float64_t exp_postscale = float64_t::one()/float64_t(1 << EXPTAB_SCALE); static const float64_t exp_max_val(3000*(1 << EXPTAB_SCALE)); // log10(DBL_MAX) < 3000 -float32_t f32_exp( float32_t x) +static float32_t f32_exp( float32_t x) { //special cases if(x.isNaN()) return float32_t::nan(); @@ -4250,7 +3435,7 @@ float32_t f32_exp( float32_t x) return (buf * EXPPOLY_32F_A0 * float64_t::fromRaw(expTab[val0 & EXPTAB_MASK]) * ((((x0 + A1)*x0 + A2)*x0 + A3)*x0 + A4)); } -float64_t f64_exp(float64_t x) +static float64_t f64_exp(float64_t x) { //special cases if(x.isNaN()) return float64_t::nan(); @@ -4550,7 +3735,7 @@ static const uint64_t CV_DECL_ALIGNED(16) icvLogTab[] = { // 0.69314718055994530941723212145818 static const float64_t ln_2 = float64_t::fromRaw(0x3fe62e42fefa39ef); -float32_t f32_log(float32_t x) +static float32_t f32_log(float32_t x) { //special cases if(x.isNaN() || x < float32_t::zero()) return float32_t::nan(); @@ -4574,7 +3759,7 @@ float32_t f32_log(float32_t x) return y0; } -float64_t f64_log(float64_t x) +static float64_t f64_log(float64_t x) { //special cases if(x.isNaN() || x < float64_t::zero()) return float64_t::nan(); @@ -4612,7 +3797,7 @@ float64_t f64_log(float64_t x) Fast cube root by Ken Turkowski (http://www.worldserver.com/turk/computergraphics/papers.html) \* ************************************************************************** */ -float32_t f32_cbrt(float32_t x) +static float32_t f32_cbrt(float32_t x) { //special cases if (x.isNaN()) return float32_t::nan(); @@ -4649,9 +3834,9 @@ float32_t f32_cbrt(float32_t x) /// POW functions /// -float32_t f32_pow( float32_t x, float32_t y) +static float32_t f32_pow( float32_t x, float32_t y) { - const float32_t zero = float32_t::zero(), one = float32_t::one(), inf = float32_t::inf(), nan = float32_t::nan(); + static const float32_t zero = float32_t::zero(), one = float32_t::one(), inf = float32_t::inf(), nan = float32_t::nan(); bool xinf = x.isInf(), yinf = y.isInf(), xnan = x.isNaN(), ynan = y.isNaN(); float32_t ax = abs(x); bool useInf = (y > zero) == (ax > one); @@ -4676,9 +3861,9 @@ float32_t f32_pow( float32_t x, float32_t y) return v; } -float64_t f64_pow( float64_t x, float64_t y) +static float64_t f64_pow( float64_t x, float64_t y) { - const float64_t zero = float64_t::zero(), one = float64_t::one(), inf = float64_t::inf(), nan = float64_t::nan(); + static const float64_t zero = float64_t::zero(), one = float64_t::one(), inf = float64_t::inf(), nan = float64_t::nan(); bool xinf = x.isInf(), yinf = y.isInf(), xnan = x.isNaN(), ynan = y.isNaN(); float64_t ax = abs(x); bool useInf = (y > zero) == (ax > one); @@ -4705,7 +3890,7 @@ float64_t f64_pow( float64_t x, float64_t y) // These functions are for internal use only -float32_t f32_powi( float32_t x, int y) +static float32_t f32_powi( float32_t x, int y) { float32_t v; //special case: (0 ** 0) == 1 @@ -4731,7 +3916,7 @@ float32_t f32_powi( float32_t x, int y) return v; } -float64_t f64_powi( float64_t x, int y) +static float64_t f64_powi( float64_t x, int y) { float64_t v; //special case: (0 ** 0) == 1 diff --git a/modules/imgproc/src/color.cpp b/modules/imgproc/src/color.cpp index b40726dd96..283727bd66 100644 --- a/modules/imgproc/src/color.cpp +++ b/modules/imgproc/src/color.cpp @@ -94,6 +94,8 @@ #include "opencl_kernels_imgproc.hpp" #include #include "hal_replacement.hpp" +#include "opencv2/core/hal/intrin.hpp" +#include "opencv2/core/softfloat.hpp" #define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n)) @@ -138,6 +140,8 @@ const int CB2GI = -5636; const int CR2GI = -11698; const int CR2RI = 22987; +static const double _1_3 = 0.333333333333; +static const float _1_3f = static_cast(_1_3); // computes cubic spline coefficients for a function: (xi=i, yi=f[i]), i=0..n template static void splineBuild(const _Tp* f, int n, _Tp* tab) @@ -164,6 +168,32 @@ template static void splineBuild(const _Tp* f, int n, _Tp* tab) } } +static void splineBuild(const softfloat* f, int n, float* tab) +{ + const softfloat f2(2), f3(3), f4(4); + softfloat cn(0); + softfloat* sftab = reinterpret_cast(tab); + int i; + tab[0] = tab[1] = 0.0f; + + for(i = 1; i < n-1; i++) + { + softfloat t = (f[i+1] - f[i]*f2 + f[i-1])*f3; + softfloat l = softfloat::one()/(f4 - sftab[(i-1)*4]); + sftab[i*4] = l; sftab[i*4+1] = (t - sftab[(i-1)*4+1])*l; + } + + for(i = n-1; i >= 0; i--) + { + softfloat c = sftab[i*4+1] - sftab[i*4]*cn; + softfloat b = f[i+1] - f[i] - (cn + c*f2)/f3; + softfloat d = (cn - c)/f3; + sftab[i*4] = f[i]; sftab[i*4+1] = b; + sftab[i*4+2] = c; sftab[i*4+3] = d; + cn = c; + } +} + // interpolates value of a function at x, 0 <= x <= n using a cubic spline. template static inline _Tp splineInterpolate(_Tp x, const _Tp* tab, int n) { @@ -3454,7 +3484,7 @@ static const float sRGB2XYZ_D65[] = static const float XYZ2sRGB_D65[] = { 3.240479f, -1.53715f, -0.498535f, - -0.969256f, 1.875991f, 0.041556f, + -0.969256f, 1.875991f, 0.041556f, 0.055648f, -0.204043f, 1.057311f }; @@ -5781,19 +5811,20 @@ struct HLS2RGB_b #endif }; - ///////////////////////////////////// RGB <-> L*a*b* ///////////////////////////////////// static const float D65[] = { 0.950456f, 1.f, 1.088754f }; enum { LAB_CBRT_TAB_SIZE = 1024, GAMMA_TAB_SIZE = 1024 }; static float LabCbrtTab[LAB_CBRT_TAB_SIZE*4]; -static const float LabCbrtTabScale = LAB_CBRT_TAB_SIZE/1.5f; +static const float LabCbrtTabScale = softfloat(LAB_CBRT_TAB_SIZE*2)/softfloat(3); static float sRGBGammaTab[GAMMA_TAB_SIZE*4], sRGBInvGammaTab[GAMMA_TAB_SIZE*4]; -static const float GammaTabScale = (float)GAMMA_TAB_SIZE; +static const float GammaTabScale((int)GAMMA_TAB_SIZE); static ushort sRGBGammaTab_b[256], linearGammaTab_b[256]; +enum { inv_gamma_shift = 12, INV_GAMMA_TAB_SIZE = (1 << inv_gamma_shift) }; +static ushort sRGBInvGammaTab_b[INV_GAMMA_TAB_SIZE], linearInvGammaTab_b[INV_GAMMA_TAB_SIZE]; #undef lab_shift #define lab_shift xyz_shift #define gamma_shift 3 @@ -5801,46 +5832,150 @@ static ushort sRGBGammaTab_b[256], linearGammaTab_b[256]; #define LAB_CBRT_TAB_SIZE_B (256*3/2*(1< 1.0f ? 1.0f : value; + +//all constants should be presented through integers to keep bit-exactness +static const softdouble gammaThreshold = softdouble(809)/softdouble(20000); // 0.04045 +static const softdouble gammaInvThreshold = softdouble(7827)/softdouble(2500000); // 0.0031308 +static const softdouble gammaLowScale = softdouble(323)/softdouble(25); // 12.92 +static const softdouble gammaPower = softdouble(12)/softdouble(5); // 2.4 +static const softdouble gammaXshift = softdouble(11)/softdouble(200); // 0.055 + +static inline softfloat applyGamma(softfloat x) +{ + //return x <= 0.04045f ? x*(1.f/12.92f) : (float)std::pow((double)(x + 0.055)*(1./1.055), 2.4); + softdouble xd = x; + return (xd <= gammaThreshold ? + xd/gammaLowScale : + pow((xd + gammaXshift)/(softdouble::one()+gammaXshift), gammaPower)); +} + +static inline softfloat applyInvGamma(softfloat x) +{ + //return x <= 0.0031308 ? x*12.92f : (float)(1.055*std::pow((double)x, 1./2.4) - 0.055); + softdouble xd = x; + return (xd <= gammaInvThreshold ? + xd*gammaLowScale : + pow(xd, softdouble::one()/gammaPower)*(softdouble::one()+gammaXshift) - gammaXshift); +} + static void initLabTabs() { static bool initialized = false; if(!initialized) { - float f[LAB_CBRT_TAB_SIZE+1], g[GAMMA_TAB_SIZE+1], ig[GAMMA_TAB_SIZE+1], scale = 1.f/LabCbrtTabScale; + static const softfloat lthresh = softfloat(216) / softfloat(24389); // 0.008856f = (6/29)^3 + static const softfloat lscale = softfloat(841) / softfloat(108); // 7.787f = (29/3)^3/(29*4) + static const softfloat lbias = softfloat(16) / softfloat(116); + static const softfloat f255(255); + + softfloat f[LAB_CBRT_TAB_SIZE+1], g[GAMMA_TAB_SIZE+1], ig[GAMMA_TAB_SIZE+1]; + softfloat scale = softfloat::one()/softfloat(LabCbrtTabScale); int i; for(i = 0; i <= LAB_CBRT_TAB_SIZE; i++) { - float x = i*scale; - f[i] = x < 0.008856f ? x*7.787f + 0.13793103448275862f : cvCbrt(x); + softfloat x = scale*softfloat(i); + f[i] = x < lthresh ? mulAdd(x, lscale, lbias) : cbrt(x); } splineBuild(f, LAB_CBRT_TAB_SIZE, LabCbrtTab); - scale = 1.f/GammaTabScale; + scale = softfloat::one()/softfloat(GammaTabScale); for(i = 0; i <= GAMMA_TAB_SIZE; i++) { - float x = i*scale; - g[i] = x <= 0.04045f ? x*(1.f/12.92f) : (float)std::pow((double)(x + 0.055)*(1./1.055), 2.4); - ig[i] = x <= 0.0031308 ? x*12.92f : (float)(1.055*std::pow((double)x, 1./2.4) - 0.055); + softfloat x = scale*softfloat(i); + g[i] = applyGamma(x); + ig[i] = applyInvGamma(x); } splineBuild(g, GAMMA_TAB_SIZE, sRGBGammaTab); splineBuild(ig, GAMMA_TAB_SIZE, sRGBInvGammaTab); + static const softfloat intScale(255*(1 << gamma_shift)); for(i = 0; i < 256; i++) { - float x = i*(1.f/255.f); - sRGBGammaTab_b[i] = saturate_cast(255.f*(1 << gamma_shift)*(x <= 0.04045f ? x*(1.f/12.92f) : (float)std::pow((double)(x + 0.055)*(1./1.055), 2.4))); + softfloat x = softfloat(i)/f255; + sRGBGammaTab_b[i] = (ushort)(cvRound(intScale*applyGamma(x))); linearGammaTab_b[i] = (ushort)(i*(1 << gamma_shift)); } + static const softfloat invScale = softfloat::one()/softfloat((int)INV_GAMMA_TAB_SIZE); + for(i = 0; i < INV_GAMMA_TAB_SIZE; i++) + { + softfloat x = invScale*softfloat(i); + sRGBInvGammaTab_b[i] = (ushort)(cvRound(f255*applyInvGamma(x))); + linearInvGammaTab_b[i] = (ushort)(cvTrunc(f255*x)); + } + static const softfloat cbTabScale(softfloat::one()/(f255*(1 << gamma_shift))); + static const softfloat lshift2(1 << lab_shift2); for(i = 0; i < LAB_CBRT_TAB_SIZE_B; i++) { - float x = i*(1.f/(255.f*(1 << gamma_shift))); - LabCbrtTab_b[i] = saturate_cast((1 << lab_shift2)*(x < 0.008856f ? x*7.787f + 0.13793103448275862f : cvCbrt(x))); + softfloat x = cbTabScale*softfloat(i); + LabCbrtTab_b[i] = (ushort)(cvRound(lshift2 * (x < lthresh ? mulAdd(x, lscale, lbias) : cbrt(x)))); } + + //Lookup table for L to y and ify calculations + static const int BASE = (1 << 14); + for(i = 0; i < 256; i++) + { + int y, ify; + //8 * 255.0 / 100.0 == 20.4 + if( i <= 20) + { + //yy = li / 903.3f; + //y = L*100/903.3f; 903.3f = (29/3)^3, 255 = 17*3*5 + y = cvRound(softfloat(i*BASE*20*9)/softfloat(17*29*29*29)); + //fy = 7.787f * yy + 16.0f / 116.0f; 7.787f = (29/3)^3/(29*4) + ify = cvRound(softfloat(BASE)*(softfloat(16)/softfloat(116) + softfloat(i*5)/softfloat(3*17*29))); + } + else + { + //fy = (li + 16.0f) / 116.0f; + softfloat fy = (softfloat(i*100*BASE)/softfloat(255*116) + + softfloat(16*BASE)/softfloat(116)); + ify = cvRound(fy); + //yy = fy * fy * fy; + y = cvRound(fy*fy*fy/softfloat(BASE*BASE)); + } + + LabToYF_b[i*2 ] = (ushort)y; // 2260 <= y <= BASE + LabToYF_b[i*2+1] = (ushort)ify; // 0 <= ify <= BASE + } + + //Lookup table for a,b to x,z conversion + for(i = minABvalue; i < LAB_BASE*9/4+minABvalue; i++) + { + int v; + //6.f/29.f*BASE = 3389.730 + if(i <= 3390) + { + //fxz[k] = (fxz[k] - 16.0f / 116.0f) / 7.787f; + // 7.787f = (29/3)^3/(29*4) + v = i*108/841 - BASE*16/116*108/841; + } + else + { + //fxz[k] = fxz[k] * fxz[k] * fxz[k]; + v = i*i/BASE*i/BASE; + } + abToXZ_b[i-minABvalue] = v; // -1335 <= v <= 88231 + } + initialized = true; } } + struct RGB2Lab_b { typedef uchar channel_type; @@ -5857,21 +5992,15 @@ struct RGB2Lab_b if (!_whitept) _whitept = D65; - float scale[] = - { - (1 << lab_shift)/_whitept[0], - (float)(1 << lab_shift), - (1 << lab_shift)/_whitept[2] - }; - + static const softfloat lshift(1 << lab_shift); for( int i = 0; i < _3; i++ ) { - coeffs[i*3+(blueIdx^2)] = cvRound(_coeffs[i*3]*scale[i]); - coeffs[i*3+1] = cvRound(_coeffs[i*3+1]*scale[i]); - coeffs[i*3+blueIdx] = cvRound(_coeffs[i*3+2]*scale[i]); + coeffs[i*3+(blueIdx^2)] = cvRound((lshift*softfloat(_coeffs[i*3 ]))/softfloat(_whitept[i])); + coeffs[i*3+1] = cvRound((lshift*softfloat(_coeffs[i*3+1]))/softfloat(_whitept[i])); + coeffs[i*3+blueIdx] = cvRound((lshift*softfloat(_coeffs[i*3+2]))/softfloat(_whitept[i])); - CV_Assert( coeffs[i] >= 0 && coeffs[i*3+1] >= 0 && coeffs[i*3+2] >= 0 && - coeffs[i*3] + coeffs[i*3+1] + coeffs[i*3+2] < 2*(1 << lab_shift) ); + CV_Assert(coeffs[i*3] >= 0 && coeffs[i*3+1] >= 0 && coeffs[i*3+2] >= 0 && + coeffs[i*3] + coeffs[i*3+1] + coeffs[i*3+2] < 2*(1 << lab_shift)); } } @@ -5886,7 +6015,8 @@ struct RGB2Lab_b C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; n *= 3; - for( i = 0; i < n; i += 3, src += scn ) + i = 0; + for(; i < n; i += 3, src += scn ) { int R = tab[src[0]], G = tab[src[1]], B = tab[src[2]]; int fX = LabCbrtTab_b[CV_DESCALE(R*C0 + G*C1 + B*C2, lab_shift)]; @@ -5909,9 +6039,6 @@ struct RGB2Lab_b }; -#define clip(value) \ - value < 0.0f ? 0.0f : value > 1.0f ? 1.0f : value; - struct RGB2Lab_f { typedef float channel_type; @@ -5928,17 +6055,22 @@ struct RGB2Lab_f if (!_whitept) _whitept = D65; - float scale[] = { 1.0f / _whitept[0], 1.0f, 1.0f / _whitept[2] }; + softfloat scale[] = { softfloat::one() / softfloat(_whitept[0]), + softfloat::one(), + softfloat::one() / softfloat(_whitept[2]) }; for( int i = 0; i < _3; i++ ) { int j = i * 3; - coeffs[j + (blueIdx ^ 2)] = _coeffs[j] * scale[i]; - coeffs[j + 1] = _coeffs[j + 1] * scale[i]; - coeffs[j + blueIdx] = _coeffs[j + 2] * scale[i]; + softfloat c0 = scale[i] * softfloat(_coeffs[j ]); + softfloat c1 = scale[i] * softfloat(_coeffs[j + 1]); + softfloat c2 = scale[i] * softfloat(_coeffs[j + 2]); + coeffs[j + (blueIdx ^ 2)] = c0; + coeffs[j + 1] = c1; + coeffs[j + blueIdx] = c2; - CV_Assert( coeffs[j] >= 0 && coeffs[j + 1] >= 0 && coeffs[j + 2] >= 0 && - coeffs[j] + coeffs[j + 1] + coeffs[j + 2] < 1.5f*LabCbrtTabScale ); + CV_Assert( c0 >= 0 && c1 >= 0 && c2 >= 0 && + c0 + c1 + c2 < softfloat((int)LAB_CBRT_TAB_SIZE) ); } } @@ -5952,8 +6084,7 @@ struct RGB2Lab_f C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; n *= 3; - static const float _1_3 = 1.0f / 3.0f; - static const float _a = 16.0f / 116.0f; + static const float _a = (softfloat(16) / softfloat(116)); for (i = 0; i < n; i += 3, src += scn ) { float R = clip(src[0]); @@ -5969,10 +6100,10 @@ struct RGB2Lab_f float X = R*C0 + G*C1 + B*C2; float Y = R*C3 + G*C4 + B*C5; float Z = R*C6 + G*C7 + B*C8; - - float FX = X > 0.008856f ? std::pow(X, _1_3) : (7.787f * X + _a); - float FY = Y > 0.008856f ? std::pow(Y, _1_3) : (7.787f * Y + _a); - float FZ = Z > 0.008856f ? std::pow(Z, _1_3) : (7.787f * Z + _a); + // 7.787f = (29/3)^3/(29*4), 0.008856f = (6/29)^3, 903.3 = (29/3)^3 + float FX = X > 0.008856f ? std::pow(X, _1_3f) : (7.787f * X + _a); + float FY = Y > 0.008856f ? std::pow(Y, _1_3f) : (7.787f * Y + _a); + float FZ = Z > 0.008856f ? std::pow(Z, _1_3f) : (7.787f * Z + _a); float L = Y > 0.008856f ? (116.f * FY - 16.f) : (903.3f * Y); float a = 500.f * (FX - FY); @@ -5989,13 +6120,15 @@ struct RGB2Lab_f bool srgb; }; -struct Lab2RGB_f + +// Performs conversion in floats +struct Lab2RGBfloat { typedef float channel_type; - Lab2RGB_f( int _dstcn, int blueIdx, const float* _coeffs, + Lab2RGBfloat( int _dstcn, int _blueIdx, const float* _coeffs, const float* _whitept, bool _srgb ) - : dstcn(_dstcn), srgb(_srgb) + : dstcn(_dstcn), srgb(_srgb), blueIdx(_blueIdx) { initLabTabs(); @@ -6006,13 +6139,14 @@ struct Lab2RGB_f for( int i = 0; i < 3; i++ ) { - coeffs[i+(blueIdx^2)*3] = _coeffs[i]*_whitept[i]; - coeffs[i+3] = _coeffs[i+3]*_whitept[i]; - coeffs[i+blueIdx*3] = _coeffs[i+6]*_whitept[i]; + coeffs[i+(blueIdx^2)*3] = (softfloat(_coeffs[i] )*softfloat(_whitept[i])); + coeffs[i+3] = (softfloat(_coeffs[i+3])*softfloat(_whitept[i])); + coeffs[i+blueIdx*3] = (softfloat(_coeffs[i+6])*softfloat(_whitept[i])); } - lThresh = 0.008856f * 903.3f; - fThresh = 7.787f * 0.008856f + 16.0f / 116.0f; + lThresh = softfloat(8); // 0.008856f * 903.3f = (6/29)^3*(29/3)^3 = 8 + fThresh = softfloat(6)/softfloat(29); // 7.787f * 0.008856f + 16.0f / 116.0f = 6/29 + #if CV_SSE2 haveSIMD = checkHardwareSupport(CV_CPU_SSE2); #endif @@ -6022,6 +6156,7 @@ struct Lab2RGB_f void process(__m128& v_li0, __m128& v_li1, __m128& v_ai0, __m128& v_ai1, __m128& v_bi0, __m128& v_bi1) const { + // 903.3 = (29/3)^3, 7.787 = (29/3)^3/(29*4) __m128 v_y00 = _mm_mul_ps(v_li0, _mm_set1_ps(1.0f/903.3f)); __m128 v_y01 = _mm_mul_ps(v_li1, _mm_set1_ps(1.0f/903.3f)); __m128 v_fy00 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(7.787f), v_y00), _mm_set1_ps(16.0f/116.0f)); @@ -6187,6 +6322,7 @@ struct Lab2RGB_f float ai = src[i + 1]; float bi = src[i + 2]; + // 903.3 = (29/3)^3, 7.787 = (29/3)^3/(29*4) float y, fy; if (li <= lThresh) { @@ -6207,7 +6343,6 @@ struct Lab2RGB_f else fxz[j] = fxz[j] * fxz[j] * fxz[j]; - float x = fxz[0], z = fxz[1]; float ro = C0 * x + C1 * y + C2 * z; float go = C3 * x + C4 * y + C5 * z; @@ -6237,18 +6372,391 @@ struct Lab2RGB_f #if CV_SSE2 bool haveSIMD; #endif + int blueIdx; +}; + + +// Performs conversion in integers +struct Lab2RGBinteger +{ + typedef uchar channel_type; + + static const int base_shift = 14; + static const int BASE = (1 << base_shift); + // lThresh == (6/29)^3 * (29/3)^3 * BASE/100 + static const int lThresh = 1311; + // fThresh == ((29/3)^3/(29*4) * (6/29)^3 + 16/116)*BASE + static const int fThresh = 3390; + // base16_116 == BASE*16/116 + static const int base16_116 = 2260; + static const int shift = lab_shift+(base_shift-inv_gamma_shift); + + Lab2RGBinteger( int _dstcn, int blueIdx, const float* _coeffs, + const float* _whitept, bool srgb ) + : dstcn(_dstcn) + { + if(!_coeffs) + _coeffs = XYZ2sRGB_D65; + if(!_whitept) + _whitept = D65; + + static const softfloat lshift(1 << lab_shift); + for(int i = 0; i < 3; i++) + { + coeffs[i+(blueIdx)*3] = cvRound(lshift*softfloat(_coeffs[i ])*softfloat(_whitept[i])); + coeffs[i+3] = cvRound(lshift*softfloat(_coeffs[i+3])*softfloat(_whitept[i])); + coeffs[i+(blueIdx^2)*3] = cvRound(lshift*softfloat(_coeffs[i+6])*softfloat(_whitept[i])); + } + + tab = srgb ? sRGBInvGammaTab_b : linearInvGammaTab_b; + } + + // L, a, b should be in their natural range + inline void process(const uchar LL, const uchar aa, const uchar bb, int& ro, int& go, int& bo) const + { + int x, y, z; + int ify; + + y = LabToYF_b[LL*2 ]; + ify = LabToYF_b[LL*2+1]; + + //float fxz[] = { ai / 500.0f + fy, fy - bi / 200.0f }; + int adiv, bdiv; + adiv = aa*BASE/500 - 128*BASE/500, bdiv = bb*BASE/200 - 128*BASE/200; + + int ifxz[] = {ify + adiv, ify - bdiv}; + + for(int k = 0; k < 2; k++) + { + int& v = ifxz[k]; + v = abToXZ_b[v-minABvalue]; + } + x = ifxz[0]; /* y = y */; z = ifxz[1]; + + int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2]; + int C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5]; + int C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; + + ro = CV_DESCALE(C0 * x + C1 * y + C2 * z, shift); + go = CV_DESCALE(C3 * x + C4 * y + C5 * z, shift); + bo = CV_DESCALE(C6 * x + C7 * y + C8 * z, shift); + + ro = max(0, min((int)INV_GAMMA_TAB_SIZE-1, ro)); + go = max(0, min((int)INV_GAMMA_TAB_SIZE-1, go)); + bo = max(0, min((int)INV_GAMMA_TAB_SIZE-1, bo)); + + ro = tab[ro]; + go = tab[go]; + bo = tab[bo]; + } + + // L, a, b shoule be in their natural range + inline void processLabToXYZ(const v_uint8x16& lv, const v_uint8x16& av, const v_uint8x16& bv, + v_int32x4& xiv00, v_int32x4& yiv00, v_int32x4& ziv00, + v_int32x4& xiv01, v_int32x4& yiv01, v_int32x4& ziv01, + v_int32x4& xiv10, v_int32x4& yiv10, v_int32x4& ziv10, + v_int32x4& xiv11, v_int32x4& yiv11, v_int32x4& ziv11) const + { + v_uint16x8 lv0, lv1; + v_expand(lv, lv0, lv1); + // Load Y and IFY values from lookup-table + // y = LabToYF_b[L_value*2], ify = LabToYF_b[L_value*2 + 1] + // LabToYF_b[i*2 ] = y; // 2260 <= y <= BASE + // LabToYF_b[i*2+1] = ify; // 0 <= ify <= BASE + uint16_t CV_DECL_ALIGNED(16) v_lv0[8], v_lv1[8]; + v_store_aligned(v_lv0, (lv0 << 1)); v_store_aligned(v_lv1, (lv1 << 1)); + v_int16x8 ify0, ify1; + + yiv00 = v_int32x4(LabToYF_b[v_lv0[0] ], LabToYF_b[v_lv0[1] ], LabToYF_b[v_lv0[2] ], LabToYF_b[v_lv0[3] ]); + yiv01 = v_int32x4(LabToYF_b[v_lv0[4] ], LabToYF_b[v_lv0[5] ], LabToYF_b[v_lv0[6] ], LabToYF_b[v_lv0[7] ]); + yiv10 = v_int32x4(LabToYF_b[v_lv1[0] ], LabToYF_b[v_lv1[1] ], LabToYF_b[v_lv1[2] ], LabToYF_b[v_lv1[3] ]); + yiv11 = v_int32x4(LabToYF_b[v_lv1[4] ], LabToYF_b[v_lv1[5] ], LabToYF_b[v_lv1[6] ], LabToYF_b[v_lv1[7] ]); + + ify0 = v_int16x8(LabToYF_b[v_lv0[0]+1], LabToYF_b[v_lv0[1]+1], LabToYF_b[v_lv0[2]+1], LabToYF_b[v_lv0[3]+1], + LabToYF_b[v_lv0[4]+1], LabToYF_b[v_lv0[5]+1], LabToYF_b[v_lv0[6]+1], LabToYF_b[v_lv0[7]+1]); + ify1 = v_int16x8(LabToYF_b[v_lv1[0]+1], LabToYF_b[v_lv1[1]+1], LabToYF_b[v_lv1[2]+1], LabToYF_b[v_lv1[3]+1], + LabToYF_b[v_lv1[4]+1], LabToYF_b[v_lv1[5]+1], LabToYF_b[v_lv1[6]+1], LabToYF_b[v_lv1[7]+1]); + + v_int16x8 adiv0, adiv1, bdiv0, bdiv1; + v_uint16x8 av0, av1, bv0, bv1; + v_expand(av, av0, av1); v_expand(bv, bv0, bv1); + //adiv = aa*BASE/500 - 128*BASE/500, bdiv = bb*BASE/200 - 128*BASE/200; + //approximations with reasonable precision + v_uint16x8 mulA = v_setall_u16(53687); + v_uint32x4 ma00, ma01, ma10, ma11; + v_uint32x4 addA = v_setall_u32(1 << 7); + v_mul_expand((av0 + (av0 << 2)), mulA, ma00, ma01); + v_mul_expand((av1 + (av1 << 2)), mulA, ma10, ma11); + adiv0 = v_reinterpret_as_s16(v_pack(((ma00 + addA) >> 13), ((ma01 + addA) >> 13))); + adiv1 = v_reinterpret_as_s16(v_pack(((ma10 + addA) >> 13), ((ma11 + addA) >> 13))); + + v_uint16x8 mulB = v_setall_u16(41943); + v_uint32x4 mb00, mb01, mb10, mb11; + v_uint32x4 addB = v_setall_u32(1 << 4); + v_mul_expand(bv0, mulB, mb00, mb01); + v_mul_expand(bv1, mulB, mb10, mb11); + bdiv0 = v_reinterpret_as_s16(v_pack((mb00 + addB) >> 9, (mb01 + addB) >> 9)); + bdiv1 = v_reinterpret_as_s16(v_pack((mb10 + addB) >> 9, (mb11 + addB) >> 9)); + + // 0 <= adiv <= 8356, 0 <= bdiv <= 20890 + /* x = ifxz[0]; y = y; z = ifxz[1]; */ + v_uint16x8 xiv0, xiv1, ziv0, ziv1; + v_int16x8 vSubA = v_setall_s16(-128*BASE/500 - minABvalue), vSubB = v_setall_s16(128*BASE/200-1 - minABvalue); + + // int ifxz[] = {ify + adiv, ify - bdiv}; + // ifxz[k] = abToXZ_b[ifxz[k]-minABvalue]; + xiv0 = v_reinterpret_as_u16(v_add_wrap(v_add_wrap(ify0, adiv0), vSubA)); + xiv1 = v_reinterpret_as_u16(v_add_wrap(v_add_wrap(ify1, adiv1), vSubA)); + ziv0 = v_reinterpret_as_u16(v_add_wrap(v_sub_wrap(ify0, bdiv0), vSubB)); + ziv1 = v_reinterpret_as_u16(v_add_wrap(v_sub_wrap(ify1, bdiv1), vSubB)); + + uint16_t CV_DECL_ALIGNED(16) v_x0[8], v_x1[8], v_z0[8], v_z1[8]; + v_store_aligned(v_x0, xiv0 ); v_store_aligned(v_x1, xiv1 ); + v_store_aligned(v_z0, ziv0 ); v_store_aligned(v_z1, ziv1 ); + + xiv00 = v_int32x4(abToXZ_b[v_x0[0]], abToXZ_b[v_x0[1]], abToXZ_b[v_x0[2]], abToXZ_b[v_x0[3]]); + xiv01 = v_int32x4(abToXZ_b[v_x0[4]], abToXZ_b[v_x0[5]], abToXZ_b[v_x0[6]], abToXZ_b[v_x0[7]]); + xiv10 = v_int32x4(abToXZ_b[v_x1[0]], abToXZ_b[v_x1[1]], abToXZ_b[v_x1[2]], abToXZ_b[v_x1[3]]); + xiv11 = v_int32x4(abToXZ_b[v_x1[4]], abToXZ_b[v_x1[5]], abToXZ_b[v_x1[6]], abToXZ_b[v_x1[7]]); + ziv00 = v_int32x4(abToXZ_b[v_z0[0]], abToXZ_b[v_z0[1]], abToXZ_b[v_z0[2]], abToXZ_b[v_z0[3]]); + ziv01 = v_int32x4(abToXZ_b[v_z0[4]], abToXZ_b[v_z0[5]], abToXZ_b[v_z0[6]], abToXZ_b[v_z0[7]]); + ziv10 = v_int32x4(abToXZ_b[v_z1[0]], abToXZ_b[v_z1[1]], abToXZ_b[v_z1[2]], abToXZ_b[v_z1[3]]); + ziv11 = v_int32x4(abToXZ_b[v_z1[4]], abToXZ_b[v_z1[5]], abToXZ_b[v_z1[6]], abToXZ_b[v_z1[7]]); + // abToXZ_b[i-minABvalue] = v; // -1335 <= v <= 88231 + } + + void operator()(const float* src, float* dst, int n) const + { + int dcn = dstcn; + float alpha = ColorChannel::max(); + + int i = 0; + if(enablePackedLab) + { + v_float32x4 vldiv = v_setall_f32(256.f/100.0f); + v_float32x4 vf255 = v_setall_f32(255.f); + static const int nPixels = 16; + for(; i <= n*3-3*nPixels; i += 3*nPixels, dst += dcn*nPixels) + { + /* + int L = saturate_cast(src[i]*BASE/100.0f); + int a = saturate_cast(src[i + 1]*BASE/256); + int b = saturate_cast(src[i + 2]*BASE/256); + */ + v_float32x4 vl[4], va[4], vb[4]; + for(int k = 0; k < 4; k++) + { + v_load_deinterleave(src + i + k*3*4, vl[k], va[k], vb[k]); + vl[k] *= vldiv; + } + + v_int32x4 ivl[4], iva[4], ivb[4]; + for(int k = 0; k < 4; k++) + { + ivl[k] = v_round(vl[k]), iva[k] = v_round(va[k]), ivb[k] = v_round(vb[k]); + } + v_int16x8 ivl16[2], iva16[2], ivb16[2]; + ivl16[0] = v_pack(ivl[0], ivl[1]); iva16[0] = v_pack(iva[0], iva[1]); ivb16[0] = v_pack(ivb[0], ivb[1]); + ivl16[1] = v_pack(ivl[2], ivl[3]); iva16[1] = v_pack(iva[2], iva[3]); ivb16[1] = v_pack(ivb[2], ivb[3]); + v_uint8x16 ivl8, iva8, ivb8; + ivl8 = v_reinterpret_as_u8(v_pack(ivl16[0], ivl16[1])); + iva8 = v_reinterpret_as_u8(v_pack(iva16[0], iva16[1])); + ivb8 = v_reinterpret_as_u8(v_pack(ivb16[0], ivb16[1])); + + v_int32x4 ixv[4], iyv[4], izv[4]; + + processLabToXYZ(ivl8, iva8, ivb8, ixv[0], iyv[0], izv[0], + ixv[1], iyv[1], izv[1], + ixv[2], iyv[2], izv[2], + ixv[3], iyv[3], izv[3]); + /* + ro = CV_DESCALE(C0 * x + C1 * y + C2 * z, shift); + go = CV_DESCALE(C3 * x + C4 * y + C5 * z, shift); + bo = CV_DESCALE(C6 * x + C7 * y + C8 * z, shift); + */ + v_int32x4 C0 = v_setall_s32(coeffs[0]), C1 = v_setall_s32(coeffs[1]), C2 = v_setall_s32(coeffs[2]); + v_int32x4 C3 = v_setall_s32(coeffs[3]), C4 = v_setall_s32(coeffs[4]), C5 = v_setall_s32(coeffs[5]); + v_int32x4 C6 = v_setall_s32(coeffs[6]), C7 = v_setall_s32(coeffs[7]), C8 = v_setall_s32(coeffs[8]); + v_int32x4 descaleShift = v_setall_s32(1 << (shift-1)), tabsz = v_setall_s32((int)INV_GAMMA_TAB_SIZE-1); + for(int k = 0; k < 4; k++) + { + v_int32x4 i_r, i_g, i_b; + v_uint32x4 r_vecs, g_vecs, b_vecs; + i_r = (ixv[k]*C0 + iyv[k]*C1 + izv[k]*C2 + descaleShift) >> shift; + i_g = (ixv[k]*C3 + iyv[k]*C4 + izv[k]*C5 + descaleShift) >> shift; + i_b = (ixv[k]*C6 + iyv[k]*C7 + izv[k]*C8 + descaleShift) >> shift; + + //limit indices in table and then substitute + //ro = tab[ro]; go = tab[go]; bo = tab[bo]; + int32_t CV_DECL_ALIGNED(16) rshifts[4], gshifts[4], bshifts[4]; + v_int32x4 rs = v_max(v_setzero_s32(), v_min(tabsz, i_r)); + v_int32x4 gs = v_max(v_setzero_s32(), v_min(tabsz, i_g)); + v_int32x4 bs = v_max(v_setzero_s32(), v_min(tabsz, i_b)); + + v_store_aligned(rshifts, rs); + v_store_aligned(gshifts, gs); + v_store_aligned(bshifts, bs); + + r_vecs = v_uint32x4(tab[rshifts[0]], tab[rshifts[1]], tab[rshifts[2]], tab[rshifts[3]]); + g_vecs = v_uint32x4(tab[gshifts[0]], tab[gshifts[1]], tab[gshifts[2]], tab[gshifts[3]]); + b_vecs = v_uint32x4(tab[bshifts[0]], tab[bshifts[1]], tab[bshifts[2]], tab[bshifts[3]]); + + v_float32x4 v_r, v_g, v_b; + v_r = v_cvt_f32(v_reinterpret_as_s32(r_vecs))/vf255; + v_g = v_cvt_f32(v_reinterpret_as_s32(g_vecs))/vf255; + v_b = v_cvt_f32(v_reinterpret_as_s32(b_vecs))/vf255; + + if(dcn == 4) + { + v_store_interleave(dst + k*dcn*4, v_b, v_g, v_r, v_setall_f32(alpha)); + } + else // dcn == 3 + { + v_store_interleave(dst + k*dcn*4, v_b, v_g, v_r); + } + } + } + } + + for(; i < n*3; i += 3, dst += dcn) + { + int ro, go, bo; + process((uchar)(src[i + 0]*255.f/100.f), (uchar)src[i + 1], (uchar)src[i + 2], ro, go, bo); + + dst[0] = bo/255.f; + dst[1] = go/255.f; + dst[2] = ro/255.f; + if(dcn == 4) + dst[3] = alpha; + } + } + + void operator()(const uchar* src, uchar* dst, int n) const + { + int i, dcn = dstcn; + uchar alpha = ColorChannel::max(); + i = 0; + + if(enablePackedLab) + { + static const int nPixels = 8*2; + for(; i <= n*3-3*nPixels; i += 3*nPixels, dst += dcn*nPixels) + { + /* + int L = src[i + 0]; + int a = src[i + 1]; + int b = src[i + 2]; + */ + v_uint8x16 u8l, u8a, u8b; + v_load_deinterleave(src + i, u8l, u8a, u8b); + + v_int32x4 xiv[4], yiv[4], ziv[4]; + processLabToXYZ(u8l, u8a, u8b, xiv[0], yiv[0], ziv[0], + xiv[1], yiv[1], ziv[1], + xiv[2], yiv[2], ziv[2], + xiv[3], yiv[3], ziv[3]); + /* + ro = CV_DESCALE(C0 * x + C1 * y + C2 * z, shift); + go = CV_DESCALE(C3 * x + C4 * y + C5 * z, shift); + bo = CV_DESCALE(C6 * x + C7 * y + C8 * z, shift); + */ + v_int32x4 C0 = v_setall_s32(coeffs[0]), C1 = v_setall_s32(coeffs[1]), C2 = v_setall_s32(coeffs[2]); + v_int32x4 C3 = v_setall_s32(coeffs[3]), C4 = v_setall_s32(coeffs[4]), C5 = v_setall_s32(coeffs[5]); + v_int32x4 C6 = v_setall_s32(coeffs[6]), C7 = v_setall_s32(coeffs[7]), C8 = v_setall_s32(coeffs[8]); + v_int32x4 descaleShift = v_setall_s32(1 << (shift-1)); + v_int32x4 tabsz = v_setall_s32((int)INV_GAMMA_TAB_SIZE-1); + v_uint32x4 r_vecs[4], g_vecs[4], b_vecs[4]; + for(int k = 0; k < 4; k++) + { + v_int32x4 i_r, i_g, i_b; + i_r = (xiv[k]*C0 + yiv[k]*C1 + ziv[k]*C2 + descaleShift) >> shift; + i_g = (xiv[k]*C3 + yiv[k]*C4 + ziv[k]*C5 + descaleShift) >> shift; + i_b = (xiv[k]*C6 + yiv[k]*C7 + ziv[k]*C8 + descaleShift) >> shift; + + //limit indices in table and then substitute + //ro = tab[ro]; go = tab[go]; bo = tab[bo]; + int32_t CV_DECL_ALIGNED(16) rshifts[4], gshifts[4], bshifts[4]; + v_int32x4 rs = v_max(v_setzero_s32(), v_min(tabsz, i_r)); + v_int32x4 gs = v_max(v_setzero_s32(), v_min(tabsz, i_g)); + v_int32x4 bs = v_max(v_setzero_s32(), v_min(tabsz, i_b)); + + v_store_aligned(rshifts, rs); + v_store_aligned(gshifts, gs); + v_store_aligned(bshifts, bs); + + r_vecs[k] = v_uint32x4(tab[rshifts[0]], tab[rshifts[1]], tab[rshifts[2]], tab[rshifts[3]]); + g_vecs[k] = v_uint32x4(tab[gshifts[0]], tab[gshifts[1]], tab[gshifts[2]], tab[gshifts[3]]); + b_vecs[k] = v_uint32x4(tab[bshifts[0]], tab[bshifts[1]], tab[bshifts[2]], tab[bshifts[3]]); + } + + v_uint16x8 u_rvec0 = v_pack(r_vecs[0], r_vecs[1]), u_rvec1 = v_pack(r_vecs[2], r_vecs[3]); + v_uint16x8 u_gvec0 = v_pack(g_vecs[0], g_vecs[1]), u_gvec1 = v_pack(g_vecs[2], g_vecs[3]); + v_uint16x8 u_bvec0 = v_pack(b_vecs[0], b_vecs[1]), u_bvec1 = v_pack(b_vecs[2], b_vecs[3]); + + v_uint8x16 u8_b, u8_g, u8_r; + u8_b = v_pack(u_bvec0, u_bvec1); + u8_g = v_pack(u_gvec0, u_gvec1); + u8_r = v_pack(u_rvec0, u_rvec1); + + if(dcn == 4) + { + v_store_interleave(dst, u8_b, u8_g, u8_r, v_setall_u8(alpha)); + } + else + { + v_store_interleave(dst, u8_b, u8_g, u8_r); + } + } + } + + for (; i < n*3; i += 3, dst += dcn) + { + int ro, go, bo; + process(src[i + 0], src[i + 1], src[i + 2], ro, go, bo); + + dst[0] = saturate_cast(bo); + dst[1] = saturate_cast(go); + dst[2] = saturate_cast(ro); + if( dcn == 4 ) + dst[3] = alpha; + } + } + + int dstcn; + int coeffs[9]; + ushort* tab; +}; + + +struct Lab2RGB_f +{ + typedef float channel_type; + + Lab2RGB_f( int _dstcn, int _blueIdx, const float* _coeffs, + const float* _whitept, bool _srgb ) + : fcvt(_dstcn, _blueIdx, _coeffs, _whitept, _srgb), dstcn(_dstcn) + { } + + void operator()(const float* src, float* dst, int n) const + { + fcvt(src, dst, n); + } + + Lab2RGBfloat fcvt; + int dstcn; }; -#undef clip struct Lab2RGB_b { typedef uchar channel_type; - Lab2RGB_b( int _dstcn, int blueIdx, const float* _coeffs, + Lab2RGB_b( int _dstcn, int _blueIdx, const float* _coeffs, const float* _whitept, bool _srgb ) - : dstcn(_dstcn), cvt(3, blueIdx, _coeffs, _whitept, _srgb ) + : fcvt(3, _blueIdx, _coeffs, _whitept, _srgb ), icvt(_dstcn, _blueIdx, _coeffs, _whitept, _srgb), dstcn(_dstcn) { + useBitExactness = (!_coeffs && !_whitept && _srgb && enableBitExactness); + #if CV_NEON v_scale_inv = vdupq_n_f32(100.f/255.f); v_scale = vdupq_n_f32(255.f); @@ -6305,6 +6813,12 @@ struct Lab2RGB_b void operator()(const uchar* src, uchar* dst, int n) const { + if(useBitExactness) + { + icvt(src, dst, n); + return; + } + int i, j, dcn = dstcn; uchar alpha = ColorChannel::max(); float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE]; @@ -6313,7 +6827,8 @@ struct Lab2RGB_b __m128 v_res = _mm_set_ps(0.f, 128.f, 128.f, 0.f); #endif - for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 ) + i = 0; + for(; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 ) { int dn = std::min(n - i, (int)BLOCK_SIZE); j = 0; @@ -6360,7 +6875,7 @@ struct Lab2RGB_b buf[j+1] = (float)(src[j+1] - 128); buf[j+2] = (float)(src[j+2] - 128); } - cvt(buf, buf, dn); + fcvt(buf, buf, dn); j = 0; #if CV_NEON @@ -6453,9 +6968,8 @@ struct Lab2RGB_b } } - int dstcn; - Lab2RGB_f cvt; - + Lab2RGBfloat fcvt; + Lab2RGBinteger icvt; #if CV_NEON float32x4_t v_scale, v_scale_inv, v_128; uint8x8_t v_alpha; @@ -6465,8 +6979,11 @@ struct Lab2RGB_b __m128i v_zero; bool haveSIMD; #endif + bool useBitExactness; + int dstcn; }; +#undef clip ///////////////////////////////////// RGB <-> L*u*v* ///////////////////////////////////// @@ -6492,12 +7009,17 @@ struct RGB2Luv_f if( blueIdx == 0 ) std::swap(coeffs[i*3], coeffs[i*3+2]); CV_Assert( coeffs[i*3] >= 0 && coeffs[i*3+1] >= 0 && coeffs[i*3+2] >= 0 && - coeffs[i*3] + coeffs[i*3+1] + coeffs[i*3+2] < 1.5f ); + softfloat(coeffs[i*3]) + + softfloat(coeffs[i*3+1]) + + softfloat(coeffs[i*3+2]) < softfloat(1.5f) ); } - float d = 1.f/std::max(whitept[0] + whitept[1]*15 + whitept[2]*3, FLT_EPSILON); - un = 4*whitept[0]*d*13; - vn = 9*whitept[1]*d*13; + softfloat d = softfloat(whitept[0]) + + softfloat(whitept[1])*softfloat(15) + + softfloat(whitept[2])*softfloat(3); + d = softfloat::one()/max(d, softfloat(FLT_EPSILON)); + un = d*softfloat(13*4)*softfloat(whitept[0]); + vn = d*softfloat(13*9)*softfloat(whitept[1]); #if CV_SSE2 haveSIMD = checkHardwareSupport(CV_CPU_SSE2); @@ -6790,9 +7312,12 @@ struct Luv2RGB_f coeffs[i+blueIdx*3] = _coeffs[i+6]; } - float d = 1.f/std::max(whitept[0] + whitept[1]*15 + whitept[2]*3, FLT_EPSILON); - un = 4*13*whitept[0]*d; - vn = 9*13*whitept[1]*d; + softfloat d = softfloat(whitept[0]) + + softfloat(whitept[1])*softfloat(15) + + softfloat(whitept[2])*softfloat(3); + d = softfloat::one()/max(d, softfloat(FLT_EPSILON)); + un = softfloat(4*13)*d*softfloat(whitept[0]); + vn = softfloat(9*13)*d*softfloat(whitept[1]); #if CV_SSE2 haveSIMD = checkHardwareSupport(CV_CPU_SSE2); #endif @@ -8534,21 +9059,15 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) { int coeffs[9]; const float * const _coeffs = sRGB2XYZ_D65, * const _whitept = D65; - const float scale[] = + static const softfloat lshift(1 << lab_shift); + for( int i = 0; i < 3; i++ ) { - (1 << lab_shift)/_whitept[0], - (float)(1 << lab_shift), - (1 << lab_shift)/_whitept[2] - }; + coeffs[i*3+(bidx^2)] = cvRound(lshift*softfloat(_coeffs[i*3 ])/softfloat(_whitept[i])); + coeffs[i*3+1] = cvRound(lshift*softfloat(_coeffs[i*3+1])/softfloat(_whitept[i])); + coeffs[i*3+bidx] = cvRound(lshift*softfloat(_coeffs[i*3+2])/softfloat(_whitept[i])); - for (int i = 0; i < 3; i++ ) - { - coeffs[i*3+(bidx^2)] = cvRound(_coeffs[i*3]*scale[i]); - coeffs[i*3+1] = cvRound(_coeffs[i*3+1]*scale[i]); - coeffs[i*3+bidx] = cvRound(_coeffs[i*3+2]*scale[i]); - - CV_Assert( coeffs[i] >= 0 && coeffs[i*3+1] >= 0 && coeffs[i*3+2] >= 0 && - coeffs[i*3] + coeffs[i*3+1] + coeffs[i*3+2] < 2*(1 << lab_shift) ); + CV_Assert(coeffs[i*3] >= 0 && coeffs[i*3+1] >= 0 && coeffs[i*3+2] >= 0 && + coeffs[i*3] + coeffs[i*3+1] + coeffs[i*3+2] < 2*(1 << lab_shift)); } Mat(1, 9, CV_32SC1, coeffs).copyTo(ucoeffs); } @@ -8573,36 +9092,47 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) { float coeffs[9]; const float * const _coeffs = sRGB2XYZ_D65, * const _whitept = D65; - float scale[] = { 1.0f / _whitept[0], 1.0f, 1.0f / _whitept[2] }; + + softfloat scale[] = { softfloat::one() / softfloat(_whitept[0]), + softfloat::one(), + softfloat::one() / softfloat(_whitept[2]) }; for (int i = 0; i < 3; i++) { int j = i * 3; - coeffs[j + (bidx ^ 2)] = _coeffs[j] * (lab ? scale[i] : 1); - coeffs[j + 1] = _coeffs[j + 1] * (lab ? scale[i] : 1); - coeffs[j + bidx] = _coeffs[j + 2] * (lab ? scale[i] : 1); - CV_Assert( coeffs[j] >= 0 && coeffs[j + 1] >= 0 && coeffs[j + 2] >= 0 && - coeffs[j] + coeffs[j + 1] + coeffs[j + 2] < 1.5f*(lab ? LabCbrtTabScale : 1) ); + softfloat c0 = (lab ? scale[i] : softfloat::one()) * softfloat(_coeffs[j ]); + softfloat c1 = (lab ? scale[i] : softfloat::one()) * softfloat(_coeffs[j + 1]); + softfloat c2 = (lab ? scale[i] : softfloat::one()) * softfloat(_coeffs[j + 2]); + + coeffs[j + (bidx ^ 2)] = c0; + coeffs[j + 1] = c1; + coeffs[j + bidx] = c2; + + CV_Assert( c0 >= 0 && c1 >= 0 && c2 >= 0 && + c0 + c1 + c2 < (lab ? softfloat((int)LAB_CBRT_TAB_SIZE) : softfloat(3)/softfloat(2))); } - float d = 1.f/std::max(_whitept[0] + _whitept[1]*15 + _whitept[2]*3, FLT_EPSILON); - un = 13*4*_whitept[0]*d; - vn = 13*9*_whitept[1]*d; + softfloat d = softfloat(_whitept[0]) + + softfloat(_whitept[1])*softfloat(15) + + softfloat(_whitept[2])*softfloat(3); + d = softfloat::one()/max(d, softfloat(FLT_EPSILON)); + un = d*softfloat(13*4)*softfloat(_whitept[0]); + vn = d*softfloat(13*9)*softfloat(_whitept[1]); Mat(1, 9, CV_32FC1, coeffs).copyTo(ucoeffs); } - float _1_3 = 1.0f / 3.0f, _a = 16.0f / 116.0f; + float _a = softfloat(16)/softfloat(116); ocl::KernelArg ucoeffsarg = ocl::KernelArg::PtrReadOnly(ucoeffs); if (lab) { if (srgb) k.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(usRGBGammaTab), - ucoeffsarg, _1_3, _a); + ucoeffsarg, _1_3f, _a); else - k.args(srcarg, dstarg, ucoeffsarg, _1_3, _a); + k.args(srcarg, dstarg, ucoeffsarg, _1_3f, _a); } else { @@ -8648,14 +9178,17 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) for( int i = 0; i < 3; i++ ) { - coeffs[i+(bidx^2)*3] = _coeffs[i] * (lab ? _whitept[i] : 1); - coeffs[i+3] = _coeffs[i+3] * (lab ? _whitept[i] : 1); - coeffs[i+bidx*3] = _coeffs[i+6] * (lab ? _whitept[i] : 1); + coeffs[i+(bidx^2)*3] = softfloat(_coeffs[i] )*softfloat(lab ? _whitept[i] : 1); + coeffs[i+3] = softfloat(_coeffs[i+3])*softfloat(lab ? _whitept[i] : 1); + coeffs[i+bidx*3] = softfloat(_coeffs[i+6])*softfloat(lab ? _whitept[i] : 1); } - float d = 1.f/std::max(_whitept[0] + _whitept[1]*15 + _whitept[2]*3, FLT_EPSILON); - un = 4*13*_whitept[0]*d; - vn = 9*13*_whitept[1]*d; + softfloat d = softfloat(_whitept[0]) + + softfloat(_whitept[1])*softfloat(15) + + softfloat(_whitept[2])*softfloat(3); + d = softfloat::one()/max(d, softfloat(FLT_EPSILON)); + un = softfloat(4*13)*d*softfloat(_whitept[0]); + vn = softfloat(9*13)*d*softfloat(_whitept[1]); Mat(1, 9, CV_32FC1, coeffs).copyTo(ucoeffs); } @@ -8663,8 +9196,8 @@ static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn ) _dst.create(sz, CV_MAKETYPE(depth, dcn)); dst = _dst.getUMat(); - float lThresh = 0.008856f * 903.3f; - float fThresh = 7.787f * 0.008856f + 16.0f / 116.0f; + float lThresh = softfloat(8); // 0.008856f * 903.3f = (6/29)^3*(29/3)^3 = 8 + float fThresh = softfloat(6)/softfloat(29); // 7.787f * 0.008856f + 16.0f / 116.0f = 6/29 ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src), dstarg = ocl::KernelArg::WriteOnly(dst), diff --git a/modules/imgproc/src/opencl/cvtcolor.cl b/modules/imgproc/src/opencl/cvtcolor.cl index 3fd1ad1250..52bef008f0 100644 --- a/modules/imgproc/src/opencl/cvtcolor.cl +++ b/modules/imgproc/src/opencl/cvtcolor.cl @@ -1755,6 +1755,7 @@ __kernel void BGR2Lab(__global const uchar * srcptr, int src_step, int src_offse B = splineInterpolate(B * GammaTabScale, gammaTab, GAMMA_TAB_SIZE); #endif + // 7.787f = (29/3)^3/(29*4), 0.008856f = (6/29)^3, 903.3 = (29/3)^3 float X = fma(R, C0, fma(G, C1, B*C2)); float Y = fma(R, C3, fma(G, C4, B*C5)); float Z = fma(R, C6, fma(G, C7, B*C8)); @@ -1794,6 +1795,7 @@ inline void Lab2BGR_f(const float * srcbuf, float * dstbuf, C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8]; float y, fy; + // 903.3 = (29/3)^3, 7.787 = (29/3)^3/(29*4) if (li <= lThresh) { y = li / 903.3f;