Abseil Common Libraries (C++) (grcp 依赖)
https://abseil.io/
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
504 lines
18 KiB
504 lines
18 KiB
// Copyright 2018 The Abseil Authors. |
|
// |
|
// Licensed under the Apache License, Version 2.0 (the "License"); |
|
// you may not use this file except in compliance with the License. |
|
// You may obtain a copy of the License at |
|
// |
|
// https://www.apache.org/licenses/LICENSE-2.0 |
|
// |
|
// Unless required by applicable law or agreed to in writing, software |
|
// distributed under the License is distributed on an "AS IS" BASIS, |
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
// See the License for the specific language governing permissions and |
|
// limitations under the License. |
|
|
|
#include "absl/strings/internal/charconv_parse.h" |
|
#include "absl/strings/charconv.h" |
|
|
|
#include <cassert> |
|
#include <cstdint> |
|
#include <limits> |
|
|
|
#include "absl/strings/internal/memutil.h" |
|
|
|
namespace absl { |
|
ABSL_NAMESPACE_BEGIN |
|
namespace { |
|
|
|
// ParseFloat<10> will read the first 19 significant digits of the mantissa. |
|
// This number was chosen for multiple reasons. |
|
// |
|
// (a) First, for whatever integer type we choose to represent the mantissa, we |
|
// want to choose the largest possible number of decimal digits for that integer |
|
// type. We are using uint64_t, which can express any 19-digit unsigned |
|
// integer. |
|
// |
|
// (b) Second, we need to parse enough digits that the binary value of any |
|
// mantissa we capture has more bits of resolution than the mantissa |
|
// representation in the target float. Our algorithm requires at least 3 bits |
|
// of headway, but 19 decimal digits give a little more than that. |
|
// |
|
// The following static assertions verify the above comments: |
|
constexpr int kDecimalMantissaDigitsMax = 19; |
|
|
|
static_assert(std::numeric_limits<uint64_t>::digits10 == |
|
kDecimalMantissaDigitsMax, |
|
"(a) above"); |
|
|
|
// IEEE doubles, which we assume in Abseil, have 53 binary bits of mantissa. |
|
static_assert(std::numeric_limits<double>::is_iec559, "IEEE double assumed"); |
|
static_assert(std::numeric_limits<double>::radix == 2, "IEEE double fact"); |
|
static_assert(std::numeric_limits<double>::digits == 53, "IEEE double fact"); |
|
|
|
// The lowest valued 19-digit decimal mantissa we can read still contains |
|
// sufficient information to reconstruct a binary mantissa. |
|
static_assert(1000000000000000000u > (uint64_t(1) << (53 + 3)), "(b) above"); |
|
|
|
// ParseFloat<16> will read the first 15 significant digits of the mantissa. |
|
// |
|
// Because a base-16-to-base-2 conversion can be done exactly, we do not need |
|
// to maximize the number of scanned hex digits to improve our conversion. What |
|
// is required is to scan two more bits than the mantissa can represent, so that |
|
// we always round correctly. |
|
// |
|
// (One extra bit does not suffice to perform correct rounding, since a number |
|
// exactly halfway between two representable floats has unique rounding rules, |
|
// so we need to differentiate between a "halfway between" number and a "closer |
|
// to the larger value" number.) |
|
constexpr int kHexadecimalMantissaDigitsMax = 15; |
|
|
|
// The minimum number of significant bits that will be read from |
|
// kHexadecimalMantissaDigitsMax hex digits. We must subtract by three, since |
|
// the most significant digit can be a "1", which only contributes a single |
|
// significant bit. |
|
constexpr int kGuaranteedHexadecimalMantissaBitPrecision = |
|
4 * kHexadecimalMantissaDigitsMax - 3; |
|
|
|
static_assert(kGuaranteedHexadecimalMantissaBitPrecision > |
|
std::numeric_limits<double>::digits + 2, |
|
"kHexadecimalMantissaDigitsMax too small"); |
|
|
|
// We also impose a limit on the number of significant digits we will read from |
|
// an exponent, to avoid having to deal with integer overflow. We use 9 for |
|
// this purpose. |
|
// |
|
// If we read a 9 digit exponent, the end result of the conversion will |
|
// necessarily be infinity or zero, depending on the sign of the exponent. |
|
// Therefore we can just drop extra digits on the floor without any extra |
|
// logic. |
|
constexpr int kDecimalExponentDigitsMax = 9; |
|
static_assert(std::numeric_limits<int>::digits10 >= kDecimalExponentDigitsMax, |
|
"int type too small"); |
|
|
|
// To avoid incredibly large inputs causing integer overflow for our exponent, |
|
// we impose an arbitrary but very large limit on the number of significant |
|
// digits we will accept. The implementation refuses to match a string with |
|
// more consecutive significant mantissa digits than this. |
|
constexpr int kDecimalDigitLimit = 50000000; |
|
|
|
// Corresponding limit for hexadecimal digit inputs. This is one fourth the |
|
// amount of kDecimalDigitLimit, since each dropped hexadecimal digit requires |
|
// a binary exponent adjustment of 4. |
|
constexpr int kHexadecimalDigitLimit = kDecimalDigitLimit / 4; |
|
|
|
// The largest exponent we can read is 999999999 (per |
|
// kDecimalExponentDigitsMax), and the largest exponent adjustment we can get |
|
// from dropped mantissa digits is 2 * kDecimalDigitLimit, and the sum of these |
|
// comfortably fits in an integer. |
|
// |
|
// We count kDecimalDigitLimit twice because there are independent limits for |
|
// numbers before and after the decimal point. (In the case where there are no |
|
// significant digits before the decimal point, there are independent limits for |
|
// post-decimal-point leading zeroes and for significant digits.) |
|
static_assert(999999999 + 2 * kDecimalDigitLimit < |
|
std::numeric_limits<int>::max(), |
|
"int type too small"); |
|
static_assert(999999999 + 2 * (4 * kHexadecimalDigitLimit) < |
|
std::numeric_limits<int>::max(), |
|
"int type too small"); |
|
|
|
// Returns true if the provided bitfield allows parsing an exponent value |
|
// (e.g., "1.5e100"). |
|
bool AllowExponent(chars_format flags) { |
|
bool fixed = (flags & chars_format::fixed) == chars_format::fixed; |
|
bool scientific = |
|
(flags & chars_format::scientific) == chars_format::scientific; |
|
return scientific || !fixed; |
|
} |
|
|
|
// Returns true if the provided bitfield requires an exponent value be present. |
|
bool RequireExponent(chars_format flags) { |
|
bool fixed = (flags & chars_format::fixed) == chars_format::fixed; |
|
bool scientific = |
|
(flags & chars_format::scientific) == chars_format::scientific; |
|
return scientific && !fixed; |
|
} |
|
|
|
const int8_t kAsciiToInt[256] = { |
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, |
|
9, -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, |
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|
-1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|
-1, -1, -1, -1, -1, -1, -1, -1, -1}; |
|
|
|
// Returns true if `ch` is a digit in the given base |
|
template <int base> |
|
bool IsDigit(char ch); |
|
|
|
// Converts a valid `ch` to its digit value in the given base. |
|
template <int base> |
|
unsigned ToDigit(char ch); |
|
|
|
// Returns true if `ch` is the exponent delimiter for the given base. |
|
template <int base> |
|
bool IsExponentCharacter(char ch); |
|
|
|
// Returns the maximum number of significant digits we will read for a float |
|
// in the given base. |
|
template <int base> |
|
constexpr int MantissaDigitsMax(); |
|
|
|
// Returns the largest consecutive run of digits we will accept when parsing a |
|
// number in the given base. |
|
template <int base> |
|
constexpr int DigitLimit(); |
|
|
|
// Returns the amount the exponent must be adjusted by for each dropped digit. |
|
// (For decimal this is 1, since the digits are in base 10 and the exponent base |
|
// is also 10, but for hexadecimal this is 4, since the digits are base 16 but |
|
// the exponent base is 2.) |
|
template <int base> |
|
constexpr int DigitMagnitude(); |
|
|
|
template <> |
|
bool IsDigit<10>(char ch) { |
|
return ch >= '0' && ch <= '9'; |
|
} |
|
template <> |
|
bool IsDigit<16>(char ch) { |
|
return kAsciiToInt[static_cast<unsigned char>(ch)] >= 0; |
|
} |
|
|
|
template <> |
|
unsigned ToDigit<10>(char ch) { |
|
return ch - '0'; |
|
} |
|
template <> |
|
unsigned ToDigit<16>(char ch) { |
|
return kAsciiToInt[static_cast<unsigned char>(ch)]; |
|
} |
|
|
|
template <> |
|
bool IsExponentCharacter<10>(char ch) { |
|
return ch == 'e' || ch == 'E'; |
|
} |
|
|
|
template <> |
|
bool IsExponentCharacter<16>(char ch) { |
|
return ch == 'p' || ch == 'P'; |
|
} |
|
|
|
template <> |
|
constexpr int MantissaDigitsMax<10>() { |
|
return kDecimalMantissaDigitsMax; |
|
} |
|
template <> |
|
constexpr int MantissaDigitsMax<16>() { |
|
return kHexadecimalMantissaDigitsMax; |
|
} |
|
|
|
template <> |
|
constexpr int DigitLimit<10>() { |
|
return kDecimalDigitLimit; |
|
} |
|
template <> |
|
constexpr int DigitLimit<16>() { |
|
return kHexadecimalDigitLimit; |
|
} |
|
|
|
template <> |
|
constexpr int DigitMagnitude<10>() { |
|
return 1; |
|
} |
|
template <> |
|
constexpr int DigitMagnitude<16>() { |
|
return 4; |
|
} |
|
|
|
// Reads decimal digits from [begin, end) into *out. Returns the number of |
|
// digits consumed. |
|
// |
|
// After max_digits has been read, keeps consuming characters, but no longer |
|
// adjusts *out. If a nonzero digit is dropped this way, *dropped_nonzero_digit |
|
// is set; otherwise, it is left unmodified. |
|
// |
|
// If no digits are matched, returns 0 and leaves *out unchanged. |
|
// |
|
// ConsumeDigits does not protect against overflow on *out; max_digits must |
|
// be chosen with respect to type T to avoid the possibility of overflow. |
|
template <int base, typename T> |
|
int ConsumeDigits(const char* begin, const char* end, int max_digits, T* out, |
|
bool* dropped_nonzero_digit) { |
|
if (base == 10) { |
|
assert(max_digits <= std::numeric_limits<T>::digits10); |
|
} else if (base == 16) { |
|
assert(max_digits * 4 <= std::numeric_limits<T>::digits); |
|
} |
|
const char* const original_begin = begin; |
|
|
|
// Skip leading zeros, but only if *out is zero. |
|
// They don't cause an overflow so we don't have to count them for |
|
// `max_digits`. |
|
while (!*out && end != begin && *begin == '0') ++begin; |
|
|
|
T accumulator = *out; |
|
const char* significant_digits_end = |
|
(end - begin > max_digits) ? begin + max_digits : end; |
|
while (begin < significant_digits_end && IsDigit<base>(*begin)) { |
|
// Do not guard against *out overflow; max_digits was chosen to avoid this. |
|
// Do assert against it, to detect problems in debug builds. |
|
auto digit = static_cast<T>(ToDigit<base>(*begin)); |
|
assert(accumulator * base >= accumulator); |
|
accumulator *= base; |
|
assert(accumulator + digit >= accumulator); |
|
accumulator += digit; |
|
++begin; |
|
} |
|
bool dropped_nonzero = false; |
|
while (begin < end && IsDigit<base>(*begin)) { |
|
dropped_nonzero = dropped_nonzero || (*begin != '0'); |
|
++begin; |
|
} |
|
if (dropped_nonzero && dropped_nonzero_digit != nullptr) { |
|
*dropped_nonzero_digit = true; |
|
} |
|
*out = accumulator; |
|
return static_cast<int>(begin - original_begin); |
|
} |
|
|
|
// Returns true if `v` is one of the chars allowed inside parentheses following |
|
// a NaN. |
|
bool IsNanChar(char v) { |
|
return (v == '_') || (v >= '0' && v <= '9') || (v >= 'a' && v <= 'z') || |
|
(v >= 'A' && v <= 'Z'); |
|
} |
|
|
|
// Checks the range [begin, end) for a strtod()-formatted infinity or NaN. If |
|
// one is found, sets `out` appropriately and returns true. |
|
bool ParseInfinityOrNan(const char* begin, const char* end, |
|
strings_internal::ParsedFloat* out) { |
|
if (end - begin < 3) { |
|
return false; |
|
} |
|
switch (*begin) { |
|
case 'i': |
|
case 'I': { |
|
// An infinity string consists of the characters "inf" or "infinity", |
|
// case insensitive. |
|
if (strings_internal::memcasecmp(begin + 1, "nf", 2) != 0) { |
|
return false; |
|
} |
|
out->type = strings_internal::FloatType::kInfinity; |
|
if (end - begin >= 8 && |
|
strings_internal::memcasecmp(begin + 3, "inity", 5) == 0) { |
|
out->end = begin + 8; |
|
} else { |
|
out->end = begin + 3; |
|
} |
|
return true; |
|
} |
|
case 'n': |
|
case 'N': { |
|
// A NaN consists of the characters "nan", case insensitive, optionally |
|
// followed by a parenthesized sequence of zero or more alphanumeric |
|
// characters and/or underscores. |
|
if (strings_internal::memcasecmp(begin + 1, "an", 2) != 0) { |
|
return false; |
|
} |
|
out->type = strings_internal::FloatType::kNan; |
|
out->end = begin + 3; |
|
// NaN is allowed to be followed by a parenthesized string, consisting of |
|
// only the characters [a-zA-Z0-9_]. Match that if it's present. |
|
begin += 3; |
|
if (begin < end && *begin == '(') { |
|
const char* nan_begin = begin + 1; |
|
while (nan_begin < end && IsNanChar(*nan_begin)) { |
|
++nan_begin; |
|
} |
|
if (nan_begin < end && *nan_begin == ')') { |
|
// We found an extra NaN specifier range |
|
out->subrange_begin = begin + 1; |
|
out->subrange_end = nan_begin; |
|
out->end = nan_begin + 1; |
|
} |
|
} |
|
return true; |
|
} |
|
default: |
|
return false; |
|
} |
|
} |
|
} // namespace |
|
|
|
namespace strings_internal { |
|
|
|
template <int base> |
|
strings_internal::ParsedFloat ParseFloat(const char* begin, const char* end, |
|
chars_format format_flags) { |
|
strings_internal::ParsedFloat result; |
|
|
|
// Exit early if we're given an empty range. |
|
if (begin == end) return result; |
|
|
|
// Handle the infinity and NaN cases. |
|
if (ParseInfinityOrNan(begin, end, &result)) { |
|
return result; |
|
} |
|
|
|
const char* const mantissa_begin = begin; |
|
while (begin < end && *begin == '0') { |
|
++begin; // skip leading zeros |
|
} |
|
uint64_t mantissa = 0; |
|
|
|
int exponent_adjustment = 0; |
|
bool mantissa_is_inexact = false; |
|
int pre_decimal_digits = ConsumeDigits<base>( |
|
begin, end, MantissaDigitsMax<base>(), &mantissa, &mantissa_is_inexact); |
|
begin += pre_decimal_digits; |
|
int digits_left; |
|
if (pre_decimal_digits >= DigitLimit<base>()) { |
|
// refuse to parse pathological inputs |
|
return result; |
|
} else if (pre_decimal_digits > MantissaDigitsMax<base>()) { |
|
// We dropped some non-fraction digits on the floor. Adjust our exponent |
|
// to compensate. |
|
exponent_adjustment = |
|
static_cast<int>(pre_decimal_digits - MantissaDigitsMax<base>()); |
|
digits_left = 0; |
|
} else { |
|
digits_left = |
|
static_cast<int>(MantissaDigitsMax<base>() - pre_decimal_digits); |
|
} |
|
if (begin < end && *begin == '.') { |
|
++begin; |
|
if (mantissa == 0) { |
|
// If we haven't seen any nonzero digits yet, keep skipping zeros. We |
|
// have to adjust the exponent to reflect the changed place value. |
|
const char* begin_zeros = begin; |
|
while (begin < end && *begin == '0') { |
|
++begin; |
|
} |
|
int zeros_skipped = static_cast<int>(begin - begin_zeros); |
|
if (zeros_skipped >= DigitLimit<base>()) { |
|
// refuse to parse pathological inputs |
|
return result; |
|
} |
|
exponent_adjustment -= static_cast<int>(zeros_skipped); |
|
} |
|
int post_decimal_digits = ConsumeDigits<base>( |
|
begin, end, digits_left, &mantissa, &mantissa_is_inexact); |
|
begin += post_decimal_digits; |
|
|
|
// Since `mantissa` is an integer, each significant digit we read after |
|
// the decimal point requires an adjustment to the exponent. "1.23e0" will |
|
// be stored as `mantissa` == 123 and `exponent` == -2 (that is, |
|
// "123e-2"). |
|
if (post_decimal_digits >= DigitLimit<base>()) { |
|
// refuse to parse pathological inputs |
|
return result; |
|
} else if (post_decimal_digits > digits_left) { |
|
exponent_adjustment -= digits_left; |
|
} else { |
|
exponent_adjustment -= post_decimal_digits; |
|
} |
|
} |
|
// If we've found no mantissa whatsoever, this isn't a number. |
|
if (mantissa_begin == begin) { |
|
return result; |
|
} |
|
// A bare "." doesn't count as a mantissa either. |
|
if (begin - mantissa_begin == 1 && *mantissa_begin == '.') { |
|
return result; |
|
} |
|
|
|
if (mantissa_is_inexact) { |
|
// We dropped significant digits on the floor. Handle this appropriately. |
|
if (base == 10) { |
|
// If we truncated significant decimal digits, store the full range of the |
|
// mantissa for future big integer math for exact rounding. |
|
result.subrange_begin = mantissa_begin; |
|
result.subrange_end = begin; |
|
} else if (base == 16) { |
|
// If we truncated hex digits, reflect this fact by setting the low |
|
// ("sticky") bit. This allows for correct rounding in all cases. |
|
mantissa |= 1; |
|
} |
|
} |
|
result.mantissa = mantissa; |
|
|
|
const char* const exponent_begin = begin; |
|
result.literal_exponent = 0; |
|
bool found_exponent = false; |
|
if (AllowExponent(format_flags) && begin < end && |
|
IsExponentCharacter<base>(*begin)) { |
|
bool negative_exponent = false; |
|
++begin; |
|
if (begin < end && *begin == '-') { |
|
negative_exponent = true; |
|
++begin; |
|
} else if (begin < end && *begin == '+') { |
|
++begin; |
|
} |
|
const char* const exponent_digits_begin = begin; |
|
// Exponent is always expressed in decimal, even for hexadecimal floats. |
|
begin += ConsumeDigits<10>(begin, end, kDecimalExponentDigitsMax, |
|
&result.literal_exponent, nullptr); |
|
if (begin == exponent_digits_begin) { |
|
// there were no digits where we expected an exponent. We failed to read |
|
// an exponent and should not consume the 'e' after all. Rewind 'begin'. |
|
found_exponent = false; |
|
begin = exponent_begin; |
|
} else { |
|
found_exponent = true; |
|
if (negative_exponent) { |
|
result.literal_exponent = -result.literal_exponent; |
|
} |
|
} |
|
} |
|
|
|
if (!found_exponent && RequireExponent(format_flags)) { |
|
// Provided flags required an exponent, but none was found. This results |
|
// in a failure to scan. |
|
return result; |
|
} |
|
|
|
// Success! |
|
result.type = strings_internal::FloatType::kNumber; |
|
if (result.mantissa > 0) { |
|
result.exponent = result.literal_exponent + |
|
(DigitMagnitude<base>() * exponent_adjustment); |
|
} else { |
|
result.exponent = 0; |
|
} |
|
result.end = begin; |
|
return result; |
|
} |
|
|
|
template ParsedFloat ParseFloat<10>(const char* begin, const char* end, |
|
chars_format format_flags); |
|
template ParsedFloat ParseFloat<16>(const char* begin, const char* end, |
|
chars_format format_flags); |
|
|
|
} // namespace strings_internal |
|
ABSL_NAMESPACE_END |
|
} // namespace absl
|
|
|