Replace all regexes in TextFormat.Tokenizer with direct char scanning.

The JVM regex engine allocates garbage on every match (especially when calling Matcher.usePattern!). Since there are expected to be a lot of tokens, this caused substantial GC overhead.

Direct char scanning also opens the possibility of other optimizations that aren't possible with regexes. For example:
  - direct reads from a char[]
  - streaming tokenization (rather than reading the complete source text)

PiperOrigin-RevId: 689230675
pull/18915/head
nickreid 6 months ago committed by Copybara-Service
parent 47613cf7ad
commit 7f6e90ba46
  1. 236
      java/core/src/main/java/com/google/protobuf/TextFormat.java
  2. 5
      java/core/src/test/java/com/google/protobuf/TextFormatTest.java

@ -24,7 +24,6 @@ import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
@ -991,14 +990,9 @@ public final class TextFormat {
* Scanner} provides no way to inspect the contents of delimiters, making it impossible to
* keep track of line and column numbers.
* </ul>
*
* <p>Luckily, Java's regular expression support does manage to be useful to us. (Barely: We need
* {@code Matcher.usePattern()}, which is new in Java 1.5.) So, we can use that, at least.
* Unfortunately, this implies that we need to have the entire input in one contiguous string.
*/
private static final class Tokenizer {
private final CharSequence text;
private final Matcher matcher;
private String currentToken;
// The character index within this.text at which the current token begins.
@ -1007,29 +1001,13 @@ public final class TextFormat {
// The line and column numbers of the current token.
private int line = 0;
private int column = 0;
private int lineInfoTrackingPos = 0;
// The line and column numbers of the previous token (allows throwing
// errors *after* consuming).
private int previousLine = 0;
private int previousColumn = 0;
// We use possessive quantifiers (*+ and ++) because otherwise the Java
// regex matcher has stack overflows on large inputs.
private static final Pattern WHITESPACE = Pattern.compile("(\\s|(#.*$))++", Pattern.MULTILINE);
private static final Pattern TOKEN =
Pattern.compile(
"[a-zA-Z_][0-9a-zA-Z_+-]*+|" // an identifier
+ "[.]?[0-9+-][0-9a-zA-Z_.+-]*+|" // a number
+ "\"([^\"\n\\\\]|\\\\.)*+(\"|\\\\?$)|" // a double-quoted string
+ "\'([^\'\n\\\\]|\\\\.)*+(\'|\\\\?$)", // a single-quoted string
Pattern.MULTILINE);
private static final Pattern DOUBLE_INFINITY =
Pattern.compile("-?inf(inity)?", Pattern.CASE_INSENSITIVE);
private static final Pattern FLOAT_INFINITY =
Pattern.compile("-?inf(inity)?f?", Pattern.CASE_INSENSITIVE);
private static final Pattern FLOAT_NAN = Pattern.compile("nanf?", Pattern.CASE_INSENSITIVE);
/**
* {@link containsSilentMarkerAfterCurrentToken} indicates if there is a silent marker after the
* current token. This value is moved to {@link containsSilentMarkerAfterPrevToken} every time
@ -1042,7 +1020,6 @@ public final class TextFormat {
/** Construct a tokenizer that parses tokens from the given text. */
private Tokenizer(final CharSequence text) {
this.text = text;
this.matcher = WHITESPACE.matcher(text);
skipWhitespace();
nextToken();
}
@ -1082,41 +1059,156 @@ public final class TextFormat {
previousColumn = column;
// Advance the line counter to the current position.
while (pos < matcher.regionStart()) {
if (text.charAt(pos) == '\n') {
while (lineInfoTrackingPos < pos) {
if (text.charAt(lineInfoTrackingPos) == '\n') {
++line;
column = 0;
} else {
++column;
}
++pos;
++lineInfoTrackingPos;
}
// Match the next token.
if (matcher.regionStart() == matcher.regionEnd()) {
// EOF
currentToken = "";
if (pos == text.length()) {
currentToken = ""; // EOF
} else {
matcher.usePattern(TOKEN);
if (matcher.lookingAt()) {
currentToken = matcher.group();
matcher.region(matcher.end(), matcher.regionEnd());
} else {
// Take one character.
currentToken = String.valueOf(text.charAt(pos));
matcher.region(pos + 1, matcher.regionEnd());
currentToken = nextTokenInternal();
skipWhitespace();
}
}
private String nextTokenInternal() {
final int textLength = this.text.length();
final int startPos = this.pos;
final char startChar = this.text.charAt(startPos);
int endPos = pos;
if (isAlphaUnder(startChar)) { // Identifier
while (++endPos != textLength) {
char c = this.text.charAt(endPos);
if (!(isAlphaUnder(c) || isDigitPlusMinus(c))) {
break;
}
}
} else if (isDigitPlusMinus(startChar) || startChar == '.') { // Number
if (startChar == '.') { // Optional leading dot
if (++endPos == textLength) {
return nextTokenSingleChar();
}
skipWhitespace();
if (!isDigitPlusMinus(this.text.charAt(endPos))) { // Mandatory first digit
return nextTokenSingleChar();
}
}
while (++endPos != textLength) {
char c = this.text.charAt(endPos);
if (!(isDigitPlusMinus(c) || isAlphaUnder(c) || c == '.')) {
break;
}
}
} else if (startChar == '"' || startChar == '\'') { // String
while (++endPos != textLength) {
char c = this.text.charAt(endPos);
if (c == startChar) {
++endPos;
break; // Quote terminates
} else if (c == '\n') {
break; // Newline terminates (error during parsing) (not consumed)
} else if (c == '\\') {
if (++endPos == textLength) {
break; // Escape into end-of-text terminates (error during parsing)
} else if (this.text.charAt(endPos) == '\n') {
break; // Escape into newline terminates (error during parsing) (not consumed)
} else {
// Otherwise the escaped char is legal and consumed
}
} else {
// Otherwise the char is a legal and consumed
}
}
} else {
return nextTokenSingleChar(); // Unrecognized start character
}
this.pos = endPos;
return this.text.subSequence(startPos, endPos).toString();
}
private static boolean isAlphaUnder(char c) {
// Defining this char-class with numeric comparisons is much faster than using a regex.
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_';
}
private static boolean isDigitPlusMinus(char c) {
// Defining this char-class with numeric comparisons is much faster than using a regex.
return ('0' <= c && c <= '9') || c == '+' || c == '-';
}
private static boolean isWhitespace(char c) {
// Defining this char-class with numeric comparisons is much faster than using a regex.
return c == ' ' || c == '\f' || c == '\n' || c == '\r' || c == '\t';
}
/**
* Produce a token for the single char at the current position.
*
* <p>We hardcode the expected single-char tokens to avoid allocating a unique string every
* time, which is a GC risk. String-literals are always loaded from the class constant pool.
*
* <p>This method must not be called if the current position is after the end-of-text.
*/
private String nextTokenSingleChar() {
final char c = this.text.charAt(this.pos++);
switch (c) {
case ':':
return ":";
case ',':
return ",";
case '[':
return "[";
case ']':
return "]";
case '{':
return "{";
case '}':
return "}";
case '<':
return "<";
case '>':
return ">";
default:
// If we don't recognize the char, create a string and let the parser report any errors
return String.valueOf(c);
}
}
/** Skip over any whitespace so that the matcher region starts at the next token. */
private void skipWhitespace() {
matcher.usePattern(WHITESPACE);
if (matcher.lookingAt()) {
matcher.region(matcher.end(), matcher.regionEnd());
final int textLength = this.text.length();
final int startPos = this.pos;
int endPos = this.pos - 1;
while (++endPos != textLength) {
char c = this.text.charAt(endPos);
if (c == '#') {
while (++endPos != textLength) {
if (this.text.charAt(endPos) == '\n') {
break; // Consume the newline as whitespace.
}
}
if (endPos == textLength) {
break;
}
} else if (isWhitespace(c)) {
// OK
} else {
break;
}
}
this.pos = endPos;
}
/**
@ -1148,8 +1240,7 @@ public final class TextFormat {
return false;
}
final char c = currentToken.charAt(0);
return ('0' <= c && c <= '9') || c == '-' || c == '+';
return isDigitPlusMinus(currentToken.charAt(0));
}
/** Returns {@code true} if the current token's text is equal to that specified. */
@ -1164,11 +1255,7 @@ public final class TextFormat {
String consumeIdentifier() throws ParseException {
for (int i = 0; i < currentToken.length(); i++) {
final char c = currentToken.charAt(i);
if (('a' <= c && c <= 'z')
|| ('A' <= c && c <= 'Z')
|| ('0' <= c && c <= '9')
|| (c == '_')
|| (c == '.')) {
if (isAlphaUnder(c) || ('0' <= c && c <= '9') || (c == '.')) {
// OK
} else {
throw parseException("Expected identifier. Found '" + currentToken + "'");
@ -1282,15 +1369,22 @@ public final class TextFormat {
public double consumeDouble() throws ParseException {
// We need to parse infinity and nan separately because
// Double.parseDouble() does not accept "inf", "infinity", or "nan".
if (DOUBLE_INFINITY.matcher(currentToken).matches()) {
final boolean negative = currentToken.startsWith("-");
nextToken();
return negative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
}
if (currentToken.equalsIgnoreCase("nan")) {
nextToken();
return Double.NaN;
switch (currentToken.toLowerCase(Locale.ROOT)) {
case "-inf":
case "-infinity":
nextToken();
return Double.NEGATIVE_INFINITY;
case "inf":
case "infinity":
nextToken();
return Double.POSITIVE_INFINITY;
case "nan":
nextToken();
return Double.NaN;
default:
// fall through
}
try {
final double result = Double.parseDouble(currentToken);
nextToken();
@ -1320,15 +1414,27 @@ public final class TextFormat {
public float consumeFloat() throws ParseException {
// We need to parse infinity and nan separately because
// Float.parseFloat() does not accept "inf", "infinity", or "nan".
if (FLOAT_INFINITY.matcher(currentToken).matches()) {
final boolean negative = currentToken.startsWith("-");
nextToken();
return negative ? Float.NEGATIVE_INFINITY : Float.POSITIVE_INFINITY;
}
if (FLOAT_NAN.matcher(currentToken).matches()) {
nextToken();
return Float.NaN;
switch (currentToken.toLowerCase(Locale.ROOT)) {
case "-inf":
case "-inff":
case "-infinity":
case "-infinityf":
nextToken();
return Float.NEGATIVE_INFINITY;
case "inf":
case "inff":
case "infinity":
case "infinityf":
nextToken();
return Float.POSITIVE_INFINITY;
case "nan":
case "nanf":
nextToken();
return Float.NaN;
default:
// fall through
}
try {
final float result = Float.parseFloat(currentToken);
nextToken();

@ -81,6 +81,7 @@ public class TextFormatTest {
+ "repeated_double: 0.125\n"
+ "repeated_double: .125\n"
+ "repeated_double: -.125\n"
+ "repeated_double: .0\n"
+ "repeated_double: 1.23E17\n"
+ "repeated_double: 1.23E+17\n"
+ "repeated_double: -1.23e-17\n"
@ -314,6 +315,7 @@ public class TextFormatTest {
.addRepeatedDouble(0.125)
.addRepeatedDouble(.125)
.addRepeatedDouble(-.125)
.addRepeatedDouble(.0)
.addRepeatedDouble(123e15)
.addRepeatedDouble(123e15)
.addRepeatedDouble(-1.23e-17)
@ -949,6 +951,9 @@ public class TextFormatTest {
"1:23: Enum type \"protobuf_unittest.TestAllTypes.NestedEnum\" has no "
+ "value with number 123.",
"optional_nested_enum: 123");
assertParseError("1:18: Couldn't parse number: For input string: \".\"", "repeated_double: .");
assertParseError(
"1:18: Couldn't parse number: For input string: \".+\"", "repeated_double: .+");
// Delimiters must match.
assertParseError("1:22: Expected identifier. Found '}'", "OptionalGroup < a: 1 }");

Loading…
Cancel
Save