Replace all regexes in TextFormat.Tokenizer with direct char scanning.

The JVM regex engine allocates garbage on every match (especially when calling Matcher.usePattern!). Since there are expected to be a lot of tokens, this caused substantial GC overhead. Direct char scanning also opens the possibility of other optimizations that aren't possible with regexes. For example: - direct reads from a char[] - streaming tokenization (rather than reading the complete source text) PiperOrigin-RevId: 689230675
6 months ago · 7f6e90ba46
parent 47613cf7ad
commit 7f6e90ba46
2 changed files with 176 additions and 65 deletions
--- a/java/core/src/main/java/com/google/protobuf/TextFormat.java
+++ b/java/core/src/main/java/com/google/protobuf/TextFormat.java
@ -24,7 +24,6 @@ import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 import java.util.logging.Logger;
-import java.util.regex.Matcher;
 import java.util.regex.Pattern;

 /**
@ -991,14 +990,9 @@ public final class TextFormat {
   *       Scanner} provides no way to inspect the contents of delimiters, making it impossible to
   *       keep track of line and column numbers.
   * </ul>
-   *
-   * <p>Luckily, Java's regular expression support does manage to be useful to us. (Barely: We need
-   * {@code Matcher.usePattern()}, which is new in Java 1.5.) So, we can use that, at least.
-   * Unfortunately, this implies that we need to have the entire input in one contiguous string.
   */
  private static final class Tokenizer {
    private final CharSequence text;
-    private final Matcher matcher;
    private String currentToken;

    // The character index within this.text at which the current token begins.
@ -1007,29 +1001,13 @@ public final class TextFormat {
    // The line and column numbers of the current token.
    private int line = 0;
    private int column = 0;
+    private int lineInfoTrackingPos = 0;

    // The line and column numbers of the previous token (allows throwing
    // errors *after* consuming).
    private int previousLine = 0;
    private int previousColumn = 0;

-    // We use possessive quantifiers (*+ and ++) because otherwise the Java
-    // regex matcher has stack overflows on large inputs.
-    private static final Pattern WHITESPACE = Pattern.compile("(\\s|(#.*$))++", Pattern.MULTILINE);
-    private static final Pattern TOKEN =
-        Pattern.compile(
-            "[a-zA-Z_][0-9a-zA-Z_+-]*+|" // an identifier
-                + "[.]?[0-9+-][0-9a-zA-Z_.+-]*+|" // a number
-                + "\"([^\"\n\\\\]|\\\\.)*+(\"|\\\\?$)|" // a double-quoted string
-                + "\'([^\'\n\\\\]|\\\\.)*+(\'|\\\\?$)", // a single-quoted string
-            Pattern.MULTILINE);
-
-    private static final Pattern DOUBLE_INFINITY =
-        Pattern.compile("-?inf(inity)?", Pattern.CASE_INSENSITIVE);
-    private static final Pattern FLOAT_INFINITY =
-        Pattern.compile("-?inf(inity)?f?", Pattern.CASE_INSENSITIVE);
-    private static final Pattern FLOAT_NAN = Pattern.compile("nanf?", Pattern.CASE_INSENSITIVE);
-
    /**
     * {@link containsSilentMarkerAfterCurrentToken} indicates if there is a silent marker after the
     * current token. This value is moved to {@link containsSilentMarkerAfterPrevToken} every time
@ -1042,7 +1020,6 @@ public final class TextFormat {
    /** Construct a tokenizer that parses tokens from the given text. */
    private Tokenizer(final CharSequence text) {
      this.text = text;
-      this.matcher = WHITESPACE.matcher(text);
      skipWhitespace();
      nextToken();
    }
@ -1082,41 +1059,156 @@ public final class TextFormat {
      previousColumn = column;

      // Advance the line counter to the current position.
-      while (pos < matcher.regionStart()) {
-        if (text.charAt(pos) == '\n') {
+      while (lineInfoTrackingPos < pos) {
+        if (text.charAt(lineInfoTrackingPos) == '\n') {
          ++line;
          column = 0;
        } else {
          ++column;
        }
-        ++pos;
+        ++lineInfoTrackingPos;
      }

      // Match the next token.
-      if (matcher.regionStart() == matcher.regionEnd()) {
-        // EOF
-        currentToken = "";
+      if (pos == text.length()) {
+        currentToken = ""; // EOF
      } else {
-        matcher.usePattern(TOKEN);
-        if (matcher.lookingAt()) {
-          currentToken = matcher.group();
-          matcher.region(matcher.end(), matcher.regionEnd());
-        } else {
-          // Take one character.
-          currentToken = String.valueOf(text.charAt(pos));
-          matcher.region(pos + 1, matcher.regionEnd());
+        currentToken = nextTokenInternal();
+        skipWhitespace();
+      }
+    }
+
+    private String nextTokenInternal() {
+      final int textLength = this.text.length();
+      final int startPos = this.pos;
+      final char startChar = this.text.charAt(startPos);
+
+      int endPos = pos;
+      if (isAlphaUnder(startChar)) { // Identifier
+        while (++endPos != textLength) {
+          char c = this.text.charAt(endPos);
+          if (!(isAlphaUnder(c) || isDigitPlusMinus(c))) {
+            break;
+          }
        }
+      } else if (isDigitPlusMinus(startChar) || startChar == '.') { // Number
+        if (startChar == '.') { // Optional leading dot
+          if (++endPos == textLength) {
+            return nextTokenSingleChar();
+          }

-        skipWhitespace();
+          if (!isDigitPlusMinus(this.text.charAt(endPos))) { // Mandatory first digit
+            return nextTokenSingleChar();
+          }
+        }
+
+        while (++endPos != textLength) {
+          char c = this.text.charAt(endPos);
+          if (!(isDigitPlusMinus(c) || isAlphaUnder(c) || c == '.')) {
+            break;
+          }
+        }
+      } else if (startChar == '"' || startChar == '\'') { // String
+        while (++endPos != textLength) {
+          char c = this.text.charAt(endPos);
+          if (c == startChar) {
+            ++endPos;
+            break; // Quote terminates
+          } else if (c == '\n') {
+            break; // Newline terminates (error during parsing) (not consumed)
+          } else if (c == '\\') {
+            if (++endPos == textLength) {
+              break; // Escape into end-of-text terminates (error during parsing)
+            } else if (this.text.charAt(endPos) == '\n') {
+              break; // Escape into newline terminates (error during parsing) (not consumed)
+            } else {
+              // Otherwise the escaped char is legal and consumed
+            }
+          } else {
+            // Otherwise the char is a legal and consumed
+          }
+        }
+      } else {
+        return nextTokenSingleChar(); // Unrecognized start character
+      }
+
+      this.pos = endPos;
+      return this.text.subSequence(startPos, endPos).toString();
+    }
+
+    private static boolean isAlphaUnder(char c) {
+      // Defining this char-class with numeric comparisons is much faster than using a regex.
+      return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_';
+    }
+
+    private static boolean isDigitPlusMinus(char c) {
+      // Defining this char-class with numeric comparisons is much faster than using a regex.
+      return ('0' <= c && c <= '9') || c == '+' || c == '-';
+    }
+
+    private static boolean isWhitespace(char c) {
+      // Defining this char-class with numeric comparisons is much faster than using a regex.
+      return c == ' ' || c == '\f' || c == '\n' || c == '\r' || c == '\t';
+    }
+
+    /**
+     * Produce a token for the single char at the current position.
+     *
+     * <p>We hardcode the expected single-char tokens to avoid allocating a unique string every
+     * time, which is a GC risk. String-literals are always loaded from the class constant pool.
+     *
+     * <p>This method must not be called if the current position is after the end-of-text.
+     */
+    private String nextTokenSingleChar() {
+      final char c = this.text.charAt(this.pos++);
+      switch (c) {
+        case ':':
+          return ":";
+        case ',':
+          return ",";
+        case '[':
+          return "[";
+        case ']':
+          return "]";
+        case '{':
+          return "{";
+        case '}':
+          return "}";
+        case '<':
+          return "<";
+        case '>':
+          return ">";
+        default:
+          // If we don't recognize the char, create a string and let the parser report any errors
+          return String.valueOf(c);
      }
    }

    /** Skip over any whitespace so that the matcher region starts at the next token. */
    private void skipWhitespace() {
-      matcher.usePattern(WHITESPACE);
-      if (matcher.lookingAt()) {
-        matcher.region(matcher.end(), matcher.regionEnd());
+      final int textLength = this.text.length();
+      final int startPos = this.pos;
+
+      int endPos = this.pos - 1;
+      while (++endPos != textLength) {
+        char c = this.text.charAt(endPos);
+        if (c == '#') {
+          while (++endPos != textLength) {
+            if (this.text.charAt(endPos) == '\n') {
+              break; // Consume the newline as whitespace.
+            }
+          }
+          if (endPos == textLength) {
+            break;
+          }
+        } else if (isWhitespace(c)) {
+          // OK
+        } else {
+          break;
+        }
      }
+
+      this.pos = endPos;
    }

    /**
@ -1148,8 +1240,7 @@ public final class TextFormat {
        return false;
      }

-      final char c = currentToken.charAt(0);
-      return ('0' <= c && c <= '9') || c == '-' || c == '+';
+      return isDigitPlusMinus(currentToken.charAt(0));
    }

    /** Returns {@code true} if the current token's text is equal to that specified. */
@ -1164,11 +1255,7 @@ public final class TextFormat {
    String consumeIdentifier() throws ParseException {
      for (int i = 0; i < currentToken.length(); i++) {
        final char c = currentToken.charAt(i);
-        if (('a' <= c && c <= 'z')
-            || ('A' <= c && c <= 'Z')
-            || ('0' <= c && c <= '9')
-            || (c == '_')
-            || (c == '.')) {
+        if (isAlphaUnder(c) || ('0' <= c && c <= '9') || (c == '.')) {
          // OK
        } else {
          throw parseException("Expected identifier. Found '" + currentToken + "'");
@ -1282,15 +1369,22 @@ public final class TextFormat {
    public double consumeDouble() throws ParseException {
      // We need to parse infinity and nan separately because
      // Double.parseDouble() does not accept "inf", "infinity", or "nan".
-      if (DOUBLE_INFINITY.matcher(currentToken).matches()) {
-        final boolean negative = currentToken.startsWith("-");
-        nextToken();
-        return negative ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
-      }
-      if (currentToken.equalsIgnoreCase("nan")) {
-        nextToken();
-        return Double.NaN;
+      switch (currentToken.toLowerCase(Locale.ROOT)) {
+        case "-inf":
+        case "-infinity":
+          nextToken();
+          return Double.NEGATIVE_INFINITY;
+        case "inf":
+        case "infinity":
+          nextToken();
+          return Double.POSITIVE_INFINITY;
+        case "nan":
+          nextToken();
+          return Double.NaN;
+        default:
+          // fall through
      }
+
      try {
        final double result = Double.parseDouble(currentToken);
        nextToken();
@ -1320,15 +1414,27 @@ public final class TextFormat {
    public float consumeFloat() throws ParseException {
      // We need to parse infinity and nan separately because
      // Float.parseFloat() does not accept "inf", "infinity", or "nan".
-      if (FLOAT_INFINITY.matcher(currentToken).matches()) {
-        final boolean negative = currentToken.startsWith("-");
-        nextToken();
-        return negative ? Float.NEGATIVE_INFINITY : Float.POSITIVE_INFINITY;
-      }
-      if (FLOAT_NAN.matcher(currentToken).matches()) {
-        nextToken();
-        return Float.NaN;
+      switch (currentToken.toLowerCase(Locale.ROOT)) {
+        case "-inf":
+        case "-inff":
+        case "-infinity":
+        case "-infinityf":
+          nextToken();
+          return Float.NEGATIVE_INFINITY;
+        case "inf":
+        case "inff":
+        case "infinity":
+        case "infinityf":
+          nextToken();
+          return Float.POSITIVE_INFINITY;
+        case "nan":
+        case "nanf":
+          nextToken();
+          return Float.NaN;
+        default:
+          // fall through
      }
+
      try {
        final float result = Float.parseFloat(currentToken);
        nextToken();
--- a/java/core/src/test/java/com/google/protobuf/TextFormatTest.java
+++ b/java/core/src/test/java/com/google/protobuf/TextFormatTest.java
@ -81,6 +81,7 @@ public class TextFormatTest {
          + "repeated_double: 0.125\n"
          + "repeated_double: .125\n"
          + "repeated_double: -.125\n"
+          + "repeated_double: .0\n"
          + "repeated_double: 1.23E17\n"
          + "repeated_double: 1.23E+17\n"
          + "repeated_double: -1.23e-17\n"
@ -314,6 +315,7 @@ public class TextFormatTest {
            .addRepeatedDouble(0.125)
            .addRepeatedDouble(.125)
            .addRepeatedDouble(-.125)
+            .addRepeatedDouble(.0)
            .addRepeatedDouble(123e15)
            .addRepeatedDouble(123e15)
            .addRepeatedDouble(-1.23e-17)
@ -949,6 +951,9 @@ public class TextFormatTest {
        "1:23: Enum type \"protobuf_unittest.TestAllTypes.NestedEnum\" has no "
            + "value with number 123.",
        "optional_nested_enum: 123");
+    assertParseError("1:18: Couldn't parse number: For input string: \".\"", "repeated_double: .");
+    assertParseError(
+        "1:18: Couldn't parse number: For input string: \".+\"", "repeated_double: .+");

    // Delimiters must match.
    assertParseError("1:22: Expected identifier. Found '}'", "OptionalGroup < a: 1 }");