@ -24,7 +24,6 @@ import java.util.Locale;
import java.util.Map ;
import java.util.Set ;
import java.util.logging.Logger ;
import java.util.regex.Matcher ;
import java.util.regex.Pattern ;
/ * *
@ -991,14 +990,9 @@ public final class TextFormat {
* Scanner } provides no way to inspect the contents of delimiters , making it impossible to
* keep track of line and column numbers .
* < / ul >
*
* < p > Luckily , Java ' s regular expression support does manage to be useful to us . ( Barely : We need
* { @code Matcher . usePattern ( ) } , which is new in Java 1 . 5 . ) So , we can use that , at least .
* Unfortunately , this implies that we need to have the entire input in one contiguous string .
* /
private static final class Tokenizer {
private final CharSequence text ;
private final Matcher matcher ;
private String currentToken ;
// The character index within this.text at which the current token begins.
@ -1007,29 +1001,13 @@ public final class TextFormat {
// The line and column numbers of the current token.
private int line = 0 ;
private int column = 0 ;
private int lineInfoTrackingPos = 0 ;
// The line and column numbers of the previous token (allows throwing
// errors *after* consuming).
private int previousLine = 0 ;
private int previousColumn = 0 ;
// We use possessive quantifiers (*+ and ++) because otherwise the Java
// regex matcher has stack overflows on large inputs.
private static final Pattern WHITESPACE = Pattern . compile ( "(\\s|(#.*$))++" , Pattern . MULTILINE ) ;
private static final Pattern TOKEN =
Pattern . compile (
"[a-zA-Z_][0-9a-zA-Z_+-]*+|" // an identifier
+ "[.]?[0-9+-][0-9a-zA-Z_.+-]*+|" // a number
+ "\"([^\"\n\\\\]|\\\\.)*+(\"|\\\\?$)|" // a double-quoted string
+ "\'([^\'\n\\\\]|\\\\.)*+(\'|\\\\?$)" , // a single-quoted string
Pattern . MULTILINE ) ;
private static final Pattern DOUBLE_INFINITY =
Pattern . compile ( "-?inf(inity)?" , Pattern . CASE_INSENSITIVE ) ;
private static final Pattern FLOAT_INFINITY =
Pattern . compile ( "-?inf(inity)?f?" , Pattern . CASE_INSENSITIVE ) ;
private static final Pattern FLOAT_NAN = Pattern . compile ( "nanf?" , Pattern . CASE_INSENSITIVE ) ;
/ * *
* { @link containsSilentMarkerAfterCurrentToken } indicates if there is a silent marker after the
* current token . This value is moved to { @link containsSilentMarkerAfterPrevToken } every time
@ -1042,7 +1020,6 @@ public final class TextFormat {
/** Construct a tokenizer that parses tokens from the given text. */
private Tokenizer ( final CharSequence text ) {
this . text = text ;
this . matcher = WHITESPACE . matcher ( text ) ;
skipWhitespace ( ) ;
nextToken ( ) ;
}
@ -1082,41 +1059,156 @@ public final class TextFormat {
previousColumn = column ;
// Advance the line counter to the current position.
while ( p os < matcher . regionStart ( ) ) {
if ( text . charAt ( p os) = = '\n' ) {
while ( lineInfoTrackingP os < pos ) {
if ( text . charAt ( lineInfoTrackingP os) = = '\n' ) {
+ + line ;
column = 0 ;
} else {
+ + column ;
}
+ + p os;
+ + lineInfoTrackingP os;
}
// Match the next token.
if ( matcher . regionStart ( ) = = matcher . regionEnd ( ) ) {
// EOF
currentToken = "" ;
if ( pos = = text . length ( ) ) {
currentToken = "" ; // EOF
} else {
matcher . usePattern ( TOKEN ) ;
if ( matcher . lookingAt ( ) ) {
currentToken = matcher . group ( ) ;
matcher . region ( matcher . end ( ) , matcher . regionEnd ( ) ) ;
} else {
// Take one character.
currentToken = String . valueOf ( text . charAt ( pos ) ) ;
matcher . region ( pos + 1 , matcher . regionEnd ( ) ) ;
currentToken = nextTokenInternal ( ) ;
skipWhitespace ( ) ;
}
}
private String nextTokenInternal ( ) {
final int textLength = this . text . length ( ) ;
final int startPos = this . pos ;
final char startChar = this . text . charAt ( startPos ) ;
int endPos = pos ;
if ( isAlphaUnder ( startChar ) ) { // Identifier
while ( + + endPos ! = textLength ) {
char c = this . text . charAt ( endPos ) ;
if ( ! ( isAlphaUnder ( c ) | | isDigitPlusMinus ( c ) ) ) {
break ;
}
}
} else if ( isDigitPlusMinus ( startChar ) | | startChar = = '.' ) { // Number
if ( startChar = = '.' ) { // Optional leading dot
if ( + + endPos = = textLength ) {
return nextTokenSingleChar ( ) ;
}
skipWhitespace ( ) ;
if ( ! isDigitPlusMinus ( this . text . charAt ( endPos ) ) ) { // Mandatory first digit
return nextTokenSingleChar ( ) ;
}
}
while ( + + endPos ! = textLength ) {
char c = this . text . charAt ( endPos ) ;
if ( ! ( isDigitPlusMinus ( c ) | | isAlphaUnder ( c ) | | c = = '.' ) ) {
break ;
}
}
} else if ( startChar = = '"' | | startChar = = '\'' ) { // String
while ( + + endPos ! = textLength ) {
char c = this . text . charAt ( endPos ) ;
if ( c = = startChar ) {
+ + endPos ;
break ; // Quote terminates
} else if ( c = = '\n' ) {
break ; // Newline terminates (error during parsing) (not consumed)
} else if ( c = = '\\' ) {
if ( + + endPos = = textLength ) {
break ; // Escape into end-of-text terminates (error during parsing)
} else if ( this . text . charAt ( endPos ) = = '\n' ) {
break ; // Escape into newline terminates (error during parsing) (not consumed)
} else {
// Otherwise the escaped char is legal and consumed
}
} else {
// Otherwise the char is a legal and consumed
}
}
} else {
return nextTokenSingleChar ( ) ; // Unrecognized start character
}
this . pos = endPos ;
return this . text . subSequence ( startPos , endPos ) . toString ( ) ;
}
private static boolean isAlphaUnder ( char c ) {
// Defining this char-class with numeric comparisons is much faster than using a regex.
return ( 'a' < = c & & c < = 'z' ) | | ( 'A' < = c & & c < = 'Z' ) | | c = = '_' ;
}
private static boolean isDigitPlusMinus ( char c ) {
// Defining this char-class with numeric comparisons is much faster than using a regex.
return ( '0' < = c & & c < = '9' ) | | c = = '+' | | c = = '-' ;
}
private static boolean isWhitespace ( char c ) {
// Defining this char-class with numeric comparisons is much faster than using a regex.
return c = = ' ' | | c = = '\f' | | c = = '\n' | | c = = '\r' | | c = = '\t' ;
}
/ * *
* Produce a token for the single char at the current position .
*
* < p > We hardcode the expected single - char tokens to avoid allocating a unique string every
* time , which is a GC risk . String - literals are always loaded from the class constant pool .
*
* < p > This method must not be called if the current position is after the end - of - text .
* /
private String nextTokenSingleChar ( ) {
final char c = this . text . charAt ( this . pos + + ) ;
switch ( c ) {
case ':' :
return ":" ;
case ',' :
return "," ;
case '[' :
return "[" ;
case ']' :
return "]" ;
case '{' :
return "{" ;
case '}' :
return "}" ;
case '<' :
return "<" ;
case '>' :
return ">" ;
default :
// If we don't recognize the char, create a string and let the parser report any errors
return String . valueOf ( c ) ;
}
}
/** Skip over any whitespace so that the matcher region starts at the next token. */
private void skipWhitespace ( ) {
matcher . usePattern ( WHITESPACE ) ;
if ( matcher . lookingAt ( ) ) {
matcher . region ( matcher . end ( ) , matcher . regionEnd ( ) ) ;
final int textLength = this . text . length ( ) ;
final int startPos = this . pos ;
int endPos = this . pos - 1 ;
while ( + + endPos ! = textLength ) {
char c = this . text . charAt ( endPos ) ;
if ( c = = '#' ) {
while ( + + endPos ! = textLength ) {
if ( this . text . charAt ( endPos ) = = '\n' ) {
break ; // Consume the newline as whitespace.
}
}
if ( endPos = = textLength ) {
break ;
}
} else if ( isWhitespace ( c ) ) {
// OK
} else {
break ;
}
}
this . pos = endPos ;
}
/ * *
@ -1148,8 +1240,7 @@ public final class TextFormat {
return false ;
}
final char c = currentToken . charAt ( 0 ) ;
return ( '0' < = c & & c < = '9' ) | | c = = '-' | | c = = '+' ;
return isDigitPlusMinus ( currentToken . charAt ( 0 ) ) ;
}
/** Returns {@code true} if the current token's text is equal to that specified. */
@ -1164,11 +1255,7 @@ public final class TextFormat {
String consumeIdentifier ( ) throws ParseException {
for ( int i = 0 ; i < currentToken . length ( ) ; i + + ) {
final char c = currentToken . charAt ( i ) ;
if ( ( 'a' < = c & & c < = 'z' )
| | ( 'A' < = c & & c < = 'Z' )
| | ( '0' < = c & & c < = '9' )
| | ( c = = '_' )
| | ( c = = '.' ) ) {
if ( isAlphaUnder ( c ) | | ( '0' < = c & & c < = '9' ) | | ( c = = '.' ) ) {
// OK
} else {
throw parseException ( "Expected identifier. Found '" + currentToken + "'" ) ;
@ -1282,15 +1369,22 @@ public final class TextFormat {
public double consumeDouble ( ) throws ParseException {
// We need to parse infinity and nan separately because
// Double.parseDouble() does not accept "inf", "infinity", or "nan".
if ( DOUBLE_INFINITY . matcher ( currentToken ) . matches ( ) ) {
final boolean negative = currentToken . startsWith ( "-" ) ;
nextToken ( ) ;
return negative ? Double . NEGATIVE_INFINITY : Double . POSITIVE_INFINITY ;
}
if ( currentToken . equalsIgnoreCase ( "nan" ) ) {
nextToken ( ) ;
return Double . NaN ;
switch ( currentToken . toLowerCase ( Locale . ROOT ) ) {
case "-inf" :
case "-infinity" :
nextToken ( ) ;
return Double . NEGATIVE_INFINITY ;
case "inf" :
case "infinity" :
nextToken ( ) ;
return Double . POSITIVE_INFINITY ;
case "nan" :
nextToken ( ) ;
return Double . NaN ;
default :
// fall through
}
try {
final double result = Double . parseDouble ( currentToken ) ;
nextToken ( ) ;
@ -1320,15 +1414,27 @@ public final class TextFormat {
public float consumeFloat ( ) throws ParseException {
// We need to parse infinity and nan separately because
// Float.parseFloat() does not accept "inf", "infinity", or "nan".
if ( FLOAT_INFINITY . matcher ( currentToken ) . matches ( ) ) {
final boolean negative = currentToken . startsWith ( "-" ) ;
nextToken ( ) ;
return negative ? Float . NEGATIVE_INFINITY : Float . POSITIVE_INFINITY ;
}
if ( FLOAT_NAN . matcher ( currentToken ) . matches ( ) ) {
nextToken ( ) ;
return Float . NaN ;
switch ( currentToken . toLowerCase ( Locale . ROOT ) ) {
case "-inf" :
case "-inff" :
case "-infinity" :
case "-infinityf" :
nextToken ( ) ;
return Float . NEGATIVE_INFINITY ;
case "inf" :
case "inff" :
case "infinity" :
case "infinityf" :
nextToken ( ) ;
return Float . POSITIVE_INFINITY ;
case "nan" :
case "nanf" :
nextToken ( ) ;
return Float . NaN ;
default :
// fall through
}
try {
final float result = Float . parseFloat ( currentToken ) ;
nextToken ( ) ;