ignore UTF-8 BOM if it is in the begining of a proto file

10 years ago · b2d2cf8b48
parent fde6e89f99
commit b2d2cf8b48
2 changed files with 35 additions and 0 deletions
--- a/src/google/protobuf/compiler/parser_unittest.cc
+++ b/src/google/protobuf/compiler/parser_unittest.cc
@ -229,6 +229,32 @@ TEST_F(ParserTest, WarnIfSyntaxIdentifierOmmitted) {
 typedef ParserTest ParseMessageTest;
 TEST_F(ParseMessageTest, IgnoreBOM) {
  char input[] = "   message TestMessage {\n"
      "  required int32 foo = 1;\n"
      "}\n";
  // Set UTF-8 BOM.
  input[0] = (char)0xEF;
  input[1] = (char)0xBB;
  input[2] = (char)0xBF;
  ExpectParsesTo(input,
    "message_type {"
    "  name: \"TestMessage\""
    "  field { name:\"foo\" label:LABEL_REQUIRED type:TYPE_INT32 number:1 }"
    "}");
 }
 TEST_F(ParseMessageTest, BOMError) {
  char input[] = "   message TestMessage {\n"
      "  required int32 foo = 1;\n"
      "}\n";
  input[0] = (char)0xEF;
  ExpectHasErrors(input,
                  "0:1: Proto file starts with 0xEF but not UTF-8 BOM. "
                  "Only UTF-8 is accepted for proto file.\n"
                  "0:0: Expected top-level statement (e.g. \"message\").\n");
 }
 TEST_F(ParseMessageTest, SimpleMessage) {
  ExpectParsesTo(
    "message TestMessage {\n"
--- a/src/google/protobuf/io/tokenizer.cc
+++ b/src/google/protobuf/io/tokenizer.cc
@ -762,6 +762,15 @@ bool Tokenizer::NextWithComments(string* prev_trailing_comments,
                             next_leading_comments);
  if (current_.type == TYPE_START) {
    // Ignore unicode byte order mark(BOM) if it appears at the file
    // beginning. Only UTF-8 BOM (0xEF 0xBB 0xBF) is accepted.
    if (TryConsume((char)0xEF)) {
      if (!TryConsume((char)0xBB) || !TryConsume((char)0xBF)) {
        AddError("Proto file starts with 0xEF but not UTF-8 BOM. "
                 "Only UTF-8 is accepted for proto file.");
        return false;
      }
    }
    collector.DetachFromPrev();
  } else {
    // A comment appearing on the same line must be attached to the previous