diff --git a/src/google/protobuf/stubs/structurally_valid.cc b/src/google/protobuf/stubs/structurally_valid.cc index e385a81ec6..0f6afe6dc8 100644 --- a/src/google/protobuf/stubs/structurally_valid.cc +++ b/src/google/protobuf/stubs/structurally_valid.cc @@ -371,36 +371,44 @@ int UTF8GenericScan(const UTF8ScanObj* st, // Do state-table scan int e = 0; uint8 c; - - // Do fast for groups of 8 identity bytes. - // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop, - // including slowing slightly on cr/lf/ht - //---------------------------- const uint8* Tbl2 = &st->fast_state[0]; - uint32 losub = st->losub; - uint32 hiadd = st->hiadd; - while (src < srclimit8) { - uint32 s0123 = (reinterpret_cast(src))[0]; - uint32 s4567 = (reinterpret_cast(src))[1]; - src += 8; - // This is a fast range check for all bytes in [lowsub..0x80-hiadd) - uint32 temp = (s0123 - losub) | (s0123 + hiadd) | - (s4567 - losub) | (s4567 + hiadd); - if ((temp & 0x80808080) != 0) { - // We typically end up here on cr/lf/ht; src was incremented - int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) | - (Tbl2[src[-6]] | Tbl2[src[-5]]); - if (e0123 != 0) { - src -= 8; - break; - } // Exit on Non-interchange - e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) | - (Tbl2[src[-2]] | Tbl2[src[-1]]); - if (e0123 != 0) { - src -= 4; - break; - } // Exit on Non-interchange - // Else OK, go around again + const uint32 losub = st->losub; + const uint32 hiadd = st->hiadd; + // Check initial few bytes one at a time until 8-byte aligned + //---------------------------- + while ((((uintptr_t)src & 0x07) != 0) && + (src < srclimit) && + Tbl2[src[0]] == 0) { + src++; + } + if (((uintptr_t)src & 0x07) == 0) { + // Do fast for groups of 8 identity bytes. + // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop, + // including slowing slightly on cr/lf/ht + //---------------------------- + while (src < srclimit8) { + uint32 s0123 = (reinterpret_cast(src))[0]; + uint32 s4567 = (reinterpret_cast(src))[1]; + src += 8; + // This is a fast range check for all bytes in [lowsub..0x80-hiadd) + uint32 temp = (s0123 - losub) | (s0123 + hiadd) | + (s4567 - losub) | (s4567 + hiadd); + if ((temp & 0x80808080) != 0) { + // We typically end up here on cr/lf/ht; src was incremented + int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) | + (Tbl2[src[-6]] | Tbl2[src[-5]]); + if (e0123 != 0) { + src -= 8; + break; + } // Exit on Non-interchange + e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) | + (Tbl2[src[-2]] | Tbl2[src[-1]]); + if (e0123 != 0) { + src -= 4; + break; + } // Exit on Non-interchange + // Else OK, go around again + } } } //---------------------------- @@ -470,10 +478,17 @@ int UTF8GenericScanFastAscii(const UTF8ScanObj* st, int rest_consumed; int exit_reason; do { - while ((src < srclimit8) && - (((reinterpret_cast(src)[0] | - reinterpret_cast(src)[1]) & 0x80808080) == 0)) { - src += 8; + // Check initial few bytes one at a time until 8-byte aligned + while ((((uintptr_t)src & 0x07) != 0) && + (src < srclimit) && (src[0] < 0x80)) { + src++; + } + if (((uintptr_t)src & 0x07) == 0) { + while ((src < srclimit8) && + (((reinterpret_cast(src)[0] | + reinterpret_cast(src)[1]) & 0x80808080) == 0)) { + src += 8; + } } while ((src < srclimit) && (src[0] < 0x80)) { src++; diff --git a/src/google/protobuf/stubs/structurally_valid_unittest.cc b/src/google/protobuf/stubs/structurally_valid_unittest.cc index 22825516e2..90888885ad 100644 --- a/src/google/protobuf/stubs/structurally_valid_unittest.cc +++ b/src/google/protobuf/stubs/structurally_valid_unittest.cc @@ -13,15 +13,25 @@ TEST(StructurallyValidTest, ValidUTF8String) { // On GCC, this string can be written as: // "abcd 1234 - \u2014\u2013\u2212" // MSVC seems to interpret \u differently. - string valid_str("abcd 1234 - \342\200\224\342\200\223\342\210\222"); + string valid_str("abcd 1234 - \342\200\224\342\200\223\342\210\222 - xyz789"); EXPECT_TRUE(IsStructurallyValidUTF8(valid_str.data(), valid_str.size())); + // Additional check for pointer alignment + for (int i = 1; i < 8; ++i) { + EXPECT_TRUE(IsStructurallyValidUTF8(valid_str.data() + i, + valid_str.size() - i)); + } } TEST(StructurallyValidTest, InvalidUTF8String) { - string invalid_str("\xA0\xB0"); + const string invalid_str("abcd\xA0\xB0\xA0\xB0\xA0\xB0 - xyz789"); EXPECT_FALSE(IsStructurallyValidUTF8(invalid_str.data(), invalid_str.size())); + // Additional check for pointer alignment + for (int i = 1; i < 8; ++i) { + EXPECT_FALSE(IsStructurallyValidUTF8(invalid_str.data() + i, + invalid_str.size() - i)); + } } } // namespace