Merge pull request #17510 from protocolbuffers/cherrypick-ruby-utf8
[Ruby] Warn if assigning a "UTF-8" string with invalid UTF-8. (#17253)pull/17530/head
commit
fa8dbaec86
7 changed files with 219 additions and 21 deletions
@ -0,0 +1,9 @@ |
|||||||
|
syntax = "proto2"; |
||||||
|
|
||||||
|
package utf8_test_protos; |
||||||
|
|
||||||
|
message TestUtf8 { |
||||||
|
optional string optional_string = 1; |
||||||
|
repeated string repeated_string = 2; |
||||||
|
map<string, string> map_string_string = 3; |
||||||
|
} |
@ -0,0 +1,136 @@ |
|||||||
|
#!/usr/bin/ruby |
||||||
|
|
||||||
|
require 'google/protobuf' |
||||||
|
require 'utf8_pb' |
||||||
|
require 'test/unit' |
||||||
|
|
||||||
|
module CaptureWarnings |
||||||
|
@@warnings = nil |
||||||
|
|
||||||
|
module_function |
||||||
|
|
||||||
|
def warn(message, category: nil, **kwargs) |
||||||
|
if @@warnings |
||||||
|
@@warnings << message |
||||||
|
else |
||||||
|
super |
||||||
|
end |
||||||
|
end |
||||||
|
|
||||||
|
def capture |
||||||
|
@@warnings = [] |
||||||
|
yield |
||||||
|
@@warnings |
||||||
|
ensure |
||||||
|
@@warnings = nil |
||||||
|
end |
||||||
|
end |
||||||
|
|
||||||
|
Warning.extend CaptureWarnings |
||||||
|
|
||||||
|
module Utf8Test |
||||||
|
def test_scalar |
||||||
|
msg = Utf8TestProtos::TestUtf8.new |
||||||
|
assert_bad_utf8 { msg.optional_string = bad_utf8_string() } |
||||||
|
end |
||||||
|
|
||||||
|
def test_repeated |
||||||
|
msg = Utf8TestProtos::TestUtf8.new |
||||||
|
assert_bad_utf8 { msg.repeated_string << bad_utf8_string() } |
||||||
|
end |
||||||
|
|
||||||
|
def test_map_key |
||||||
|
msg = Utf8TestProtos::TestUtf8.new |
||||||
|
assert_bad_utf8 { msg.map_string_string[bad_utf8_string()] = "abc" } |
||||||
|
end |
||||||
|
|
||||||
|
def test_map_value |
||||||
|
msg = Utf8TestProtos::TestUtf8.new |
||||||
|
assert_bad_utf8 { msg.map_string_string["abc"] = bad_utf8_string() } |
||||||
|
end |
||||||
|
end |
||||||
|
|
||||||
|
# Tests the case of string objects that are marked UTF-8, but contain invalid |
||||||
|
# UTF-8. |
||||||
|
# |
||||||
|
# For now these only warn, but in the next major version they will throw an |
||||||
|
# exception. |
||||||
|
class MarkedUtf8Test < Test::Unit::TestCase |
||||||
|
def assert_bad_utf8(&block) |
||||||
|
warnings = CaptureWarnings.capture(&block) |
||||||
|
assert_equal 1, warnings.length |
||||||
|
assert_match(/String is invalid UTF-8. This will be an error in a future version./, warnings[0]) |
||||||
|
end |
||||||
|
|
||||||
|
def bad_utf8_string |
||||||
|
str = "\x80" |
||||||
|
assert_false str.valid_encoding? |
||||||
|
str |
||||||
|
end |
||||||
|
|
||||||
|
include Utf8Test |
||||||
|
end |
||||||
|
|
||||||
|
# This test doesn't work in JRuby because JRuby appears to have a bug where |
||||||
|
# the "valid" bit on a string's data is not invalidated properly when the |
||||||
|
# string is modified: https://github.com/jruby/jruby/issues/8316 |
||||||
|
if !defined? JRUBY_VERSION |
||||||
|
# Tests the case of string objects that are marked UTF-8, and initially contain |
||||||
|
# valid UTF-8, but are later modified to be invalid UTF-8. This may put the |
||||||
|
# string into an state of "unknown" validity. |
||||||
|
# |
||||||
|
# For now these only warn, but in the next major version they will throw an |
||||||
|
# exception. |
||||||
|
class MarkedModifiedUtf8Test < Test::Unit::TestCase |
||||||
|
def assert_bad_utf8(&block) |
||||||
|
warnings = CaptureWarnings.capture(&block) |
||||||
|
assert_equal 1, warnings.length |
||||||
|
assert_match(/String is invalid UTF-8. This will be an error in a future version./, warnings[0]) |
||||||
|
end |
||||||
|
|
||||||
|
def bad_utf8_string |
||||||
|
str = " " |
||||||
|
assert_true str.valid_encoding? |
||||||
|
str[0] = "\x80" |
||||||
|
str |
||||||
|
end |
||||||
|
|
||||||
|
include Utf8Test |
||||||
|
end |
||||||
|
end |
||||||
|
|
||||||
|
# Tests the case of string objects that are marked with a non-UTF-8 encoding, |
||||||
|
# but contain invalid UTF-8. |
||||||
|
# |
||||||
|
# This case will raise Encoding::UndefinedConversionError. |
||||||
|
class MarkedNonUtf8Test < Test::Unit::TestCase |
||||||
|
def assert_bad_utf8 |
||||||
|
assert_raises(Encoding::UndefinedConversionError) { yield } |
||||||
|
end |
||||||
|
|
||||||
|
def bad_utf8_string |
||||||
|
str = "\x80".force_encoding(Encoding::ASCII_8BIT) |
||||||
|
assert_true str.valid_encoding? |
||||||
|
str |
||||||
|
end |
||||||
|
|
||||||
|
include Utf8Test |
||||||
|
end |
||||||
|
|
||||||
|
# Tests the case of string objects that are marked with a non-UTF-8 encoding, |
||||||
|
# but are invalid even in their source encoding. |
||||||
|
# |
||||||
|
# This case will raise Encoding::InvalidByteSequenceError |
||||||
|
class MarkedNonUtf8Test < Test::Unit::TestCase |
||||||
|
def assert_bad_utf8(&block) |
||||||
|
assert_raises(Encoding::InvalidByteSequenceError, &block) |
||||||
|
end |
||||||
|
|
||||||
|
def bad_utf8_string |
||||||
|
str = "\x80".force_encoding(Encoding::ASCII) |
||||||
|
assert_false str.valid_encoding? |
||||||
|
str |
||||||
|
end |
||||||
|
|
||||||
|
include Utf8Test |
||||||
|
end |
Loading…
Reference in new issue