Merge pull request #17510 from protocolbuffers/cherrypick-ruby-utf8
[Ruby] Warn if assigning a "UTF-8" string with invalid UTF-8. (#17253)pull/17530/head
commit
fa8dbaec86
7 changed files with 219 additions and 21 deletions
@ -0,0 +1,9 @@ |
||||
syntax = "proto2"; |
||||
|
||||
package utf8_test_protos; |
||||
|
||||
message TestUtf8 { |
||||
optional string optional_string = 1; |
||||
repeated string repeated_string = 2; |
||||
map<string, string> map_string_string = 3; |
||||
} |
@ -0,0 +1,136 @@ |
||||
#!/usr/bin/ruby |
||||
|
||||
require 'google/protobuf' |
||||
require 'utf8_pb' |
||||
require 'test/unit' |
||||
|
||||
module CaptureWarnings |
||||
@@warnings = nil |
||||
|
||||
module_function |
||||
|
||||
def warn(message, category: nil, **kwargs) |
||||
if @@warnings |
||||
@@warnings << message |
||||
else |
||||
super |
||||
end |
||||
end |
||||
|
||||
def capture |
||||
@@warnings = [] |
||||
yield |
||||
@@warnings |
||||
ensure |
||||
@@warnings = nil |
||||
end |
||||
end |
||||
|
||||
Warning.extend CaptureWarnings |
||||
|
||||
module Utf8Test |
||||
def test_scalar |
||||
msg = Utf8TestProtos::TestUtf8.new |
||||
assert_bad_utf8 { msg.optional_string = bad_utf8_string() } |
||||
end |
||||
|
||||
def test_repeated |
||||
msg = Utf8TestProtos::TestUtf8.new |
||||
assert_bad_utf8 { msg.repeated_string << bad_utf8_string() } |
||||
end |
||||
|
||||
def test_map_key |
||||
msg = Utf8TestProtos::TestUtf8.new |
||||
assert_bad_utf8 { msg.map_string_string[bad_utf8_string()] = "abc" } |
||||
end |
||||
|
||||
def test_map_value |
||||
msg = Utf8TestProtos::TestUtf8.new |
||||
assert_bad_utf8 { msg.map_string_string["abc"] = bad_utf8_string() } |
||||
end |
||||
end |
||||
|
||||
# Tests the case of string objects that are marked UTF-8, but contain invalid |
||||
# UTF-8. |
||||
# |
||||
# For now these only warn, but in the next major version they will throw an |
||||
# exception. |
||||
class MarkedUtf8Test < Test::Unit::TestCase |
||||
def assert_bad_utf8(&block) |
||||
warnings = CaptureWarnings.capture(&block) |
||||
assert_equal 1, warnings.length |
||||
assert_match(/String is invalid UTF-8. This will be an error in a future version./, warnings[0]) |
||||
end |
||||
|
||||
def bad_utf8_string |
||||
str = "\x80" |
||||
assert_false str.valid_encoding? |
||||
str |
||||
end |
||||
|
||||
include Utf8Test |
||||
end |
||||
|
||||
# This test doesn't work in JRuby because JRuby appears to have a bug where |
||||
# the "valid" bit on a string's data is not invalidated properly when the |
||||
# string is modified: https://github.com/jruby/jruby/issues/8316 |
||||
if !defined? JRUBY_VERSION |
||||
# Tests the case of string objects that are marked UTF-8, and initially contain |
||||
# valid UTF-8, but are later modified to be invalid UTF-8. This may put the |
||||
# string into an state of "unknown" validity. |
||||
# |
||||
# For now these only warn, but in the next major version they will throw an |
||||
# exception. |
||||
class MarkedModifiedUtf8Test < Test::Unit::TestCase |
||||
def assert_bad_utf8(&block) |
||||
warnings = CaptureWarnings.capture(&block) |
||||
assert_equal 1, warnings.length |
||||
assert_match(/String is invalid UTF-8. This will be an error in a future version./, warnings[0]) |
||||
end |
||||
|
||||
def bad_utf8_string |
||||
str = " " |
||||
assert_true str.valid_encoding? |
||||
str[0] = "\x80" |
||||
str |
||||
end |
||||
|
||||
include Utf8Test |
||||
end |
||||
end |
||||
|
||||
# Tests the case of string objects that are marked with a non-UTF-8 encoding, |
||||
# but contain invalid UTF-8. |
||||
# |
||||
# This case will raise Encoding::UndefinedConversionError. |
||||
class MarkedNonUtf8Test < Test::Unit::TestCase |
||||
def assert_bad_utf8 |
||||
assert_raises(Encoding::UndefinedConversionError) { yield } |
||||
end |
||||
|
||||
def bad_utf8_string |
||||
str = "\x80".force_encoding(Encoding::ASCII_8BIT) |
||||
assert_true str.valid_encoding? |
||||
str |
||||
end |
||||
|
||||
include Utf8Test |
||||
end |
||||
|
||||
# Tests the case of string objects that are marked with a non-UTF-8 encoding, |
||||
# but are invalid even in their source encoding. |
||||
# |
||||
# This case will raise Encoding::InvalidByteSequenceError |
||||
class MarkedNonUtf8Test < Test::Unit::TestCase |
||||
def assert_bad_utf8(&block) |
||||
assert_raises(Encoding::InvalidByteSequenceError, &block) |
||||
end |
||||
|
||||
def bad_utf8_string |
||||
str = "\x80".force_encoding(Encoding::ASCII) |
||||
assert_false str.valid_encoding? |
||||
str |
||||
end |
||||
|
||||
include Utf8Test |
||||
end |
Loading…
Reference in new issue