Merge pull request #17510 from protocolbuffers/cherrypick-ruby-utf8

[Ruby] Warn if assigning a "UTF-8" string with invalid UTF-8. (#17253)
pull/17530/head
Joshua Haberman 4 months ago committed by GitHub
commit fa8dbaec86
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 1
      ruby/Rakefile
  2. 50
      ruby/ext/google/protobuf_c/convert.c
  3. 15
      ruby/lib/google/protobuf/ffi/internal/convert.rb
  4. 19
      ruby/src/main/java/com/google/protobuf/jruby/Utils.java
  5. 10
      ruby/tests/BUILD.bazel
  6. 9
      ruby/tests/utf8.proto
  7. 136
      ruby/tests/utf8.rb

@ -35,6 +35,7 @@ test_protos = %w[
tests/test_import_proto2.proto
tests/test_ruby_package.proto
tests/test_ruby_package_proto2.proto
tests/utf8.proto
]
# These are omitted for now because we don't support proto2.

@ -104,6 +104,41 @@ unknownval:
rb_raise(rb_eRangeError, "Unknown symbol value for enum field '%s'.", name);
}
VALUE Convert_CheckStringUtf8(VALUE str) {
VALUE utf8 = rb_enc_from_encoding(rb_utf8_encoding());
if (rb_obj_encoding(str) == utf8) {
// Note: Just because a string is marked as having UTF-8 encoding does
// not mean that it is *valid* UTF-8. We have to check separately
// whether it is valid.
if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
// TODO: For now
// we only warn for this case. We will remove the warning and throw an
// exception below in the 30.x release
rb_warn(
"String is invalid UTF-8. This will be an error in a future "
"version.");
// VALUE exc = rb_const_get_at(
// rb_cEncoding, rb_intern("InvalidByteSequenceError"));
// rb_raise(exc, "String is invalid UTF-8");
}
} else {
// Note: this will not duplicate underlying string data unless
// necessary.
//
// This will throw an exception if the conversion cannot be performed:
// - Encoding::UndefinedConversionError if certain characters cannot be
// converted to UTF-8.
// - Encoding::InvalidByteSequenceError if certain characters were invalid
// in the source encoding.
str = rb_str_encode(str, utf8, 0, Qnil);
PBRUBY_ASSERT(rb_enc_str_coderange(str) != ENC_CODERANGE_BROKEN);
}
return str;
}
upb_MessageValue Convert_RubyToUpb(VALUE value, const char* name,
TypeInfo type_info, upb_Arena* arena) {
upb_MessageValue ret;
@ -137,8 +172,7 @@ upb_MessageValue Convert_RubyToUpb(VALUE value, const char* name,
}
break;
}
case kUpb_CType_String: {
VALUE utf8 = rb_enc_from_encoding(rb_utf8_encoding());
case kUpb_CType_String:
if (rb_obj_class(value) == rb_cSymbol) {
value = rb_funcall(value, rb_intern("to_s"), 0);
} else if (!rb_obj_is_kind_of(value, rb_cString)) {
@ -147,19 +181,9 @@ upb_MessageValue Convert_RubyToUpb(VALUE value, const char* name,
rb_class2name(CLASS_OF(value)));
}
if (rb_obj_encoding(value) != utf8) {
// Note: this will not duplicate underlying string data unless
// necessary.
value = rb_str_encode(value, utf8, 0, Qnil);
if (rb_enc_str_coderange(value) == ENC_CODERANGE_BROKEN) {
rb_raise(rb_eEncodingError, "String is invalid UTF-8");
}
}
value = Convert_CheckStringUtf8(value);
ret.str_val = Convert_StringData(value, arena);
break;
}
case kUpb_CType_Bytes: {
VALUE bytes = rb_enc_from_encoding(rb_ascii8bit_encoding());
if (rb_obj_class(value) != rb_cString) {

@ -33,11 +33,18 @@ module Google
return_value[:bool_val] = value
when :string
raise TypeError.new "Invalid argument for string field '#{name}' (given #{value.class})." unless value.is_a?(String) or value.is_a?(Symbol)
begin
value = value.to_s if value.is_a?(Symbol)
if value.encoding == Encoding::UTF_8
unless value.valid_encoding?
# TODO:
# For now we only warn for this case. We will remove the
# warning and throw an exception below in the 30.x release
warn "String is invalid UTF-8. This will be an error in a future version."
# raise Encoding::InvalidByteSequenceError.new "String is invalid UTF-8"
end
string_value = value
else
string_value = value.to_s.encode("UTF-8")
rescue Encoding::UndefinedConversionError
# TODO - why not include the field name here?
raise Encoding::UndefinedConversionError.new "String is invalid UTF-8"
end
return_value[:str_val][:size] = string_value.bytesize
return_value[:str_val][:data] = Google::Protobuf::FFI.arena_malloc(arena, string_value.bytesize)

@ -40,6 +40,7 @@ import org.jcodings.Encoding;
import org.jcodings.specific.ASCIIEncoding;
import org.jcodings.specific.UTF8Encoding;
import org.jruby.*;
import org.jruby.common.RubyWarnings;
import org.jruby.exceptions.RaiseException;
import org.jruby.ext.bigdecimal.RubyBigDecimal;
import org.jruby.runtime.Block;
@ -389,11 +390,21 @@ public class Utils {
if (!(value instanceof RubyString))
throw createInvalidTypeError(context, fieldType, fieldName, value);
RubyString string = (RubyString) value;
if (encoding == UTF8Encoding.INSTANCE && string.getEncoding().isUTF8()) {
if (string.isCodeRangeBroken()) {
// TODO: For now we only warn for
// this case. We will remove the warning and throw an exception in the 30.x release
context
.runtime
.getWarnings()
.warn("String is invalid UTF-8. This will be an error in a future version.");
}
}
value =
((RubyString) value)
.encode(
context,
context.runtime.getEncodingService().convertEncodingToRubyEncoding(encoding));
string.encode(
context, context.runtime.getEncodingService().convertEncodingToRubyEncoding(encoding));
value.setFrozen(true);
return value;
}

@ -142,6 +142,16 @@ ruby_test(
],
)
ruby_test(
name = "utf8",
srcs = ["utf8.rb"],
deps = [
":test_ruby_protos",
"//ruby:protobuf",
"@protobuf_bundle//:test-unit",
],
)
ruby_test(
name = "well_known_types_test",
srcs = ["well_known_types_test.rb"],

@ -0,0 +1,9 @@
syntax = "proto2";
package utf8_test_protos;
message TestUtf8 {
optional string optional_string = 1;
repeated string repeated_string = 2;
map<string, string> map_string_string = 3;
}

@ -0,0 +1,136 @@
#!/usr/bin/ruby
require 'google/protobuf'
require 'utf8_pb'
require 'test/unit'
module CaptureWarnings
@@warnings = nil
module_function
def warn(message, category: nil, **kwargs)
if @@warnings
@@warnings << message
else
super
end
end
def capture
@@warnings = []
yield
@@warnings
ensure
@@warnings = nil
end
end
Warning.extend CaptureWarnings
module Utf8Test
def test_scalar
msg = Utf8TestProtos::TestUtf8.new
assert_bad_utf8 { msg.optional_string = bad_utf8_string() }
end
def test_repeated
msg = Utf8TestProtos::TestUtf8.new
assert_bad_utf8 { msg.repeated_string << bad_utf8_string() }
end
def test_map_key
msg = Utf8TestProtos::TestUtf8.new
assert_bad_utf8 { msg.map_string_string[bad_utf8_string()] = "abc" }
end
def test_map_value
msg = Utf8TestProtos::TestUtf8.new
assert_bad_utf8 { msg.map_string_string["abc"] = bad_utf8_string() }
end
end
# Tests the case of string objects that are marked UTF-8, but contain invalid
# UTF-8.
#
# For now these only warn, but in the next major version they will throw an
# exception.
class MarkedUtf8Test < Test::Unit::TestCase
def assert_bad_utf8(&block)
warnings = CaptureWarnings.capture(&block)
assert_equal 1, warnings.length
assert_match(/String is invalid UTF-8. This will be an error in a future version./, warnings[0])
end
def bad_utf8_string
str = "\x80"
assert_false str.valid_encoding?
str
end
include Utf8Test
end
# This test doesn't work in JRuby because JRuby appears to have a bug where
# the "valid" bit on a string's data is not invalidated properly when the
# string is modified: https://github.com/jruby/jruby/issues/8316
if !defined? JRUBY_VERSION
# Tests the case of string objects that are marked UTF-8, and initially contain
# valid UTF-8, but are later modified to be invalid UTF-8. This may put the
# string into an state of "unknown" validity.
#
# For now these only warn, but in the next major version they will throw an
# exception.
class MarkedModifiedUtf8Test < Test::Unit::TestCase
def assert_bad_utf8(&block)
warnings = CaptureWarnings.capture(&block)
assert_equal 1, warnings.length
assert_match(/String is invalid UTF-8. This will be an error in a future version./, warnings[0])
end
def bad_utf8_string
str = " "
assert_true str.valid_encoding?
str[0] = "\x80"
str
end
include Utf8Test
end
end
# Tests the case of string objects that are marked with a non-UTF-8 encoding,
# but contain invalid UTF-8.
#
# This case will raise Encoding::UndefinedConversionError.
class MarkedNonUtf8Test < Test::Unit::TestCase
def assert_bad_utf8
assert_raises(Encoding::UndefinedConversionError) { yield }
end
def bad_utf8_string
str = "\x80".force_encoding(Encoding::ASCII_8BIT)
assert_true str.valid_encoding?
str
end
include Utf8Test
end
# Tests the case of string objects that are marked with a non-UTF-8 encoding,
# but are invalid even in their source encoding.
#
# This case will raise Encoding::InvalidByteSequenceError
class MarkedNonUtf8Test < Test::Unit::TestCase
def assert_bad_utf8(&block)
assert_raises(Encoding::InvalidByteSequenceError, &block)
end
def bad_utf8_string
str = "\x80".force_encoding(Encoding::ASCII)
assert_false str.valid_encoding?
str
end
include Utf8Test
end
Loading…
Cancel
Save