Merge pull request #17510 from protocolbuffers/cherrypick-ruby-utf8

[Ruby] Warn if assigning a "UTF-8" string with invalid UTF-8. (#17253)
4 months ago · fa8dbaec86
parent ca75771fe7 fe9a53cb51
commit fa8dbaec86
7 changed files with 219 additions and 21 deletions
--- a/ruby/Rakefile
+++ b/ruby/Rakefile
@ -35,6 +35,7 @@ test_protos = %w[
  tests/test_import_proto2.proto
  tests/test_ruby_package.proto
  tests/test_ruby_package_proto2.proto
  tests/utf8.proto
 ]
 # These are omitted for now because we don't support proto2.
--- a/ruby/ext/google/protobuf_c/convert.c
+++ b/ruby/ext/google/protobuf_c/convert.c
@ -104,6 +104,41 @@ unknownval:
  rb_raise(rb_eRangeError, "Unknown symbol value for enum field '%s'.", name);
 }
 VALUE Convert_CheckStringUtf8(VALUE str) {
  VALUE utf8 = rb_enc_from_encoding(rb_utf8_encoding());
  if (rb_obj_encoding(str) == utf8) {
    // Note: Just because a string is marked as having UTF-8 encoding does
    // not mean that it is *valid* UTF-8.  We have to check separately
    // whether it is valid.
    if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
      // TODO: For now
      // we only warn for this case.  We will remove the warning and throw an
      // exception below in the 30.x release
      rb_warn(
          "String is invalid UTF-8. This will be an error in a future "
          "version.");
      // VALUE exc = rb_const_get_at(
      //     rb_cEncoding, rb_intern("InvalidByteSequenceError"));
      // rb_raise(exc, "String is invalid UTF-8");
    }
  } else {
    // Note: this will not duplicate underlying string data unless
    // necessary.
    //
    // This will throw an exception if the conversion cannot be performed:
    // - Encoding::UndefinedConversionError if certain characters cannot be
    //   converted to UTF-8.
    // - Encoding::InvalidByteSequenceError if certain characters were invalid
    //   in the source encoding.
    str = rb_str_encode(str, utf8, 0, Qnil);
    PBRUBY_ASSERT(rb_enc_str_coderange(str) != ENC_CODERANGE_BROKEN);
  }
  return str;
 }
 upb_MessageValue Convert_RubyToUpb(VALUE value, const char* name,
                                   TypeInfo type_info, upb_Arena* arena) {
  upb_MessageValue ret;
@ -137,8 +172,7 @@ upb_MessageValue Convert_RubyToUpb(VALUE value, const char* name,
      }
      break;
    }
-    case kUpb_CType_String: {
+    case kUpb_CType_String:
      VALUE utf8 = rb_enc_from_encoding(rb_utf8_encoding());
      if (rb_obj_class(value) == rb_cSymbol) {
        value = rb_funcall(value, rb_intern("to_s"), 0);
      } else if (!rb_obj_is_kind_of(value, rb_cString)) {
@ -147,19 +181,9 @@ upb_MessageValue Convert_RubyToUpb(VALUE value, const char* name,
                 rb_class2name(CLASS_OF(value)));
      }
-      if (rb_obj_encoding(value) != utf8) {
+      value = Convert_CheckStringUtf8(value);
        // Note: this will not duplicate underlying string data unless
        // necessary.
        value = rb_str_encode(value, utf8, 0, Qnil);
        if (rb_enc_str_coderange(value) == ENC_CODERANGE_BROKEN) {
          rb_raise(rb_eEncodingError, "String is invalid UTF-8");
        }
      }
      ret.str_val = Convert_StringData(value, arena);
      break;
    }
    case kUpb_CType_Bytes: {
      VALUE bytes = rb_enc_from_encoding(rb_ascii8bit_encoding());
      if (rb_obj_class(value) != rb_cString) {
--- a/ruby/lib/google/protobuf/ffi/internal/convert.rb
+++ b/ruby/lib/google/protobuf/ffi/internal/convert.rb
@ -33,11 +33,18 @@ module Google
            return_value[:bool_val] = value
          when :string
            raise TypeError.new "Invalid argument for string field '#{name}' (given #{value.class})." unless value.is_a?(String) or value.is_a?(Symbol)
-            begin
+            value = value.to_s if value.is_a?(Symbol)
            if value.encoding == Encoding::UTF_8
              unless value.valid_encoding?
                # TODO:
                # For now we only warn for this case.  We will remove the
                # warning and throw an exception below in the 30.x release
                warn "String is invalid UTF-8. This will be an error in a future version."
                # raise Encoding::InvalidByteSequenceError.new "String is invalid UTF-8"
              end
              string_value = value
            else
              string_value = value.to_s.encode("UTF-8")
            rescue Encoding::UndefinedConversionError
              # TODO - why not include the field name here?
              raise Encoding::UndefinedConversionError.new "String is invalid UTF-8"
            end
            return_value[:str_val][:size] = string_value.bytesize
            return_value[:str_val][:data] = Google::Protobuf::FFI.arena_malloc(arena, string_value.bytesize)
--- a/ruby/src/main/java/com/google/protobuf/jruby/Utils.java
+++ b/ruby/src/main/java/com/google/protobuf/jruby/Utils.java
@ -40,6 +40,7 @@ import org.jcodings.Encoding;
 import org.jcodings.specific.ASCIIEncoding;
 import org.jcodings.specific.UTF8Encoding;
 import org.jruby.*;
 import org.jruby.common.RubyWarnings;
 import org.jruby.exceptions.RaiseException;
 import org.jruby.ext.bigdecimal.RubyBigDecimal;
 import org.jruby.runtime.Block;
@ -389,11 +390,21 @@ public class Utils {
    if (!(value instanceof RubyString))
      throw createInvalidTypeError(context, fieldType, fieldName, value);
    RubyString string = (RubyString) value;
    if (encoding == UTF8Encoding.INSTANCE && string.getEncoding().isUTF8()) {
      if (string.isCodeRangeBroken()) {
        // TODO: For now we only warn for
        // this case.  We will remove the warning and throw an exception in the 30.x release
        context
            .runtime
            .getWarnings()
            .warn("String is invalid UTF-8. This will be an error in a future version.");
      }
    }
    value =
-        ((RubyString) value)
+        string.encode(
-            .encode(
+            context, context.runtime.getEncodingService().convertEncodingToRubyEncoding(encoding));
                context,
                context.runtime.getEncodingService().convertEncodingToRubyEncoding(encoding));
    value.setFrozen(true);
    return value;
  }
--- a/ruby/tests/BUILD.bazel
+++ b/ruby/tests/BUILD.bazel
@ -142,6 +142,16 @@ ruby_test(
    ],
 )
 ruby_test(
    name = "utf8",
    srcs = ["utf8.rb"],
    deps = [
        ":test_ruby_protos",
        "//ruby:protobuf",
        "@protobuf_bundle//:test-unit",
    ],
 )
 ruby_test(
    name = "well_known_types_test",
    srcs = ["well_known_types_test.rb"],
--- a/ruby/tests/utf8.proto
+++ b/ruby/tests/utf8.proto
@ -0,0 +1,9 @@
 syntax = "proto2";
 package utf8_test_protos;
 message TestUtf8 {
  optional string optional_string = 1;
  repeated string repeated_string = 2;
  map<string, string> map_string_string = 3;
 }
--- a/ruby/tests/utf8.rb
+++ b/ruby/tests/utf8.rb
@ -0,0 +1,136 @@
 #!/usr/bin/ruby
 require 'google/protobuf'
 require 'utf8_pb'
 require 'test/unit'
 module CaptureWarnings
  @@warnings = nil
  module_function
  def warn(message, category: nil, **kwargs)
    if @@warnings
      @@warnings << message
    else
      super
    end
  end
  def capture
    @@warnings = []
    yield
    @@warnings
  ensure
    @@warnings = nil
  end
 end
 Warning.extend CaptureWarnings
 module Utf8Test
  def test_scalar
    msg = Utf8TestProtos::TestUtf8.new
    assert_bad_utf8 { msg.optional_string = bad_utf8_string() }
  end
  def test_repeated
    msg = Utf8TestProtos::TestUtf8.new
    assert_bad_utf8 { msg.repeated_string << bad_utf8_string() }
  end
  def test_map_key
    msg = Utf8TestProtos::TestUtf8.new
    assert_bad_utf8 { msg.map_string_string[bad_utf8_string()] = "abc" }
  end
  def test_map_value
    msg = Utf8TestProtos::TestUtf8.new
    assert_bad_utf8 { msg.map_string_string["abc"] = bad_utf8_string() }
  end
 end
 # Tests the case of string objects that are marked UTF-8, but contain invalid
 # UTF-8.
 #
 # For now these only warn, but in the next major version they will throw an
 # exception.
 class MarkedUtf8Test < Test::Unit::TestCase
  def assert_bad_utf8(&block)
    warnings = CaptureWarnings.capture(&block)
    assert_equal 1, warnings.length
    assert_match(/String is invalid UTF-8. This will be an error in a future version./, warnings[0])
  end
  def bad_utf8_string
    str = "\x80"
    assert_false str.valid_encoding?
    str
  end
  include Utf8Test
 end
 # This test doesn't work in JRuby because JRuby appears to have a bug where
 # the "valid" bit on a string's data is not invalidated properly when the
 # string is modified: https://github.com/jruby/jruby/issues/8316
 if !defined? JRUBY_VERSION
  # Tests the case of string objects that are marked UTF-8, and initially contain
  # valid UTF-8, but are later modified to be invalid UTF-8.  This may put the
  # string into an state of "unknown" validity.
  #
  # For now these only warn, but in the next major version they will throw an
  # exception.
  class MarkedModifiedUtf8Test < Test::Unit::TestCase
    def assert_bad_utf8(&block)
      warnings = CaptureWarnings.capture(&block)
      assert_equal 1, warnings.length
      assert_match(/String is invalid UTF-8. This will be an error in a future version./, warnings[0])
    end
    def bad_utf8_string
      str = " "
      assert_true str.valid_encoding?
      str[0] = "\x80"
      str
    end
    include Utf8Test
  end
 end
 # Tests the case of string objects that are marked with a non-UTF-8 encoding,
 # but contain invalid UTF-8.
 #
 # This case will raise Encoding::UndefinedConversionError.
 class MarkedNonUtf8Test < Test::Unit::TestCase
  def assert_bad_utf8
    assert_raises(Encoding::UndefinedConversionError) { yield }
  end
  def bad_utf8_string
    str = "\x80".force_encoding(Encoding::ASCII_8BIT)
    assert_true str.valid_encoding?
    str
  end
  include Utf8Test
 end
 # Tests the case of string objects that are marked with a non-UTF-8 encoding,
 # but are invalid even in their source encoding.
 #
 # This case will raise Encoding::InvalidByteSequenceError
 class MarkedNonUtf8Test < Test::Unit::TestCase
  def assert_bad_utf8(&block)
    assert_raises(Encoding::InvalidByteSequenceError, &block)
  end
  def bad_utf8_string
    str = "\x80".force_encoding(Encoding::ASCII)
    assert_false str.valid_encoding?
    str
  end
  include Utf8Test
 end