Error if assigning a "UTF-8" string with invalid UTF-8.

This has been a warning since v28.x: fa8dbaec86 See also https://protobuf.dev/news/2024-10-02/#utf-8-enforcement which pre-announces this for v30.x. Fixes https://github.com/protocolbuffers/protobuf/issues/17484 PiperOrigin-RevId: 691917648
6 months ago · 2f505a7aa1
parent 752173215a
commit 2f505a7aa1
4 changed files with 19 additions and 31 deletions
--- a/ruby/ext/google/protobuf_c/convert.c
+++ b/ruby/ext/google/protobuf_c/convert.c
@ -112,16 +112,9 @@ VALUE Convert_CheckStringUtf8(VALUE str) {
    // not mean that it is *valid* UTF-8.  We have to check separately
    // whether it is valid.
    if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
-      // TODO: For now
-      // we only warn for this case.  We will remove the warning and throw an
-      // exception below in the 30.x release
-
-      rb_warn(
-          "String is invalid UTF-8. This will be an error in a future "
-          "version.");
-      // VALUE exc = rb_const_get_at(
-      //     rb_cEncoding, rb_intern("InvalidByteSequenceError"));
-      // rb_raise(exc, "String is invalid UTF-8");
+      VALUE exc = rb_const_get_at(
+          rb_cEncoding, rb_intern("InvalidByteSequenceError"));
+      rb_raise(exc, "String is invalid UTF-8");
    }
  } else {
    // Note: this will not duplicate underlying string data unless
--- a/ruby/lib/google/protobuf/ffi/internal/convert.rb
+++ b/ruby/lib/google/protobuf/ffi/internal/convert.rb
@ -36,11 +36,7 @@ module Google
            value = value.to_s if value.is_a?(Symbol)
            if value.encoding == Encoding::UTF_8
              unless value.valid_encoding?
-                # TODO:
-                # For now we only warn for this case.  We will remove the
-                # warning and throw an exception below in the 30.x release
-                warn "String is invalid UTF-8. This will be an error in a future version."
-                # raise Encoding::InvalidByteSequenceError.new "String is invalid UTF-8"
+                raise Encoding::InvalidByteSequenceError.new "String is invalid UTF-8"
              end
              string_value = value
            else
--- a/ruby/src/main/java/com/google/protobuf/jruby/Utils.java
+++ b/ruby/src/main/java/com/google/protobuf/jruby/Utils.java
@ -331,6 +331,15 @@ public class Utils {
        && fieldDescriptor.getMessageType().getOptions().getMapEntry();
  }

+  public static RaiseException createInvalidByteSequenceError(
+      ThreadContext context, String message) {
+    if (cInvalidByteSequenceError == null) {
+      cInvalidByteSequenceError =
+          (RubyClass) context.runtime.getClassFromPath("Encoding::InvalidByteSequenceError");
+    }
+    return RaiseException.from(context.runtime, cInvalidByteSequenceError, message);
+  }
+
  public static RaiseException createTypeError(ThreadContext context, String message) {
    if (cTypeError == null) {
      cTypeError = (RubyClass) context.runtime.getClassFromPath("Google::Protobuf::TypeError");
@ -393,12 +402,7 @@ public class Utils {
    RubyString string = (RubyString) value;
    if (encoding == UTF8Encoding.INSTANCE && string.getEncoding().isUTF8()) {
      if (string.isCodeRangeBroken()) {
-        // TODO: For now we only warn for
-        // this case.  We will remove the warning and throw an exception in the 30.x release
-        context
-            .runtime
-            .getWarnings()
-            .warn("String is invalid UTF-8. This will be an error in a future version.");
+        throw createInvalidByteSequenceError(context, "String is invalid UTF-8.");
      }
    }

@ -424,4 +428,5 @@ public class Utils {
  private static final long UINT_MAX = 0xffffffffl;

  private static RubyClass cTypeError;
+  private static RubyClass cInvalidByteSequenceError;
 }
--- a/ruby/tests/utf8.rb
+++ b/ruby/tests/utf8.rb
@ -53,13 +53,10 @@ end
 # Tests the case of string objects that are marked UTF-8, but contain invalid
 # UTF-8.
 #
-# For now these only warn, but in the next major version they will throw an
-# exception.
+# This case will raise Encoding::InvalidByteSequenceError
 class MarkedUtf8Test < Test::Unit::TestCase
  def assert_bad_utf8(&block)
-    warnings = CaptureWarnings.capture(&block)
-    assert_equal 1, warnings.length
-    assert_match(/String is invalid UTF-8. This will be an error in a future version./, warnings[0])
+    assert_raises(Encoding::InvalidByteSequenceError, &block)
  end

  def bad_utf8_string
@ -79,13 +76,10 @@ if !defined? JRUBY_VERSION
  # valid UTF-8, but are later modified to be invalid UTF-8.  This may put the
  # string into an state of "unknown" validity.
  #
-  # For now these only warn, but in the next major version they will throw an
-  # exception.
+  # This case will raise Encoding::InvalidByteSequenceError
  class MarkedModifiedUtf8Test < Test::Unit::TestCase
    def assert_bad_utf8(&block)
-      warnings = CaptureWarnings.capture(&block)
-      assert_equal 1, warnings.length
-      assert_match(/String is invalid UTF-8. This will be an error in a future version./, warnings[0])
+      assert_raises(Encoding::InvalidByteSequenceError, &block)
    end

    def bad_utf8_string