Error if assigning a "UTF-8" string with invalid UTF-8.

This has been a warning since v28.x: fa8dbaec86

See also https://protobuf.dev/news/2024-10-02/#utf-8-enforcement which pre-announces this for v30.x.

Fixes https://github.com/protocolbuffers/protobuf/issues/17484

PiperOrigin-RevId: 691917648
pull/19054/head
Sandy Zhang 6 months ago committed by Copybara-Service
parent 752173215a
commit 2f505a7aa1
  1. 13
      ruby/ext/google/protobuf_c/convert.c
  2. 6
      ruby/lib/google/protobuf/ffi/internal/convert.rb
  3. 17
      ruby/src/main/java/com/google/protobuf/jruby/Utils.java
  4. 14
      ruby/tests/utf8.rb

@ -112,16 +112,9 @@ VALUE Convert_CheckStringUtf8(VALUE str) {
// not mean that it is *valid* UTF-8. We have to check separately
// whether it is valid.
if (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN) {
// TODO: For now
// we only warn for this case. We will remove the warning and throw an
// exception below in the 30.x release
rb_warn(
"String is invalid UTF-8. This will be an error in a future "
"version.");
// VALUE exc = rb_const_get_at(
// rb_cEncoding, rb_intern("InvalidByteSequenceError"));
// rb_raise(exc, "String is invalid UTF-8");
VALUE exc = rb_const_get_at(
rb_cEncoding, rb_intern("InvalidByteSequenceError"));
rb_raise(exc, "String is invalid UTF-8");
}
} else {
// Note: this will not duplicate underlying string data unless

@ -36,11 +36,7 @@ module Google
value = value.to_s if value.is_a?(Symbol)
if value.encoding == Encoding::UTF_8
unless value.valid_encoding?
# TODO:
# For now we only warn for this case. We will remove the
# warning and throw an exception below in the 30.x release
warn "String is invalid UTF-8. This will be an error in a future version."
# raise Encoding::InvalidByteSequenceError.new "String is invalid UTF-8"
raise Encoding::InvalidByteSequenceError.new "String is invalid UTF-8"
end
string_value = value
else

@ -331,6 +331,15 @@ public class Utils {
&& fieldDescriptor.getMessageType().getOptions().getMapEntry();
}
public static RaiseException createInvalidByteSequenceError(
ThreadContext context, String message) {
if (cInvalidByteSequenceError == null) {
cInvalidByteSequenceError =
(RubyClass) context.runtime.getClassFromPath("Encoding::InvalidByteSequenceError");
}
return RaiseException.from(context.runtime, cInvalidByteSequenceError, message);
}
public static RaiseException createTypeError(ThreadContext context, String message) {
if (cTypeError == null) {
cTypeError = (RubyClass) context.runtime.getClassFromPath("Google::Protobuf::TypeError");
@ -393,12 +402,7 @@ public class Utils {
RubyString string = (RubyString) value;
if (encoding == UTF8Encoding.INSTANCE && string.getEncoding().isUTF8()) {
if (string.isCodeRangeBroken()) {
// TODO: For now we only warn for
// this case. We will remove the warning and throw an exception in the 30.x release
context
.runtime
.getWarnings()
.warn("String is invalid UTF-8. This will be an error in a future version.");
throw createInvalidByteSequenceError(context, "String is invalid UTF-8.");
}
}
@ -424,4 +428,5 @@ public class Utils {
private static final long UINT_MAX = 0xffffffffl;
private static RubyClass cTypeError;
private static RubyClass cInvalidByteSequenceError;
}

@ -53,13 +53,10 @@ end
# Tests the case of string objects that are marked UTF-8, but contain invalid
# UTF-8.
#
# For now these only warn, but in the next major version they will throw an
# exception.
# This case will raise Encoding::InvalidByteSequenceError
class MarkedUtf8Test < Test::Unit::TestCase
def assert_bad_utf8(&block)
warnings = CaptureWarnings.capture(&block)
assert_equal 1, warnings.length
assert_match(/String is invalid UTF-8. This will be an error in a future version./, warnings[0])
assert_raises(Encoding::InvalidByteSequenceError, &block)
end
def bad_utf8_string
@ -79,13 +76,10 @@ if !defined? JRUBY_VERSION
# valid UTF-8, but are later modified to be invalid UTF-8. This may put the
# string into an state of "unknown" validity.
#
# For now these only warn, but in the next major version they will throw an
# exception.
# This case will raise Encoding::InvalidByteSequenceError
class MarkedModifiedUtf8Test < Test::Unit::TestCase
def assert_bad_utf8(&block)
warnings = CaptureWarnings.capture(&block)
assert_equal 1, warnings.length
assert_match(/String is invalid UTF-8. This will be an error in a future version./, warnings[0])
assert_raises(Encoding::InvalidByteSequenceError, &block)
end
def bad_utf8_string

Loading…
Cancel
Save