protobuf/rust/string.rs

// Protocol Buffers - Google's data interchange format
// Copyright 2023 Google LLC.  All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file or at
// https://developers.google.com/open-source/licenses/bsd

//! Items specific to `bytes` and `string` fields.
#![allow(dead_code)]
#![allow(unused)]

use crate::__internal::Private;
use crate::__runtime::{InnerProtoString, PtrAndLen, RawMessage};
use crate::{IntoProxied, Mut, MutProxied, MutProxy, Optional, Proxied, Proxy, View, ViewProxy};
use std::borrow::Cow;
use std::cmp::{Eq, Ord, Ordering, PartialEq, PartialOrd};
use std::convert::{AsMut, AsRef};
use std::ffi::{OsStr, OsString};
use std::fmt;
use std::hash::{Hash, Hasher};
use std::iter;
use std::ops::{Deref, DerefMut};
use std::ptr;
use std::rc::Rc;
use std::sync::Arc;
use utf8::Utf8Chunks;

pub struct ProtoBytes {
    pub(crate) inner: InnerProtoString,
}

impl ProtoBytes {
    // Returns the kernel-specific container. This method is private in spirit and
    // must not be called by a user.
    #[doc(hidden)]
    pub fn into_inner(self, _private: Private) -> InnerProtoString {
        self.inner
    }
}

impl AsRef<[u8]> for ProtoBytes {
    fn as_ref(&self) -> &[u8] {
        self.inner.as_bytes()
    }
}

impl From<&[u8]> for ProtoBytes {
    fn from(v: &[u8]) -> ProtoBytes {
        ProtoBytes { inner: InnerProtoString::from(v) }
    }
}

impl<const N: usize> From<&[u8; N]> for ProtoBytes {
    fn from(v: &[u8; N]) -> ProtoBytes {
        ProtoBytes { inner: InnerProtoString::from(v.as_ref()) }
    }
}

impl Proxied for ProtoBytes {
    type View<'msg> = &'msg [u8];
}

impl IntoProxied<ProtoBytes> for ProtoBytes {
    fn into_proxied(self, _private: Private) -> ProtoBytes {
        self
    }
}

impl IntoProxied<ProtoBytes> for &[u8] {
    fn into_proxied(self, _private: Private) -> ProtoBytes {
        ProtoBytes::from(self)
    }
}

impl<const N: usize> IntoProxied<ProtoBytes> for &[u8; N] {
    fn into_proxied(self, _private: Private) -> ProtoBytes {
        ProtoBytes::from(self.as_ref())
    }
}

impl IntoProxied<ProtoBytes> for Vec<u8> {
    fn into_proxied(self, _private: Private) -> ProtoBytes {
        ProtoBytes::from(AsRef::<[u8]>::as_ref(&self))
    }
}

impl IntoProxied<ProtoBytes> for &Vec<u8> {
    fn into_proxied(self, _private: Private) -> ProtoBytes {
        ProtoBytes::from(AsRef::<[u8]>::as_ref(self))
    }
}

impl IntoProxied<ProtoBytes> for Box<[u8]> {
    fn into_proxied(self, _private: Private) -> ProtoBytes {
        ProtoBytes::from(AsRef::<[u8]>::as_ref(&self))
    }
}

impl IntoProxied<ProtoBytes> for Cow<'_, [u8]> {
    fn into_proxied(self, _private: Private) -> ProtoBytes {
        ProtoBytes::from(AsRef::<[u8]>::as_ref(&self))
    }
}

impl IntoProxied<ProtoBytes> for Rc<[u8]> {
    fn into_proxied(self, _private: Private) -> ProtoBytes {
        ProtoBytes::from(AsRef::<[u8]>::as_ref(&self))
    }
}

impl IntoProxied<ProtoBytes> for Arc<[u8]> {
    fn into_proxied(self, _private: Private) -> ProtoBytes {
        ProtoBytes::from(AsRef::<[u8]>::as_ref(&self))
    }
}

impl<'msg> Proxy<'msg> for &'msg [u8] {
    type Proxied = ProtoBytes;

    fn as_view(&self) -> &[u8] {
        self
    }

    fn into_view<'shorter>(self) -> &'shorter [u8]
    where
        'msg: 'shorter,
    {
        self
    }
}

impl<'msg> ViewProxy<'msg> for &'msg [u8] {}

/// The bytes were not valid UTF-8.
#[derive(Debug, PartialEq)]
pub struct Utf8Error(pub(crate) ());

impl From<std::str::Utf8Error> for Utf8Error {
    fn from(_: std::str::Utf8Error) -> Utf8Error {
        Utf8Error(())
    }
}

/// An owned type representing protobuf `string` field's contents.
///
/// # UTF-8
///
/// Protobuf [docs] state that a `string` field contains UTF-8 encoded text.
/// However, not every runtime enforces this, and the Rust runtime is designed
/// to integrate with other runtimes with FFI, like C++.
///
/// `ProtoString` represents a string type that is expected to contain valid
/// UTF-8. However, `ProtoString` is not validated, so users must
/// call [`ProtoString::to_string`] to perform a (possibly runtime-elided) UTF-8
/// validation check. This validation should rarely fail in pure Rust programs,
/// but is necessary to prevent UB when interacting with C++, or other languages
/// with looser restrictions.
///
///
/// # `Display` and `ToString`
/// `ProtoString` is ordinarily UTF-8 and so implements `Display`. If there are
/// any invalid UTF-8 sequences, they are replaced with [`U+FFFD REPLACEMENT
/// CHARACTER`]. Because anything implementing `Display` also implements
/// `ToString`, `ProtoString::to_string()` is equivalent to
/// `String::from_utf8_lossy(proto_string.as_bytes()).into_owned()`.
///
/// [`U+FFFD REPLACEMENT CHARACTER`]: std::char::REPLACEMENT_CHARACTER
pub struct ProtoString {
    pub(crate) inner: InnerProtoString,
}

impl ProtoString {
    pub fn as_bytes(&self) -> &[u8] {
        self.inner.as_bytes()
    }

    // Returns the kernel-specific container. This method is private in spirit and
    // must not be called by a user.
    #[doc(hidden)]
    pub fn into_inner(self, _private: Private) -> InnerProtoString {
        self.inner
    }
}

impl From<ProtoString> for ProtoBytes {
    fn from(v: ProtoString) -> Self {
        ProtoBytes { inner: v.inner }
    }
}

impl From<&str> for ProtoString {
    fn from(v: &str) -> Self {
        Self::from(v.as_bytes())
    }
}

impl From<&[u8]> for ProtoString {
    fn from(v: &[u8]) -> Self {
        Self { inner: InnerProtoString::from(v) }
    }
}

impl IntoProxied<ProtoString> for ProtoString {
    fn into_proxied(self, _private: Private) -> ProtoString {
        self
    }
}

impl IntoProxied<ProtoString> for &str {
    fn into_proxied(self, _private: Private) -> ProtoString {
        ProtoString::from(self)
    }
}

impl IntoProxied<ProtoString> for &ProtoStr {
    fn into_proxied(self, _private: Private) -> ProtoString {
        ProtoString::from(self.as_bytes())
    }
}

impl IntoProxied<ProtoString> for String {
    fn into_proxied(self, _private: Private) -> ProtoString {
        ProtoString::from(self.as_str())
    }
}

impl IntoProxied<ProtoString> for &String {
    fn into_proxied(self, _private: Private) -> ProtoString {
        ProtoString::from(self.as_bytes())
    }
}

impl IntoProxied<ProtoString> for OsString {
    fn into_proxied(self, private: Private) -> ProtoString {
        self.as_os_str().into_proxied(private)
    }
}

impl IntoProxied<ProtoString> for &OsStr {
    fn into_proxied(self, _private: Private) -> ProtoString {
        ProtoString::from(self.as_encoded_bytes())
    }
}

impl IntoProxied<ProtoString> for Box<str> {
    fn into_proxied(self, _private: Private) -> ProtoString {
        ProtoString::from(AsRef::<str>::as_ref(&self))
    }
}

impl IntoProxied<ProtoString> for Cow<'_, str> {
    fn into_proxied(self, _private: Private) -> ProtoString {
        ProtoString::from(AsRef::<str>::as_ref(&self))
    }
}

impl IntoProxied<ProtoString> for Rc<str> {
    fn into_proxied(self, _private: Private) -> ProtoString {
        ProtoString::from(AsRef::<str>::as_ref(&self))
    }
}

impl IntoProxied<ProtoString> for Arc<str> {
    fn into_proxied(self, _private: Private) -> ProtoString {
        ProtoString::from(AsRef::<str>::as_ref(&self))
    }
}

/// A shared immutable view of a protobuf `string` field's contents.
///
/// Like a `str`, it can be cheaply accessed as bytes and
/// is dynamically sized, requiring it be accessed through a pointer.
///
/// # UTF-8 and `&str` access
///
/// Protobuf [docs] state that a `string` field contains UTF-8 encoded text.
/// However, not every runtime enforces this, and the Rust runtime is designed
/// to integrate with other runtimes with FFI, like C++.
///
/// Because of this, in order to access the contents as a `&str`, users must
/// call [`ProtoStr::to_str`] to perform a (possibly runtime-elided) UTF-8
/// validation check. However, the Rust API only allows `set()`ting a `string`
/// field with data should be valid UTF-8 like a `&str` or a
/// `&ProtoStr`. This means that this check should rarely fail, but is necessary
/// to prevent UB when interacting with C++, which has looser restrictions.
///
/// Most of the time, users should not perform direct `&str` access to the
/// contents - this type implements `Display` and comparison with `str`,
/// so it's best to avoid a UTF-8 check by working directly with `&ProtoStr`
/// or converting to `&[u8]`.
///
/// # `Display` and `ToString`
/// `ProtoStr` is ordinarily UTF-8 and so implements `Display`. If there are
/// any invalid UTF-8 sequences, they are replaced with [`U+FFFD REPLACEMENT
/// CHARACTER`]. Because anything implementing `Display` also implements
/// `ToString`, `proto_str.to_string()` is equivalent to
/// `String::from_utf8_lossy(proto_str.as_bytes()).into_owned()`.
///
/// [docs]: https://protobuf.dev/programming-guides/proto2/#scalar
/// [dst]: https://doc.rust-lang.org/reference/dynamically-sized-types.html
/// [`U+FFFD REPLACEMENT CHARACTER`]: std::char::REPLACEMENT_CHARACTER
#[repr(transparent)]
pub struct ProtoStr([u8]);

impl ProtoStr {
    /// Converts `self` to a byte slice.
    ///
    /// Note: this type does not implement `Deref`; you must call `as_bytes()`
    /// or `AsRef<[u8]>` to get access to bytes.
    pub fn as_bytes(&self) -> &[u8] {
        &self.0
    }

    /// Yields a `&str` slice if `self` contains valid UTF-8.
    ///
    /// This may perform a runtime check, dependent on runtime.
    ///
    /// `String::from_utf8_lossy(proto_str.as_bytes())` can be used to
    /// infallibly construct a string, replacing invalid UTF-8 with
    /// [`U+FFFD REPLACEMENT CHARACTER`].
    ///
    /// [`U+FFFD REPLACEMENT CHARACTER`]: std::char::REPLACEMENT_CHARACTER
    // This is not `try_to_str` since `to_str` is shorter, with `CStr` as precedent.
    pub fn to_str(&self) -> Result<&str, Utf8Error> {
        Ok(std::str::from_utf8(&self.0)?)
    }

    /// Converts `self` to a string, including invalid characters.
    ///
    /// Invalid UTF-8 sequences are replaced with
    /// [`U+FFFD REPLACEMENT CHARACTER`].
    ///
    /// Users should be prefer this to `.to_string()` provided by `Display`.
    /// `.to_cow_lossy()` is the same operation, but it may avoid an
    /// allocation if the string is already UTF-8.
    ///
    /// [`U+FFFD REPLACEMENT CHARACTER`]: std::char::REPLACEMENT_CHARACTER
    //
    // This method is named `to_string_lossy` in `CStr`, but since `to_string`
    // also exists on this type, this name was chosen to avoid confusion.
    pub fn to_cow_lossy(&self) -> Cow<'_, str> {
        String::from_utf8_lossy(&self.0)
    }

    /// Returns `true` if `self` has a length of zero bytes.
    pub fn is_empty(&self) -> bool {
        self.0.is_empty()
    }

    /// Returns the length of `self`.
    ///
    /// Like `&str`, this is a length in bytes, not `char`s or graphemes.
    pub fn len(&self) -> usize {
        self.0.len()
    }

    /// Iterates over the `char`s in this protobuf `string`.
    ///
    /// Invalid UTF-8 sequences are replaced with
    /// [`U+FFFD REPLACEMENT CHARACTER`].
    ///
    /// [`U+FFFD REPLACEMENT CHARACTER`]: std::char::REPLACEMENT_CHARACTER
    pub fn chars(&self) -> impl Iterator<Item = char> + '_ + fmt::Debug {
        Utf8Chunks::new(self.as_bytes()).flat_map(|chunk| {
            let mut yield_replacement_char = !chunk.invalid().is_empty();
            chunk.valid().chars().chain(iter::from_fn(move || {
                // Yield a single replacement character for every
                // non-empty invalid sequence.
                yield_replacement_char.then(|| {
                    yield_replacement_char = false;
                    char::REPLACEMENT_CHARACTER
                })
            }))
        })
    }

    /// Returns an iterator over chunks of UTF-8 data in the string.
    ///
    /// An `Ok(&str)` is yielded for every valid UTF-8 chunk, and an
    /// `Err(&[u8])` for each non-UTF-8 chunk. An `Err` will be emitted
    /// multiple times in a row for contiguous invalid chunks. Each invalid
    /// chunk in an `Err` has a maximum length of 3 bytes.
    pub fn utf8_chunks(&self) -> impl Iterator<Item = Result<&str, &[u8]>> + '_ {
        Utf8Chunks::new(self.as_bytes()).flat_map(|chunk| {
            let valid = chunk.valid();
            let invalid = chunk.invalid();
            (!valid.is_empty())
                .then_some(Ok(valid))
                .into_iter()
                .chain((!invalid.is_empty()).then_some(Err(invalid)))
        })
    }

    /// Converts known-UTF-8 bytes to a `ProtoStr` without a check.
    ///
    /// # Safety
    /// `bytes` must be valid UTF-8 if the current runtime requires it.
    pub unsafe fn from_utf8_unchecked(bytes: &[u8]) -> &Self {
        // SAFETY:
        // - `ProtoStr` is `#[repr(transparent)]` over `[u8]`, so it has the same
        //   layout.
        // - `ProtoStr` has the same pointer metadata and element size as `[u8]`.
        unsafe { &*(bytes as *const [u8] as *const Self) }
    }

    /// Interprets a string slice as a `&ProtoStr`.
    pub fn from_str(string: &str) -> &Self {
        // SAFETY: `string.as_bytes()` is valid UTF-8.
        unsafe { Self::from_utf8_unchecked(string.as_bytes()) }
    }
}

impl AsRef<[u8]> for ProtoStr {
    fn as_ref(&self) -> &[u8] {
        self.as_bytes()
    }
}

impl<'msg> From<&'msg ProtoStr> for &'msg [u8] {
    fn from(val: &'msg ProtoStr) -> &'msg [u8] {
        val.as_bytes()
    }
}

impl<'msg> From<&'msg str> for &'msg ProtoStr {
    fn from(val: &'msg str) -> &'msg ProtoStr {
        ProtoStr::from_str(val)
    }
}

impl<'msg> TryFrom<&'msg ProtoStr> for &'msg str {
    type Error = Utf8Error;

    fn try_from(val: &'msg ProtoStr) -> Result<&'msg str, Utf8Error> {
        val.to_str()
    }
}

impl<'msg> TryFrom<&'msg [u8]> for &'msg ProtoStr {
    type Error = Utf8Error;

    fn try_from(val: &'msg [u8]) -> Result<&'msg ProtoStr, Utf8Error> {
        Ok(ProtoStr::from_str(std::str::from_utf8(val)?))
    }
}

impl fmt::Debug for ProtoStr {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        fmt::Debug::fmt(&Utf8Chunks::new(self.as_bytes()).debug(), f)
    }
}

impl fmt::Display for ProtoStr {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        use std::fmt::Write as _;
        for chunk in Utf8Chunks::new(self.as_bytes()) {
            fmt::Display::fmt(chunk.valid(), f)?;
            if !chunk.invalid().is_empty() {
                // One invalid chunk is emitted per detected invalid sequence.
                f.write_char(char::REPLACEMENT_CHARACTER)?;
            }
        }
        Ok(())
    }
}

impl Hash for ProtoStr {
    fn hash<H: Hasher>(&self, state: &mut H) {
        self.as_bytes().hash(state)
    }
}

impl Eq for ProtoStr {}
impl Ord for ProtoStr {
    fn cmp(&self, other: &ProtoStr) -> Ordering {
        self.as_bytes().cmp(other.as_bytes())
    }
}

impl Proxied for ProtoString {
    type View<'msg> = &'msg ProtoStr;
}

impl<'msg> Proxy<'msg> for &'msg ProtoStr {
    type Proxied = ProtoString;

    fn as_view(&self) -> &ProtoStr {
        self
    }

    fn into_view<'shorter>(self) -> &'shorter ProtoStr
    where
        'msg: 'shorter,
    {
        self
    }
}

impl<'msg> ViewProxy<'msg> for &'msg ProtoStr {}

/// Implements `PartialCmp` and `PartialEq` for the `lhs` against the `rhs`
/// using `AsRef<[u8]>`.
// TODO: consider improving to not require a `<()>` if no generics are
// needed
macro_rules! impl_bytes_partial_cmp {
    ($(<($($generics:tt)*)> $lhs:ty => $rhs:ty),+ $(,)?) => {
        $(
            impl<$($generics)*> PartialEq<$rhs> for $lhs {
                fn eq(&self, other: &$rhs) -> bool {
                    AsRef::<[u8]>::as_ref(self) == AsRef::<[u8]>::as_ref(other)
                }
            }
            impl<$($generics)*> PartialOrd<$rhs> for $lhs {
                fn partial_cmp(&self, other: &$rhs) -> Option<Ordering> {
                    AsRef::<[u8]>::as_ref(self).partial_cmp(AsRef::<[u8]>::as_ref(other))
                }
            }
        )*
    };
}

impl_bytes_partial_cmp!(
    // `ProtoStr` against protobuf types
    <()> ProtoStr => ProtoStr,

    // `ProtoStr` against foreign types
    <()> ProtoStr => str,
    <()> str => ProtoStr,
);

#[cfg(test)]
mod tests {
    use super::*;

    // TODO: Add unit tests

    // Shorter and safe utility function to construct `ProtoStr` from bytes for
    // testing.
    fn test_proto_str(bytes: &[u8]) -> &ProtoStr {
        // SAFETY: The runtime that this test executes under does not elide UTF-8 checks
        // inside of `ProtoStr`.
        unsafe { ProtoStr::from_utf8_unchecked(bytes) }
    }

    // UTF-8 test cases copied from:
    // https://github.com/rust-lang/rust/blob/e8ee0b7/library/core/tests/str_lossy.rs

    #[test]
    fn proto_str_debug() {
        assert_eq!(&format!("{:?}", test_proto_str(b"Hello There")), "\"Hello There\"");
        assert_eq!(
            &format!(
                "{:?}",
                test_proto_str(b"Hello\xC0\x80 There\xE6\x83 Goodbye\xf4\x8d\x93\xaa"),
            ),
            "\"Hello\\xC0\\x80 There\\xE6\\x83 Goodbye\\u{10d4ea}\"",
        );
    }

    #[test]
    fn proto_str_display() {
        assert_eq!(&test_proto_str(b"Hello There").to_string(), "Hello There");
        assert_eq!(
            &test_proto_str(b"Hello\xC0\x80 There\xE6\x83 Goodbye\xf4\x8d\x93\xaa").to_string(),
            "Hello<EFBFBD><EFBFBD> There<EFBFBD> Goodbye\u{10d4ea}",
        );
    }

    #[test]
    fn proto_str_to_rust_str() {
        assert_eq!(test_proto_str(b"hello").to_str(), Ok("hello"));
        assert_eq!(test_proto_str("ศไทย中华Việt Nam".as_bytes()).to_str(), Ok("ศไทย中华Việt Nam"));
        for expect_fail in [
            &b"Hello\xC2 There\xFF Goodbye"[..],
            b"Hello\xC0\x80 There\xE6\x83 Goodbye",
            b"\xF5foo\xF5\x80bar",
            b"\xF1foo\xF1\x80bar\xF1\x80\x80baz",
            b"\xF4foo\xF4\x80bar\xF4\xBFbaz",
            b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar",
            b"\xED\xA0\x80foo\xED\xBF\xBFbar",
        ] {
            assert_eq!(test_proto_str(expect_fail).to_str(), Err(Utf8Error(())), "{expect_fail:?}");
        }
    }

    #[test]
    fn proto_str_to_cow() {
        assert_eq!(test_proto_str(b"hello").to_cow_lossy(), Cow::Borrowed("hello"));
        assert_eq!(
            test_proto_str("ศไทย中华Việt Nam".as_bytes()).to_cow_lossy(),
            Cow::Borrowed("ศไทย中华Việt Nam")
        );
        for (bytes, lossy_str) in [
            (&b"Hello\xC2 There\xFF Goodbye"[..], "Hello<EFBFBD> There<EFBFBD> Goodbye"),
            (b"Hello\xC0\x80 There\xE6\x83 Goodbye", "Hello<EFBFBD><EFBFBD> There<EFBFBD> Goodbye"),
            (b"\xF5foo\xF5\x80bar", "<EFBFBD>foo<EFBFBD><EFBFBD>bar"),
            (b"\xF1foo\xF1\x80bar\xF1\x80\x80baz", "<EFBFBD>foo<EFBFBD>bar<EFBFBD>baz"),
            (b"\xF4foo\xF4\x80bar\xF4\xBFbaz", "<EFBFBD>foo<EFBFBD>bar<EFBFBD><EFBFBD>baz"),
            (b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar", "<EFBFBD><EFBFBD><EFBFBD><EFBFBD>foo\u{10000}bar"),
            (b"\xED\xA0\x80foo\xED\xBF\xBFbar", "<EFBFBD><EFBFBD><EFBFBD>foo<EFBFBD><EFBFBD><EFBFBD>bar"),
        ] {
            let cow = test_proto_str(bytes).to_cow_lossy();
            assert!(matches!(cow, Cow::Owned(_)));
            assert_eq!(&*cow, lossy_str, "{bytes:?}");
        }
    }

    #[test]
    fn proto_str_utf8_chunks() {
        macro_rules! assert_chunks {
            ($bytes:expr, $($chunks:expr),* $(,)?) => {
                let bytes = $bytes;
                let chunks: &[Result<&str, &[u8]>] = &[$($chunks),*];
                let s = test_proto_str(bytes);
                let mut got_chunks = s.utf8_chunks();
                let mut expected_chars = chunks.iter().copied();
                assert!(got_chunks.eq(expected_chars), "{bytes:?} -> {chunks:?}");
            };
        }
        assert_chunks!(b"hello", Ok("hello"));
        assert_chunks!("ศไทย中华Việt Nam".as_bytes(), Ok("ศไทย中华Việt Nam"));
        assert_chunks!(
            b"Hello\xC2 There\xFF Goodbye",
            Ok("Hello"),
            Err(b"\xC2"),
            Ok(" There"),
            Err(b"\xFF"),
            Ok(" Goodbye"),
        );
        assert_chunks!(
            b"Hello\xC0\x80 There\xE6\x83 Goodbye",
            Ok("Hello"),
            Err(b"\xC0"),
            Err(b"\x80"),
            Ok(" There"),
            Err(b"\xE6\x83"),
            Ok(" Goodbye"),
        );
        assert_chunks!(
            b"\xF5foo\xF5\x80bar",
            Err(b"\xF5"),
            Ok("foo"),
            Err(b"\xF5"),
            Err(b"\x80"),
            Ok("bar"),
        );
        assert_chunks!(
            b"\xF1foo\xF1\x80bar\xF1\x80\x80baz",
            Err(b"\xF1"),
            Ok("foo"),
            Err(b"\xF1\x80"),
            Ok("bar"),
            Err(b"\xF1\x80\x80"),
            Ok("baz"),
        );
        assert_chunks!(
            b"\xF4foo\xF4\x80bar\xF4\xBFbaz",
            Err(b"\xF4"),
            Ok("foo"),
            Err(b"\xF4\x80"),
            Ok("bar"),
            Err(b"\xF4"),
            Err(b"\xBF"),
            Ok("baz"),
        );
        assert_chunks!(
            b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar",
            Err(b"\xF0"),
            Err(b"\x80"),
            Err(b"\x80"),
            Err(b"\x80"),
            Ok("foo\u{10000}bar"),
        );
        assert_chunks!(
            b"\xED\xA0\x80foo\xED\xBF\xBFbar",
            Err(b"\xED"),
            Err(b"\xA0"),
            Err(b"\x80"),
            Ok("foo"),
            Err(b"\xED"),
            Err(b"\xBF"),
            Err(b"\xBF"),
            Ok("bar"),
        );
    }

    #[test]
    fn proto_str_chars() {
        macro_rules! assert_chars {
            ($bytes:expr, $chars:expr) => {
                let bytes = $bytes;
                let chars = $chars;
                let s = test_proto_str(bytes);
                let mut got_chars = s.chars();
                let mut expected_chars = chars.into_iter();
                assert!(got_chars.eq(expected_chars), "{bytes:?} -> {chars:?}");
            };
        }
        assert_chars!(b"hello", ['h', 'e', 'l', 'l', 'o']);
        assert_chars!(
            "ศไทย中华Việt Nam".as_bytes(),
            ['ศ', 'ไ', 'ท', 'ย', '中', '华', 'V', 'i', 'ệ', 't', ' ', 'N', 'a', 'm']
        );
        assert_chars!(
            b"Hello\xC2 There\xFF Goodbye",
            [
                'H', 'e', 'l', 'l', 'o', '<EFBFBD>', ' ', 'T', 'h', 'e', 'r', 'e', '<EFBFBD>', ' ', 'G', 'o',
                'o', 'd', 'b', 'y', 'e'
            ]
        );
        assert_chars!(
            b"Hello\xC0\x80 There\xE6\x83 Goodbye",
            [
                'H', 'e', 'l', 'l', 'o', '<EFBFBD>', '<EFBFBD>', ' ', 'T', 'h', 'e', 'r', 'e', '<EFBFBD>', ' ', 'G',
                'o', 'o', 'd', 'b', 'y', 'e'
            ]
        );
        assert_chars!(b"\xF5foo\xF5\x80bar", ['<EFBFBD>', 'f', 'o', 'o', '<EFBFBD>', '<EFBFBD>', 'b', 'a', 'r']);
        assert_chars!(
            b"\xF1foo\xF1\x80bar\xF1\x80\x80baz",
            ['<EFBFBD>', 'f', 'o', 'o', '<EFBFBD>', 'b', 'a', 'r', '<EFBFBD>', 'b', 'a', 'z']
        );
        assert_chars!(
            b"\xF4foo\xF4\x80bar\xF4\xBFbaz",
            ['<EFBFBD>', 'f', 'o', 'o', '<EFBFBD>', 'b', 'a', 'r', '<EFBFBD>', '<EFBFBD>', 'b', 'a', 'z']
        );
        assert_chars!(
            b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar",
            ['<EFBFBD>', '<EFBFBD>', '<EFBFBD>', '<EFBFBD>', 'f', 'o', 'o', '\u{10000}', 'b', 'a', 'r']
        );
        assert_chars!(
            b"\xED\xA0\x80foo\xED\xBF\xBFbar",
            ['<EFBFBD>', '<EFBFBD>', '<EFBFBD>', 'f', 'o', 'o', '<EFBFBD>', '<EFBFBD>', '<EFBFBD>', 'b', 'a', 'r']
        );
    }
}