+
+int utf8_naive(const unsigned char *data, int len);
+
+static const int8_t _first_len_tbl[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3,
+};
+
+static const int8_t _first_range_tbl[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8,
+};
+
+static const int8_t _range_min_tbl[] = {
+ 0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80,
+ 0xC2, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F,
+};
+static const int8_t _range_max_tbl[] = {
+ 0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F,
+ 0xF4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+};
+
+static const int8_t _df_ee_tbl[] = {
+ 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0,
+};
+static const int8_t _ef_fe_tbl[] = {
+ 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
+/* Return 0 on success, -1 on error */
+int utf8_range2(const unsigned char *data, int len)
+{
+ if (len >= 32) {
+ __m128i prev_input = _mm_set1_epi8(0);
+ __m128i prev_first_len = _mm_set1_epi8(0);
+
+ const __m128i first_len_tbl =
+ _mm_loadu_si128((const __m128i *)_first_len_tbl);
+ const __m128i first_range_tbl =
+ _mm_loadu_si128((const __m128i *)_first_range_tbl);
+ const __m128i range_min_tbl =
+ _mm_loadu_si128((const __m128i *)_range_min_tbl);
+ const __m128i range_max_tbl =
+ _mm_loadu_si128((const __m128i *)_range_max_tbl);
+ const __m128i df_ee_tbl =
+ _mm_loadu_si128((const __m128i *)_df_ee_tbl);
+ const __m128i ef_fe_tbl =
+ _mm_loadu_si128((const __m128i *)_ef_fe_tbl);
+
+ __m128i error = _mm_set1_epi8(0);
+
+ while (len >= 32) {
+ /***************************** block 1 ****************************/
+ const __m128i input_a = _mm_loadu_si128((const __m128i *)data);
+
+ __m128i high_nibbles =
+ _mm_and_si128(_mm_srli_epi16(input_a, 4), _mm_set1_epi8(0x0F));
+
+ __m128i first_len_a = _mm_shuffle_epi8(first_len_tbl, high_nibbles);
+
+ __m128i range_a = _mm_shuffle_epi8(first_range_tbl, high_nibbles);
+
+ range_a = _mm_or_si128(
+ range_a, _mm_alignr_epi8(first_len_a, prev_first_len, 15));
+
+ __m128i tmp;
+ tmp = _mm_alignr_epi8(first_len_a, prev_first_len, 14);
+ tmp = _mm_subs_epu8(tmp, _mm_set1_epi8(1));
+ range_a = _mm_or_si128(range_a, tmp);
+
+ tmp = _mm_alignr_epi8(first_len_a, prev_first_len, 13);
+ tmp = _mm_subs_epu8(tmp, _mm_set1_epi8(2));
+ range_a = _mm_or_si128(range_a, tmp);
+
+ __m128i shift1, pos, range2;
+ shift1 = _mm_alignr_epi8(input_a, prev_input, 15);
+ pos = _mm_sub_epi8(shift1, _mm_set1_epi8(0xEF));
+ tmp = _mm_subs_epu8(pos, _mm_set1_epi8(0xF0));
+ range2 = _mm_shuffle_epi8(df_ee_tbl, tmp);
+ tmp = _mm_adds_epu8(pos, _mm_set1_epi8(0x70));
+ range2 = _mm_add_epi8(range2, _mm_shuffle_epi8(ef_fe_tbl, tmp));
+
+ range_a = _mm_add_epi8(range_a, range2);
+
+ __m128i minv = _mm_shuffle_epi8(range_min_tbl, range_a);
+ __m128i maxv = _mm_shuffle_epi8(range_max_tbl, range_a);
+
+ tmp = _mm_or_si128(
+ _mm_cmplt_epi8(input_a, minv),
+ _mm_cmpgt_epi8(input_a, maxv)
+ );
+ error = _mm_or_si128(error, tmp);
+
+ /***************************** block 2 ****************************/
+ const __m128i input_b = _mm_loadu_si128((const __m128i *)(data+16));
+
+ high_nibbles =
+ _mm_and_si128(_mm_srli_epi16(input_b, 4), _mm_set1_epi8(0x0F));
+
+ __m128i first_len_b = _mm_shuffle_epi8(first_len_tbl, high_nibbles);
+
+ __m128i range_b = _mm_shuffle_epi8(first_range_tbl, high_nibbles);
+
+ range_b = _mm_or_si128(
+ range_b, _mm_alignr_epi8(first_len_b, first_len_a, 15));
+
+
+ tmp = _mm_alignr_epi8(first_len_b, first_len_a, 14);
+ tmp = _mm_subs_epu8(tmp, _mm_set1_epi8(1));
+ range_b = _mm_or_si128(range_b, tmp);
+
+ tmp = _mm_alignr_epi8(first_len_b, first_len_a, 13);
+ tmp = _mm_subs_epu8(tmp, _mm_set1_epi8(2));
+ range_b = _mm_or_si128(range_b, tmp);
+
+ shift1 = _mm_alignr_epi8(input_b, input_a, 15);
+ pos = _mm_sub_epi8(shift1, _mm_set1_epi8(0xEF));
+ tmp = _mm_subs_epu8(pos, _mm_set1_epi8(0xF0));
+ range2 = _mm_shuffle_epi8(df_ee_tbl, tmp);
+ tmp = _mm_adds_epu8(pos, _mm_set1_epi8(0x70));
+ range2 = _mm_add_epi8(range2, _mm_shuffle_epi8(ef_fe_tbl, tmp));
+
+ range_b = _mm_add_epi8(range_b, range2);
+
+ minv = _mm_shuffle_epi8(range_min_tbl, range_b);
+ maxv = _mm_shuffle_epi8(range_max_tbl, range_b);
+
+
+ tmp = _mm_or_si128(
+ _mm_cmplt_epi8(input_b, minv),
+ _mm_cmpgt_epi8(input_b, maxv)
+ );
+ error = _mm_or_si128(error, tmp);
+
+ /************************ next iteration **************************/
+ prev_input = input_b;
+ prev_first_len = first_len_b;
+
+ data += 32;
+ len -= 32;
+ }
+
+ if (!_mm_testz_si128(error, error))
+ return -1;
+
+ int32_t token4 = _mm_extract_epi32(prev_input, 3);
+ const int8_t *token = (const int8_t *)&token4;
+ int lookahead = 0;
+ if (token[3] > (int8_t)0xBF)
+ lookahead = 1;
+ else if (token[2] > (int8_t)0xBF)
+ lookahead = 2;
+ else if (token[1] > (int8_t)0xBF)
+ lookahead = 3;
+
+ data -= lookahead;
+ len += lookahead;
+ }
+
+ return utf8_naive(data, len);
+}
+
+#endif
diff --git a/third_party/utf8_range/utf8_corpus_dir/utf8_corpus_durst.txt b/third_party/utf8_range/utf8_corpus_dir/utf8_corpus_durst.txt
new file mode 100644
index 0000000000..b8157dbf45
--- /dev/null
+++ b/third_party/utf8_range/utf8_corpus_dir/utf8_corpus_durst.txt
@@ -0,0 +1,213 @@
+
+
+ UTF-8 test file
+
+ Original by Markus Kuhn, adapted for HTML by Martin Dürst.
+
+UTF-8 encoded sample plain-text file
+‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
+
+Markus Kuhn [ˈmaʳkʊs kuːn] <mkuhn@acm.org> — 1999-08-20
+
+
+The ASCII compatible UTF-8 encoding of ISO 10646 and Unicode
+plain-text files is defined in RFC 2279 and in ISO 10646-1 Annex R.
+
+
+Using Unicode/UTF-8, you can write in emails and source code things such as
+
+Mathematics and Sciences:
+
+ ∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β),
+
+ ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ, ⊥ < a ≠ b ≡ c ≤ d ≪ ⊤ ⇒ (A ⇔ B),
+
+ 2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm
+
+Linguistics and dictionaries:
+
+ ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn
+ Y [ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ]
+
+APL:
+
+ ((V⍳V)=⍳⍴V)/V←,V ⌷←⍳→⍴∆∇⊃‾⍎⍕⌈
+
+Nicer typography in plain text files:
+
+ ╔══════════════════════════════════════════╗
+ ║ ║
+ ║ • ‘single’ and “double” quotes ║
+ ║ ║
+ ║ • Curly apostrophes: “We’ve been here” ║
+ ║ ║
+ ║ • Latin-1 apostrophe and accents: '´` ║
+ ║ ║
+ ║ • ‚deutsche‘ „Anführungszeichen“ ║
+ ║ ║
+ ║ • †, ‡, ‰, •, 3–4, —, −5/+5, ™, … ║
+ ║ ║
+ ║ • ASCII safety test: 1lI|, 0OD, 8B ║
+ ║ ╭─────────╮ ║
+ ║ • the euro symbol: │ 14.95 € │ ║
+ ║ ╰─────────╯ ║
+ ╚══════════════════════════════════════════╝
+
+Greek (in Polytonic):
+
+ The Greek anthem:
+
+ Σὲ γνωρίζω ἀπὸ τὴν κόψη
+ τοῦ σπαθιοῦ τὴν τρομερή,
+ σὲ γνωρίζω ἀπὸ τὴν ὄψη
+ ποὺ μὲ βία μετράει τὴ γῆ.
+
+ ᾿Απ᾿ τὰ κόκκαλα βγαλμένη
+ τῶν ῾Ελλήνων τὰ ἱερά
+ καὶ σὰν πρῶτα ἀνδρειωμένη
+ χαῖρε, ὦ χαῖρε, ᾿Ελευθεριά!
+
+ From a speech of Demosthenes in the 4th century BC:
+
+ Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,
+ ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς
+ λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ
+ τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿
+ εἰς τοῦτο προήκοντα, ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ
+ πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν
+ οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,
+ οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν
+ ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον
+ τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι
+ γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν
+ προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους
+ σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ
+ τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ
+ τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς
+ τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.
+
+ Δημοσθένους, Γ´ ᾿Ολυνθιακὸς
+
+Georgian:
+
+ From a Unicode conference invitation:
+
+ გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო
+ კონფერენციაზე დასასწრებად, რომელიც გაიმართება 10-12 მარტს,
+ ქ. მაინცში, გერმანიაში. კონფერენცია შეჰკრებს ერთად მსოფლიოს
+ ექსპერტებს ისეთ დარგებში როგორიცაა ინტერნეტი და Unicode-ი,
+ ინტერნაციონალიზაცია და ლოკალიზაცია, Unicode-ის გამოყენება
+ ოპერაციულ სისტემებსა, და გამოყენებით პროგრამებში, შრიფტებში,
+ ტექსტების დამუშავებასა და მრავალენოვან კომპიუტერულ სისტემებში.
+
+Russian:
+
+ From a Unicode conference invitation:
+
+ Зарегистрируйтесь сейчас на Десятую Международную Конференцию по
+ Unicode, которая состоится 10-12 марта 1997 года в Майнце в Германии.
+ Конференция соберет широкий круг экспертов по вопросам глобального
+ Интернета и Unicode, локализации и интернационализации, воплощению и
+ применению Unicode в различных операционных системах и программных
+ приложениях, шрифтах, верстке и многоязычных компьютерных системах.
+
+Thai (UCS Level 2):
+
+ Excerpt from a poetry on The Romance of The Three Kingdoms (a Chinese
+ classic 'San Gua'):
+
+ [----------------------------|------------------------]
+ ๏ แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช พระปกเกศกองบู๊กู้ขึ้นใหม่
+ สิบสองกษัตริย์ก่อนหน้าแลถัดไป สององค์ไซร้โง่เขลาเบาปัญญา
+ ทรงนับถือขันทีเป็นที่พึ่ง บ้านเมืองจึงวิปริตเป็นนักหนา
+ โฮจิ๋นเรียกทัพทั่วหัวเมืองมา หมายจะฆ่ามดชั่วตัวสำคัญ
+ เหมือนขับไสไล่เสือจากเคหา รับหมาป่าเข้ามาเลยอาสัญ
+ ฝ่ายอ้องอุ้นยุแยกให้แตกกัน ใช้สาวนั้นเป็นชนวนชื่นชวนใจ
+ พลันลิฉุยกุยกีกลับก่อเหตุ ช่างอาเพศจริงหนาฟ้าร้องไห้
+ ต้องรบราฆ่าฟันจนบรรลัย ฤๅหาใครค้ำชูกู้บรรลังก์ ฯ
+
+ (The above is a two-column text. If combining characters are handled
+ correctly, the lines of the second column should be aligned with the
+ | character above.)
+
+Ethiopian:
+
+ Proverbs in the Amharic language:
+
+ ሰማይ አይታረስ ንጉሥ አይከሰስ።
+ ብላ ካለኝ እንደአባቴ በቆመጠኝ።
+ ጌጥ ያለቤቱ ቁምጥና ነው።
+ ደሀ በሕልሙ ቅቤ ባይጠጣ ንጣት በገደለው።
+ የአፍ ወለምታ በቅቤ አይታሽም።
+ አይጥ በበላ ዳዋ ተመታ።
+ ሲተረጉሙ ይደረግሙ።
+ ቀስ በቀስ፥ ዕንቁላል በእግሩ ይሄዳል።
+ ድር ቢያብር አንበሳ ያስር።
+ ሰው እንደቤቱ እንጅ እንደ ጉረቤቱ አይተዳደርም።
+ እግዜር የከፈተውን ጉሮሮ ሳይዘጋው አይድርም።
+ የጎረቤት ሌባ፥ ቢያዩት ይስቅ ባያዩት ያጠልቅ።
+ ሥራ ከመፍታት ልጄን ላፋታት።
+ ዓባይ ማደሪያ የለው፥ ግንድ ይዞ ይዞራል።
+ የእስላም አገሩ መካ የአሞራ አገሩ ዋርካ።
+ ተንጋሎ ቢተፉ ተመልሶ ባፉ።
+ ወዳጅህ ማር ቢሆን ጨርስህ አትላሰው።
+ እግርህን በፍራሽህ ልክ ዘርጋ።
+
+Runes:
+
+ ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ
+
+ (Old English, which transcribed into Latin reads 'He cwaeth that he
+ bude thaem lande northweardum with tha Westsae.' and means 'He said
+ that he lived in the northern land near the Western Sea.')
+
+Braille:
+
+ ⡌⠁⠧⠑ ⠼⠁⠒ ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌
+
+ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠙⠑⠁⠙⠒ ⠞⠕ ⠃⠑⠛⠔ ⠺⠊⠹⠲ ⡹⠻⠑ ⠊⠎ ⠝⠕ ⠙⠳⠃⠞
+ ⠱⠁⠞⠑⠧⠻ ⠁⠃⠳⠞ ⠹⠁⠞⠲ ⡹⠑ ⠗⠑⠛⠊⠌⠻ ⠕⠋ ⠙⠊⠎ ⠃⠥⠗⠊⠁⠇ ⠺⠁⠎
+ ⠎⠊⠛⠝⠫ ⠃⠹ ⠹⠑ ⠊⠇⠻⠛⠹⠍⠁⠝⠂ ⠹⠑ ⠊⠇⠻⠅⠂ ⠹⠑ ⠥⠝⠙⠻⠞⠁⠅⠻⠂
+ ⠁⠝⠙ ⠹⠑ ⠡⠊⠑⠋ ⠍⠳⠗⠝⠻⠲ ⡎⠊⠗⠕⠕⠛⠑ ⠎⠊⠛⠝⠫ ⠊⠞⠲ ⡁⠝⠙
+ ⡎⠊⠗⠕⠕⠛⠑⠰⠎ ⠝⠁⠍⠑ ⠺⠁⠎ ⠛⠕⠕⠙ ⠥⠏⠕⠝ ⠰⡡⠁⠝⠛⠑⠂ ⠋⠕⠗ ⠁⠝⠹⠹⠔⠛ ⠙⠑
+ ⠡⠕⠎⠑ ⠞⠕ ⠏⠥⠞ ⠙⠊⠎ ⠙⠁⠝⠙ ⠞⠕⠲
+
+ ⡕⠇⠙ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+ ⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞ ⠍⠑⠁⠝ ⠞⠕ ⠎⠁⠹ ⠹⠁⠞ ⡊ ⠅⠝⠪⠂ ⠕⠋ ⠍⠹
+ ⠪⠝ ⠅⠝⠪⠇⠫⠛⠑⠂ ⠱⠁⠞ ⠹⠻⠑ ⠊⠎ ⠏⠜⠞⠊⠊⠥⠇⠜⠇⠹ ⠙⠑⠁⠙ ⠁⠃⠳⠞
+ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ ⡊ ⠍⠊⠣⠞ ⠙⠁⠧⠑ ⠃⠑⠲ ⠔⠊⠇⠔⠫⠂ ⠍⠹⠎⠑⠇⠋⠂ ⠞⠕
+ ⠗⠑⠛⠜⠙ ⠁ ⠊⠕⠋⠋⠔⠤⠝⠁⠊⠇ ⠁⠎ ⠹⠑ ⠙⠑⠁⠙⠑⠌ ⠏⠊⠑⠊⠑ ⠕⠋ ⠊⠗⠕⠝⠍⠕⠝⠛⠻⠹
+ ⠔ ⠹⠑ ⠞⠗⠁⠙⠑⠲ ⡃⠥⠞ ⠹⠑ ⠺⠊⠎⠙⠕⠍ ⠕⠋ ⠳⠗ ⠁⠝⠊⠑⠌⠕⠗⠎
+ ⠊⠎ ⠔ ⠹⠑ ⠎⠊⠍⠊⠇⠑⠆ ⠁⠝⠙ ⠍⠹ ⠥⠝⠙⠁⠇⠇⠪⠫ ⠙⠁⠝⠙⠎
+ ⠩⠁⠇⠇ ⠝⠕⠞ ⠙⠊⠌⠥⠗⠃ ⠊⠞⠂ ⠕⠗ ⠹⠑ ⡊⠳⠝⠞⠗⠹⠰⠎ ⠙⠕⠝⠑ ⠋⠕⠗⠲ ⡹⠳
+ ⠺⠊⠇⠇ ⠹⠻⠑⠋⠕⠗⠑ ⠏⠻⠍⠊⠞ ⠍⠑ ⠞⠕ ⠗⠑⠏⠑⠁⠞⠂ ⠑⠍⠏⠙⠁⠞⠊⠊⠁⠇⠇⠹⠂ ⠹⠁⠞
+ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+ (The first couple of paragraphs of "A Christmas Carol" by Dickens)
+
+Compact font selection example text:
+
+ ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789
+ abcdefghijklmnopqrstuvwxyz £©µÀÆÖÞßéöÿ
+ –—‘“”„†•…‰™œŠŸž€ ΑΒΓΔΩαβγδω АБВГДабвгд
+ ∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi�⑀₂ἠḂӥẄɐː⍎אԱა
+
+Greetings in various languages:
+
+ Hello world, Καλημέρα κόσμε, コンニチハ
+
+Box drawing alignment tests: █
+ ▉
+ ╔══╦══╗ ┌──┬──┐ ╭──┬──╮ ╭──┬──╮ ┏━━┳━━┓ ┎┒┏┑ ╷ ╻ ┏┯┓ ┌┰┐ ▊ ╱╲╱╲╳╳╳
+ ║┌─╨─┐║ │╔═╧═╗│ │╒═╪═╕│ │╓─╁─╖│ ┃┌─╂─┐┃ ┗╃╄┙ ╶┼╴╺╋╸┠┼┨ ┝╋┥ ▋ ╲╱╲╱╳╳╳
+ ║│╲ ╱│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╿ │┃ ┍╅╆┓ ╵ ╹ ┗┷┛ └┸┘ ▌ ╱╲╱╲╳╳╳
+ ╠╡ ╳ ╞╣ ├╢ ╟┤ ├┼─┼─┼┤ ├╫─╂─╫┤ ┣┿╾┼╼┿┫ ┕┛┖┚ ┌┄┄┐ ╎ ┏┅┅┓ ┋ ▍ ╲╱╲╱╳╳╳
+ ║│╱ ╲│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╽ │┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▎
+ ║└─╥─┘║ │╚═╤═╝│ │╘═╪═╛│ │╙─╀─╜│ ┃└─╂─┘┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▏
+ ╚══╩══╝ └──┴──┘ ╰──┴──╯ ╰──┴──╯ ┗━━┻━━┛ └╌╌┘ ╎ ┗╍╍┛ ┋ ▁▂▃▄▅▆▇█
+
+
+
+
diff --git a/third_party/utf8_range/utf8_corpus_dir/utf8_corpus_kuhn.txt b/third_party/utf8_range/utf8_corpus_dir/utf8_corpus_kuhn.txt
new file mode 100644
index 0000000000..e8708178c1
Binary files /dev/null and b/third_party/utf8_range/utf8_corpus_dir/utf8_corpus_kuhn.txt differ
diff --git a/third_party/utf8_range/utf8_range.h b/third_party/utf8_range/utf8_range.h
new file mode 100644
index 0000000000..24d5c77d2f
--- /dev/null
+++ b/third_party/utf8_range/utf8_range.h
@@ -0,0 +1,21 @@
+#ifndef THIRD_PARTY_UTF8_RANGE_UTF8_RANGE_H_
+#define THIRD_PARTY_UTF8_RANGE_UTF8_RANGE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if (defined(__ARM_NEON) && defined(__aarch64__)) || defined(__SSE4_1__)
+int utf8_range2(const unsigned char* data, int len);
+#else
+int utf8_naive(const unsigned char* data, int len);
+static inline int utf8_range2(const unsigned char* data, int len) {
+ return utf8_naive(data, len);
+}
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // THIRD_PARTY_UTF8_RANGE_UTF8_RANGE_H_
diff --git a/third_party/utf8_range/utf8_to_utf16/Makefile b/third_party/utf8_range/utf8_to_utf16/Makefile
new file mode 100644
index 0000000000..853ffa4162
--- /dev/null
+++ b/third_party/utf8_range/utf8_to_utf16/Makefile
@@ -0,0 +1,11 @@
+CC = gcc
+CPPFLAGS = -g -O3 -Wall -march=native
+
+OBJS = main.o iconv.o naive.o
+
+utf8to16: ${OBJS}
+ gcc $^ -o $@
+
+.PHONY: clean
+clean:
+ rm -f utf8to16 *.o
diff --git a/third_party/utf8_range/utf8_to_utf16/iconv.c b/third_party/utf8_range/utf8_to_utf16/iconv.c
new file mode 100644
index 0000000000..35aebb6b78
--- /dev/null
+++ b/third_party/utf8_range/utf8_to_utf16/iconv.c
@@ -0,0 +1,51 @@
+#include
+#include
+#include
+#include
+
+static iconv_t s_cd;
+
+/* Call iconv_open only once so the benchmark will be faster? */
+static void __attribute__ ((constructor)) init_iconv(void)
+{
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ s_cd = iconv_open("UTF-16LE", "UTF-8");
+#else
+ s_cd = iconv_open("UTF-16BE", "UTF-8");
+#endif
+ if (s_cd == (iconv_t)-1) {
+ perror("iconv_open");
+ exit(1);
+ }
+}
+
+/*
+ * Parameters:
+ * - buf8, len8: input utf-8 string
+ * - buf16: buffer to store decoded utf-16 string
+ * - *len16: on entry - utf-16 buffer length in bytes
+ * on exit - length in bytes of valid decoded utf-16 string
+ * Returns:
+ * - 0: success
+ * - >0: error position of input utf-8 string
+ * - -1: utf-16 buffer overflow
+ * LE/BE depends on host
+ */
+int utf8_to16_iconv(const unsigned char *buf8, size_t len8,
+ unsigned short *buf16, size_t *len16)
+{
+ size_t ret, len16_save = *len16;
+ const unsigned char *buf8_0 = buf8;
+
+ ret = iconv(s_cd, (char **)&buf8, &len8, (char **)&buf16, len16);
+
+ *len16 = len16_save - *len16;
+
+ if (ret != (size_t)-1)
+ return 0;
+
+ if (errno == E2BIG)
+ return -1; /* Output buffer full */
+
+ return buf8 - buf8_0 + 1; /* EILSEQ, EINVAL, error position */
+}
diff --git a/third_party/utf8_range/utf8_to_utf16/main.c b/third_party/utf8_range/utf8_to_utf16/main.c
new file mode 100644
index 0000000000..cb8976406d
--- /dev/null
+++ b/third_party/utf8_range/utf8_to_utf16/main.c
@@ -0,0 +1,424 @@
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+int utf8_to16_iconv(const unsigned char *buf8, size_t len8,
+ unsigned short *buf16, size_t *len16);
+int utf8_to16_naive(const unsigned char *buf8, size_t len8,
+ unsigned short *buf16, size_t *len16);
+
+static struct ftab {
+ const char *name;
+ int (*func)(const unsigned char *buf8, size_t len8,
+ unsigned short *buf16, size_t *len16);
+} ftab[] = {
+ {
+ .name = "iconv",
+ .func = utf8_to16_iconv,
+ }, {
+ .name = "naive",
+ .func = utf8_to16_naive,
+ },
+};
+
+static unsigned char *load_test_buf(int len)
+{
+ const char utf8[] = "\xF0\x90\xBF\x80";
+ const int utf8_len = sizeof(utf8)/sizeof(utf8[0]) - 1;
+
+ unsigned char *data = malloc(len);
+ unsigned char *p = data;
+
+ while (len >= utf8_len) {
+ memcpy(p, utf8, utf8_len);
+ p += utf8_len;
+ len -= utf8_len;
+ }
+
+ while (len--)
+ *p++ = 0x7F;
+
+ return data;
+}
+
+static unsigned char *load_test_file(int *len)
+{
+ unsigned char *data;
+ int fd;
+ struct stat stat;
+
+ fd = open("../UTF-8-demo.txt", O_RDONLY);
+ if (fd == -1) {
+ printf("Failed to open ../UTF-8-demo.txt!\n");
+ exit(1);
+ }
+ if (fstat(fd, &stat) == -1) {
+ printf("Failed to get file size!\n");
+ exit(1);
+ }
+
+ *len = stat.st_size;
+ data = malloc(*len);
+ if (read(fd, data, *len) != *len) {
+ printf("Failed to read file!\n");
+ exit(1);
+ }
+
+ close(fd);
+
+ return data;
+}
+
+static void print_test(const unsigned char *data, int len)
+{
+ printf(" [len=%d] \"", len);
+ while (len--)
+ printf("\\x%02X", *data++);
+
+ printf("\"\n");
+}
+
+struct test {
+ const unsigned char *data;
+ int len;
+};
+
+static void prepare_test_buf(unsigned char *buf, const struct test *pos,
+ int pos_len, int pos_idx)
+{
+ /* Round concatenate correct tokens to 1024 bytes */
+ int buf_idx = 0;
+ while (buf_idx < 1024) {
+ int buf_len = 1024 - buf_idx;
+
+ if (buf_len >= pos[pos_idx].len) {
+ memcpy(buf+buf_idx, pos[pos_idx].data, pos[pos_idx].len);
+ buf_idx += pos[pos_idx].len;
+ } else {
+ memset(buf+buf_idx, 0, buf_len);
+ buf_idx += buf_len;
+ }
+
+ if (++pos_idx == pos_len)
+ pos_idx = 0;
+ }
+}
+
+/* Return 0 on success, -1 on error */
+static int test_manual(const struct ftab *ftab, unsigned short *buf16,
+ unsigned short *_buf16)
+{
+#define LEN16 4096
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpointer-sign"
+ /* positive tests */
+ static const struct test pos[] = {
+ {"", 0},
+ {"\x00", 1},
+ {"\x66", 1},
+ {"\x7F", 1},
+ {"\x00\x7F", 2},
+ {"\x7F\x00", 2},
+ {"\xC2\x80", 2},
+ {"\xDF\xBF", 2},
+ {"\xE0\xA0\x80", 3},
+ {"\xE0\xA0\xBF", 3},
+ {"\xED\x9F\x80", 3},
+ {"\xEF\x80\xBF", 3},
+ {"\xF0\x90\xBF\x80", 4},
+ {"\xF2\x81\xBE\x99", 4},
+ {"\xF4\x8F\x88\xAA", 4},
+ };
+
+ /* negative tests */
+ static const struct test neg[] = {
+ {"\x80", 1},
+ {"\xBF", 1},
+ {"\xC0\x80", 2},
+ {"\xC1\x00", 2},
+ {"\xC2\x7F", 2},
+ {"\xDF\xC0", 2},
+ {"\xE0\x9F\x80", 3},
+ {"\xE0\xC2\x80", 3},
+ {"\xED\xA0\x80", 3},
+ {"\xED\x7F\x80", 3},
+ {"\xEF\x80\x00", 3},
+ {"\xF0\x8F\x80\x80", 4},
+ {"\xF0\xEE\x80\x80", 4},
+ {"\xF2\x90\x91\x7F", 4},
+ {"\xF4\x90\x88\xAA", 4},
+ {"\xF4\x00\xBF\xBF", 4},
+ {"\x00\x00\x00\x00\x00\xC2\x80\x00\x00\x00\xE1\x80\x80\x00\x00\xC2" \
+ "\xC2\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
+ 32},
+ {"\x00\x00\x00\x00\x00\xC2\xC2\x80\x00\x00\xE1\x80\x80\x00\x00\x00",
+ 16},
+ {"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" \
+ "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xF1\x80",
+ 32},
+ {"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" \
+ "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xF1",
+ 32},
+ {"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" \
+ "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xF1\x80" \
+ "\x80", 33},
+ {"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" \
+ "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xF1\x80" \
+ "\xC2\x80", 34},
+ {"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" \
+ "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xF0" \
+ "\x80\x80\x80", 35},
+ };
+#pragma GCC diagnostic push
+
+ size_t len16 = LEN16, _len16 = LEN16;
+ int ret, _ret;
+
+ /* Test single token */
+ for (int i = 0; i < sizeof(pos)/sizeof(pos[0]); ++i) {
+ ret = ftab->func(pos[i].data, pos[i].len, buf16, &len16);
+ _ret = utf8_to16_iconv(pos[i].data, pos[i].len, _buf16, &_len16);
+ if (ret != _ret || len16 != _len16 || memcmp(buf16, _buf16, len16)) {
+ printf("FAILED positive test(%d:%d, %lu:%lu): ",
+ ret, _ret, len16, _len16);
+ print_test(pos[i].data, pos[i].len);
+ return -1;
+ }
+ len16 = _len16 = LEN16;
+ }
+ for (int i = 0; i < sizeof(neg)/sizeof(neg[0]); ++i) {
+ ret = ftab->func(neg[i].data, neg[i].len, buf16, &len16);
+ _ret = utf8_to16_iconv(neg[i].data, neg[i].len, _buf16, &_len16);
+ if (ret != _ret || len16 != _len16 || memcmp(buf16, _buf16, len16)) {
+ printf("FAILED negitive test(%d:%d, %lu:%lu): ",
+ ret, _ret, len16, _len16);
+ print_test(neg[i].data, neg[i].len);
+ return -1;
+ }
+ len16 = _len16 = LEN16;
+ }
+
+ /* Test shifted buffer to cover 1k length */
+ /* buffer size must be greater than 1024 + 16 + max(test string length) */
+ const int max_size = 1024*2;
+ uint64_t buf64[max_size/8 + 2];
+ /* Offset 8 bytes by 1 byte */
+ unsigned char *buf = ((unsigned char *)buf64) + 1;
+ int buf_len;
+
+ for (int i = 0; i < sizeof(pos)/sizeof(pos[0]); ++i) {
+ /* Positive test: shift 16 bytes, validate each shift */
+ prepare_test_buf(buf, pos, sizeof(pos)/sizeof(pos[0]), i);
+ buf_len = 1024;
+ for (int j = 0; j < 16; ++j) {
+ ret = ftab->func(buf, buf_len, buf16, &len16);
+ _ret = utf8_to16_iconv(buf, buf_len, _buf16, &_len16);
+ if (ret != _ret || len16 != _len16 || \
+ memcmp(buf16, _buf16, len16)) {
+ printf("FAILED positive test(%d:%d, %lu:%lu): ",
+ ret, _ret, len16, _len16);
+ print_test(buf, buf_len);
+ return -1;
+ }
+ len16 = _len16 = LEN16;
+ for (int k = buf_len; k >= 1; --k)
+ buf[k] = buf[k-1];
+ buf[0] = '\x55';
+ ++buf_len;
+ }
+
+ /* Negative test: trunk last non ascii */
+ while (buf_len >= 1 && buf[buf_len-1] <= 0x7F)
+ --buf_len;
+ if (buf_len) {
+ ret = ftab->func(buf, buf_len-1, buf16, &len16);
+ _ret = utf8_to16_iconv(buf, buf_len-1, _buf16, &_len16);
+ if (ret != _ret || len16 != _len16 || \
+ memcmp(buf16, _buf16, len16)) {
+ printf("FAILED negative test(%d:%d, %lu:%lu): ",
+ ret, _ret, len16, _len16);
+ print_test(buf, buf_len-1);
+ return -1;
+ }
+ len16 = _len16 = LEN16;
+ }
+ }
+
+ /* Negative test */
+ for (int i = 0; i < sizeof(neg)/sizeof(neg[0]); ++i) {
+ /* Append one error token, shift 16 bytes, validate each shift */
+ int pos_idx = i % (sizeof(pos)/sizeof(pos[0]));
+ prepare_test_buf(buf, pos, sizeof(pos)/sizeof(pos[0]), pos_idx);
+ memcpy(buf+1024, neg[i].data, neg[i].len);
+ buf_len = 1024 + neg[i].len;
+ for (int j = 0; j < 16; ++j) {
+ ret = ftab->func(buf, buf_len, buf16, &len16);
+ _ret = utf8_to16_iconv(buf, buf_len, _buf16, &_len16);
+ if (ret != _ret || len16 != _len16 || \
+ memcmp(buf16, _buf16, len16)) {
+ printf("FAILED negative test(%d:%d, %lu:%lu): ",
+ ret, _ret, len16, _len16);
+ print_test(buf, buf_len);
+ return -1;
+ }
+ len16 = _len16 = LEN16;
+ for (int k = buf_len; k >= 1; --k)
+ buf[k] = buf[k-1];
+ buf[0] = '\x66';
+ ++buf_len;
+ }
+ }
+
+ return 0;
+}
+
+static void test(const unsigned char *buf8, size_t len8,
+ unsigned short *buf16, size_t len16, const struct ftab *ftab)
+{
+ /* Use iconv as the reference answer */
+ if (strcmp(ftab->name, "iconv") == 0)
+ return;
+
+ printf("%s\n", ftab->name);
+
+ /* Test file or buffer */
+ size_t _len16 = len16;
+ unsigned short *_buf16 = (unsigned short *)malloc(_len16);
+ if (utf8_to16_iconv(buf8, len8, _buf16, &_len16)) {
+ printf("Invalid test file or buffer!\n");
+ exit(1);
+ }
+ printf("standard test: ");
+ if (ftab->func(buf8, len8, buf16, &len16) || len16 != _len16 || \
+ memcmp(buf16, _buf16, len16) != 0)
+ printf("FAIL\n");
+ else
+ printf("pass\n");
+ free(_buf16);
+
+ /* Manual cases */
+ unsigned short *mbuf8 = (unsigned short *)malloc(LEN16);
+ unsigned short *mbuf16 = (unsigned short *)malloc(LEN16);
+ printf("manual test: %s\n",
+ test_manual(ftab, mbuf8, mbuf16) ? "FAIL" : "pass");
+ free(mbuf8);
+ free(mbuf16);
+ printf("\n");
+}
+
+static void bench(const unsigned char *buf8, size_t len8,
+ unsigned short *buf16, size_t len16, const struct ftab *ftab)
+{
+ const int loops = 1024*1024*1024/len8;
+ int ret = 0;
+ double time, size;
+ struct timeval tv1, tv2;
+
+ fprintf(stderr, "bench %s... ", ftab->name);
+ gettimeofday(&tv1, 0);
+ for (int i = 0; i < loops; ++i)
+ ret |= ftab->func(buf8, len8, buf16, &len16);
+ gettimeofday(&tv2, 0);
+ printf("%s\n", ret?"FAIL":"pass");
+
+ time = tv2.tv_usec - tv1.tv_usec;
+ time = time / 1000000 + tv2.tv_sec - tv1.tv_sec;
+ size = ((double)len8 * loops) / (1024*1024);
+ printf("time: %.4f s\n", time);
+ printf("data: %.0f MB\n", size);
+ printf("BW: %.2f MB/s\n", size / time);
+ printf("\n");
+}
+
+static void usage(const char *bin)
+{
+ printf("Usage:\n");
+ printf("%s test [alg] ==> test all or one algorithm\n", bin);
+ printf("%s bench [alg] ==> benchmark all or one algorithm\n", bin);
+ printf("%s bench size NUM ==> benchmark with specific buffer size\n", bin);
+ printf("alg = ");
+ for (int i = 0; i < sizeof(ftab)/sizeof(ftab[0]); ++i)
+ printf("%s ", ftab[i].name);
+ printf("\nNUM = buffer size in bytes, 1 ~ 67108864(64M)\n");
+}
+
+int main(int argc, char *argv[])
+{
+ int len8 = 0, len16;
+ unsigned char *buf8;
+ unsigned short *buf16;
+ const char *alg = NULL;
+ void (*tb)(const unsigned char *buf8, size_t len8,
+ unsigned short *buf16, size_t len16, const struct ftab *ftab);
+
+ tb = NULL;
+ if (argc >= 2) {
+ if (strcmp(argv[1], "test") == 0)
+ tb = test;
+ else if (strcmp(argv[1], "bench") == 0)
+ tb = bench;
+ if (argc >= 3) {
+ alg = argv[2];
+ if (strcmp(alg, "size") == 0) {
+ if (argc < 4) {
+ tb = NULL;
+ } else {
+ alg = NULL;
+ len8 = atoi(argv[3]);
+ if (len8 <= 0 || len8 > 67108864) {
+ printf("Buffer size error!\n\n");
+ tb = NULL;
+ }
+ }
+ }
+ }
+ }
+
+ if (tb == NULL) {
+ usage(argv[0]);
+ return 1;
+ }
+
+ /* Load UTF8 test buffer */
+ if (len8)
+ buf8 = load_test_buf(len8);
+ else
+ buf8 = load_test_file(&len8);
+
+ /* Prepare UTF16 buffer large enough */
+ len16 = len8 * 2;
+ buf16 = (unsigned short *)malloc(len16);
+
+ if (tb == bench)
+ printf("============== Bench UTF8 (%d bytes) ==============\n", len8);
+ for (int i = 0; i < sizeof(ftab)/sizeof(ftab[0]); ++i) {
+ if (alg && strcmp(alg, ftab[i].name) != 0)
+ continue;
+ tb((const unsigned char *)buf8, len8, buf16, len16, &ftab[i]);
+ }
+
+#if 0
+ if (tb == bench) {
+ printf("==================== Bench ASCII ====================\n");
+ /* Change test buffer to ascii */
+ for (int i = 0; i < len; i++)
+ data[i] &= 0x7F;
+
+ for (int i = 0; i < sizeof(ftab)/sizeof(ftab[0]); ++i) {
+ if (alg && strcmp(alg, ftab[i].name) != 0)
+ continue;
+ tb((const unsigned char *)data, len, &ftab[i]);
+ printf("\n");
+ }
+ }
+#endif
+
+ return 0;
+}
diff --git a/third_party/utf8_range/utf8_to_utf16/naive.c b/third_party/utf8_range/utf8_to_utf16/naive.c
new file mode 100644
index 0000000000..05ab07b76e
--- /dev/null
+++ b/third_party/utf8_range/utf8_to_utf16/naive.c
@@ -0,0 +1,133 @@
+#include
+
+/*
+ * UTF-8 to UTF-16
+ * Table from https://woboq.com/blog/utf-8-processing-using-simd.html
+ *
+ * +-------------------------------------+-------------------+
+ * | UTF-8 | UTF-16LE (HI LO) |
+ * +-------------------------------------+-------------------+
+ * | 0aaaaaaa | 00000000 0aaaaaaa |
+ * +-------------------------------------+-------------------+
+ * | 110bbbbb 10aaaaaa | 00000bbb bbaaaaaa |
+ * +-------------------------------------+-------------------+
+ * | 1110cccc 10bbbbbb 10aaaaaa | ccccbbbb bbaaaaaa |
+ * +-------------------------------------+-------------------+
+ * | 11110ddd 10ddcccc 10bbbbbb 10aaaaaa | 110110uu uuccccbb |
+ * + uuuu = ddddd - 1 | 110111bb bbaaaaaa |
+ * +-------------------------------------+-------------------+
+ */
+
+/*
+ * Parameters:
+ * - buf8, len8: input utf-8 string
+ * - buf16: buffer to store decoded utf-16 string
+ * - *len16: on entry - utf-16 buffer length in bytes
+ * on exit - length in bytes of valid decoded utf-16 string
+ * Returns:
+ * - 0: success
+ * - >0: error position of input utf-8 string
+ * - -1: utf-16 buffer overflow
+ * LE/BE depends on host
+ */
+int utf8_to16_naive(const unsigned char *buf8, size_t len8,
+ unsigned short *buf16, size_t *len16)
+{
+ int err_pos = 1;
+ size_t len16_left = *len16;
+
+ *len16 = 0;
+
+ while (len8) {
+ unsigned char b0, b1, b2, b3;
+ unsigned int u;
+
+ /* Output buffer full */
+ if (len16_left < 2)
+ return -1;
+
+ /* 1st byte */
+ b0 = buf8[0];
+
+ if ((b0 & 0x80) == 0) {
+ /* 0aaaaaaa -> 00000000 0aaaaaaa */
+ *buf16++ = b0;
+ ++buf8;
+ --len8;
+ ++err_pos;
+ *len16 += 2;
+ len16_left -= 2;
+ continue;
+ }
+
+ /* Character length */
+ size_t clen = b0 & 0xF0;
+ clen >>= 4; /* 10xx, 110x, 1110, 1111 */
+ clen -= 12; /* -4~-1, 0/1, 2, 3 */
+ clen += !clen; /* -4~-1, 1, 2, 3 */
+
+ /* String too short or invalid 1st byte (10xxxxxx) */
+ if (len8 <= clen)
+ return err_pos;
+
+ /* Trailing bytes must be within 0x80 ~ 0xBF */
+ b1 = buf8[1];
+ if ((signed char)b1 >= (signed char)0xC0)
+ return err_pos;
+ b1 &= 0x3F;
+
+ ++clen;
+ if (clen == 2) {
+ u = b0 & 0x1F;
+ u <<= 6;
+ u |= b1;
+ if (u <= 0x7F)
+ return err_pos;
+ *buf16++ = u;
+ } else {
+ b2 = buf8[2];
+ if ((signed char)b2 >= (signed char)0xC0)
+ return err_pos;
+ b2 &= 0x3F;
+ if (clen == 3) {
+ u = b0 & 0x0F;
+ u <<= 6;
+ u |= b1;
+ u <<= 6;
+ u |= b2;
+ if (u <= 0x7FF || (u >= 0xD800 && u <= 0xDFFF))
+ return err_pos;
+ *buf16++ = u;
+ } else {
+ /* clen == 4 */
+ if (len16_left < 4)
+ return -1; /* Output buffer full */
+ b3 = buf8[3];
+ if ((signed char)b3 >= (signed char)0xC0)
+ return err_pos;
+ u = b0 & 0x07;
+ u <<= 6;
+ u |= b1;
+ u <<= 6;
+ u |= b2;
+ u <<= 6;
+ u |= (b3 & 0x3F);
+ if (u <= 0xFFFF || u > 0x10FFFF)
+ return err_pos;
+ u -= 0x10000;
+ *buf16++ = (((u >> 10) & 0x3FF) | 0xD800);
+ *buf16++ = ((u & 0x3FF) | 0xDC00);
+ *len16 += 2;
+ len16_left -= 2;
+ }
+ }
+
+ buf8 += clen;
+ len8 -= clen;
+ err_pos += clen;
+ *len16 += 2;
+ len16_left -= 2;
+ }
+
+ return 0;
+}
diff --git a/third_party/utf8_range/utf8_validity.cc b/third_party/utf8_range/utf8_validity.cc
new file mode 100644
index 0000000000..db811993f8
--- /dev/null
+++ b/third_party/utf8_range/utf8_validity.cc
@@ -0,0 +1,458 @@
+// Copyright 2022 Google LLC
+//
+// Use of this source code is governed by an MIT-style
+// license that can be found in the LICENSE file or at
+// https://opensource.org/licenses/MIT.
+
+/* This is a wrapper for the Google range-sse.cc algorithm which checks whether a
+ * sequence of bytes is a valid UTF-8 sequence and finds the longest valid prefix of
+ * the UTF-8 sequence.
+ *
+ * The key difference is that it checks for as much ASCII symbols as possible
+ * and then falls back to the range-sse.cc algorithm. The changes to the
+ * algorithm are cosmetic, mostly to trick the clang compiler to produce optimal
+ * code.
+ *
+ * For API see the utf8_validity.h header.
+ */
+#include "utf8_validity.h"
+
+#include
+#include
+
+#include "absl/strings/ascii.h"
+#include "absl/strings/string_view.h"
+
+#ifdef __SSE4_1__
+#include
+#include
+#include
+#endif
+
+namespace utf8_range {
+namespace {
+
+inline uint64_t UNALIGNED_LOAD64(const void* p) {
+ uint64_t t;
+ memcpy(&t, p, sizeof t);
+ return t;
+}
+
+inline bool TrailByteOk(const char c) {
+ return static_cast(c) <= static_cast(0xBF);
+}
+
+/* If ReturnPosition is false then it returns 1 if |data| is a valid utf8
+ * sequence, otherwise returns 0.
+ * If ReturnPosition is set to true, returns the length in bytes of the prefix
+ of |data| that is all structurally valid UTF-8.
+ */
+template
+size_t ValidUTF8Span(const char* data, const char* end) {
+ /* We return err_pos in the loop which is always 0 if !ReturnPosition */
+ size_t err_pos = 0;
+ size_t codepoint_bytes = 0;
+ /* The early check is done because of early continue's on codepoints of all
+ * sizes, i.e. we first check for ascii and if it is, we call continue, then
+ * for 2 byte codepoints, etc. This is done in order to reduce indentation and
+ * improve readability of the codepoint validity check.
+ */
+ while (data + codepoint_bytes < end) {
+ if (ReturnPosition) {
+ err_pos += codepoint_bytes;
+ }
+ data += codepoint_bytes;
+ const size_t len = end - data;
+ const unsigned char byte1 = data[0];
+
+ /* We do not skip many ascii bytes at the same time as this function is
+ used for tail checking (< 16 bytes) and for non x86 platforms. We also
+ don't think that cases where non-ASCII codepoints are followed by ascii
+ happen often. For small strings it also introduces some penalty. For
+ purely ascii UTF8 strings (which is the overwhelming case) we call
+ SkipAscii function which is multiplatform and extremely fast.
+ */
+ /* [00..7F] ASCII -> 1 byte */
+ if (absl::ascii_isascii(byte1)) {
+ codepoint_bytes = 1;
+ continue;
+ }
+ /* [C2..DF], [80..BF] -> 2 bytes */
+ if (len >= 2 && byte1 >= 0xC2 && byte1 <= 0xDF && TrailByteOk(data[1])) {
+ codepoint_bytes = 2;
+ continue;
+ }
+ if (len >= 3) {
+ const unsigned char byte2 = data[1];
+ const unsigned char byte3 = data[2];
+
+ /* Is byte2, byte3 between [0x80, 0xBF]
+ * Check for 0x80 was done above.
+ */
+ if (!TrailByteOk(byte2) || !TrailByteOk(byte3)) {
+ return err_pos;
+ }
+
+ if (/* E0, A0..BF, 80..BF */
+ ((byte1 == 0xE0 && byte2 >= 0xA0) ||
+ /* E1..EC, 80..BF, 80..BF */
+ (byte1 >= 0xE1 && byte1 <= 0xEC) ||
+ /* ED, 80..9F, 80..BF */
+ (byte1 == 0xED && byte2 <= 0x9F) ||
+ /* EE..EF, 80..BF, 80..BF */
+ (byte1 >= 0xEE && byte1 <= 0xEF))) {
+ codepoint_bytes = 3;
+ continue;
+ }
+ if (len >= 4) {
+ const unsigned char byte4 = data[3];
+ /* Is byte4 between 0x80 ~ 0xBF */
+ if (!TrailByteOk(byte4)) {
+ return err_pos;
+ }
+
+ if (/* F0, 90..BF, 80..BF, 80..BF */
+ ((byte1 == 0xF0 && byte2 >= 0x90) ||
+ /* F1..F3, 80..BF, 80..BF, 80..BF */
+ (byte1 >= 0xF1 && byte1 <= 0xF3) ||
+ /* F4, 80..8F, 80..BF, 80..BF */
+ (byte1 == 0xF4 && byte2 <= 0x8F))) {
+ codepoint_bytes = 4;
+ continue;
+ }
+ }
+ }
+ return err_pos;
+ }
+ if (ReturnPosition) {
+ err_pos += codepoint_bytes;
+ }
+ /* if ReturnPosition is false, this returns 1.
+ * if ReturnPosition is true, this returns err_pos.
+ */
+ return err_pos + (1 - ReturnPosition);
+}
+
+/* Returns the number of bytes needed to skip backwards to get to the first
+ byte of codepoint.
+ */
+inline int CodepointSkipBackwards(int32_t codepoint_word) {
+ const int8_t* const codepoint =
+ reinterpret_cast(&codepoint_word);
+ if (!TrailByteOk(codepoint[3])) {
+ return 1;
+ } else if (!TrailByteOk(codepoint[2])) {
+ return 2;
+ } else if (!TrailByteOk(codepoint[1])) {
+ return 3;
+ }
+ return 0;
+}
+
+/* Skipping over ASCII as much as possible, per 8 bytes. It is intentional
+ as most strings to check for validity consist only of 1 byte codepoints.
+ */
+inline const char* SkipAscii(const char* data, const char* end) {
+ while (8 <= end - data &&
+ (UNALIGNED_LOAD64(data) & 0x8080808080808080) == 0) {
+ data += 8;
+ }
+ while (data < end && absl::ascii_isascii(*data)) {
+ ++data;
+ }
+ return data;
+}
+
+template
+size_t ValidUTF8(const char* data, size_t len) {
+ if (len == 0) return 1 - ReturnPosition;
+ const char* const end = data + len;
+ data = SkipAscii(data, end);
+ /* SIMD algorithm always outperforms the naive version for any data of
+ length >=16.
+ */
+ if (end - data < 16) {
+ return (ReturnPosition ? (data - (end - len)) : 0) +
+ ValidUTF8Span(data, end);
+ }
+#ifndef __SSE4_1__
+ return (ReturnPosition ? (data - (end - len)) : 0) +
+ ValidUTF8Span(data, end);
+#else
+ /* This code checks that utf-8 ranges are structurally valid 16 bytes at once
+ * using superscalar instructions.
+ * The mapping between ranges of codepoint and their corresponding utf-8
+ * sequences is below.
+ */
+
+ /*
+ * U+0000...U+007F 00...7F
+ * U+0080...U+07FF C2...DF 80...BF
+ * U+0800...U+0FFF E0 A0...BF 80...BF
+ * U+1000...U+CFFF E1...EC 80...BF 80...BF
+ * U+D000...U+D7FF ED 80...9F 80...BF
+ * U+E000...U+FFFF EE...EF 80...BF 80...BF
+ * U+10000...U+3FFFF F0 90...BF 80...BF 80...BF
+ * U+40000...U+FFFFF F1...F3 80...BF 80...BF 80...BF
+ * U+100000...U+10FFFF F4 80...8F 80...BF 80...BF
+ */
+
+ /* First we compute the type for each byte, as given by the table below.
+ * This type will be used as an index later on.
+ */
+
+ /*
+ * Index Min Max Byte Type
+ * 0 00 7F Single byte sequence
+ * 1,2,3 80 BF Second, third and fourth byte for many of the sequences.
+ * 4 A0 BF Second byte after E0
+ * 5 80 9F Second byte after ED
+ * 6 90 BF Second byte after F0
+ * 7 80 8F Second byte after F4
+ * 8 C2 F4 First non ASCII byte
+ * 9..15 7F 80 Invalid byte
+ */
+
+ /* After the first step we compute the index for all bytes, then we permute
+ the bytes according to their indices to check the ranges from the range
+ table.
+ * The range for a given type can be found in the range_min_table and
+ range_max_table, the range for type/index X is in range_min_table[X] ...
+ range_max_table[X].
+ */
+
+ /* Algorithm:
+ * Put index zero to all bytes.
+ * Find all non ASCII characters, give them index 8.
+ * For each tail byte in a codepoint sequence, give it an index corresponding
+ to the 1 based index from the end.
+ * If the first byte of the codepoint is in the [C0...DF] range, we write
+ index 1 in the following byte.
+ * If the first byte of the codepoint is in the range [E0...EF], we write
+ indices 2 and 1 in the next two bytes.
+ * If the first byte of the codepoint is in the range [F0...FF] we write
+ indices 3,2,1 into the next three bytes.
+ * For finding the number of bytes we need to look at high nibbles (4 bits)
+ and do the lookup from the table, it can be done with shift by 4 + shuffle
+ instructions. We call it `first_len`.
+ * Then we shift first_len by 8 bits to get the indices of the 2nd bytes.
+ * Saturating sub 1 and shift by 8 bits to get the indices of the 3rd bytes.
+ * Again to get the indices of the 4th bytes.
+ * Take OR of all that 4 values and check within range.
+ */
+ /* For example:
+ * input C3 80 68 E2 80 20 A6 F0 A0 80 AC 20 F0 93 80 80
+ * first_len 1 0 0 2 0 0 0 3 0 0 0 0 3 0 0 0
+ * 1st byte 8 0 0 8 0 0 0 8 0 0 0 0 8 0 0 0
+ * 2nd byte 0 1 0 0 2 0 0 0 3 0 0 0 0 3 0 0 // Shift + sub
+ * 3rd byte 0 0 0 0 0 1 0 0 0 2 0 0 0 0 2 0 // Shift + sub
+ * 4th byte 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 // Shift + sub
+ * Index 8 1 0 8 2 1 0 8 3 2 1 0 8 3 2 1 // OR of results
+ */
+
+ /* Checking for errors:
+ * Error checking is done by looking up the high nibble (4 bits) of each byte
+ against an error checking table.
+ * Because the lookup value for the second byte depends of the value of the
+ first byte in codepoint, we use saturated operations to adjust the index.
+ * Specifically we need to add 2 for E0, 3 for ED, 3 for F0 and 4 for F4 to
+ match the correct index.
+ * If we subtract from all bytes EF then EO -> 241, ED -> 254, F0 -> 1,
+ F4 -> 5
+ * Do saturating sub 240, then E0 -> 1, ED -> 14 and we can do lookup to
+ match the adjustment
+ * Add saturating 112, then F0 -> 113, F4 -> 117, all that were > 16 will
+ be more 128 and lookup in ef_fe_table will return 0 but for F0
+ and F4 it will be 4 and 5 accordingly
+ */
+ /*
+ * Then just check the appropriate ranges with greater/smaller equal
+ instructions. Check tail with a naive algorithm.
+ * To save from previous 16 byte checks we just align previous_first_len to
+ get correct continuations of the codepoints.
+ */
+
+ /*
+ * Map high nibble of "First Byte" to legal character length minus 1
+ * 0x00 ~ 0xBF --> 0
+ * 0xC0 ~ 0xDF --> 1
+ * 0xE0 ~ 0xEF --> 2
+ * 0xF0 ~ 0xFF --> 3
+ */
+ const __m128i first_len_table =
+ _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3);
+
+ /* Map "First Byte" to 8-th item of range table (0xC2 ~ 0xF4) */
+ const __m128i first_range_table =
+ _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8);
+
+ /*
+ * Range table, map range index to min and max values
+ */
+ const __m128i range_min_table =
+ _mm_setr_epi8(0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80, 0xC2, 0x7F,
+ 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F);
+
+ const __m128i range_max_table =
+ _mm_setr_epi8(0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F, 0xF4, 0x80,
+ 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+
+ /*
+ * Tables for fast handling of four special First Bytes(E0,ED,F0,F4), after
+ * which the Second Byte are not 80~BF. It contains "range index adjustment".
+ * +------------+---------------+------------------+----------------+
+ * | First Byte | original range| range adjustment | adjusted range |
+ * +------------+---------------+------------------+----------------+
+ * | E0 | 2 | 2 | 4 |
+ * +------------+---------------+------------------+----------------+
+ * | ED | 2 | 3 | 5 |
+ * +------------+---------------+------------------+----------------+
+ * | F0 | 3 | 3 | 6 |
+ * +------------+---------------+------------------+----------------+
+ * | F4 | 4 | 4 | 8 |
+ * +------------+---------------+------------------+----------------+
+ */
+
+ /* df_ee_table[1] -> E0, df_ee_table[14] -> ED as ED - E0 = 13 */
+ // The values represent the adjustment in the Range Index table for a correct
+ // index.
+ const __m128i df_ee_table =
+ _mm_setr_epi8(0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0);
+
+ /* ef_fe_table[1] -> F0, ef_fe_table[5] -> F4, F4 - F0 = 4 */
+ // The values represent the adjustment in the Range Index table for a correct
+ // index.
+ const __m128i ef_fe_table =
+ _mm_setr_epi8(0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+
+ __m128i prev_input = _mm_set1_epi8(0);
+ __m128i prev_first_len = _mm_set1_epi8(0);
+ __m128i error = _mm_set1_epi8(0);
+ while (end - data >= 16) {
+ const __m128i input =
+ _mm_loadu_si128(reinterpret_cast(data));
+
+ /* high_nibbles = input >> 4 */
+ const __m128i high_nibbles =
+ _mm_and_si128(_mm_srli_epi16(input, 4), _mm_set1_epi8(0x0F));
+
+ /* first_len = legal character length minus 1 */
+ /* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */
+ /* first_len = first_len_table[high_nibbles] */
+ __m128i first_len = _mm_shuffle_epi8(first_len_table, high_nibbles);
+
+ /* First Byte: set range index to 8 for bytes within 0xC0 ~ 0xFF */
+ /* range = first_range_table[high_nibbles] */
+ __m128i range = _mm_shuffle_epi8(first_range_table, high_nibbles);
+
+ /* Second Byte: set range index to first_len */
+ /* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */
+ /* range |= (first_len, prev_first_len) << 1 byte */
+ range = _mm_or_si128(range, _mm_alignr_epi8(first_len, prev_first_len, 15));
+
+ /* Third Byte: set range index to saturate_sub(first_len, 1) */
+ /* 0 for 00~7F, 0 for C0~DF, 1 for E0~EF, 2 for F0~FF */
+ __m128i tmp1;
+ __m128i tmp2;
+ /* tmp1 = saturate_sub(first_len, 1) */
+ tmp1 = _mm_subs_epu8(first_len, _mm_set1_epi8(1));
+ /* tmp2 = saturate_sub(prev_first_len, 1) */
+ tmp2 = _mm_subs_epu8(prev_first_len, _mm_set1_epi8(1));
+ /* range |= (tmp1, tmp2) << 2 bytes */
+ range = _mm_or_si128(range, _mm_alignr_epi8(tmp1, tmp2, 14));
+
+ /* Fourth Byte: set range index to saturate_sub(first_len, 2) */
+ /* 0 for 00~7F, 0 for C0~DF, 0 for E0~EF, 1 for F0~FF */
+ /* tmp1 = saturate_sub(first_len, 2) */
+ tmp1 = _mm_subs_epu8(first_len, _mm_set1_epi8(2));
+ /* tmp2 = saturate_sub(prev_first_len, 2) */
+ tmp2 = _mm_subs_epu8(prev_first_len, _mm_set1_epi8(2));
+ /* range |= (tmp1, tmp2) << 3 bytes */
+ range = _mm_or_si128(range, _mm_alignr_epi8(tmp1, tmp2, 13));
+
+ /*
+ * Now we have below range indices calculated
+ * Correct cases:
+ * - 8 for C0~FF
+ * - 3 for 1st byte after F0~FF
+ * - 2 for 1st byte after E0~EF or 2nd byte after F0~FF
+ * - 1 for 1st byte after C0~DF or 2nd byte after E0~EF or
+ * 3rd byte after F0~FF
+ * - 0 for others
+ * Error cases:
+ * >9 for non ascii First Byte overlapping
+ * E.g., F1 80 C2 90 --> 8 3 10 2, where 10 indicates error
+ */
+
+ /* Adjust Second Byte range for special First Bytes(E0,ED,F0,F4) */
+ /* Overlaps lead to index 9~15, which are illegal in range table */
+ __m128i shift1;
+ __m128i pos;
+ __m128i range2;
+ /* shift1 = (input, prev_input) << 1 byte */
+ shift1 = _mm_alignr_epi8(input, prev_input, 15);
+ pos = _mm_sub_epi8(shift1, _mm_set1_epi8(0xEF));
+ /*
+ * shift1: | EF F0 ... FE | FF 00 ... ... DE | DF E0 ... EE |
+ * pos: | 0 1 15 | 16 17 239| 240 241 255|
+ * pos-240: | 0 0 0 | 0 0 0 | 0 1 15 |
+ * pos+112: | 112 113 127| >= 128 | >= 128 |
+ */
+ tmp1 = _mm_subs_epu8(pos, _mm_set1_epi8(-16));
+ range2 = _mm_shuffle_epi8(df_ee_table, tmp1);
+ tmp2 = _mm_adds_epu8(pos, _mm_set1_epi8(112));
+ range2 = _mm_add_epi8(range2, _mm_shuffle_epi8(ef_fe_table, tmp2));
+
+ range = _mm_add_epi8(range, range2);
+
+ /* Load min and max values per calculated range index */
+ __m128i min_range = _mm_shuffle_epi8(range_min_table, range);
+ __m128i max_range = _mm_shuffle_epi8(range_max_table, range);
+
+ /* Check value range */
+ if (ReturnPosition) {
+ error = _mm_cmplt_epi8(input, min_range);
+ error = _mm_or_si128(error, _mm_cmpgt_epi8(input, max_range));
+ /* 5% performance drop from this conditional branch */
+ if (!_mm_testz_si128(error, error)) {
+ break;
+ }
+ } else {
+ error = _mm_or_si128(error, _mm_cmplt_epi8(input, min_range));
+ error = _mm_or_si128(error, _mm_cmpgt_epi8(input, max_range));
+ }
+
+ prev_input = input;
+ prev_first_len = first_len;
+
+ data += 16;
+ }
+ /* If we got to the end, we don't need to skip any bytes backwards */
+ if (ReturnPosition && (data - (end - len)) == 0) {
+ return ValidUTF8Span(data, end);
+ }
+ /* Find previous codepoint (not 80~BF) */
+ data -= CodepointSkipBackwards(_mm_extract_epi32(prev_input, 3));
+ if (ReturnPosition) {
+ return (data - (end - len)) + ValidUTF8Span(data, end);
+ }
+ /* Test if there was any error */
+ if (!_mm_testz_si128(error, error)) {
+ return 0;
+ }
+ /* Check the tail */
+ return ValidUTF8Span(data, end);
+#endif
+}
+
+} // namespace
+
+bool IsStructurallyValid(absl::string_view str) {
+ return ValidUTF8*ReturnPosition=*/false>(str.data(), str.size());
+}
+
+size_t SpanStructurallyValid(absl::string_view str) {
+ return ValidUTF8*ReturnPosition=*/true>(str.data(), str.size());
+}
+
+} // namespace utf8_range
diff --git a/third_party/utf8_range/utf8_validity.h b/third_party/utf8_range/utf8_validity.h
new file mode 100644
index 0000000000..4a8d75b3b4
--- /dev/null
+++ b/third_party/utf8_range/utf8_validity.h
@@ -0,0 +1,23 @@
+// Copyright 2022 Google LLC
+//
+// Use of this source code is governed by an MIT-style
+// license that can be found in the LICENSE file or at
+// https://opensource.org/licenses/MIT.
+
+#ifndef THIRD_PARTY_UTF8_RANGE_UTF8_VALIDITY_H_
+#define THIRD_PARTY_UTF8_RANGE_UTF8_VALIDITY_H_
+
+#include "absl/strings/string_view.h"
+
+namespace utf8_range {
+
+// Returns true if the sequence of characters is a valid UTF-8 sequence.
+bool IsStructurallyValid(absl::string_view str);
+
+// Returns the length in bytes of the prefix of str that is all
+// structurally valid UTF-8.
+size_t SpanStructurallyValid(absl::string_view str);
+
+} // namespace utf8_range
+
+#endif // THIRD_PARTY_UTF8_RANGE_UTF8_VALIDITY_H_
diff --git a/third_party/utf8_range/utf8_validity_test.cc b/third_party/utf8_range/utf8_validity_test.cc
new file mode 100644
index 0000000000..2648df674b
--- /dev/null
+++ b/third_party/utf8_range/utf8_validity_test.cc
@@ -0,0 +1,76 @@
+#include "utf8_validity.h"
+
+#include "gtest/gtest.h"
+#include "absl/strings/string_view.h"
+
+namespace utf8_range {
+
+TEST(Utf8Validity, SpanStructurallyValid) {
+ // Test simple good strings
+ EXPECT_EQ(4, SpanStructurallyValid("abcd"));
+ EXPECT_EQ(4, SpanStructurallyValid(absl::string_view("a\0cd", 4))); // NULL
+ EXPECT_EQ(4, SpanStructurallyValid("ab\xc2\x81")); // 2-byte
+ EXPECT_EQ(4, SpanStructurallyValid("a\xe2\x81\x81")); // 3-byte
+ EXPECT_EQ(4, SpanStructurallyValid("\xf2\x81\x81\x81")); // 4
+
+ // Test simple bad strings
+ EXPECT_EQ(3, SpanStructurallyValid("abc\x80")); // bad char
+ EXPECT_EQ(3, SpanStructurallyValid("abc\xc2")); // trunc 2
+ EXPECT_EQ(2, SpanStructurallyValid("ab\xe2\x81")); // trunc 3
+ EXPECT_EQ(1, SpanStructurallyValid("a\xf2\x81\x81")); // trunc 4
+ EXPECT_EQ(2, SpanStructurallyValid("ab\xc0\x81")); // not 1
+ EXPECT_EQ(1, SpanStructurallyValid("a\xe0\x81\x81")); // not 2
+ EXPECT_EQ(0, SpanStructurallyValid("\xf0\x81\x81\x81")); // not 3
+ EXPECT_EQ(0, SpanStructurallyValid("\xf4\xbf\xbf\xbf")); // big
+ // surrogate min, max
+ EXPECT_EQ(0, SpanStructurallyValid("\xED\xA0\x80")); // U+D800
+ EXPECT_EQ(0, SpanStructurallyValid("\xED\xBF\xBF")); // U+DFFF
+
+ // non-shortest forms should all return false
+ EXPECT_EQ(0, SpanStructurallyValid("\xc0\x80"));
+ EXPECT_EQ(0, SpanStructurallyValid("\xc1\xbf"));
+ EXPECT_EQ(0, SpanStructurallyValid("\xe0\x80\x80"));
+ EXPECT_EQ(0, SpanStructurallyValid("\xe0\x9f\xbf"));
+ EXPECT_EQ(0, SpanStructurallyValid("\xf0\x80\x80\x80"));
+ EXPECT_EQ(0, SpanStructurallyValid("\xf0\x83\xbf\xbf"));
+
+ // This string unchecked caused GWS to crash 7/2006:
+ // invalid sequence 0xc7 0xc8 0xcd 0xcb
+ EXPECT_EQ(0, SpanStructurallyValid("\xc7\xc8\xcd\xcb"));
+}
+
+TEST(Utf8Validity, IsStructurallyValid) {
+ // Test simple good strings
+ EXPECT_TRUE(IsStructurallyValid("abcd"));
+ EXPECT_TRUE(IsStructurallyValid(absl::string_view("a\0cd", 4))); // NULL
+ EXPECT_TRUE(IsStructurallyValid("ab\xc2\x81")); // 2-byte
+ EXPECT_TRUE(IsStructurallyValid("a\xe2\x81\x81")); // 3-byte
+ EXPECT_TRUE(IsStructurallyValid("\xf2\x81\x81\x81")); // 4
+
+ // Test simple bad strings
+ EXPECT_FALSE(IsStructurallyValid("abc\x80")); // bad char
+ EXPECT_FALSE(IsStructurallyValid("abc\xc2")); // trunc 2
+ EXPECT_FALSE(IsStructurallyValid("ab\xe2\x81")); // trunc 3
+ EXPECT_FALSE(IsStructurallyValid("a\xf2\x81\x81")); // trunc 4
+ EXPECT_FALSE(IsStructurallyValid("ab\xc0\x81")); // not 1
+ EXPECT_FALSE(IsStructurallyValid("a\xe0\x81\x81")); // not 2
+ EXPECT_FALSE(IsStructurallyValid("\xf0\x81\x81\x81")); // not 3
+ EXPECT_FALSE(IsStructurallyValid("\xf4\xbf\xbf\xbf")); // big
+ // surrogate min, max
+ EXPECT_FALSE(IsStructurallyValid("\xED\xA0\x80")); // U+D800
+ EXPECT_FALSE(IsStructurallyValid("\xED\xBF\xBF")); // U+DFFF
+
+ // non-shortest forms should all return false
+ EXPECT_FALSE(IsStructurallyValid("\xc0\x80"));
+ EXPECT_FALSE(IsStructurallyValid("\xc1\xbf"));
+ EXPECT_FALSE(IsStructurallyValid("\xe0\x80\x80"));
+ EXPECT_FALSE(IsStructurallyValid("\xe0\x9f\xbf"));
+ EXPECT_FALSE(IsStructurallyValid("\xf0\x80\x80\x80"));
+ EXPECT_FALSE(IsStructurallyValid("\xf0\x83\xbf\xbf"));
+
+ // This string unchecked caused GWS to crash 7/2006:
+ // invalid sequence 0xc7 0xc8 0xcd 0xcb
+ EXPECT_FALSE(IsStructurallyValid("\xc7\xc8\xcd\xcb"));
+}
+
+} // namespace utf8_range
diff --git a/third_party/utf8_range/workspace_deps.bzl b/third_party/utf8_range/workspace_deps.bzl
new file mode 100644
index 0000000000..d296f9ff51
--- /dev/null
+++ b/third_party/utf8_range/workspace_deps.bzl
@@ -0,0 +1,11 @@
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("@bazel_tools//tools/build_defs/repo:utils.bzl", "maybe")
+
+def utf8_range_deps():
+ maybe(
+ http_archive,
+ name = "com_google_absl",
+ url = "https://github.com/abseil/abseil-cpp/archive/8c0b94e793a66495e0b1f34a5eb26bd7dc672db0.zip",
+ strip_prefix = "abseil-cpp-8c0b94e793a66495e0b1f34a5eb26bd7dc672db0",
+ sha256 = "b9f490fae1c0d89a19073a081c3c588452461e5586e4ae31bc50a8f36339135e",
+ )
diff --git a/update_subtrees.sh b/update_subtrees.sh
old mode 100755
new mode 100644