From 8c0d1916a41f0fb32340ce5257de780acf598353 Mon Sep 17 00:00:00 2001 From: Behdad Esfahbod Date: Fri, 5 Jan 2018 12:46:12 +0000 Subject: [PATCH] Improve CGJ skipping logic Previously we made CGJ unskippable. Now, if CGJ did NOT prevent any reordering, allow skipping over it. To make this work we had to make changes to the Arabic mark reordering algorithm implementation to renumber moved MCM marks. See comments. Fixes https://github.com/harfbuzz/harfbuzz/issues/554 --- src/hb-buffer-private.hh | 1 + src/hb-ot-layout-private.hh | 11 ++++++- src/hb-ot-shape-complex-arabic.cc | 37 ++++++++++++++++------ src/hb-ot-shape-normalize.cc | 30 ++++++++++-------- test/shaping/tests/arabic-mark-order.tests | 4 +++ 5 files changed, 58 insertions(+), 25 deletions(-) diff --git a/src/hb-buffer-private.hh b/src/hb-buffer-private.hh index 97bdc1be3..fa5ca166d 100644 --- a/src/hb-buffer-private.hh +++ b/src/hb-buffer-private.hh @@ -69,6 +69,7 @@ enum hb_buffer_scratch_flags_t { HB_BUFFER_SCRATCH_FLAG_HAS_SPACE_FALLBACK = 0x00000004u, HB_BUFFER_SCRATCH_FLAG_HAS_GPOS_ATTACHMENT = 0x00000008u, HB_BUFFER_SCRATCH_FLAG_HAS_UNSAFE_TO_BREAK = 0x00000010u, + HB_BUFFER_SCRATCH_FLAG_HAS_CGJ = 0x00000020u, /* Reserved for complex shapers' internal use. */ HB_BUFFER_SCRATCH_FLAG_COMPLEX0 = 0x01000000u, diff --git a/src/hb-ot-layout-private.hh b/src/hb-ot-layout-private.hh index 0f0926f89..be5695d98 100644 --- a/src/hb-ot-layout-private.hh +++ b/src/hb-ot-layout-private.hh @@ -280,7 +280,11 @@ _hb_glyph_info_set_unicode_props (hb_glyph_info_t *info, hb_buffer_t *buffer) else if (unlikely (hb_in_range (u, 0xE0020u, 0xE007Fu))) props |= UPROPS_MASK_HIDDEN; /* COMBINING GRAPHEME JOINER should not be skipped; at least some times. * https://github.com/harfbuzz/harfbuzz/issues/554 */ - else if (unlikely (u == 0x034Fu)) props |= UPROPS_MASK_HIDDEN; + else if (unlikely (u == 0x034Fu)) + { + buffer->scratch_flags |= HB_BUFFER_SCRATCH_FLAG_HAS_CGJ; + props |= UPROPS_MASK_HIDDEN; + } } else if (unlikely (HB_UNICODE_GENERAL_CATEGORY_IS_NON_ENCLOSING_MARK_OR_MODIFIER_SYMBOL (gen_cat))) { @@ -388,6 +392,11 @@ _hb_glyph_info_is_default_ignorable_and_not_hidden (const hb_glyph_info_t *info) == UPROPS_MASK_IGNORABLE) && !_hb_glyph_info_ligated (info); } +static inline void +_hb_glyph_info_unhide (hb_glyph_info_t *info) +{ + info->unicode_props() &= ~ UPROPS_MASK_HIDDEN; +} static inline bool _hb_glyph_info_is_unicode_format (const hb_glyph_info_t *info) diff --git a/src/hb-ot-shape-complex-arabic.cc b/src/hb-ot-shape-complex-arabic.cc index eb9d36ff1..47961bfd5 100644 --- a/src/hb-ot-shape-complex-arabic.cc +++ b/src/hb-ot-shape-complex-arabic.cc @@ -644,13 +644,15 @@ reorder_marks_arabic (const hb_ot_shape_plan_t *plan, { hb_glyph_info_t *info = buffer->info; + DEBUG_MSG (ARABIC, buffer, "Reordering marks from %d to %d", start, end); + unsigned int i = start; for (unsigned int cc = 220; cc <= 230; cc += 10) { - DEBUG_MSG (ARABIC, buffer, "Looking for %d's starting at %d\n", cc, i); + DEBUG_MSG (ARABIC, buffer, "Looking for %d's starting at %d", cc, i); while (i < end && info_cc(info[i]) < cc) i++; - DEBUG_MSG (ARABIC, buffer, "Looking for %d's stopped at %d\n", cc, i); + DEBUG_MSG (ARABIC, buffer, "Looking for %d's stopped at %d", cc, i); if (i == end) break; @@ -658,20 +660,17 @@ reorder_marks_arabic (const hb_ot_shape_plan_t *plan, if (info_cc(info[i]) > cc) continue; - /* Technically we should also check "info_cc(info[j]) == cc" - * in the following loop. But not doing it is safe; we might - * end up moving all the 220 MCMs and 230 MCMs together in one - * move and be done. */ unsigned int j = i; - while (j < end && info_is_mcm (info[j])) + while (j < end && info_cc(info[j]) == cc && info_is_mcm (info[j])) j++; - DEBUG_MSG (ARABIC, buffer, "Found %d's from %d to %d\n", cc, i, j); if (i == j) continue; + DEBUG_MSG (ARABIC, buffer, "Found %d's from %d to %d", cc, i, j); + /* Shift it! */ - DEBUG_MSG (ARABIC, buffer, "Shifting %d's: %d %d\n", cc, i, j); + DEBUG_MSG (ARABIC, buffer, "Shifting %d's: %d %d", cc, i, j); hb_glyph_info_t temp[HB_OT_SHAPE_COMPLEX_MAX_COMBINING_MARKS]; assert (j - i <= ARRAY_LENGTH (temp)); buffer->merge_clusters (start, j); @@ -679,7 +678,25 @@ reorder_marks_arabic (const hb_ot_shape_plan_t *plan, memmove (&info[start + j - i], &info[start], (i - start) * sizeof (hb_glyph_info_t)); memmove (&info[start], temp, (j - i) * sizeof (hb_glyph_info_t)); - start += j - i; + /* Renumber CC such that the reordered sequence is still sorted. + * 22 and 26 are chosen because they are smaller than all Arabic categories, + * and are folded back to 220/230 respectively during fallback mark positioning. + * + * We do this because the CGJ-handling logic in the normalizer relies on + * mark sequences having an increasing order even after this reordering. + * https://github.com/harfbuzz/harfbuzz/issues/554 + * This, however, does break some obscure sequences, where the normalizer + * might compose a sequence that it should not. For example, in the seequence + * ALEF, HAMZAH, MADDAH, we should NOT try to compose ALEF+MADDAH, but with this + * renumbering, we will. + */ + unsigned int new_start = start + j - i; + unsigned int new_cc = cc == 220 ? HB_MODIFIED_COMBINING_CLASS_CCC22 : HB_MODIFIED_COMBINING_CLASS_CCC26; + while (start < new_start) + { + _hb_glyph_info_set_modified_combining_class (&info[start], new_cc); + start++; + } i = j; } diff --git a/src/hb-ot-shape-normalize.cc b/src/hb-ot-shape-normalize.cc index fd9e7c2a8..62cbb9de9 100644 --- a/src/hb-ot-shape-normalize.cc +++ b/src/hb-ot-shape-normalize.cc @@ -345,9 +345,8 @@ _hb_ot_shape_normalize (const hb_ot_shape_plan_t *plan, if (_hb_glyph_info_get_modified_combining_class (&buffer->info[end]) == 0) break; - /* We are going to do a O(n^2). Only do this if the sequence is short, - * but not too short ;). */ - if (end - i < 2 || end - i > HB_OT_SHAPE_COMPLEX_MAX_COMBINING_MARKS) { + /* We are going to do a O(n^2). Only do this if the sequence is short. */ + if (end - i > HB_OT_SHAPE_COMPLEX_MAX_COMBINING_MARKS) { i = end; continue; } @@ -373,13 +372,11 @@ _hb_ot_shape_normalize (const hb_ot_shape_plan_t *plan, buffer->clear_output (); count = buffer->len; unsigned int starter = 0; - bool combine = true; buffer->next_glyph (); while (buffer->idx < count && !buffer->in_error) { hb_codepoint_t composed, glyph; - if (combine && - /* We don't try to compose a non-mark character with it's preceding starter. + if (/* We don't try to compose a non-mark character with it's preceding starter. * This is both an optimization to avoid trying to compose every two neighboring * glyphs in most scripts AND a desired feature for Hangul. Apparently Hangul * fonts are not designed to mix-and-match pre-composed syllables and Jamo. */ @@ -410,22 +407,27 @@ _hb_ot_shape_normalize (const hb_ot_shape_plan_t *plan, continue; } - else if (/* We sometimes custom-tailor the sorted order of marks. In that case, stop - * trying to combine as soon as combining-class drops. */ - starter < buffer->out_len - 1 && - info_cc (buffer->prev()) > info_cc (buffer->cur())) - combine = false; } /* Blocked, or doesn't compose. */ buffer->next_glyph (); if (info_cc (buffer->prev()) == 0) - { starter = buffer->out_len - 1; - combine = true; - } } buffer->swap_buffers (); + if (buffer->scratch_flags & HB_BUFFER_SCRATCH_FLAG_HAS_CGJ) + { + /* For all CGJ, check if it prevented any reordering at all. + * If it did NOT, then make it skippable. + * https://github.com/harfbuzz/harfbuzz/issues/554 + */ + for (unsigned int i = 1; i + 1 < buffer->len; i++) + if (buffer->info[i].codepoint == 0x034Fu/*CGJ*/ && + info_cc(buffer->info[i-1]) <= info_cc(buffer->info[i+1])) + { + _hb_glyph_info_unhide (&buffer->info[i]); + } + } } diff --git a/test/shaping/tests/arabic-mark-order.tests b/test/shaping/tests/arabic-mark-order.tests index b48c82fd3..cc5226bb2 100644 --- a/test/shaping/tests/arabic-mark-order.tests +++ b/test/shaping/tests/arabic-mark-order.tests @@ -1,2 +1,6 @@ fonts/sha1sum/94a5d6fb15a27521fba9ea4aee9cb39b2d03322a.ttf::U+064A,U+064E,U+0670,U+0653,U+0640,U+0654,U+064E,U+0627:[afii57415.zz04=7+481|afii57454=4@25,975+0|uni0654=4@-50,50+0|afii57440=4+650|uni0670_uni0653=0@75,400+0|afii57454=0@750,1125+0|afii57450.calt=0+1331] fonts/sha1sum/24b8d24d00ae86f49791b746da4c9d3f717a51a8.ttf::U+0628,U+0618,U+0619,U+064E,U+064F,U+0654,U+0658,U+0653,U+0654,U+0651,U+0656,U+0651,U+065C,U+0655,U+0650:[uni0653.small=0@266,2508+0|uni0654=0@308,2151+0|uni0655=0@518,-1544+0|uni065C=0@501,-1453+0|uni0656=0@573,-659+0|uni0650=0@500,133+0|uni0619=0@300,1807+0|uni0618=0@357,1674+0|uni0651064E=0@387,1178+0|uni0651=0@402,764+0|uni0658=0@424,404+0|uni0654064F=0@540,-435+0|uni0628=0+1352] +fonts/sha1sum/21b7fb9c1eeae260473809fbc1fe330f66a507cd.ttf::U+0649,U+0655,U+034F,U+0650:[uni0650.small2=0@727,-774+0|space=0+0|uni0655=0@727,-209+0|uni0649=0+1566] +fonts/sha1sum/21b7fb9c1eeae260473809fbc1fe330f66a507cd.ttf::U+0649,U+0655,U+0650:[uni0650.small2=0@727,-774+0|uni0655=0@727,-209+0|uni0649=0+1566] +fonts/sha1sum/21b7fb9c1eeae260473809fbc1fe330f66a507cd.ttf::U+0649,U+0650,U+0655:[uni0650.small2=0@727,-774+0|uni0655=0@727,-209+0|uni0649=0+1566] +fonts/sha1sum/21b7fb9c1eeae260473809fbc1fe330f66a507cd.ttf::U+0649,U+0650,U+034F,U+0655:[uni0655=0+0|space=0+0|uni0650=0@166,0+0|uni0649=0+1566]