diff --git a/src/hb-unicode-private.hh b/src/hb-unicode-private.hh index 14d8131c4..5c5fff974 100644 --- a/src/hb-unicode-private.hh +++ b/src/hb-unicode-private.hh @@ -1,7 +1,7 @@ /* * Copyright © 2009 Red Hat, Inc. * Copyright © 2011 Codethink Limited - * Copyright © 2010,2011 Google, Inc. + * Copyright © 2010,2011,2012 Google, Inc. * * This is part of HarfBuzz, a text shaping library. * @@ -37,6 +37,7 @@ #include "hb-object-private.hh" +extern HB_INTERNAL const uint8_t _hb_modified_combining_class[256]; /* * hb_unicode_funcs_t @@ -143,8 +144,11 @@ HB_UNICODE_FUNCS_IMPLEMENT_CALLBACKS_SIMPLE } - HB_INTERNAL unsigned int - modified_combining_class (hb_codepoint_t unicode); + unsigned int + modified_combining_class (hb_codepoint_t unicode) + { + return _hb_modified_combining_class[combining_class (unicode)]; + } inline hb_bool_t is_variation_selector (hb_codepoint_t unicode) diff --git a/src/hb-unicode.cc b/src/hb-unicode.cc index b1bd147e5..f4eae0d5f 100644 --- a/src/hb-unicode.cc +++ b/src/hb-unicode.cc @@ -1,7 +1,7 @@ /* * Copyright © 2009 Red Hat, Inc. - * Copyright © 2011 Codethink Limited - * Copyright © 2010,2011 Google, Inc. + * Copyright © 2011 Codethink Limited + * Copyright © 2010,2011,2012 Google, Inc. * * This is part of HarfBuzz, a text shaping library. * @@ -287,69 +287,148 @@ hb_unicode_decompose_compatibility (hb_unicode_funcs_t *ufuncs, } -unsigned int -hb_unicode_funcs_t::modified_combining_class (hb_codepoint_t unicode) +const uint8_t +_hb_modified_combining_class[256] = { - int c = combining_class (unicode); - - if (unlikely (hb_in_range (c, 27, 33))) - { - /* Modify the combining-class to suit Arabic better. See: - * http://unicode.org/faq/normalization.html#8 - * http://unicode.org/faq/normalization.html#9 - */ - c = c == 33 ? 27 : c + 1; - } - else if (unlikely (hb_in_range (c, 10, 26))) - { - /* The equivalent fix for Hebrew is more complex. - * - * We permute the "fixed-position" classes 10-26 into the order - * described in the SBL Hebrew manual: - * - * http://www.sbl-site.org/Fonts/SBLHebrewUserManual1.5x.pdf - * - * (as recommended by: - * http://forum.fontlab.com/archive-old-microsoft-volt-group/vista-and-diacritic-ordering-t6751.0.html) - * - * More details here: - * https://bugzilla.mozilla.org/show_bug.cgi?id=662055 - */ - static const int permuted_hebrew_classes[26 - 10 + 1] = { - /* 10 sheva */ 22, - /* 11 hataf segol */ 15, - /* 12 hataf patah */ 16, - /* 13 hataf qamats */ 17, - /* 14 hiriq */ 23, - /* 15 tsere */ 18, - /* 16 segol */ 19, - /* 17 patah */ 20, - /* 18 qamats */ 21, - /* 19 holam */ 14, - /* 20 qubuts */ 24, - /* 21 dagesh */ 12, - /* 22 meteg */ 25, - /* 23 rafe */ 13, - /* 24 shin dot */ 10, - /* 25 sin dot */ 11, - /* 26 point varika */ 26, - }; - c = permuted_hebrew_classes[c - 10]; - } - else if (unlikely (unicode == 0x0E3A)) /* THAI VOWEL SIGN PHINTHU */ - { - /* Assign 104, so it reorders after the THAI ccc=103 marks. - * Uniscribe does this. */ - c = 104; - } - else if (unlikely (hb_in_range (unicode, 0x0C55, 0x0C56))) - { - /* Telugu length marks. - * These are the only matras in the main Indic script range that have - * a non-zero ccc. That makes them reorder with the Halant that is - * ccc=9. Just zero them, we don't need them in our Indic shaper. */ - c = 0; - } - - return c; -} + 0, /* HB_UNICODE_COMBINING_CLASS_NOT_REORDERED */ + 1, /* HB_UNICODE_COMBINING_CLASS_OVERLAY */ + 2, 3, 4, 5, 6, + 7, /* HB_UNICODE_COMBINING_CLASS_NUKTA */ + 8, /* HB_UNICODE_COMBINING_CLASS_KANA_VOICING */ + 9, /* HB_UNICODE_COMBINING_CLASS_VIRAMA */ + + /* Hebrew */ + + /* + * We permute the "fixed-position" classes 10-26 into the order + * described in the SBL Hebrew manual: + * + * http://www.sbl-site.org/Fonts/SBLHebrewUserManual1.5x.pdf + * + * (as recommended by: + * http://forum.fontlab.com/archive-old-microsoft-volt-group/vista-and-diacritic-ordering-t6751.0.html) + * + * More details here: + * https://bugzilla.mozilla.org/show_bug.cgi?id=662055 + */ + 22, /* HB_UNICODE_COMBINING_CLASS_CCC10 sheva */ + 15, /* HB_UNICODE_COMBINING_CLASS_CCC11 hataf segol */ + 16, /* HB_UNICODE_COMBINING_CLASS_CCC12 hataf patah*/ + 17, /* HB_UNICODE_COMBINING_CLASS_CCC13 hataf qamats */ + 23, /* HB_UNICODE_COMBINING_CLASS_CCC14 hiriq */ + 18, /* HB_UNICODE_COMBINING_CLASS_CCC15 tsere */ + 19, /* HB_UNICODE_COMBINING_CLASS_CCC16 segol */ + 20, /* HB_UNICODE_COMBINING_CLASS_CCC17 patah */ + 21, /* HB_UNICODE_COMBINING_CLASS_CCC18 qamats */ + 14, /* HB_UNICODE_COMBINING_CLASS_CCC19 holam */ + 24, /* HB_UNICODE_COMBINING_CLASS_CCC20 qubuts */ + 12, /* HB_UNICODE_COMBINING_CLASS_CCC21 dagesh */ + 25, /* HB_UNICODE_COMBINING_CLASS_CCC22 meteg */ + 13, /* HB_UNICODE_COMBINING_CLASS_CCC23 rafe */ + 10, /* HB_UNICODE_COMBINING_CLASS_CCC24 shin dot */ + 11, /* HB_UNICODE_COMBINING_CLASS_CCC25 sin dot */ + + 26, /* HB_UNICODE_COMBINING_CLASS_CCC26 */ + + /* Arabic */ + + /* + * Modify to move Shadda (ccc=33) before other marks. See: + * http://unicode.org/faq/normalization.html#8 + * http://unicode.org/faq/normalization.html#9 + */ + 28, /* HB_UNICODE_COMBINING_CLASS_CCC27 */ + 29, /* HB_UNICODE_COMBINING_CLASS_CCC28 */ + 30, /* HB_UNICODE_COMBINING_CLASS_CCC29 */ + 31, /* HB_UNICODE_COMBINING_CLASS_CCC30 */ + 32, /* HB_UNICODE_COMBINING_CLASS_CCC31 */ + 33, /* HB_UNICODE_COMBINING_CLASS_CCC32 */ + 27, /* HB_UNICODE_COMBINING_CLASS_CCC33 shadda */ + + 34, /* HB_UNICODE_COMBINING_CLASS_CCC34 */ + 35, /* HB_UNICODE_COMBINING_CLASS_CCC35 */ + + /* Syriac */ + 36, /* HB_UNICODE_COMBINING_CLASS_CCC36 */ + + 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, + 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, + + /* Telugu */ + + /* + * Modify Telugu length marks (ccc=84, ccc=91). + * These are the only matras in the main Indic scripts range that have + * a non-zero ccc. That makes them reorder with the Halant that is + * ccc=9. Just zero them, we don't need them in our Indic shaper. + */ + 0, /* HB_UNICODE_COMBINING_CLASS_CCC84 */ + 85, 86, 87, 88, 89, 90, + 0, /* HB_UNICODE_COMBINING_CLASS_CCC91 */ + 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, + + /* Thai */ + + /* + * Modify U+0E38 and U+0E39 (ccc=104) to be reordered before U+0E3A (ccc=9). + * Uniscribe does this too. + */ + 3, /* HB_UNICODE_COMBINING_CLASS_CCC103 */ + + 104, 105, 106, + 107, /* HB_UNICODE_COMBINING_CLASS_CCC107 */ + 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, + + /* Lao */ + 118, /* HB_UNICODE_COMBINING_CLASS_CCC118 */ + 119, 120, 121, + 122, /* HB_UNICODE_COMBINING_CLASS_CCC122 */ + 123, 124, 125, 126, 127, 128, + + /* Tibetan */ + 129, /* HB_UNICODE_COMBINING_CLASS_CCC129 */ + 130, /* HB_UNICODE_COMBINING_CLASS_CCC130 */ + 131, + 132, /* HB_UNICODE_COMBINING_CLASS_CCC133 */ + 133, 134, 135, 136, 137, 138, 139, + + + 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, + 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, + 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, + 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, + 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, + + 200, /* HB_UNICODE_COMBINING_CLASS_ATTACHED_BELOW_LEFT */ + 201, + 202, /* HB_UNICODE_COMBINING_CLASS_ATTACHED_BELOW */ + 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, + 214, /* HB_UNICODE_COMBINING_CLASS_ATTACHED_ABOVE */ + 215, + 216, /* HB_UNICODE_COMBINING_CLASS_ATTACHED_ABOVE_RIGHT */ + 217, + 218, /* HB_UNICODE_COMBINING_CLASS_BELOW_LEFT */ + 219, + 220, /* HB_UNICODE_COMBINING_CLASS_BELOW */ + 221, + 222, /* HB_UNICODE_COMBINING_CLASS_BELOW_RIGHT */ + 223, + 224, /* HB_UNICODE_COMBINING_CLASS_LEFT */ + 225, + 226, /* HB_UNICODE_COMBINING_CLASS_RIGHT */ + 227, + 228, /* HB_UNICODE_COMBINING_CLASS_ABOVE_LEFT */ + 229, + 230, /* HB_UNICODE_COMBINING_CLASS_ABOVE */ + 231, + 232, /* HB_UNICODE_COMBINING_CLASS_ABOVE_RIGHT */ + 233, /* HB_UNICODE_COMBINING_CLASS_DOUBLE_BELOW */ + 234, /* HB_UNICODE_COMBINING_CLASS_DOUBLE_ABOVE */ + 235, 236, 237, 238, 239, + 240, /* HB_UNICODE_COMBINING_CLASS_IOTA_SUBSCRIPT */ + 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, + 255, /* HB_UNICODE_COMBINING_CLASS_INVALID */ +}; diff --git a/src/hb-unicode.h b/src/hb-unicode.h index 47084dab9..2e10d98a3 100644 --- a/src/hb-unicode.h +++ b/src/hb-unicode.h @@ -79,6 +79,10 @@ typedef enum /* hb_unicode_combining_class_t */ +/* Note: newer versions of Unicode may add new values. Clients should be ready to handle + * any value in the 0..254 range being returned from hb_unicode_combining_class(). + */ + /* Unicode Character Database property: Canonical_Combining_Class (ccc) */ typedef enum {