From 7c7cb2a98907d99ca86bdbfca0bf9c48bfa4ed49 Mon Sep 17 00:00:00 2001 From: David Corbett Date: Sat, 20 Jan 2018 15:53:09 -0500 Subject: [PATCH] Match extlang subtags If the second subtag of a BCP 47 tag is three letters long, it denotes an extended language. The tag converter ignores the language subtag and uses the extended language instead. There are some grandfathered exceptions, which are handled earlier. --- src/gen-tag-table.py | 2 +- src/hb-ot-tag-table.hh | 21 +++++++++++++++++++++ src/hb-ot-tag.cc | 10 +++++++++- test/api/test-ot-tag.c | 7 +++++++ 4 files changed, 38 insertions(+), 2 deletions(-) diff --git a/src/gen-tag-table.py b/src/gen-tag-table.py index 925ffb439..7cbf3a79e 100755 --- a/src/gen-tag-table.py +++ b/src/gen-tag-table.py @@ -884,7 +884,7 @@ def print_subtag_matches (subtag): for language, tags in sorted (ot.from_bcp_47.items (), key=lambda i: (-len (i[0]), i[0])): lt = LanguageTag (language) - if len (lt.subtags) == 1 or lt.grandfathered and ot.from_bcp_47[lt.subtags[0]] == tags: + if len (lt.subtags) == 1 or lt.grandfathered and len (lt.subtags[1]) != 3 and ot.from_bcp_47[lt.subtags[0]] == tags: continue print (' if (', end='') if (lt.language == 'und' or diff --git a/src/hb-ot-tag-table.hh b/src/hb-ot-tag-table.hh index 0d06255a3..721136883 100644 --- a/src/hb-ot-tag-table.hh +++ b/src/hb-ot-tag-table.hh @@ -1279,6 +1279,13 @@ hb_ot_tags_from_complex_language (const char *lang_str, *count = 1; return true; } + if (0 == strcmp (lang_str, "zh-min-nan")) + { + /* Minnan, Hokkien, Amoy, Taiwanese, Southern Min, Southern Fujian, Hoklo, Southern Fukien, Ho-lo */ + tags[0] = HB_TAG('Z','H','S',' '); /* Chinese Simplified */ + *count = 1; + return true; + } if (lang_matches (lang_str, "cdo-hans")) { /* Min Dong Chinese; Han (Simplified variant) */ @@ -1791,6 +1798,13 @@ hb_ot_tags_from_complex_language (const char *lang_str, *count = 1; return true; } + if (0 == strcmp (lang_str, "no-bok")) + { + /* Norwegian Bokmal */ + tags[0] = HB_TAG('N','O','R',' '); /* Norwegian */ + *count = 1; + return true; + } if (0 == strcmp (lang_str, "no-nyn")) { /* Norwegian Nynorsk */ @@ -1822,6 +1836,13 @@ hb_ot_tags_from_complex_language (const char *lang_str, *count = 1; return true; } + if (0 == strcmp (lang_str, "zh-min")) + { + /* Min, Fuzhou, Hokkien, Amoy, or Taiwanese */ + tags[0] = HB_TAG('Z','H','S',' '); /* Chinese Simplified */ + *count = 1; + return true; + } if (0 == strcmp (lang_str, "i-hak")) { /* Hakka */ diff --git a/src/hb-ot-tag.cc b/src/hb-ot-tag.cc index 4d8cb5989..0d4c06030 100644 --- a/src/hb-ot-tag.cc +++ b/src/hb-ot-tag.cc @@ -249,8 +249,17 @@ hb_ot_tags_from_language (const char *lang_str, return; /* Find a language matching in the first component. */ + s = strchr (lang_str, '-'); { const LangTag *lang_tag; + if (s && limit - lang_str >= 6) + { + const char *extlang_end = strchr (s + 1, '-'); + /* If there is an extended language tag, use it. */ + if (3 == (extlang_end ? extlang_end - s - 1 : strlen (s + 1)) && + ISALPHA (s[1])) + lang_str = s + 1; + } lang_tag = (LangTag *) bsearch (lang_str, ot_languages, ARRAY_LENGTH (ot_languages), sizeof (LangTag), lang_compare_first_component); @@ -264,7 +273,6 @@ hb_ot_tags_from_language (const char *lang_str, } } - s = strchr (lang_str, '-'); if (!s) s = lang_str + strlen (lang_str); if (s - lang_str == 3) { diff --git a/test/api/test-ot-tag.c b/test/api/test-ot-tag.c index 6d64d131b..350289812 100644 --- a/test/api/test-ot-tag.c +++ b/test/api/test-ot-tag.c @@ -369,9 +369,13 @@ test_ot_tag_language (void) test_tag_from_language ("ABC", "xyz-xy-x-hbotabc-zxc"); /* Unnormalized BCP 47 tags */ + test_tag_from_language ("ARA", "ar-aao"); test_tag_from_language ("JBO", "art-lojban"); + test_tag_from_language ("KOK", "kok-gom"); test_tag_from_language ("LTZ", "i-lux"); test_tag_from_language ("MNG", "drh"); + test_tag_from_language ("MOR", "ar-ary"); + test_tag_from_language ("MOR", "ar-ary-DZ"); test_tag_from_language ("NOR", "no-bok"); test_tag_from_language ("NYN", "no-nyn"); test_tag_from_language ("ZHS", "i-hak"); @@ -379,6 +383,9 @@ test_ot_tag_language (void) test_tag_from_language ("ZHS", "zh-min"); test_tag_from_language ("ZHS", "zh-min-nan"); test_tag_from_language ("ZHS", "zh-xiang"); + + /* A UN M.49 region code, not an extended language subtag */ + test_tag_from_language ("ARA", "ar-001"); } static void