Match extlang subtags

If the second subtag of a BCP 47 tag is three letters long, it denotes
an extended language. The tag converter ignores the language subtag and
uses the extended language instead.

There are some grandfathered exceptions, which are handled earlier.
pull/1238/head
David Corbett 7 years ago committed by Behdad Esfahbod
parent 2f1f961cc0
commit 7c7cb2a989
  1. 2
      src/gen-tag-table.py
  2. 21
      src/hb-ot-tag-table.hh
  3. 10
      src/hb-ot-tag.cc
  4. 7
      test/api/test-ot-tag.c

@ -884,7 +884,7 @@ def print_subtag_matches (subtag):
for language, tags in sorted (ot.from_bcp_47.items (), key=lambda i: (-len (i[0]), i[0])):
lt = LanguageTag (language)
if len (lt.subtags) == 1 or lt.grandfathered and ot.from_bcp_47[lt.subtags[0]] == tags:
if len (lt.subtags) == 1 or lt.grandfathered and len (lt.subtags[1]) != 3 and ot.from_bcp_47[lt.subtags[0]] == tags:
continue
print (' if (', end='')
if (lt.language == 'und' or

@ -1279,6 +1279,13 @@ hb_ot_tags_from_complex_language (const char *lang_str,
*count = 1;
return true;
}
if (0 == strcmp (lang_str, "zh-min-nan"))
{
/* Minnan, Hokkien, Amoy, Taiwanese, Southern Min, Southern Fujian, Hoklo, Southern Fukien, Ho-lo */
tags[0] = HB_TAG('Z','H','S',' '); /* Chinese Simplified */
*count = 1;
return true;
}
if (lang_matches (lang_str, "cdo-hans"))
{
/* Min Dong Chinese; Han (Simplified variant) */
@ -1791,6 +1798,13 @@ hb_ot_tags_from_complex_language (const char *lang_str,
*count = 1;
return true;
}
if (0 == strcmp (lang_str, "no-bok"))
{
/* Norwegian Bokmal */
tags[0] = HB_TAG('N','O','R',' '); /* Norwegian */
*count = 1;
return true;
}
if (0 == strcmp (lang_str, "no-nyn"))
{
/* Norwegian Nynorsk */
@ -1822,6 +1836,13 @@ hb_ot_tags_from_complex_language (const char *lang_str,
*count = 1;
return true;
}
if (0 == strcmp (lang_str, "zh-min"))
{
/* Min, Fuzhou, Hokkien, Amoy, or Taiwanese */
tags[0] = HB_TAG('Z','H','S',' '); /* Chinese Simplified */
*count = 1;
return true;
}
if (0 == strcmp (lang_str, "i-hak"))
{
/* Hakka */

@ -249,8 +249,17 @@ hb_ot_tags_from_language (const char *lang_str,
return;
/* Find a language matching in the first component. */
s = strchr (lang_str, '-');
{
const LangTag *lang_tag;
if (s && limit - lang_str >= 6)
{
const char *extlang_end = strchr (s + 1, '-');
/* If there is an extended language tag, use it. */
if (3 == (extlang_end ? extlang_end - s - 1 : strlen (s + 1)) &&
ISALPHA (s[1]))
lang_str = s + 1;
}
lang_tag = (LangTag *) bsearch (lang_str, ot_languages,
ARRAY_LENGTH (ot_languages), sizeof (LangTag),
lang_compare_first_component);
@ -264,7 +273,6 @@ hb_ot_tags_from_language (const char *lang_str,
}
}
s = strchr (lang_str, '-');
if (!s)
s = lang_str + strlen (lang_str);
if (s - lang_str == 3) {

@ -369,9 +369,13 @@ test_ot_tag_language (void)
test_tag_from_language ("ABC", "xyz-xy-x-hbotabc-zxc");
/* Unnormalized BCP 47 tags */
test_tag_from_language ("ARA", "ar-aao");
test_tag_from_language ("JBO", "art-lojban");
test_tag_from_language ("KOK", "kok-gom");
test_tag_from_language ("LTZ", "i-lux");
test_tag_from_language ("MNG", "drh");
test_tag_from_language ("MOR", "ar-ary");
test_tag_from_language ("MOR", "ar-ary-DZ");
test_tag_from_language ("NOR", "no-bok");
test_tag_from_language ("NYN", "no-nyn");
test_tag_from_language ("ZHS", "i-hak");
@ -379,6 +383,9 @@ test_ot_tag_language (void)
test_tag_from_language ("ZHS", "zh-min");
test_tag_from_language ("ZHS", "zh-min-nan");
test_tag_from_language ("ZHS", "zh-xiang");
/* A UN M.49 region code, not an extended language subtag */
test_tag_from_language ("ARA", "ar-001");
}
static void

Loading…
Cancel
Save