Switch on the first char of a complex language tag

This results in a tenfold speed-up for the common case of tags that are not complex, in the sense of `hb_ot_tags_from_complex_language`.
6 years ago · 3f8877473f
parent a754d44195
commit 3f8877473f
2 changed files with 922 additions and 829 deletions
--- a/src/gen-tag-table.py
+++ b/src/gen-tag-table.py
@ -288,6 +288,37 @@ class LanguageTag (object):
 		except StopIteration:
 			return None

+	def is_complex (self):
+		"""Return whether this tag is too complex to represent as a
+		``LangTag`` in the generated code.
+
+		Complex tags need to be handled in
+		``hb_ot_tags_from_complex_language``.
+
+		Returns:
+			Whether this tag is complex.
+		"""
+		return not (len (self.subtags) == 1
+			or self.grandfathered
+			and len (self.subtags[1]) != 3
+			and ot.from_bcp_47[self.subtags[0]] == ot.from_bcp_47[self.language])
+
+	def get_group (self):
+		"""Return the group into which this tag should be categorized in
+		``hb_ot_tags_from_complex_language``.
+
+		The group is the first letter of the tag, or ``'und'`` if this tag
+		should not be matched in a ``switch`` statement in the generated
+		code.
+
+		Returns:
+			This tag's group.
+		"""
+		return ('und'
+			if (self.language == 'und'
+				or self.variant in bcp_47.prefixes and len (bcp_47.prefixes[self.variant]) == 1)
+			else self.language[0])
+
 class OpenTypeRegistryParser (HTMLParser):
 	"""A parser for the OpenType language system tag registry.

@ -598,16 +629,15 @@ class BCP47Parser (object):
 				for macrolanguage in macrolanguages:
 					self._add_macrolanguage (biggest_macrolanguage, macrolanguage)

-	def get_name (self, tag):
+	def get_name (self, lt):
 		"""Return the names of the subtags in a language tag.

 		Args:
-			tag (str): A BCP 47 language tag.
+			lt (LanguageTag): A BCP 47 language tag.

 		Returns:
-			The name form of ``tag``.
+			The name form of ``lt``.
 		"""
-		lt = LanguageTag (tag)
 		name = self.names[lt.language].split ('\n')[0]
 		if lt.script:
 			name += '; ' + self.names[lt.script.title ()].split ('\n')[0]
@ -909,58 +939,101 @@ print ('\t\t\t\t  unsigned int *count /* IN/OUT */,')
 print ('\t\t\t\t  hb_tag_t     *tags /* OUT */)')
 print ('{')

-def print_subtag_matches (subtag):
+def print_subtag_matches (subtag, new_line):
 	if subtag:
-		print ()
-		print ('      && subtag_matches (lang_str, limit, "-%s")' % subtag, end='')
-
-for language, tags in sorted (ot.from_bcp_47.items (), key=lambda i: (-len (i[0]), i[0])):
-	lt = LanguageTag (language)
-	if len (lt.subtags) == 1 or lt.grandfathered and len (lt.subtags[1]) != 3 and ot.from_bcp_47[lt.subtags[0]] == tags:
+		if new_line:
+			print ()
+			print ('\t&& ', end='')
+		print ('subtag_matches (lang_str, limit, "-%s")' % subtag, end='')
+
+complex_tags = collections.defaultdict (list)
+for initial, group in itertools.groupby ((lt_tags for lt_tags in [
+			(LanguageTag (language), tags)
+			for language, tags in sorted (ot.from_bcp_47.items (),
+				key=lambda i: (-len (i[0]), i[0]))
+		] if lt_tags[0].is_complex ()),
+		key=lambda lt_tags: lt_tags[0].get_group ()):
+	complex_tags[initial] += group
+
+for initial, items in sorted (complex_tags.items ()):
+	if initial != 'und':
 		continue
-	print ('  if (', end='')
-	if (lt.language == 'und' or
-			lt.variant in bcp_47.prefixes and
-			len (bcp_47.prefixes[lt.variant]) == 1):
+	for lt, tags in items:
 		if lt.variant in bcp_47.prefixes:
 			expect (next (iter (bcp_47.prefixes[lt.variant])) == lt.language,
 					'%s is not a valid prefix of %s' % (lt.language, lt.variant))
-		print ('1', end='')
-	elif lt.grandfathered:
-		print ('0 == strcmp (lang_str, "%s")' % lt.language, end='')
-	else:
-		print ('lang_matches (lang_str, "%s' % lt.language, end='')
-		if lt.script:
-			print ('-%s' % lt.script, end='')
-			lt.script = None
-			if lt.region:
-				print ('-%s' % lt.region, end='')
-				lt.region = None
-		print ('")', end='')
-	print_subtag_matches (lt.script)
-	print_subtag_matches (lt.region)
-	print_subtag_matches (lt.variant)
-	print (')')
-	print ('  {')
-	write ('    /* %s */' % bcp_47.get_name (language))
-	print ()
-	if len (tags) == 1:
-		write ('    tags[0] = %s;  /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
+		print ('  if (', end='')
+		print_subtag_matches (lt.script, False)
+		print_subtag_matches (lt.region, False)
+		print_subtag_matches (lt.variant, False)
+		print (')')
+		print ('  {')
+		write ('    /* %s */' % bcp_47.get_name (lt))
+		print ()
+		if len (tags) == 1:
+			write ('    tags[0] = %s;  /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
+			print ()
+			print ('    *count = 1;')
+		else:
+			print ('    hb_tag_t possible_tags[] = {')
+			for tag in tags:
+				write ('      %s,  /* %s */' % (hb_tag (tag), ot.names[tag]))
+				print ()
+			print ('    };')
+			print ('    for (i = 0; i < %s && i < *count; i++)' % len (tags))
+			print ('      tags[i] = possible_tags[i];')
+			print ('    *count = i;')
+		print ('    return true;')
+		print ('  }')
+
+print ('  switch (lang_str[0])')
+print ('  {')
+for initial, items in sorted (complex_tags.items ()):
+	if initial == 'und':
+		continue
+	print ("  case '%s':" % initial)
+	for lt, tags in items:
+		print ('    if (', end='')
+		if lt.grandfathered:
+			print ('0 == strcmp (&lang_str[1], "%s")' % lt.language[1:], end='')
+		else:
+			string_literal = lt.language[1:] + '-'
+			if lt.script:
+				string_literal += lt.script
+				lt.script = None
+				if lt.region:
+					string_literal += '-' + lt.region
+					lt.region = None
+			if string_literal[-1] == '-':
+				print ('0 == strncmp (&lang_str[1], "%s", %i)' % (string_literal, len (string_literal)), end='')
+			else:
+				print ('lang_matches (&lang_str[1], "%s")' % string_literal, end='')
+		print_subtag_matches (lt.script, True)
+		print_subtag_matches (lt.region, True)
+		print_subtag_matches (lt.variant, True)
+		print (')')
+		print ('    {')
+		write ('      /* %s */' % bcp_47.get_name (lt))
 		print ()
-		print ('    *count = 1;')
-	else:
-		print ('    unsigned int i;')
-		print ('    hb_tag_t possible_tags[] = {')
-		for tag in tags:
-			write ('      %s,  /* %s */' % (hb_tag (tag), ot.names[tag]))
+		if len (tags) == 1:
+			write ('      tags[0] = %s;  /* %s */' % (hb_tag (tags[0]), ot.names[tags[0]]))
 			print ()
-		print ('    };')
-		print ('    for (i = 0; i < %s && i < *count; i++)' % len (tags))
-		print ('      tags[i] = possible_tags[i];')
-		print ('    *count = i;')
-	print ('    return true;')
-	print ('  }')
+			print ('      *count = 1;')
+		else:
+			print ('      unsigned int i;')
+			print ('      hb_tag_t possible_tags[] = {')
+			for tag in tags:
+				write ('\t%s,  /* %s */' % (hb_tag (tag), ot.names[tag]))
+				print ()
+			print ('      };')
+			print ('      for (i = 0; i < %s && i < *count; i++)' % len (tags))
+			print ('\ttags[i] = possible_tags[i];')
+			print ('      *count = i;')
+		print ('      return true;')
+		print ('    }')
+	print ('    break;')

+print ('  }')
 print ('  return false;')
 print ('}')
 print ()
@ -1030,7 +1103,7 @@ verify_disambiguation_dict ()
 for ot_tag, bcp_47_tag in sorted (disambiguation.items ()):
 	write ('  case %s:  /* %s */' % (hb_tag (ot_tag), ot.names[ot_tag]))
 	print ()
-	write ('    return hb_language_from_string (\"%s\", -1);  /* %s */' % (bcp_47_tag, bcp_47.get_name (bcp_47_tag)))
+	write ('    return hb_language_from_string (\"%s\", -1);  /* %s */' % (bcp_47_tag, bcp_47.get_name (LanguageTag (bcp_47_tag))))
 	print ()

 print ('  default:')
--- a/src/hb-ot-tag-table.hh
+++ b/src/hb-ot-tag-table.hh