[Indic] Add per-script configuration tables

This concludes the Indic shape_plan work. May do for Arabic also...
12 years ago · 11b0e20ba4
parent 85fc6c483f
commit 11b0e20ba4
2 changed files with 159 additions and 166 deletions
--- a/src/hb-ot-shape-complex-indic-private.hh
+++ b/src/hb-ot-shape-complex-indic-private.hh
@ -157,8 +157,8 @@ enum indic_matra_category_t {
 #define IS_DEVA(u) (IN_HALF_BLOCK (u, 0x0900))
 #define IS_BENG(u) (IN_HALF_BLOCK (u, 0x0980))
-#define IS_GURM(u) (IN_HALF_BLOCK (u, 0x0A00))
+#define IS_GURU(u) (IN_HALF_BLOCK (u, 0x0A00))
-#define IS_GUJA(u) (IN_HALF_BLOCK (u, 0x0A80))
+#define IS_GUJR(u) (IN_HALF_BLOCK (u, 0x0A80))
 #define IS_ORYA(u) (IN_HALF_BLOCK (u, 0x0B00))
 #define IS_TAML(u) (IN_HALF_BLOCK (u, 0x0B80))
 #define IS_TELU(u) (IN_HALF_BLOCK (u, 0x0C00))
@ -172,8 +172,8 @@ enum indic_matra_category_t {
 #define MATRA_POS_RIGHT(u)	( \
 				  IS_DEVA(u) ? POS_AFTER_SUB  : \
 				  IS_BENG(u) ? POS_AFTER_POST : \
-				  IS_GURM(u) ? POS_AFTER_POST : \
+				  IS_GURU(u) ? POS_AFTER_POST : \
-				  IS_GUJA(u) ? POS_AFTER_POST : \
+				  IS_GUJR(u) ? POS_AFTER_POST : \
 				  IS_ORYA(u) ? POS_AFTER_POST : \
 				  IS_TAML(u) ? POS_AFTER_POST : \
 				  IS_TELU(u) ? (u <= 0x0C42 ? POS_BEFORE_SUB : POS_AFTER_SUB) : \
@ -185,8 +185,8 @@ enum indic_matra_category_t {
 				)
 #define MATRA_POS_TOP(u)	( /* BENG and MLYM don't have top matras. */ \
 				  IS_DEVA(u) ? POS_AFTER_SUB  : \
-				  IS_GURM(u) ? POS_AFTER_POST : /* Deviate from spec */ \
+				  IS_GURU(u) ? POS_AFTER_POST : /* Deviate from spec */ \
-				  IS_GUJA(u) ? POS_AFTER_SUB  : \
+				  IS_GUJR(u) ? POS_AFTER_SUB  : \
 				  IS_ORYA(u) ? POS_AFTER_MAIN : \
 				  IS_TAML(u) ? POS_AFTER_SUB  : \
 				  IS_TELU(u) ? POS_BEFORE_SUB : \
@ -198,8 +198,8 @@ enum indic_matra_category_t {
 #define MATRA_POS_BOTTOM(u)	( \
 				  IS_DEVA(u) ? POS_AFTER_SUB  : \
 				  IS_BENG(u) ? POS_AFTER_SUB  : \
-				  IS_GURM(u) ? POS_AFTER_POST : \
+				  IS_GURU(u) ? POS_AFTER_POST : \
-				  IS_GUJA(u) ? POS_AFTER_POST : \
+				  IS_GUJR(u) ? POS_AFTER_POST : \
 				  IS_ORYA(u) ? POS_AFTER_SUB  : \
 				  IS_TAML(u) ? POS_AFTER_POST : \
 				  IS_TELU(u) ? POS_BEFORE_SUB : \
--- a/src/hb-ot-shape-complex-indic.cc
+++ b/src/hb-ot-shape-complex-indic.cc
@ -25,23 +25,12 @@
 */
 #include "hb-ot-shape-complex-indic-private.hh"
 #include "hb-ot-shape-private.hh"
 #include "hb-ot-layout-private.hh"
-#define OLD_INDIC_TAG(script) (((hb_tag_t) script) | 0x20000000)
+/*
-#define IS_OLD_INDIC_TAG(tag) ( \
+ * Global Indic shaper options.
-				(tag) == OLD_INDIC_TAG (HB_SCRIPT_BENGALI)	|| \
+ */
 				(tag) == OLD_INDIC_TAG (HB_SCRIPT_DEVANAGARI)	|| \
 				(tag) == OLD_INDIC_TAG (HB_SCRIPT_GUJARATI)	|| \
 				(tag) == OLD_INDIC_TAG (HB_SCRIPT_GURMUKHI)	|| \
 				(tag) == OLD_INDIC_TAG (HB_SCRIPT_KANNADA)	|| \
 				(tag) == OLD_INDIC_TAG (HB_SCRIPT_MALAYALAM)	|| \
 				(tag) == OLD_INDIC_TAG (HB_SCRIPT_ORIYA)	|| \
 				(tag) == OLD_INDIC_TAG (HB_SCRIPT_TAMIL)	|| \
 				(tag) == OLD_INDIC_TAG (HB_SCRIPT_TELUGU)	|| \
 			      0)
 struct indic_options_t
 {
@ -82,6 +71,65 @@ indic_options (void)
 }
 /*
 * Indic configurations.  Note that we do not want to keep every single script-specific
 * behavior in these tables necessarily.  This should mainly be used for per-script
 * properties that are cheaper keeping here, than in the code.  Ie. if, say, one and
 * only one script has an exception, that one script can be if'ed directly in the code,
 * instead of adding a new flag in these structs.
 */
 enum base_position_t {
  BASE_POS_FIRST,
  BASE_POS_LAST
 };
 enum reph_position_t {
  REPH_POS_DEFAULT     = POS_BEFORE_POST,
  REPH_POS_AFTER_MAIN  = POS_AFTER_MAIN,
  REPH_POS_BEFORE_SUB  = POS_BEFORE_SUB,
  REPH_POS_AFTER_SUB   = POS_AFTER_SUB,
  REPH_POS_BEFORE_POST = POS_BEFORE_POST,
  REPH_POS_AFTER_POST  = POS_AFTER_POST
 };
 enum reph_mode_t {
  REPH_MODE_IMPLICIT,  /* Reph formed out of initial Ra,H sequence. */
  REPH_MODE_EXPLICIT,  /* Reph formed out of initial Ra,H,ZWJ sequence. */
  REPH_MODE_VIS_REPHA, /* Encoded Repha character, no reordering needed. */
  REPH_MODE_LOG_REPHA  /* Encoded Repha character, needs reordering. */
 };
 struct indic_config_t
 {
  hb_script_t     script;
  bool            has_old_spec;
  hb_codepoint_t  virama;
  base_position_t base_pos;
  reph_position_t reph_pos;
  reph_mode_t     reph_mode;
 };
 static const indic_config_t indic_configs[] =
 {
  /* Default.  Should be first. */
  {HB_SCRIPT_INVALID,	false,     0,BASE_POS_LAST, REPH_POS_DEFAULT,    REPH_MODE_IMPLICIT},
  {HB_SCRIPT_DEVANAGARI,true, 0x094D,BASE_POS_LAST, REPH_POS_BEFORE_POST,REPH_MODE_IMPLICIT},
  {HB_SCRIPT_BENGALI,	true, 0x09CD,BASE_POS_LAST, REPH_POS_AFTER_SUB,  REPH_MODE_IMPLICIT},
  {HB_SCRIPT_GURMUKHI,	true, 0x0A4D,BASE_POS_LAST, REPH_POS_BEFORE_SUB, REPH_MODE_IMPLICIT},
  {HB_SCRIPT_GUJARATI,	true, 0x0ACD,BASE_POS_LAST, REPH_POS_BEFORE_POST,REPH_MODE_IMPLICIT},
  {HB_SCRIPT_ORIYA,	true, 0x0B4D,BASE_POS_LAST, REPH_POS_AFTER_MAIN, REPH_MODE_IMPLICIT},
  {HB_SCRIPT_TAMIL,	true, 0x0BCD,BASE_POS_LAST, REPH_POS_AFTER_POST, REPH_MODE_IMPLICIT},
  {HB_SCRIPT_TELUGU,	true, 0x0C4D,BASE_POS_LAST, REPH_POS_AFTER_POST, REPH_MODE_EXPLICIT},
  {HB_SCRIPT_KANNADA,	true, 0x0CCD,BASE_POS_LAST, REPH_POS_AFTER_POST, REPH_MODE_IMPLICIT},
  {HB_SCRIPT_MALAYALAM,	true, 0x0D4D,BASE_POS_LAST, REPH_POS_AFTER_MAIN, REPH_MODE_LOG_REPHA},
  {HB_SCRIPT_SINHALA,	false,0x0DCA,BASE_POS_FIRST,REPH_POS_AFTER_MAIN, REPH_MODE_EXPLICIT},
  {HB_SCRIPT_KHMER,	false,0x17D2,BASE_POS_FIRST,REPH_POS_DEFAULT,    REPH_MODE_VIS_REPHA},
 };
 /*
 * Indic shaper.
 */
 struct feature_list_t {
  hb_tag_t tag;
@ -228,7 +276,7 @@ struct indic_shape_plan_t
    hb_codepoint_t glyph = virama_glyph;
    if (unlikely (virama_glyph == (hb_codepoint_t) -1))
    {
-      if (!font->get_glyph (virama, 0, &glyph))
+      if (!config->virama || !font->get_glyph (config->virama, 0, &glyph))
 	glyph = 0;
      /* Technically speaking, the spec says we should apply 'locl' to virama too.
       * Maybe one day... */
@ -242,10 +290,9 @@ struct indic_shape_plan_t
    return glyph != 0;
  }
  const indic_config_t *config;
  bool is_old_spec;
  hb_codepoint_t virama;
  hb_codepoint_t virama_glyph;
  would_substitute_feature_t pref;
@ -262,26 +309,15 @@ data_create_indic (const hb_ot_shape_plan_t *plan)
  if (unlikely (!indic_plan))
    return NULL;
-  indic_plan->is_old_spec = IS_OLD_INDIC_TAG (plan->map.get_chosen_script (0));
+  indic_plan->config = &indic_configs[0];
-  {
+  for (unsigned int i = 1; i < ARRAY_LENGTH (indic_configs); i++)
-    hb_codepoint_t virama;
+    if (plan->props.script == indic_configs[i].script) {
-    switch ((int) plan->props.script) {
+      indic_plan->config = &indic_configs[i];
-      case HB_SCRIPT_DEVANAGARI:virama = 0x094D; break;
+      break;
      case HB_SCRIPT_BENGALI:	virama = 0x09CD; break;
      case HB_SCRIPT_GURMUKHI:	virama = 0x0A4D; break;
      case HB_SCRIPT_GUJARATI:	virama = 0x0ACD; break;
      case HB_SCRIPT_ORIYA:	virama = 0x0B4D; break;
      case HB_SCRIPT_TAMIL:	virama = 0x0BCD; break;
      case HB_SCRIPT_TELUGU:	virama = 0x0C4D; break;
      case HB_SCRIPT_KANNADA:	virama = 0x0CCD; break;
      case HB_SCRIPT_MALAYALAM:	virama = 0x0D4D; break;
      case HB_SCRIPT_SINHALA:	virama = 0x0DCA; break;
      case HB_SCRIPT_KHMER:	virama = 0x17D2; break;
      default:			virama = 0;      break;
    }
-    indic_plan->virama = virama;
+
-  }
+  indic_plan->is_old_spec = indic_plan->config->has_old_spec && ((plan->map.get_chosen_script (0) & 0x000000FF) != '2');
-  indic_plan->virama_glyph = indic_plan->virama ? (hb_codepoint_t) -1 : 0;
+  indic_plan->virama_glyph = (hb_codepoint_t) -1;
  indic_plan->pref.init (&plan->map, HB_TAG('p','r','e','f'));
  indic_plan->blwf.init (&plan->map, HB_TAG('b','l','w','f'));
@ -397,9 +433,9 @@ initial_reordering_consonant_syllable (const hb_ot_shape_plan_t *plan, hb_buffer
 	start + 3 <= end &&
 	info[start].indic_category() == OT_Ra &&
 	info[start + 1].indic_category() == OT_H &&
-	(unlikely (buffer->props.script == HB_SCRIPT_SINHALA || buffer->props.script == HB_SCRIPT_TELUGU) ?
+	(/* TODO Handle other Reph modes. */
-	 info[start + 2].indic_category() == OT_ZWJ /* In Sinhala & Telugu, form Reph only if ZWJ is present */:
+	 (indic_plan->config->reph_mode == REPH_MODE_IMPLICIT && !is_joiner (info[start + 2])) ||
-	 !is_joiner (info[start + 2] /* In other scripts, any joiner blocks Reph formation */ )
+	 (indic_plan->config->reph_mode == REPH_MODE_EXPLICIT && info[start + 2].indic_category() == OT_ZWJ)
 	))
    {
      limit += 2;
@ -409,92 +445,84 @@ initial_reordering_consonant_syllable (const hb_ot_shape_plan_t *plan, hb_buffer
      has_reph = true;
    };
-     enum base_position_t {
+    switch (indic_plan->config->base_pos == BASE_POS_LAST)
       BASE_FIRST,
       BASE_LAST
     } base_pos;
    switch ((hb_tag_t) buffer->props.script)
    {
      case HB_SCRIPT_SINHALA:
      case HB_SCRIPT_KHMER:
 	base_pos = BASE_FIRST;
 	break;
      default:
 	base_pos = BASE_LAST;
 	break;
    }
    if (base_pos == BASE_LAST)
    {
-      /* -> starting from the end of the syllable, move backwards */
+      case BASE_POS_LAST:
-      unsigned int i = end;
+      {
-      bool seen_below = false;
+	/* -> starting from the end of the syllable, move backwards */
-      do {
+	unsigned int i = end;
-	i--;
+	bool seen_below = false;
-	/* -> until a consonant is found */
+	do {
-	if (is_consonant (info[i]))
+	  i--;
-	{
+	  /* -> until a consonant is found */
-	  /* -> that does not have a below-base or post-base form
+	  if (is_consonant (info[i]))
 	   * (post-base forms have to follow below-base forms), */
 	  if (info[i].indic_position() != POS_BELOW_C &&
 	      (info[i].indic_position() != POS_POST_C || seen_below))
 	  {
 	    /* -> that does not have a below-base or post-base form
 	     * (post-base forms have to follow below-base forms), */
 	    if (info[i].indic_position() != POS_BELOW_C &&
 		(info[i].indic_position() != POS_POST_C || seen_below))
 	    {
 	      base = i;
 	      break;
 	    }
 	    if (info[i].indic_position() == POS_BELOW_C)
 	      seen_below = true;
 	    /* -> or that is not a pre-base reordering Ra,
 	     *
 	     * IMPLEMENTATION NOTES:
 	     *
 	     * Our pre-base reordering Ra's are marked POS_BELOW, so will be skipped
 	     * by the logic above already.
 	     */
 	    /* -> or arrive at the first consonant. The consonant stopped at will
 	     * be the base. */
 	    base = i;
 	    break;
 	  }
-	  if (info[i].indic_position() == POS_BELOW_C)
+	  else
-	    seen_below = true;
+	  {
 	    /* A ZWJ after a Halant stops the base search, and requests an explicit
 	     * half form.
 	     * A ZWJ before a Halant, requests a subjoined form instead, and hence
 	     * search continues.  This is particularly important for Bengali
 	     * sequence Ra,H,Ya that shouls form Ya-Phalaa by subjoining Ya. */
 	    if (start < i &&
 		info[i].indic_category() == OT_ZWJ &&
 		info[i - 1].indic_category() == OT_H)
 	      break;
 	  }
 	} while (i > limit);
      }
      break;
-	  /* -> or that is not a pre-base reordering Ra,
+      case BASE_POS_FIRST:
-	   *
+      {
-	   * IMPLEMENTATION NOTES:
+	/* In scripts without half forms (eg. Khmer), the first consonant is always the base. */
 	   *
 	   * Our pre-base reordering Ra's are marked POS_BELOW, so will be skipped
 	   * by the logic above already.
 	   */
-	  /* -> or arrive at the first consonant. The consonant stopped at will
+	if (!has_reph)
-	   * be the base. */
+	  base = limit;
 	  base = i;
 	}
 	else
 	{
 	  /* A ZWJ after a Halant stops the base search, and requests an explicit
 	   * half form.
 	   * A ZWJ before a Halant, requests a subjoined form instead, and hence
 	   * search continues.  This is particularly important for Bengali
 	   * sequence Ra,H,Ya that shouls form Ya-Phalaa by subjoining Ya. */
 	  if (start < i &&
 	      info[i].indic_category() == OT_ZWJ &&
 	      info[i - 1].indic_category() == OT_H)
 	    break;
 	}
      } while (i > limit);
    }
    else
    {
      /* In scripts without half forms (eg. Khmer), the first consonant is always the base. */
-      if (!has_reph)
+	/* Find the last base consonant that is not blocked by ZWJ.  If there is
-	base = limit;
+	 * a ZWJ right before a base consonant, that would request a subjoined form. */
 	for (unsigned int i = limit; i < end; i++)
 	  if (is_consonant (info[i]) && info[i].indic_position() == POS_BASE_C)
 	  {
 	    if (limit < i && info[i - 1].indic_category() == OT_ZWJ)
 	      break;
 	    else
 	      base = i;
 	  }
-      /* Find the last base consonant that is not blocked by ZWJ.  If there is
+	/* Mark all subsequent consonants as below. */
-       * a ZWJ right before a base consonant, that would request a subjoined form. */
+	for (unsigned int i = base + 1; i < end; i++)
-      for (unsigned int i = limit; i < end; i++)
+	  if (is_consonant (info[i]) && info[i].indic_position() == POS_BASE_C)
-        if (is_consonant (info[i]) && info[i].indic_position() == POS_BASE_C)
+	    info[i].indic_position() = POS_BELOW_C;
-	{
+      }
-	  if (limit < i && info[i - 1].indic_category() == OT_ZWJ)
+      break;
 	    break;
          else
 	    base = i;
 	}
-      /* Mark all subsequent consonants as below. */
+      default:
-      for (unsigned int i = base + 1; i < end; i++)
+      abort ();
        if (is_consonant (info[i]) && info[i].indic_position() == POS_BASE_C)
 	  info[i].indic_position() = POS_BELOW_C;
    }
    /* -> If the syllable starts with Ra + Halant (in a script that has Reph)
@ -864,50 +892,15 @@ final_reordering_syllable (const hb_ot_shape_plan_t *plan,
      info[start].indic_position() == POS_RA_TO_BECOME_REPH &&
      info[start + 1].indic_position() != POS_RA_TO_BECOME_REPH)
  {
-      unsigned int new_reph_pos;
+    unsigned int new_reph_pos;
-
+    reph_position_t reph_pos = indic_plan->config->reph_pos;
-     enum reph_position_t {
+
-       REPH_AFTER_MAIN,
+    /* XXX Figure out old behavior too */
       REPH_BEFORE_SUBSCRIPT,
       REPH_AFTER_SUBSCRIPT,
       REPH_BEFORE_POSTSCRIPT,
       REPH_AFTER_POSTSCRIPT
     } reph_pos;
     /* XXX Figure out old behavior too */
     switch ((hb_tag_t) buffer->props.script)
     {
       case HB_SCRIPT_MALAYALAM:
       case HB_SCRIPT_ORIYA:
       case HB_SCRIPT_SINHALA:
 	 reph_pos = REPH_AFTER_MAIN;
 	 break;
       case HB_SCRIPT_GURMUKHI:
 	 reph_pos = REPH_BEFORE_SUBSCRIPT;
 	 break;
       case HB_SCRIPT_BENGALI:
 	 reph_pos = REPH_AFTER_SUBSCRIPT;
 	 break;
       default:
       case HB_SCRIPT_DEVANAGARI:
       case HB_SCRIPT_GUJARATI:
 	 reph_pos = REPH_BEFORE_POSTSCRIPT;
 	 break;
       case HB_SCRIPT_KANNADA:
       case HB_SCRIPT_TAMIL:
       case HB_SCRIPT_TELUGU:
 	 reph_pos = REPH_AFTER_POSTSCRIPT;
 	 break;
     }
    /*       1. If reph should be positioned after post-base consonant forms,
     *          proceed to step 5.
     */
-    if (reph_pos == REPH_AFTER_POSTSCRIPT)
+    if (reph_pos == REPH_POS_AFTER_POST)
    {
      goto reph_step_5;
    }
@ -940,7 +933,7 @@ final_reordering_syllable (const hb_ot_shape_plan_t *plan,
     *          first consonant not ligated with main, or find the first
     *          consonant that is not a potential pre-base reordering Ra.
     */
-    if (reph_pos == REPH_AFTER_MAIN)
+    if (reph_pos == REPH_POS_AFTER_MAIN)
    {
      new_reph_pos = base;
      /* XXX Skip potential pre-base reordering Ra. */
@ -956,7 +949,7 @@ final_reordering_syllable (const hb_ot_shape_plan_t *plan,
     *          first matra, syllable modifier sign or vedic sign.
     */
    /* This is our take on what step 4 is trying to say (and failing, BADLY). */
-    if (reph_pos == REPH_AFTER_SUBSCRIPT)
+    if (reph_pos == REPH_POS_AFTER_SUB)
    {
      new_reph_pos = base;
      while (new_reph_pos < end &&