parent
0e292eb2a2
commit
3f33f0d1f2
9 changed files with 5498 additions and 1 deletions
@ -0,0 +1,207 @@ |
|||||||
|
/*
|
||||||
|
* Copyright (C) 2012 Grigori Goronzy <greg@kinoho.net> |
||||||
|
* |
||||||
|
* Permission to use, copy, modify, and/or distribute this software for any |
||||||
|
* purpose with or without fee is hereby granted, provided that the above |
||||||
|
* copyright notice and this permission notice appear in all copies. |
||||||
|
* |
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
||||||
|
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
||||||
|
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
||||||
|
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
||||||
|
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
||||||
|
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
||||||
|
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
||||||
|
*/ |
||||||
|
|
||||||
|
#include "hb-private.hh" |
||||||
|
|
||||||
|
#include "hb-unicode-private.hh" |
||||||
|
|
||||||
|
HB_BEGIN_DECLS |
||||||
|
#include "ucdn.h" |
||||||
|
HB_END_DECLS |
||||||
|
|
||||||
|
static const hb_script_t ucdn_script_translate[] = |
||||||
|
{ |
||||||
|
HB_SCRIPT_COMMON, |
||||||
|
HB_SCRIPT_LATIN, |
||||||
|
HB_SCRIPT_GREEK, |
||||||
|
HB_SCRIPT_CYRILLIC, |
||||||
|
HB_SCRIPT_ARMENIAN, |
||||||
|
HB_SCRIPT_HEBREW, |
||||||
|
HB_SCRIPT_ARABIC, |
||||||
|
HB_SCRIPT_SYRIAC, |
||||||
|
HB_SCRIPT_THAANA, |
||||||
|
HB_SCRIPT_DEVANAGARI, |
||||||
|
HB_SCRIPT_BENGALI, |
||||||
|
HB_SCRIPT_GURMUKHI, |
||||||
|
HB_SCRIPT_GUJARATI, |
||||||
|
HB_SCRIPT_ORIYA, |
||||||
|
HB_SCRIPT_TAMIL, |
||||||
|
HB_SCRIPT_TELUGU, |
||||||
|
HB_SCRIPT_KANNADA, |
||||||
|
HB_SCRIPT_MALAYALAM, |
||||||
|
HB_SCRIPT_SINHALA, |
||||||
|
HB_SCRIPT_THAI, |
||||||
|
HB_SCRIPT_LAO, |
||||||
|
HB_SCRIPT_TIBETAN, |
||||||
|
HB_SCRIPT_MYANMAR, |
||||||
|
HB_SCRIPT_GEORGIAN, |
||||||
|
HB_SCRIPT_HANGUL, |
||||||
|
HB_SCRIPT_ETHIOPIC, |
||||||
|
HB_SCRIPT_CHEROKEE, |
||||||
|
HB_SCRIPT_CANADIAN_ABORIGINAL, |
||||||
|
HB_SCRIPT_OGHAM, |
||||||
|
HB_SCRIPT_RUNIC, |
||||||
|
HB_SCRIPT_KHMER, |
||||||
|
HB_SCRIPT_MONGOLIAN, |
||||||
|
HB_SCRIPT_HIRAGANA, |
||||||
|
HB_SCRIPT_KATAKANA, |
||||||
|
HB_SCRIPT_BOPOMOFO, |
||||||
|
HB_SCRIPT_HAN, |
||||||
|
HB_SCRIPT_YI, |
||||||
|
HB_SCRIPT_OLD_ITALIC, |
||||||
|
HB_SCRIPT_GOTHIC, |
||||||
|
HB_SCRIPT_DESERET, |
||||||
|
HB_SCRIPT_INHERITED, |
||||||
|
HB_SCRIPT_TAGALOG, |
||||||
|
HB_SCRIPT_HANUNOO, |
||||||
|
HB_SCRIPT_BUHID, |
||||||
|
HB_SCRIPT_TAGBANWA, |
||||||
|
HB_SCRIPT_LIMBU, |
||||||
|
HB_SCRIPT_TAI_LE, |
||||||
|
HB_SCRIPT_LINEAR_B, |
||||||
|
HB_SCRIPT_UGARITIC, |
||||||
|
HB_SCRIPT_SHAVIAN, |
||||||
|
HB_SCRIPT_OSMANYA, |
||||||
|
HB_SCRIPT_CYPRIOT, |
||||||
|
HB_SCRIPT_BRAILLE, |
||||||
|
HB_SCRIPT_BUGINESE, |
||||||
|
HB_SCRIPT_COPTIC, |
||||||
|
HB_SCRIPT_NEW_TAI_LUE, |
||||||
|
HB_SCRIPT_GLAGOLITIC, |
||||||
|
HB_SCRIPT_TIFINAGH, |
||||||
|
HB_SCRIPT_SYLOTI_NAGRI, |
||||||
|
HB_SCRIPT_OLD_PERSIAN, |
||||||
|
HB_SCRIPT_KHAROSHTHI, |
||||||
|
HB_SCRIPT_BALINESE, |
||||||
|
HB_SCRIPT_CUNEIFORM, |
||||||
|
HB_SCRIPT_PHOENICIAN, |
||||||
|
HB_SCRIPT_PHAGS_PA, |
||||||
|
HB_SCRIPT_NKO, |
||||||
|
HB_SCRIPT_SUNDANESE, |
||||||
|
HB_SCRIPT_LEPCHA, |
||||||
|
HB_SCRIPT_OL_CHIKI, |
||||||
|
HB_SCRIPT_VAI, |
||||||
|
HB_SCRIPT_SAURASHTRA, |
||||||
|
HB_SCRIPT_KAYAH_LI, |
||||||
|
HB_SCRIPT_REJANG, |
||||||
|
HB_SCRIPT_LYCIAN, |
||||||
|
HB_SCRIPT_CARIAN, |
||||||
|
HB_SCRIPT_LYDIAN, |
||||||
|
HB_SCRIPT_CHAM, |
||||||
|
HB_SCRIPT_TAI_THAM, |
||||||
|
HB_SCRIPT_TAI_VIET, |
||||||
|
HB_SCRIPT_AVESTAN, |
||||||
|
HB_SCRIPT_EGYPTIAN_HIEROGLYPHS, |
||||||
|
HB_SCRIPT_SAMARITAN, |
||||||
|
HB_SCRIPT_LISU, |
||||||
|
HB_SCRIPT_BAMUM, |
||||||
|
HB_SCRIPT_JAVANESE, |
||||||
|
HB_SCRIPT_MEETEI_MAYEK, |
||||||
|
HB_SCRIPT_IMPERIAL_ARAMAIC, |
||||||
|
HB_SCRIPT_OLD_SOUTH_ARABIAN, |
||||||
|
HB_SCRIPT_INSCRIPTIONAL_PARTHIAN, |
||||||
|
HB_SCRIPT_INSCRIPTIONAL_PAHLAVI, |
||||||
|
HB_SCRIPT_OLD_TURKIC, |
||||||
|
HB_SCRIPT_KAITHI, |
||||||
|
HB_SCRIPT_BATAK, |
||||||
|
HB_SCRIPT_BRAHMI, |
||||||
|
HB_SCRIPT_MANDAIC, |
||||||
|
HB_SCRIPT_CHAKMA, |
||||||
|
HB_SCRIPT_MEROITIC_CURSIVE, |
||||||
|
HB_SCRIPT_MEROITIC_HIEROGLYPHS, |
||||||
|
HB_SCRIPT_MIAO, |
||||||
|
HB_SCRIPT_SHARADA, |
||||||
|
HB_SCRIPT_SORA_SOMPENG, |
||||||
|
HB_SCRIPT_TAKRI, |
||||||
|
HB_SCRIPT_UNKNOWN, |
||||||
|
}; |
||||||
|
|
||||||
|
static hb_unicode_combining_class_t |
||||||
|
hb_ucdn_combining_class(hb_unicode_funcs_t *ufuncs, hb_codepoint_t unicode, |
||||||
|
void *user_data) |
||||||
|
{ |
||||||
|
return (hb_unicode_combining_class_t) ucdn_get_combining_class(unicode); |
||||||
|
} |
||||||
|
|
||||||
|
static unsigned int |
||||||
|
hb_ucdn_eastasian_width(hb_unicode_funcs_t *ufuncs, hb_codepoint_t unicode, |
||||||
|
void *user_data) |
||||||
|
{ |
||||||
|
int w = ucdn_get_east_asian_width(unicode); |
||||||
|
return (w == UCDN_EAST_ASIAN_F || w == UCDN_EAST_ASIAN_W) ? 2 : 1; |
||||||
|
} |
||||||
|
|
||||||
|
static hb_unicode_general_category_t |
||||||
|
hb_ucdn_general_category(hb_unicode_funcs_t *ufuncs, |
||||||
|
hb_codepoint_t unicode, void *user_data) |
||||||
|
{ |
||||||
|
return (hb_unicode_general_category_t)ucdn_get_general_category(unicode); |
||||||
|
} |
||||||
|
|
||||||
|
static hb_codepoint_t |
||||||
|
hb_ucdn_mirroring(hb_unicode_funcs_t *ufuncs, hb_codepoint_t unicode, |
||||||
|
void *user_data) |
||||||
|
{ |
||||||
|
return ucdn_mirror(unicode); |
||||||
|
} |
||||||
|
|
||||||
|
static hb_script_t |
||||||
|
hb_ucdn_script(hb_unicode_funcs_t *ufuncs, hb_codepoint_t unicode, |
||||||
|
void *user_data) |
||||||
|
{ |
||||||
|
return ucdn_script_translate[ucdn_get_script(unicode)]; |
||||||
|
} |
||||||
|
|
||||||
|
static hb_bool_t |
||||||
|
hb_ucdn_compose(hb_unicode_funcs_t *ufuncs, hb_codepoint_t a, |
||||||
|
hb_codepoint_t b, hb_codepoint_t *ab, void *user_data) |
||||||
|
{ |
||||||
|
return ucdn_compose(ab, a, b); |
||||||
|
} |
||||||
|
|
||||||
|
static hb_bool_t |
||||||
|
hb_ucdn_decompose(hb_unicode_funcs_t *ufuncs, hb_codepoint_t ab, |
||||||
|
hb_codepoint_t *a, hb_codepoint_t *b, void *user_data) |
||||||
|
{ |
||||||
|
return ucdn_decompose(ab, a, b); |
||||||
|
} |
||||||
|
|
||||||
|
static unsigned int |
||||||
|
hb_ucdn_decompose_compatibility(hb_unicode_funcs_t *ufuncs, hb_codepoint_t u, |
||||||
|
hb_codepoint_t *decomposed, void *user_data) |
||||||
|
{ |
||||||
|
return ucdn_compat_decompose(u, decomposed); |
||||||
|
} |
||||||
|
|
||||||
|
extern "C" HB_INTERNAL |
||||||
|
hb_unicode_funcs_t * |
||||||
|
hb_ucdn_get_unicode_funcs (void) |
||||||
|
{ |
||||||
|
static const hb_unicode_funcs_t _hb_ucdn_unicode_funcs = { |
||||||
|
HB_OBJECT_HEADER_STATIC, |
||||||
|
|
||||||
|
NULL, /* parent */ |
||||||
|
true, /* immutable */ |
||||||
|
{ |
||||||
|
#define HB_UNICODE_FUNC_IMPLEMENT(name) hb_ucdn_##name, |
||||||
|
HB_UNICODE_FUNCS_IMPLEMENT_CALLBACKS |
||||||
|
#undef HB_UNICODE_FUNC_IMPLEMENT |
||||||
|
} |
||||||
|
}; |
||||||
|
|
||||||
|
return const_cast<hb_unicode_funcs_t *> (&_hb_ucdn_unicode_funcs); |
||||||
|
} |
||||||
|
|
@ -0,0 +1,18 @@ |
|||||||
|
## Process this file with automake to produce Makefile.in
|
||||||
|
|
||||||
|
noinst_LTLIBRARIES = libhb-ucdn.la
|
||||||
|
|
||||||
|
|
||||||
|
libhb_ucdn_la_SOURCES = \
|
||||||
|
ucdn.h \
|
||||||
|
ucdn.c \
|
||||||
|
unicodedata_db.h
|
||||||
|
libhb_ucdn_CPPFLAGS = \
|
||||||
|
-I$(top_srcdir) \
|
||||||
|
-I$(top_srcdir)/src \
|
||||||
|
-I$(top_builddir)/src
|
||||||
|
libhb_ucdn_la_LIBADD =
|
||||||
|
|
||||||
|
EXTRA_DIST = README
|
||||||
|
|
||||||
|
-include $(top_srcdir)/git.mk |
@ -0,0 +1,33 @@ |
|||||||
|
UCDN - Unicode Database and Normalization |
||||||
|
|
||||||
|
UCDN is a Unicode support library. Currently, it provides access |
||||||
|
to basic character properties contained in the Unicode Character |
||||||
|
Database and low-level normalization functions (pairwise canonical |
||||||
|
composition/decomposition and compatibility decomposition). More |
||||||
|
functionality might be provided in the future, such as additional |
||||||
|
properties, string normalization and encoding conversion. |
||||||
|
|
||||||
|
UCDN uses standard C89 with no particular dependencies or requirements |
||||||
|
except for stdint.h, and can be easily integrated into existing |
||||||
|
projects. However, it can also be used as a standalone library, |
||||||
|
and a CMake build script is provided for this. The first motivation |
||||||
|
behind UCDN development was to provide a standalone set of Unicode |
||||||
|
functions for the HarfBuzz OpenType shaping library. For this purpose, |
||||||
|
a HarfBuzz-specific wrapper is shipped along with it (hb-ucdn.h). |
||||||
|
|
||||||
|
UCDN is published under the ISC license, please see the license header |
||||||
|
in the C source code for more information. The makeunicodata.py script |
||||||
|
required for parsing Unicode database files is licensed under the |
||||||
|
PSF license, please see PYTHON-LICENSE for more information. |
||||||
|
|
||||||
|
UCDN was written by Grigori Goronzy <greg@kinoho.net>. |
||||||
|
|
||||||
|
How to Use |
||||||
|
|
||||||
|
Include ucdn.c, ucdn.h and unicodedata_db.h in your project. Now, |
||||||
|
just use the functions as documented in ucdn.h. |
||||||
|
|
||||||
|
In some cases, it might be necessary to regenerate the Unicode |
||||||
|
database file. The script makeunicodedata.py (Python 3.x required) |
||||||
|
fetches the appropriate files and dumps the compressed database into |
||||||
|
unicodedata_db.h. |
@ -0,0 +1,282 @@ |
|||||||
|
/*
|
||||||
|
* Copyright (C) 2012 Grigori Goronzy <greg@kinoho.net> |
||||||
|
* |
||||||
|
* Permission to use, copy, modify, and/or distribute this software for any |
||||||
|
* purpose with or without fee is hereby granted, provided that the above |
||||||
|
* copyright notice and this permission notice appear in all copies. |
||||||
|
* |
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
||||||
|
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
||||||
|
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
||||||
|
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
||||||
|
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
||||||
|
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
||||||
|
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
||||||
|
*/ |
||||||
|
|
||||||
|
#include <stdio.h> |
||||||
|
#include <stdlib.h> |
||||||
|
#include <stdint.h> |
||||||
|
#include "ucdn.h" |
||||||
|
|
||||||
|
typedef struct { |
||||||
|
const unsigned char category; |
||||||
|
const unsigned char combining; |
||||||
|
const unsigned char bidi_class; |
||||||
|
const unsigned char mirrored; |
||||||
|
const unsigned char east_asian_width; |
||||||
|
const unsigned char normalization_check; |
||||||
|
const unsigned char script; |
||||||
|
} UCDRecord; |
||||||
|
|
||||||
|
typedef struct { |
||||||
|
unsigned short from, to; |
||||||
|
} MirrorPair; |
||||||
|
|
||||||
|
typedef struct { |
||||||
|
int start; |
||||||
|
short count, index; |
||||||
|
} Reindex; |
||||||
|
|
||||||
|
#include "unicodedata_db.h" |
||||||
|
|
||||||
|
/* constants required for Hangul (de)composition */ |
||||||
|
#define SBASE 0xAC00 |
||||||
|
#define LBASE 0x1100 |
||||||
|
#define VBASE 0x1161 |
||||||
|
#define TBASE 0x11A7 |
||||||
|
#define SCOUNT 11172 |
||||||
|
#define LCOUNT 19 |
||||||
|
#define VCOUNT 21 |
||||||
|
#define TCOUNT 28 |
||||||
|
#define NCOUNT (VCOUNT * TCOUNT) |
||||||
|
|
||||||
|
static UCDRecord *get_ucd_record(uint32_t code) |
||||||
|
{ |
||||||
|
int index, offset; |
||||||
|
|
||||||
|
if (code >= 0x110000) |
||||||
|
index = 0; |
||||||
|
else { |
||||||
|
index = index0[code >> (SHIFT1+SHIFT2)] << SHIFT1; |
||||||
|
offset = (code >> SHIFT2) & ((1<<SHIFT1) - 1); |
||||||
|
index = index1[index + offset] << SHIFT2; |
||||||
|
offset = code & ((1<<SHIFT2) - 1); |
||||||
|
index = index2[index + offset]; |
||||||
|
} |
||||||
|
|
||||||
|
return &ucd_records[index]; |
||||||
|
} |
||||||
|
|
||||||
|
static unsigned short *get_decomp_record(uint32_t code) |
||||||
|
{ |
||||||
|
int index, offset; |
||||||
|
|
||||||
|
if (code >= 0x110000) |
||||||
|
index = 0; |
||||||
|
else { |
||||||
|
index = decomp_index0[code >> (DECOMP_SHIFT1+DECOMP_SHIFT2)] |
||||||
|
<< DECOMP_SHIFT1; |
||||||
|
offset = (code >> DECOMP_SHIFT2) & ((1<<DECOMP_SHIFT1) - 1); |
||||||
|
index = decomp_index1[index + offset] << DECOMP_SHIFT2; |
||||||
|
offset = code & ((1<<DECOMP_SHIFT2) - 1); |
||||||
|
index = decomp_index2[index + offset]; |
||||||
|
} |
||||||
|
|
||||||
|
return &decomp_data[index]; |
||||||
|
} |
||||||
|
|
||||||
|
static int get_comp_index(uint32_t code, Reindex *idx) |
||||||
|
{ |
||||||
|
int i; |
||||||
|
|
||||||
|
for (i = 0; idx[i].start; i++) { |
||||||
|
Reindex *cur = &idx[i]; |
||||||
|
if (code < cur->start) |
||||||
|
return -1; |
||||||
|
if (code <= cur->start + cur->count) { |
||||||
|
return cur->index + (code - cur->start); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
return -1; |
||||||
|
} |
||||||
|
|
||||||
|
static int compare_mp(const void *a, const void *b) |
||||||
|
{ |
||||||
|
MirrorPair *mpa = (MirrorPair *)a; |
||||||
|
MirrorPair *mpb = (MirrorPair *)b; |
||||||
|
return mpa->from - mpb->from; |
||||||
|
} |
||||||
|
|
||||||
|
static int hangul_pair_decompose(uint32_t code, uint32_t *a, uint32_t *b) |
||||||
|
{ |
||||||
|
int si = code - SBASE; |
||||||
|
|
||||||
|
if (si < 0 || si >= SCOUNT) |
||||||
|
return 0; |
||||||
|
|
||||||
|
if (si % TCOUNT) { |
||||||
|
/* LV,T */ |
||||||
|
*a = SBASE + (si / TCOUNT) * TCOUNT; |
||||||
|
*b = TBASE + (si % TCOUNT); |
||||||
|
return 3; |
||||||
|
} else { |
||||||
|
/* L,V */ |
||||||
|
*a = LBASE + (si / NCOUNT); |
||||||
|
*b = VBASE + (si % NCOUNT) / TCOUNT; |
||||||
|
return 2; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
static int hangul_pair_compose(uint32_t *code, uint32_t a, uint32_t b) |
||||||
|
{ |
||||||
|
if (b < VBASE || b >= (TBASE + TCOUNT)) |
||||||
|
return 0; |
||||||
|
|
||||||
|
if ((a < LBASE || a >= (LBASE + LCOUNT)) |
||||||
|
&& (a < SBASE || a >= (SBASE + SCOUNT))) |
||||||
|
return 0; |
||||||
|
|
||||||
|
if (a >= SBASE) { |
||||||
|
/* LV,T */ |
||||||
|
*code = a + (b - TBASE); |
||||||
|
return 3; |
||||||
|
} else { |
||||||
|
/* L,V */ |
||||||
|
int li = a - LBASE; |
||||||
|
int vi = b - VBASE; |
||||||
|
*code = SBASE + li * NCOUNT + vi * TCOUNT; |
||||||
|
return 2; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
static uint32_t decode_utf16(unsigned short **code_ptr) |
||||||
|
{ |
||||||
|
unsigned short *code = *code_ptr; |
||||||
|
|
||||||
|
if ((code[0] & 0xd800) != 0xd800) { |
||||||
|
*code_ptr += 1; |
||||||
|
return (uint32_t)code[0]; |
||||||
|
} else { |
||||||
|
*code_ptr += 2; |
||||||
|
return 0x10000 + ((uint32_t)code[1] - 0xdc00) + |
||||||
|
(((uint32_t)code[0] - 0xd800) << 10); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
const char *ucdn_get_unicode_version(void) |
||||||
|
{ |
||||||
|
return UNIDATA_VERSION; |
||||||
|
} |
||||||
|
|
||||||
|
int ucdn_get_combining_class(uint32_t code) |
||||||
|
{ |
||||||
|
return get_ucd_record(code)->combining; |
||||||
|
} |
||||||
|
|
||||||
|
int ucdn_get_east_asian_width(uint32_t code) |
||||||
|
{ |
||||||
|
return get_ucd_record(code)->east_asian_width; |
||||||
|
} |
||||||
|
|
||||||
|
int ucdn_get_general_category(uint32_t code) |
||||||
|
{ |
||||||
|
return get_ucd_record(code)->category; |
||||||
|
} |
||||||
|
|
||||||
|
int ucdn_get_bidi_class(uint32_t code) |
||||||
|
{ |
||||||
|
return get_ucd_record(code)->bidi_class; |
||||||
|
} |
||||||
|
|
||||||
|
int ucdn_get_mirrored(uint32_t code) |
||||||
|
{ |
||||||
|
return get_ucd_record(code)->mirrored; |
||||||
|
} |
||||||
|
|
||||||
|
int ucdn_get_script(uint32_t code) |
||||||
|
{ |
||||||
|
return get_ucd_record(code)->script; |
||||||
|
} |
||||||
|
|
||||||
|
uint32_t ucdn_mirror(uint32_t code) |
||||||
|
{ |
||||||
|
MirrorPair mp = {0}; |
||||||
|
MirrorPair *res; |
||||||
|
|
||||||
|
if (get_ucd_record(code)->mirrored == 0) |
||||||
|
return code; |
||||||
|
|
||||||
|
mp.from = code; |
||||||
|
res = bsearch(&mp, mirror_pairs, BIDI_MIRROR_LEN, sizeof(MirrorPair), |
||||||
|
compare_mp); |
||||||
|
|
||||||
|
if (res == NULL) |
||||||
|
return code; |
||||||
|
else |
||||||
|
return res->to; |
||||||
|
} |
||||||
|
|
||||||
|
int ucdn_decompose(uint32_t code, uint32_t *a, uint32_t *b) |
||||||
|
{ |
||||||
|
unsigned short *rec; |
||||||
|
int len; |
||||||
|
|
||||||
|
if (hangul_pair_decompose(code, a, b)) |
||||||
|
return 1; |
||||||
|
|
||||||
|
rec = get_decomp_record(code); |
||||||
|
len = rec[0] >> 8; |
||||||
|
|
||||||
|
if ((rec[0] & 0xff) != 0 || len == 0) |
||||||
|
return 0; |
||||||
|
|
||||||
|
rec++; |
||||||
|
*a = decode_utf16(&rec); |
||||||
|
if (len > 1) |
||||||
|
*b = decode_utf16(&rec); |
||||||
|
else |
||||||
|
*b = 0; |
||||||
|
|
||||||
|
return 1; |
||||||
|
} |
||||||
|
|
||||||
|
int ucdn_compose(uint32_t *code, uint32_t a, uint32_t b) |
||||||
|
{ |
||||||
|
int l, r, index, indexi, offset; |
||||||
|
|
||||||
|
if (hangul_pair_compose(code, a, b)) |
||||||
|
return 1; |
||||||
|
|
||||||
|
l = get_comp_index(a, nfc_first); |
||||||
|
r = get_comp_index(b, nfc_last); |
||||||
|
|
||||||
|
if (l < 0 || r < 0) |
||||||
|
return 0; |
||||||
|
|
||||||
|
indexi = l * TOTAL_LAST + r; |
||||||
|
index = comp_index0[indexi >> (COMP_SHIFT1+COMP_SHIFT2)] << COMP_SHIFT1; |
||||||
|
offset = (indexi >> COMP_SHIFT2) & ((1<<COMP_SHIFT1) - 1); |
||||||
|
index = comp_index1[index + offset] << COMP_SHIFT2; |
||||||
|
offset = indexi & ((1<<COMP_SHIFT2) - 1); |
||||||
|
*code = comp_data[index + offset]; |
||||||
|
|
||||||
|
return *code != 0; |
||||||
|
} |
||||||
|
|
||||||
|
int ucdn_compat_decompose(uint32_t code, uint32_t *decomposed) |
||||||
|
{ |
||||||
|
int i, len; |
||||||
|
unsigned short *rec = get_decomp_record(code); |
||||||
|
len = rec[0] >> 8; |
||||||
|
|
||||||
|
if (len == 0) |
||||||
|
return 0; |
||||||
|
|
||||||
|
rec++; |
||||||
|
for (i = 0; i < len; i++) |
||||||
|
decomposed[i] = decode_utf16(&rec); |
||||||
|
|
||||||
|
return len; |
||||||
|
} |
@ -0,0 +1,290 @@ |
|||||||
|
/*
|
||||||
|
* Copyright (C) 2012 Grigori Goronzy <greg@kinoho.net> |
||||||
|
* |
||||||
|
* Permission to use, copy, modify, and/or distribute this software for any |
||||||
|
* purpose with or without fee is hereby granted, provided that the above |
||||||
|
* copyright notice and this permission notice appear in all copies. |
||||||
|
* |
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
||||||
|
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
||||||
|
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
||||||
|
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
||||||
|
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
||||||
|
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
||||||
|
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
||||||
|
*/ |
||||||
|
|
||||||
|
#ifndef UCDN_H |
||||||
|
#define UCDN_H |
||||||
|
|
||||||
|
#include <stdint.h> |
||||||
|
|
||||||
|
#define UCDN_EAST_ASIAN_F 0 |
||||||
|
#define UCDN_EAST_ASIAN_H 1 |
||||||
|
#define UCDN_EAST_ASIAN_W 2 |
||||||
|
#define UCDN_EAST_ASIAN_NA 3 |
||||||
|
#define UCDN_EAST_ASIAN_A 4 |
||||||
|
#define UCDN_EAST_ASIAN_N 5 |
||||||
|
|
||||||
|
#define UCDN_SCRIPT_COMMON 0 |
||||||
|
#define UCDN_SCRIPT_LATIN 1 |
||||||
|
#define UCDN_SCRIPT_GREEK 2 |
||||||
|
#define UCDN_SCRIPT_CYRILLIC 3 |
||||||
|
#define UCDN_SCRIPT_ARMENIAN 4 |
||||||
|
#define UCDN_SCRIPT_HEBREW 5 |
||||||
|
#define UCDN_SCRIPT_ARABIC 6 |
||||||
|
#define UCDN_SCRIPT_SYRIAC 7 |
||||||
|
#define UCDN_SCRIPT_THAANA 8 |
||||||
|
#define UCDN_SCRIPT_DEVANAGARI 9 |
||||||
|
#define UCDN_SCRIPT_BENGALI 10 |
||||||
|
#define UCDN_SCRIPT_GURMUKHI 11 |
||||||
|
#define UCDN_SCRIPT_GUJARATI 12 |
||||||
|
#define UCDN_SCRIPT_ORIYA 13 |
||||||
|
#define UCDN_SCRIPT_TAMIL 14 |
||||||
|
#define UCDN_SCRIPT_TELUGU 15 |
||||||
|
#define UCDN_SCRIPT_KANNADA 16 |
||||||
|
#define UCDN_SCRIPT_MALAYALAM 17 |
||||||
|
#define UCDN_SCRIPT_SINHALA 18 |
||||||
|
#define UCDN_SCRIPT_THAI 19 |
||||||
|
#define UCDN_SCRIPT_LAO 20 |
||||||
|
#define UCDN_SCRIPT_TIBETAN 21 |
||||||
|
#define UCDN_SCRIPT_MYANMAR 22 |
||||||
|
#define UCDN_SCRIPT_GEORGIAN 23 |
||||||
|
#define UCDN_SCRIPT_HANGUL 24 |
||||||
|
#define UCDN_SCRIPT_ETHIOPIC 25 |
||||||
|
#define UCDN_SCRIPT_CHEROKEE 26 |
||||||
|
#define UCDN_SCRIPT_CANADIAN_ABORIGINAL 27 |
||||||
|
#define UCDN_SCRIPT_OGHAM 28 |
||||||
|
#define UCDN_SCRIPT_RUNIC 29 |
||||||
|
#define UCDN_SCRIPT_KHMER 30 |
||||||
|
#define UCDN_SCRIPT_MONGOLIAN 31 |
||||||
|
#define UCDN_SCRIPT_HIRAGANA 32 |
||||||
|
#define UCDN_SCRIPT_KATAKANA 33 |
||||||
|
#define UCDN_SCRIPT_BOPOMOFO 34 |
||||||
|
#define UCDN_SCRIPT_HAN 35 |
||||||
|
#define UCDN_SCRIPT_YI 36 |
||||||
|
#define UCDN_SCRIPT_OLD_ITALIC 37 |
||||||
|
#define UCDN_SCRIPT_GOTHIC 38 |
||||||
|
#define UCDN_SCRIPT_DESERET 39 |
||||||
|
#define UCDN_SCRIPT_INHERITED 40 |
||||||
|
#define UCDN_SCRIPT_TAGALOG 41 |
||||||
|
#define UCDN_SCRIPT_HANUNOO 42 |
||||||
|
#define UCDN_SCRIPT_BUHID 43 |
||||||
|
#define UCDN_SCRIPT_TAGBANWA 44 |
||||||
|
#define UCDN_SCRIPT_LIMBU 45 |
||||||
|
#define UCDN_SCRIPT_TAI_LE 46 |
||||||
|
#define UCDN_SCRIPT_LINEAR_B 47 |
||||||
|
#define UCDN_SCRIPT_UGARITIC 48 |
||||||
|
#define UCDN_SCRIPT_SHAVIAN 49 |
||||||
|
#define UCDN_SCRIPT_OSMANYA 50 |
||||||
|
#define UCDN_SCRIPT_CYPRIOT 51 |
||||||
|
#define UCDN_SCRIPT_BRAILLE 52 |
||||||
|
#define UCDN_SCRIPT_BUGINESE 53 |
||||||
|
#define UCDN_SCRIPT_COPTIC 54 |
||||||
|
#define UCDN_SCRIPT_NEW_TAI_LUE 55 |
||||||
|
#define UCDN_SCRIPT_GLAGOLITIC 56 |
||||||
|
#define UCDN_SCRIPT_TIFINAGH 57 |
||||||
|
#define UCDN_SCRIPT_SYLOTI_NAGRI 58 |
||||||
|
#define UCDN_SCRIPT_OLD_PERSIAN 59 |
||||||
|
#define UCDN_SCRIPT_KHAROSHTHI 60 |
||||||
|
#define UCDN_SCRIPT_BALINESE 61 |
||||||
|
#define UCDN_SCRIPT_CUNEIFORM 62 |
||||||
|
#define UCDN_SCRIPT_PHOENICIAN 63 |
||||||
|
#define UCDN_SCRIPT_PHAGS_PA 64 |
||||||
|
#define UCDN_SCRIPT_NKO 65 |
||||||
|
#define UCDN_SCRIPT_SUNDANESE 66 |
||||||
|
#define UCDN_SCRIPT_LEPCHA 67 |
||||||
|
#define UCDN_SCRIPT_OL_CHIKI 68 |
||||||
|
#define UCDN_SCRIPT_VAI 69 |
||||||
|
#define UCDN_SCRIPT_SAURASHTRA 70 |
||||||
|
#define UCDN_SCRIPT_KAYAH_LI 71 |
||||||
|
#define UCDN_SCRIPT_REJANG 72 |
||||||
|
#define UCDN_SCRIPT_LYCIAN 73 |
||||||
|
#define UCDN_SCRIPT_CARIAN 74 |
||||||
|
#define UCDN_SCRIPT_LYDIAN 75 |
||||||
|
#define UCDN_SCRIPT_CHAM 76 |
||||||
|
#define UCDN_SCRIPT_TAI_THAM 77 |
||||||
|
#define UCDN_SCRIPT_TAI_VIET 78 |
||||||
|
#define UCDN_SCRIPT_AVESTAN 79 |
||||||
|
#define UCDN_SCRIPT_EGYPTIAN_HIEROGLYPHS 80 |
||||||
|
#define UCDN_SCRIPT_SAMARITAN 81 |
||||||
|
#define UCDN_SCRIPT_LISU 82 |
||||||
|
#define UCDN_SCRIPT_BAMUM 83 |
||||||
|
#define UCDN_SCRIPT_JAVANESE 84 |
||||||
|
#define UCDN_SCRIPT_MEETEI_MAYEK 85 |
||||||
|
#define UCDN_SCRIPT_IMPERIAL_ARAMAIC 86 |
||||||
|
#define UCDN_SCRIPT_OLD_SOUTH_ARABIAN 87 |
||||||
|
#define UCDN_SCRIPT_INSCRIPTIONAL_PARTHIAN 88 |
||||||
|
#define UCDN_SCRIPT_INSCRIPTIONAL_PAHLAVI 89 |
||||||
|
#define UCDN_SCRIPT_OLD_TURKIC 90 |
||||||
|
#define UCDN_SCRIPT_KAITHI 91 |
||||||
|
#define UCDN_SCRIPT_BATAK 92 |
||||||
|
#define UCDN_SCRIPT_BRAHMI 93 |
||||||
|
#define UCDN_SCRIPT_MANDAIC 94 |
||||||
|
#define UCDN_SCRIPT_CHAKMA 95 |
||||||
|
#define UCDN_SCRIPT_MEROITIC_CURSIVE 96 |
||||||
|
#define UCDN_SCRIPT_MEROITIC_HIEROGLYPHS 97 |
||||||
|
#define UCDN_SCRIPT_MIAO 98 |
||||||
|
#define UCDN_SCRIPT_SHARADA 99 |
||||||
|
#define UCDN_SCRIPT_SORA_SOMPENG 100 |
||||||
|
#define UCDN_SCRIPT_TAKRI 101 |
||||||
|
#define UCDN_SCRIPT_UNKNOWN 102 |
||||||
|
|
||||||
|
#define UCDN_GENERAL_CATEGORY_CC 0 |
||||||
|
#define UCDN_GENERAL_CATEGORY_CF 1 |
||||||
|
#define UCDN_GENERAL_CATEGORY_CN 2 |
||||||
|
#define UCDN_GENERAL_CATEGORY_CO 3 |
||||||
|
#define UCDN_GENERAL_CATEGORY_CS 4 |
||||||
|
#define UCDN_GENERAL_CATEGORY_LL 5 |
||||||
|
#define UCDN_GENERAL_CATEGORY_LM 6 |
||||||
|
#define UCDN_GENERAL_CATEGORY_LO 7 |
||||||
|
#define UCDN_GENERAL_CATEGORY_LT 8 |
||||||
|
#define UCDN_GENERAL_CATEGORY_LU 9 |
||||||
|
#define UCDN_GENERAL_CATEGORY_MC 10 |
||||||
|
#define UCDN_GENERAL_CATEGORY_ME 11 |
||||||
|
#define UCDN_GENERAL_CATEGORY_MN 12 |
||||||
|
#define UCDN_GENERAL_CATEGORY_ND 13 |
||||||
|
#define UCDN_GENERAL_CATEGORY_NL 14 |
||||||
|
#define UCDN_GENERAL_CATEGORY_NO 15 |
||||||
|
#define UCDN_GENERAL_CATEGORY_PC 16 |
||||||
|
#define UCDN_GENERAL_CATEGORY_PD 17 |
||||||
|
#define UCDN_GENERAL_CATEGORY_PE 18 |
||||||
|
#define UCDN_GENERAL_CATEGORY_PF 19 |
||||||
|
#define UCDN_GENERAL_CATEGORY_PI 20 |
||||||
|
#define UCDN_GENERAL_CATEGORY_PO 21 |
||||||
|
#define UCDN_GENERAL_CATEGORY_PS 22 |
||||||
|
#define UCDN_GENERAL_CATEGORY_SC 23 |
||||||
|
#define UCDN_GENERAL_CATEGORY_SK 24 |
||||||
|
#define UCDN_GENERAL_CATEGORY_SM 25 |
||||||
|
#define UCDN_GENERAL_CATEGORY_SO 26 |
||||||
|
#define UCDN_GENERAL_CATEGORY_ZL 27 |
||||||
|
#define UCDN_GENERAL_CATEGORY_ZP 28 |
||||||
|
#define UCDN_GENERAL_CATEGORY_ZS 29 |
||||||
|
|
||||||
|
#define UCDN_BIDI_CLASS_L 0 |
||||||
|
#define UCDN_BIDI_CLASS_LRE 1 |
||||||
|
#define UCDN_BIDI_CLASS_LRO 2 |
||||||
|
#define UCDN_BIDI_CLASS_R 3 |
||||||
|
#define UCDN_BIDI_CLASS_AL 4 |
||||||
|
#define UCDN_BIDI_CLASS_RLE 5 |
||||||
|
#define UCDN_BIDI_CLASS_RLO 6 |
||||||
|
#define UCDN_BIDI_CLASS_PDF 7 |
||||||
|
#define UCDN_BIDI_CLASS_EN 8 |
||||||
|
#define UCDN_BIDI_CLASS_ES 9 |
||||||
|
#define UCDN_BIDI_CLASS_ET 10 |
||||||
|
#define UCDN_BIDI_CLASS_AN 11 |
||||||
|
#define UCDN_BIDI_CLASS_CS 12 |
||||||
|
#define UCDN_BIDI_CLASS_NSM 13 |
||||||
|
#define UCDN_BIDI_CLASS_BN 14 |
||||||
|
#define UCDN_BIDI_CLASS_B 15 |
||||||
|
#define UCDN_BIDI_CLASS_S 16 |
||||||
|
#define UCDN_BIDI_CLASS_WS 17 |
||||||
|
#define UCDN_BIDI_CLASS_ON 18 |
||||||
|
|
||||||
|
/**
|
||||||
|
* Return version of the Unicode database. |
||||||
|
* |
||||||
|
* @return Unicode database version |
||||||
|
*/ |
||||||
|
const char *ucdn_get_unicode_version(void); |
||||||
|
|
||||||
|
/**
|
||||||
|
* Get combining class of a codepoint. |
||||||
|
* |
||||||
|
* @param code Unicode codepoint |
||||||
|
* @return combining class value, as defined in UAX#44 |
||||||
|
*/ |
||||||
|
int ucdn_get_combining_class(uint32_t code); |
||||||
|
|
||||||
|
/**
|
||||||
|
* Get east-asian width of a codepoint. |
||||||
|
* |
||||||
|
* @param code Unicode codepoint |
||||||
|
* @return value according to UCDN_EAST_ASIAN_* and as defined in UAX#11. |
||||||
|
*/ |
||||||
|
int ucdn_get_east_asian_width(uint32_t code); |
||||||
|
|
||||||
|
/**
|
||||||
|
* Get general category of a codepoint. |
||||||
|
* |
||||||
|
* @param code Unicode codepoint |
||||||
|
* @return value according to UCDN_GENERAL_CATEGORY_* and as defined in |
||||||
|
* UAX#44. |
||||||
|
*/ |
||||||
|
int ucdn_get_general_category(uint32_t code); |
||||||
|
|
||||||
|
/**
|
||||||
|
* Get bidirectional class of a codepoint. |
||||||
|
* |
||||||
|
* @param code Unicode codepoint |
||||||
|
* @return value according to UCDN_BIDI_CLASS_* and as defined in UAX#44. |
||||||
|
*/ |
||||||
|
int ucdn_get_bidi_class(uint32_t code); |
||||||
|
|
||||||
|
/**
|
||||||
|
* Get script of a codepoint. |
||||||
|
* |
||||||
|
* @param code Unicode codepoint |
||||||
|
* @return value according to UCDN_SCRIPT_* and as defined in UAX#24. |
||||||
|
*/ |
||||||
|
int ucdn_get_script(uint32_t code); |
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if codepoint can be mirrored. |
||||||
|
* |
||||||
|
* @param code Unicode codepoint |
||||||
|
* @return 1 if mirrored character exists, otherwise 0 |
||||||
|
*/ |
||||||
|
int ucdn_get_mirrored(uint32_t code); |
||||||
|
|
||||||
|
/**
|
||||||
|
* Mirror a codepoint. |
||||||
|
* |
||||||
|
* @param code Unicode codepoint |
||||||
|
* @return mirrored codepoint or the original codepoint if no |
||||||
|
* mirrored character exists |
||||||
|
*/ |
||||||
|
uint32_t ucdn_mirror(uint32_t code); |
||||||
|
|
||||||
|
/**
|
||||||
|
* Pairwise canonical decomposition of a codepoint. This includes |
||||||
|
* Hangul Jamo decomposition (see chapter 3.12 of the Unicode core |
||||||
|
* specification). |
||||||
|
* |
||||||
|
* Hangul is decomposed into L and V jamos for LV forms, and an |
||||||
|
* LV precomposed syllable and a T jamo for LVT forms. |
||||||
|
* |
||||||
|
* @param code Unicode codepoint |
||||||
|
* @param a filled with first codepoint of decomposition |
||||||
|
* @param b filled with second codepoint of decomposition, or 0 |
||||||
|
* @return success |
||||||
|
*/ |
||||||
|
int ucdn_decompose(uint32_t code, uint32_t *a, uint32_t *b); |
||||||
|
|
||||||
|
/**
|
||||||
|
* Compatibility decomposition of a codepoint. |
||||||
|
* |
||||||
|
* @param code Unicode codepoint |
||||||
|
* @param decomposed filled with decomposition, must be able to hold 18 |
||||||
|
* characters |
||||||
|
* @return length of decomposition or 0 in case none exists |
||||||
|
*/ |
||||||
|
int ucdn_compat_decompose(uint32_t code, uint32_t *decomposed); |
||||||
|
|
||||||
|
/**
|
||||||
|
* Pairwise canonical composition of two codepoints. This includes |
||||||
|
* Hangul Jamo composition (see chapter 3.12 of the Unicode core |
||||||
|
* specification). |
||||||
|
* |
||||||
|
* Hangul composition expects either L and V jamos, or an LV |
||||||
|
* precomposed syllable and a T jamo. This is exactly the inverse |
||||||
|
* of pairwise Hangul decomposition. |
||||||
|
* |
||||||
|
* @param code filled with composition |
||||||
|
* @param a first codepoint |
||||||
|
* @param b second codepoint |
||||||
|
* @return success |
||||||
|
*/ |
||||||
|
int ucdn_compose(uint32_t *code, uint32_t a, uint32_t b); |
||||||
|
|
||||||
|
#endif |
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue