parent
0e292eb2a2
commit
3f33f0d1f2
9 changed files with 5498 additions and 1 deletions
@ -0,0 +1,207 @@ |
||||
/*
|
||||
* Copyright (C) 2012 Grigori Goronzy <greg@kinoho.net> |
||||
* |
||||
* Permission to use, copy, modify, and/or distribute this software for any |
||||
* purpose with or without fee is hereby granted, provided that the above |
||||
* copyright notice and this permission notice appear in all copies. |
||||
* |
||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
||||
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
||||
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
||||
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
||||
*/ |
||||
|
||||
#include "hb-private.hh" |
||||
|
||||
#include "hb-unicode-private.hh" |
||||
|
||||
HB_BEGIN_DECLS |
||||
#include "ucdn.h" |
||||
HB_END_DECLS |
||||
|
||||
static const hb_script_t ucdn_script_translate[] = |
||||
{ |
||||
HB_SCRIPT_COMMON, |
||||
HB_SCRIPT_LATIN, |
||||
HB_SCRIPT_GREEK, |
||||
HB_SCRIPT_CYRILLIC, |
||||
HB_SCRIPT_ARMENIAN, |
||||
HB_SCRIPT_HEBREW, |
||||
HB_SCRIPT_ARABIC, |
||||
HB_SCRIPT_SYRIAC, |
||||
HB_SCRIPT_THAANA, |
||||
HB_SCRIPT_DEVANAGARI, |
||||
HB_SCRIPT_BENGALI, |
||||
HB_SCRIPT_GURMUKHI, |
||||
HB_SCRIPT_GUJARATI, |
||||
HB_SCRIPT_ORIYA, |
||||
HB_SCRIPT_TAMIL, |
||||
HB_SCRIPT_TELUGU, |
||||
HB_SCRIPT_KANNADA, |
||||
HB_SCRIPT_MALAYALAM, |
||||
HB_SCRIPT_SINHALA, |
||||
HB_SCRIPT_THAI, |
||||
HB_SCRIPT_LAO, |
||||
HB_SCRIPT_TIBETAN, |
||||
HB_SCRIPT_MYANMAR, |
||||
HB_SCRIPT_GEORGIAN, |
||||
HB_SCRIPT_HANGUL, |
||||
HB_SCRIPT_ETHIOPIC, |
||||
HB_SCRIPT_CHEROKEE, |
||||
HB_SCRIPT_CANADIAN_ABORIGINAL, |
||||
HB_SCRIPT_OGHAM, |
||||
HB_SCRIPT_RUNIC, |
||||
HB_SCRIPT_KHMER, |
||||
HB_SCRIPT_MONGOLIAN, |
||||
HB_SCRIPT_HIRAGANA, |
||||
HB_SCRIPT_KATAKANA, |
||||
HB_SCRIPT_BOPOMOFO, |
||||
HB_SCRIPT_HAN, |
||||
HB_SCRIPT_YI, |
||||
HB_SCRIPT_OLD_ITALIC, |
||||
HB_SCRIPT_GOTHIC, |
||||
HB_SCRIPT_DESERET, |
||||
HB_SCRIPT_INHERITED, |
||||
HB_SCRIPT_TAGALOG, |
||||
HB_SCRIPT_HANUNOO, |
||||
HB_SCRIPT_BUHID, |
||||
HB_SCRIPT_TAGBANWA, |
||||
HB_SCRIPT_LIMBU, |
||||
HB_SCRIPT_TAI_LE, |
||||
HB_SCRIPT_LINEAR_B, |
||||
HB_SCRIPT_UGARITIC, |
||||
HB_SCRIPT_SHAVIAN, |
||||
HB_SCRIPT_OSMANYA, |
||||
HB_SCRIPT_CYPRIOT, |
||||
HB_SCRIPT_BRAILLE, |
||||
HB_SCRIPT_BUGINESE, |
||||
HB_SCRIPT_COPTIC, |
||||
HB_SCRIPT_NEW_TAI_LUE, |
||||
HB_SCRIPT_GLAGOLITIC, |
||||
HB_SCRIPT_TIFINAGH, |
||||
HB_SCRIPT_SYLOTI_NAGRI, |
||||
HB_SCRIPT_OLD_PERSIAN, |
||||
HB_SCRIPT_KHAROSHTHI, |
||||
HB_SCRIPT_BALINESE, |
||||
HB_SCRIPT_CUNEIFORM, |
||||
HB_SCRIPT_PHOENICIAN, |
||||
HB_SCRIPT_PHAGS_PA, |
||||
HB_SCRIPT_NKO, |
||||
HB_SCRIPT_SUNDANESE, |
||||
HB_SCRIPT_LEPCHA, |
||||
HB_SCRIPT_OL_CHIKI, |
||||
HB_SCRIPT_VAI, |
||||
HB_SCRIPT_SAURASHTRA, |
||||
HB_SCRIPT_KAYAH_LI, |
||||
HB_SCRIPT_REJANG, |
||||
HB_SCRIPT_LYCIAN, |
||||
HB_SCRIPT_CARIAN, |
||||
HB_SCRIPT_LYDIAN, |
||||
HB_SCRIPT_CHAM, |
||||
HB_SCRIPT_TAI_THAM, |
||||
HB_SCRIPT_TAI_VIET, |
||||
HB_SCRIPT_AVESTAN, |
||||
HB_SCRIPT_EGYPTIAN_HIEROGLYPHS, |
||||
HB_SCRIPT_SAMARITAN, |
||||
HB_SCRIPT_LISU, |
||||
HB_SCRIPT_BAMUM, |
||||
HB_SCRIPT_JAVANESE, |
||||
HB_SCRIPT_MEETEI_MAYEK, |
||||
HB_SCRIPT_IMPERIAL_ARAMAIC, |
||||
HB_SCRIPT_OLD_SOUTH_ARABIAN, |
||||
HB_SCRIPT_INSCRIPTIONAL_PARTHIAN, |
||||
HB_SCRIPT_INSCRIPTIONAL_PAHLAVI, |
||||
HB_SCRIPT_OLD_TURKIC, |
||||
HB_SCRIPT_KAITHI, |
||||
HB_SCRIPT_BATAK, |
||||
HB_SCRIPT_BRAHMI, |
||||
HB_SCRIPT_MANDAIC, |
||||
HB_SCRIPT_CHAKMA, |
||||
HB_SCRIPT_MEROITIC_CURSIVE, |
||||
HB_SCRIPT_MEROITIC_HIEROGLYPHS, |
||||
HB_SCRIPT_MIAO, |
||||
HB_SCRIPT_SHARADA, |
||||
HB_SCRIPT_SORA_SOMPENG, |
||||
HB_SCRIPT_TAKRI, |
||||
HB_SCRIPT_UNKNOWN, |
||||
}; |
||||
|
||||
static hb_unicode_combining_class_t |
||||
hb_ucdn_combining_class(hb_unicode_funcs_t *ufuncs, hb_codepoint_t unicode, |
||||
void *user_data) |
||||
{ |
||||
return (hb_unicode_combining_class_t) ucdn_get_combining_class(unicode); |
||||
} |
||||
|
||||
static unsigned int |
||||
hb_ucdn_eastasian_width(hb_unicode_funcs_t *ufuncs, hb_codepoint_t unicode, |
||||
void *user_data) |
||||
{ |
||||
int w = ucdn_get_east_asian_width(unicode); |
||||
return (w == UCDN_EAST_ASIAN_F || w == UCDN_EAST_ASIAN_W) ? 2 : 1; |
||||
} |
||||
|
||||
static hb_unicode_general_category_t |
||||
hb_ucdn_general_category(hb_unicode_funcs_t *ufuncs, |
||||
hb_codepoint_t unicode, void *user_data) |
||||
{ |
||||
return (hb_unicode_general_category_t)ucdn_get_general_category(unicode); |
||||
} |
||||
|
||||
static hb_codepoint_t |
||||
hb_ucdn_mirroring(hb_unicode_funcs_t *ufuncs, hb_codepoint_t unicode, |
||||
void *user_data) |
||||
{ |
||||
return ucdn_mirror(unicode); |
||||
} |
||||
|
||||
static hb_script_t |
||||
hb_ucdn_script(hb_unicode_funcs_t *ufuncs, hb_codepoint_t unicode, |
||||
void *user_data) |
||||
{ |
||||
return ucdn_script_translate[ucdn_get_script(unicode)]; |
||||
} |
||||
|
||||
static hb_bool_t |
||||
hb_ucdn_compose(hb_unicode_funcs_t *ufuncs, hb_codepoint_t a, |
||||
hb_codepoint_t b, hb_codepoint_t *ab, void *user_data) |
||||
{ |
||||
return ucdn_compose(ab, a, b); |
||||
} |
||||
|
||||
static hb_bool_t |
||||
hb_ucdn_decompose(hb_unicode_funcs_t *ufuncs, hb_codepoint_t ab, |
||||
hb_codepoint_t *a, hb_codepoint_t *b, void *user_data) |
||||
{ |
||||
return ucdn_decompose(ab, a, b); |
||||
} |
||||
|
||||
static unsigned int |
||||
hb_ucdn_decompose_compatibility(hb_unicode_funcs_t *ufuncs, hb_codepoint_t u, |
||||
hb_codepoint_t *decomposed, void *user_data) |
||||
{ |
||||
return ucdn_compat_decompose(u, decomposed); |
||||
} |
||||
|
||||
extern "C" HB_INTERNAL |
||||
hb_unicode_funcs_t * |
||||
hb_ucdn_get_unicode_funcs (void) |
||||
{ |
||||
static const hb_unicode_funcs_t _hb_ucdn_unicode_funcs = { |
||||
HB_OBJECT_HEADER_STATIC, |
||||
|
||||
NULL, /* parent */ |
||||
true, /* immutable */ |
||||
{ |
||||
#define HB_UNICODE_FUNC_IMPLEMENT(name) hb_ucdn_##name, |
||||
HB_UNICODE_FUNCS_IMPLEMENT_CALLBACKS |
||||
#undef HB_UNICODE_FUNC_IMPLEMENT |
||||
} |
||||
}; |
||||
|
||||
return const_cast<hb_unicode_funcs_t *> (&_hb_ucdn_unicode_funcs); |
||||
} |
||||
|
@ -0,0 +1,18 @@ |
||||
## Process this file with automake to produce Makefile.in
|
||||
|
||||
noinst_LTLIBRARIES = libhb-ucdn.la
|
||||
|
||||
|
||||
libhb_ucdn_la_SOURCES = \
|
||||
ucdn.h \
|
||||
ucdn.c \
|
||||
unicodedata_db.h
|
||||
libhb_ucdn_CPPFLAGS = \
|
||||
-I$(top_srcdir) \
|
||||
-I$(top_srcdir)/src \
|
||||
-I$(top_builddir)/src
|
||||
libhb_ucdn_la_LIBADD =
|
||||
|
||||
EXTRA_DIST = README
|
||||
|
||||
-include $(top_srcdir)/git.mk |
@ -0,0 +1,33 @@ |
||||
UCDN - Unicode Database and Normalization |
||||
|
||||
UCDN is a Unicode support library. Currently, it provides access |
||||
to basic character properties contained in the Unicode Character |
||||
Database and low-level normalization functions (pairwise canonical |
||||
composition/decomposition and compatibility decomposition). More |
||||
functionality might be provided in the future, such as additional |
||||
properties, string normalization and encoding conversion. |
||||
|
||||
UCDN uses standard C89 with no particular dependencies or requirements |
||||
except for stdint.h, and can be easily integrated into existing |
||||
projects. However, it can also be used as a standalone library, |
||||
and a CMake build script is provided for this. The first motivation |
||||
behind UCDN development was to provide a standalone set of Unicode |
||||
functions for the HarfBuzz OpenType shaping library. For this purpose, |
||||
a HarfBuzz-specific wrapper is shipped along with it (hb-ucdn.h). |
||||
|
||||
UCDN is published under the ISC license, please see the license header |
||||
in the C source code for more information. The makeunicodata.py script |
||||
required for parsing Unicode database files is licensed under the |
||||
PSF license, please see PYTHON-LICENSE for more information. |
||||
|
||||
UCDN was written by Grigori Goronzy <greg@kinoho.net>. |
||||
|
||||
How to Use |
||||
|
||||
Include ucdn.c, ucdn.h and unicodedata_db.h in your project. Now, |
||||
just use the functions as documented in ucdn.h. |
||||
|
||||
In some cases, it might be necessary to regenerate the Unicode |
||||
database file. The script makeunicodedata.py (Python 3.x required) |
||||
fetches the appropriate files and dumps the compressed database into |
||||
unicodedata_db.h. |
@ -0,0 +1,282 @@ |
||||
/*
|
||||
* Copyright (C) 2012 Grigori Goronzy <greg@kinoho.net> |
||||
* |
||||
* Permission to use, copy, modify, and/or distribute this software for any |
||||
* purpose with or without fee is hereby granted, provided that the above |
||||
* copyright notice and this permission notice appear in all copies. |
||||
* |
||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
||||
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
||||
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
||||
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
||||
*/ |
||||
|
||||
#include <stdio.h> |
||||
#include <stdlib.h> |
||||
#include <stdint.h> |
||||
#include "ucdn.h" |
||||
|
||||
typedef struct { |
||||
const unsigned char category; |
||||
const unsigned char combining; |
||||
const unsigned char bidi_class; |
||||
const unsigned char mirrored; |
||||
const unsigned char east_asian_width; |
||||
const unsigned char normalization_check; |
||||
const unsigned char script; |
||||
} UCDRecord; |
||||
|
||||
typedef struct { |
||||
unsigned short from, to; |
||||
} MirrorPair; |
||||
|
||||
typedef struct { |
||||
int start; |
||||
short count, index; |
||||
} Reindex; |
||||
|
||||
#include "unicodedata_db.h" |
||||
|
||||
/* constants required for Hangul (de)composition */ |
||||
#define SBASE 0xAC00 |
||||
#define LBASE 0x1100 |
||||
#define VBASE 0x1161 |
||||
#define TBASE 0x11A7 |
||||
#define SCOUNT 11172 |
||||
#define LCOUNT 19 |
||||
#define VCOUNT 21 |
||||
#define TCOUNT 28 |
||||
#define NCOUNT (VCOUNT * TCOUNT) |
||||
|
||||
static UCDRecord *get_ucd_record(uint32_t code) |
||||
{ |
||||
int index, offset; |
||||
|
||||
if (code >= 0x110000) |
||||
index = 0; |
||||
else { |
||||
index = index0[code >> (SHIFT1+SHIFT2)] << SHIFT1; |
||||
offset = (code >> SHIFT2) & ((1<<SHIFT1) - 1); |
||||
index = index1[index + offset] << SHIFT2; |
||||
offset = code & ((1<<SHIFT2) - 1); |
||||
index = index2[index + offset]; |
||||
} |
||||
|
||||
return &ucd_records[index]; |
||||
} |
||||
|
||||
static unsigned short *get_decomp_record(uint32_t code) |
||||
{ |
||||
int index, offset; |
||||
|
||||
if (code >= 0x110000) |
||||
index = 0; |
||||
else { |
||||
index = decomp_index0[code >> (DECOMP_SHIFT1+DECOMP_SHIFT2)] |
||||
<< DECOMP_SHIFT1; |
||||
offset = (code >> DECOMP_SHIFT2) & ((1<<DECOMP_SHIFT1) - 1); |
||||
index = decomp_index1[index + offset] << DECOMP_SHIFT2; |
||||
offset = code & ((1<<DECOMP_SHIFT2) - 1); |
||||
index = decomp_index2[index + offset]; |
||||
} |
||||
|
||||
return &decomp_data[index]; |
||||
} |
||||
|
||||
static int get_comp_index(uint32_t code, Reindex *idx) |
||||
{ |
||||
int i; |
||||
|
||||
for (i = 0; idx[i].start; i++) { |
||||
Reindex *cur = &idx[i]; |
||||
if (code < cur->start) |
||||
return -1; |
||||
if (code <= cur->start + cur->count) { |
||||
return cur->index + (code - cur->start); |
||||
} |
||||
} |
||||
|
||||
return -1; |
||||
} |
||||
|
||||
static int compare_mp(const void *a, const void *b) |
||||
{ |
||||
MirrorPair *mpa = (MirrorPair *)a; |
||||
MirrorPair *mpb = (MirrorPair *)b; |
||||
return mpa->from - mpb->from; |
||||
} |
||||
|
||||
static int hangul_pair_decompose(uint32_t code, uint32_t *a, uint32_t *b) |
||||
{ |
||||
int si = code - SBASE; |
||||
|
||||
if (si < 0 || si >= SCOUNT) |
||||
return 0; |
||||
|
||||
if (si % TCOUNT) { |
||||
/* LV,T */ |
||||
*a = SBASE + (si / TCOUNT) * TCOUNT; |
||||
*b = TBASE + (si % TCOUNT); |
||||
return 3; |
||||
} else { |
||||
/* L,V */ |
||||
*a = LBASE + (si / NCOUNT); |
||||
*b = VBASE + (si % NCOUNT) / TCOUNT; |
||||
return 2; |
||||
} |
||||
} |
||||
|
||||
static int hangul_pair_compose(uint32_t *code, uint32_t a, uint32_t b) |
||||
{ |
||||
if (b < VBASE || b >= (TBASE + TCOUNT)) |
||||
return 0; |
||||
|
||||
if ((a < LBASE || a >= (LBASE + LCOUNT)) |
||||
&& (a < SBASE || a >= (SBASE + SCOUNT))) |
||||
return 0; |
||||
|
||||
if (a >= SBASE) { |
||||
/* LV,T */ |
||||
*code = a + (b - TBASE); |
||||
return 3; |
||||
} else { |
||||
/* L,V */ |
||||
int li = a - LBASE; |
||||
int vi = b - VBASE; |
||||
*code = SBASE + li * NCOUNT + vi * TCOUNT; |
||||
return 2; |
||||
} |
||||
} |
||||
|
||||
static uint32_t decode_utf16(unsigned short **code_ptr) |
||||
{ |
||||
unsigned short *code = *code_ptr; |
||||
|
||||
if ((code[0] & 0xd800) != 0xd800) { |
||||
*code_ptr += 1; |
||||
return (uint32_t)code[0]; |
||||
} else { |
||||
*code_ptr += 2; |
||||
return 0x10000 + ((uint32_t)code[1] - 0xdc00) + |
||||
(((uint32_t)code[0] - 0xd800) << 10); |
||||
} |
||||
} |
||||
|
||||
const char *ucdn_get_unicode_version(void) |
||||
{ |
||||
return UNIDATA_VERSION; |
||||
} |
||||
|
||||
int ucdn_get_combining_class(uint32_t code) |
||||
{ |
||||
return get_ucd_record(code)->combining; |
||||
} |
||||
|
||||
int ucdn_get_east_asian_width(uint32_t code) |
||||
{ |
||||
return get_ucd_record(code)->east_asian_width; |
||||
} |
||||
|
||||
int ucdn_get_general_category(uint32_t code) |
||||
{ |
||||
return get_ucd_record(code)->category; |
||||
} |
||||
|
||||
int ucdn_get_bidi_class(uint32_t code) |
||||
{ |
||||
return get_ucd_record(code)->bidi_class; |
||||
} |
||||
|
||||
int ucdn_get_mirrored(uint32_t code) |
||||
{ |
||||
return get_ucd_record(code)->mirrored; |
||||
} |
||||
|
||||
int ucdn_get_script(uint32_t code) |
||||
{ |
||||
return get_ucd_record(code)->script; |
||||
} |
||||
|
||||
uint32_t ucdn_mirror(uint32_t code) |
||||
{ |
||||
MirrorPair mp = {0}; |
||||
MirrorPair *res; |
||||
|
||||
if (get_ucd_record(code)->mirrored == 0) |
||||
return code; |
||||
|
||||
mp.from = code; |
||||
res = bsearch(&mp, mirror_pairs, BIDI_MIRROR_LEN, sizeof(MirrorPair), |
||||
compare_mp); |
||||
|
||||
if (res == NULL) |
||||
return code; |
||||
else |
||||
return res->to; |
||||
} |
||||
|
||||
int ucdn_decompose(uint32_t code, uint32_t *a, uint32_t *b) |
||||
{ |
||||
unsigned short *rec; |
||||
int len; |
||||
|
||||
if (hangul_pair_decompose(code, a, b)) |
||||
return 1; |
||||
|
||||
rec = get_decomp_record(code); |
||||
len = rec[0] >> 8; |
||||
|
||||
if ((rec[0] & 0xff) != 0 || len == 0) |
||||
return 0; |
||||
|
||||
rec++; |
||||
*a = decode_utf16(&rec); |
||||
if (len > 1) |
||||
*b = decode_utf16(&rec); |
||||
else |
||||
*b = 0; |
||||
|
||||
return 1; |
||||
} |
||||
|
||||
int ucdn_compose(uint32_t *code, uint32_t a, uint32_t b) |
||||
{ |
||||
int l, r, index, indexi, offset; |
||||
|
||||
if (hangul_pair_compose(code, a, b)) |
||||
return 1; |
||||
|
||||
l = get_comp_index(a, nfc_first); |
||||
r = get_comp_index(b, nfc_last); |
||||
|
||||
if (l < 0 || r < 0) |
||||
return 0; |
||||
|
||||
indexi = l * TOTAL_LAST + r; |
||||
index = comp_index0[indexi >> (COMP_SHIFT1+COMP_SHIFT2)] << COMP_SHIFT1; |
||||
offset = (indexi >> COMP_SHIFT2) & ((1<<COMP_SHIFT1) - 1); |
||||
index = comp_index1[index + offset] << COMP_SHIFT2; |
||||
offset = indexi & ((1<<COMP_SHIFT2) - 1); |
||||
*code = comp_data[index + offset]; |
||||
|
||||
return *code != 0; |
||||
} |
||||
|
||||
int ucdn_compat_decompose(uint32_t code, uint32_t *decomposed) |
||||
{ |
||||
int i, len; |
||||
unsigned short *rec = get_decomp_record(code); |
||||
len = rec[0] >> 8; |
||||
|
||||
if (len == 0) |
||||
return 0; |
||||
|
||||
rec++; |
||||
for (i = 0; i < len; i++) |
||||
decomposed[i] = decode_utf16(&rec); |
||||
|
||||
return len; |
||||
} |
@ -0,0 +1,290 @@ |
||||
/*
|
||||
* Copyright (C) 2012 Grigori Goronzy <greg@kinoho.net> |
||||
* |
||||
* Permission to use, copy, modify, and/or distribute this software for any |
||||
* purpose with or without fee is hereby granted, provided that the above |
||||
* copyright notice and this permission notice appear in all copies. |
||||
* |
||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
||||
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
||||
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
||||
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
||||
*/ |
||||
|
||||
#ifndef UCDN_H |
||||
#define UCDN_H |
||||
|
||||
#include <stdint.h> |
||||
|
||||
#define UCDN_EAST_ASIAN_F 0 |
||||
#define UCDN_EAST_ASIAN_H 1 |
||||
#define UCDN_EAST_ASIAN_W 2 |
||||
#define UCDN_EAST_ASIAN_NA 3 |
||||
#define UCDN_EAST_ASIAN_A 4 |
||||
#define UCDN_EAST_ASIAN_N 5 |
||||
|
||||
#define UCDN_SCRIPT_COMMON 0 |
||||
#define UCDN_SCRIPT_LATIN 1 |
||||
#define UCDN_SCRIPT_GREEK 2 |
||||
#define UCDN_SCRIPT_CYRILLIC 3 |
||||
#define UCDN_SCRIPT_ARMENIAN 4 |
||||
#define UCDN_SCRIPT_HEBREW 5 |
||||
#define UCDN_SCRIPT_ARABIC 6 |
||||
#define UCDN_SCRIPT_SYRIAC 7 |
||||
#define UCDN_SCRIPT_THAANA 8 |
||||
#define UCDN_SCRIPT_DEVANAGARI 9 |
||||
#define UCDN_SCRIPT_BENGALI 10 |
||||
#define UCDN_SCRIPT_GURMUKHI 11 |
||||
#define UCDN_SCRIPT_GUJARATI 12 |
||||
#define UCDN_SCRIPT_ORIYA 13 |
||||
#define UCDN_SCRIPT_TAMIL 14 |
||||
#define UCDN_SCRIPT_TELUGU 15 |
||||
#define UCDN_SCRIPT_KANNADA 16 |
||||
#define UCDN_SCRIPT_MALAYALAM 17 |
||||
#define UCDN_SCRIPT_SINHALA 18 |
||||
#define UCDN_SCRIPT_THAI 19 |
||||
#define UCDN_SCRIPT_LAO 20 |
||||
#define UCDN_SCRIPT_TIBETAN 21 |
||||
#define UCDN_SCRIPT_MYANMAR 22 |
||||
#define UCDN_SCRIPT_GEORGIAN 23 |
||||
#define UCDN_SCRIPT_HANGUL 24 |
||||
#define UCDN_SCRIPT_ETHIOPIC 25 |
||||
#define UCDN_SCRIPT_CHEROKEE 26 |
||||
#define UCDN_SCRIPT_CANADIAN_ABORIGINAL 27 |
||||
#define UCDN_SCRIPT_OGHAM 28 |
||||
#define UCDN_SCRIPT_RUNIC 29 |
||||
#define UCDN_SCRIPT_KHMER 30 |
||||
#define UCDN_SCRIPT_MONGOLIAN 31 |
||||
#define UCDN_SCRIPT_HIRAGANA 32 |
||||
#define UCDN_SCRIPT_KATAKANA 33 |
||||
#define UCDN_SCRIPT_BOPOMOFO 34 |
||||
#define UCDN_SCRIPT_HAN 35 |
||||
#define UCDN_SCRIPT_YI 36 |
||||
#define UCDN_SCRIPT_OLD_ITALIC 37 |
||||
#define UCDN_SCRIPT_GOTHIC 38 |
||||
#define UCDN_SCRIPT_DESERET 39 |
||||
#define UCDN_SCRIPT_INHERITED 40 |
||||
#define UCDN_SCRIPT_TAGALOG 41 |
||||
#define UCDN_SCRIPT_HANUNOO 42 |
||||
#define UCDN_SCRIPT_BUHID 43 |
||||
#define UCDN_SCRIPT_TAGBANWA 44 |
||||
#define UCDN_SCRIPT_LIMBU 45 |
||||
#define UCDN_SCRIPT_TAI_LE 46 |
||||
#define UCDN_SCRIPT_LINEAR_B 47 |
||||
#define UCDN_SCRIPT_UGARITIC 48 |
||||
#define UCDN_SCRIPT_SHAVIAN 49 |
||||
#define UCDN_SCRIPT_OSMANYA 50 |
||||
#define UCDN_SCRIPT_CYPRIOT 51 |
||||
#define UCDN_SCRIPT_BRAILLE 52 |
||||
#define UCDN_SCRIPT_BUGINESE 53 |
||||
#define UCDN_SCRIPT_COPTIC 54 |
||||
#define UCDN_SCRIPT_NEW_TAI_LUE 55 |
||||
#define UCDN_SCRIPT_GLAGOLITIC 56 |
||||
#define UCDN_SCRIPT_TIFINAGH 57 |
||||
#define UCDN_SCRIPT_SYLOTI_NAGRI 58 |
||||
#define UCDN_SCRIPT_OLD_PERSIAN 59 |
||||
#define UCDN_SCRIPT_KHAROSHTHI 60 |
||||
#define UCDN_SCRIPT_BALINESE 61 |
||||
#define UCDN_SCRIPT_CUNEIFORM 62 |
||||
#define UCDN_SCRIPT_PHOENICIAN 63 |
||||
#define UCDN_SCRIPT_PHAGS_PA 64 |
||||
#define UCDN_SCRIPT_NKO 65 |
||||
#define UCDN_SCRIPT_SUNDANESE 66 |
||||
#define UCDN_SCRIPT_LEPCHA 67 |
||||
#define UCDN_SCRIPT_OL_CHIKI 68 |
||||
#define UCDN_SCRIPT_VAI 69 |
||||
#define UCDN_SCRIPT_SAURASHTRA 70 |
||||
#define UCDN_SCRIPT_KAYAH_LI 71 |
||||
#define UCDN_SCRIPT_REJANG 72 |
||||
#define UCDN_SCRIPT_LYCIAN 73 |
||||
#define UCDN_SCRIPT_CARIAN 74 |
||||
#define UCDN_SCRIPT_LYDIAN 75 |
||||
#define UCDN_SCRIPT_CHAM 76 |
||||
#define UCDN_SCRIPT_TAI_THAM 77 |
||||
#define UCDN_SCRIPT_TAI_VIET 78 |
||||
#define UCDN_SCRIPT_AVESTAN 79 |
||||
#define UCDN_SCRIPT_EGYPTIAN_HIEROGLYPHS 80 |
||||
#define UCDN_SCRIPT_SAMARITAN 81 |
||||
#define UCDN_SCRIPT_LISU 82 |
||||
#define UCDN_SCRIPT_BAMUM 83 |
||||
#define UCDN_SCRIPT_JAVANESE 84 |
||||
#define UCDN_SCRIPT_MEETEI_MAYEK 85 |
||||
#define UCDN_SCRIPT_IMPERIAL_ARAMAIC 86 |
||||
#define UCDN_SCRIPT_OLD_SOUTH_ARABIAN 87 |
||||
#define UCDN_SCRIPT_INSCRIPTIONAL_PARTHIAN 88 |
||||
#define UCDN_SCRIPT_INSCRIPTIONAL_PAHLAVI 89 |
||||
#define UCDN_SCRIPT_OLD_TURKIC 90 |
||||
#define UCDN_SCRIPT_KAITHI 91 |
||||
#define UCDN_SCRIPT_BATAK 92 |
||||
#define UCDN_SCRIPT_BRAHMI 93 |
||||
#define UCDN_SCRIPT_MANDAIC 94 |
||||
#define UCDN_SCRIPT_CHAKMA 95 |
||||
#define UCDN_SCRIPT_MEROITIC_CURSIVE 96 |
||||
#define UCDN_SCRIPT_MEROITIC_HIEROGLYPHS 97 |
||||
#define UCDN_SCRIPT_MIAO 98 |
||||
#define UCDN_SCRIPT_SHARADA 99 |
||||
#define UCDN_SCRIPT_SORA_SOMPENG 100 |
||||
#define UCDN_SCRIPT_TAKRI 101 |
||||
#define UCDN_SCRIPT_UNKNOWN 102 |
||||
|
||||
#define UCDN_GENERAL_CATEGORY_CC 0 |
||||
#define UCDN_GENERAL_CATEGORY_CF 1 |
||||
#define UCDN_GENERAL_CATEGORY_CN 2 |
||||
#define UCDN_GENERAL_CATEGORY_CO 3 |
||||
#define UCDN_GENERAL_CATEGORY_CS 4 |
||||
#define UCDN_GENERAL_CATEGORY_LL 5 |
||||
#define UCDN_GENERAL_CATEGORY_LM 6 |
||||
#define UCDN_GENERAL_CATEGORY_LO 7 |
||||
#define UCDN_GENERAL_CATEGORY_LT 8 |
||||
#define UCDN_GENERAL_CATEGORY_LU 9 |
||||
#define UCDN_GENERAL_CATEGORY_MC 10 |
||||
#define UCDN_GENERAL_CATEGORY_ME 11 |
||||
#define UCDN_GENERAL_CATEGORY_MN 12 |
||||
#define UCDN_GENERAL_CATEGORY_ND 13 |
||||
#define UCDN_GENERAL_CATEGORY_NL 14 |
||||
#define UCDN_GENERAL_CATEGORY_NO 15 |
||||
#define UCDN_GENERAL_CATEGORY_PC 16 |
||||
#define UCDN_GENERAL_CATEGORY_PD 17 |
||||
#define UCDN_GENERAL_CATEGORY_PE 18 |
||||
#define UCDN_GENERAL_CATEGORY_PF 19 |
||||
#define UCDN_GENERAL_CATEGORY_PI 20 |
||||
#define UCDN_GENERAL_CATEGORY_PO 21 |
||||
#define UCDN_GENERAL_CATEGORY_PS 22 |
||||
#define UCDN_GENERAL_CATEGORY_SC 23 |
||||
#define UCDN_GENERAL_CATEGORY_SK 24 |
||||
#define UCDN_GENERAL_CATEGORY_SM 25 |
||||
#define UCDN_GENERAL_CATEGORY_SO 26 |
||||
#define UCDN_GENERAL_CATEGORY_ZL 27 |
||||
#define UCDN_GENERAL_CATEGORY_ZP 28 |
||||
#define UCDN_GENERAL_CATEGORY_ZS 29 |
||||
|
||||
#define UCDN_BIDI_CLASS_L 0 |
||||
#define UCDN_BIDI_CLASS_LRE 1 |
||||
#define UCDN_BIDI_CLASS_LRO 2 |
||||
#define UCDN_BIDI_CLASS_R 3 |
||||
#define UCDN_BIDI_CLASS_AL 4 |
||||
#define UCDN_BIDI_CLASS_RLE 5 |
||||
#define UCDN_BIDI_CLASS_RLO 6 |
||||
#define UCDN_BIDI_CLASS_PDF 7 |
||||
#define UCDN_BIDI_CLASS_EN 8 |
||||
#define UCDN_BIDI_CLASS_ES 9 |
||||
#define UCDN_BIDI_CLASS_ET 10 |
||||
#define UCDN_BIDI_CLASS_AN 11 |
||||
#define UCDN_BIDI_CLASS_CS 12 |
||||
#define UCDN_BIDI_CLASS_NSM 13 |
||||
#define UCDN_BIDI_CLASS_BN 14 |
||||
#define UCDN_BIDI_CLASS_B 15 |
||||
#define UCDN_BIDI_CLASS_S 16 |
||||
#define UCDN_BIDI_CLASS_WS 17 |
||||
#define UCDN_BIDI_CLASS_ON 18 |
||||
|
||||
/**
|
||||
* Return version of the Unicode database. |
||||
* |
||||
* @return Unicode database version |
||||
*/ |
||||
const char *ucdn_get_unicode_version(void); |
||||
|
||||
/**
|
||||
* Get combining class of a codepoint. |
||||
* |
||||
* @param code Unicode codepoint |
||||
* @return combining class value, as defined in UAX#44 |
||||
*/ |
||||
int ucdn_get_combining_class(uint32_t code); |
||||
|
||||
/**
|
||||
* Get east-asian width of a codepoint. |
||||
* |
||||
* @param code Unicode codepoint |
||||
* @return value according to UCDN_EAST_ASIAN_* and as defined in UAX#11. |
||||
*/ |
||||
int ucdn_get_east_asian_width(uint32_t code); |
||||
|
||||
/**
|
||||
* Get general category of a codepoint. |
||||
* |
||||
* @param code Unicode codepoint |
||||
* @return value according to UCDN_GENERAL_CATEGORY_* and as defined in |
||||
* UAX#44. |
||||
*/ |
||||
int ucdn_get_general_category(uint32_t code); |
||||
|
||||
/**
|
||||
* Get bidirectional class of a codepoint. |
||||
* |
||||
* @param code Unicode codepoint |
||||
* @return value according to UCDN_BIDI_CLASS_* and as defined in UAX#44. |
||||
*/ |
||||
int ucdn_get_bidi_class(uint32_t code); |
||||
|
||||
/**
|
||||
* Get script of a codepoint. |
||||
* |
||||
* @param code Unicode codepoint |
||||
* @return value according to UCDN_SCRIPT_* and as defined in UAX#24. |
||||
*/ |
||||
int ucdn_get_script(uint32_t code); |
||||
|
||||
/**
|
||||
* Check if codepoint can be mirrored. |
||||
* |
||||
* @param code Unicode codepoint |
||||
* @return 1 if mirrored character exists, otherwise 0 |
||||
*/ |
||||
int ucdn_get_mirrored(uint32_t code); |
||||
|
||||
/**
|
||||
* Mirror a codepoint. |
||||
* |
||||
* @param code Unicode codepoint |
||||
* @return mirrored codepoint or the original codepoint if no |
||||
* mirrored character exists |
||||
*/ |
||||
uint32_t ucdn_mirror(uint32_t code); |
||||
|
||||
/**
|
||||
* Pairwise canonical decomposition of a codepoint. This includes |
||||
* Hangul Jamo decomposition (see chapter 3.12 of the Unicode core |
||||
* specification). |
||||
* |
||||
* Hangul is decomposed into L and V jamos for LV forms, and an |
||||
* LV precomposed syllable and a T jamo for LVT forms. |
||||
* |
||||
* @param code Unicode codepoint |
||||
* @param a filled with first codepoint of decomposition |
||||
* @param b filled with second codepoint of decomposition, or 0 |
||||
* @return success |
||||
*/ |
||||
int ucdn_decompose(uint32_t code, uint32_t *a, uint32_t *b); |
||||
|
||||
/**
|
||||
* Compatibility decomposition of a codepoint. |
||||
* |
||||
* @param code Unicode codepoint |
||||
* @param decomposed filled with decomposition, must be able to hold 18 |
||||
* characters |
||||
* @return length of decomposition or 0 in case none exists |
||||
*/ |
||||
int ucdn_compat_decompose(uint32_t code, uint32_t *decomposed); |
||||
|
||||
/**
|
||||
* Pairwise canonical composition of two codepoints. This includes |
||||
* Hangul Jamo composition (see chapter 3.12 of the Unicode core |
||||
* specification). |
||||
* |
||||
* Hangul composition expects either L and V jamos, or an LV |
||||
* precomposed syllable and a T jamo. This is exactly the inverse |
||||
* of pairwise Hangul decomposition. |
||||
* |
||||
* @param code filled with composition |
||||
* @param a first codepoint |
||||
* @param b second codepoint |
||||
* @return success |
||||
*/ |
||||
int ucdn_compose(uint32_t *code, uint32_t a, uint32_t b); |
||||
|
||||
#endif |
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue