Improve unicode table handling (#286)

- Document table and index formats
- Add size statistics
- Fix UBSAN issue in `get_le24()`

Fixes #285
This commit is contained in:
Charlie Gordon 2024-05-05 12:10:24 +02:00 committed by GitHub
parent 3b45d155c7
commit 1402478d8d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 233 additions and 78 deletions

View file

@ -189,9 +189,13 @@ static const uint8_t unicode_prop_Cased1_table[196] = {
}; };
static const uint8_t unicode_prop_Cased1_index[21] = { static const uint8_t unicode_prop_Cased1_index[21] = {
0xb9, 0x02, 0xe0, 0xc0, 0x1d, 0x20, 0xe5, 0x2c, 0xb9, 0x02, 0xe0, // 002B9 at 39
0x20, 0xb1, 0x07, 0x21, 0xc1, 0xd6, 0x21, 0x4a, 0xc0, 0x1d, 0x20, // 01DC0 at 65
0xf1, 0x01, 0x8a, 0xf1, 0x01, 0xe5, 0x2c, 0x20, // 02CE5 at 97
0xb1, 0x07, 0x21, // 107B1 at 129
0xc1, 0xd6, 0x21, // 1D6C1 at 161
0x4a, 0xf1, 0x01, // 1F14A at 192
0x8a, 0xf1, 0x01, // 1F18A at 224 (upper bound)
}; };
static const uint8_t unicode_prop_Case_Ignorable_table[737] = { static const uint8_t unicode_prop_Case_Ignorable_table[737] = {
@ -291,15 +295,29 @@ static const uint8_t unicode_prop_Case_Ignorable_table[737] = {
}; };
static const uint8_t unicode_prop_Case_Ignorable_index[69] = { static const uint8_t unicode_prop_Case_Ignorable_index[69] = {
0xbe, 0x05, 0x00, 0xfe, 0x07, 0x00, 0x52, 0x0a, 0xbe, 0x05, 0x00, // 005BE at 32
0xa0, 0xc1, 0x0b, 0x00, 0x82, 0x0d, 0x00, 0x3f, 0xfe, 0x07, 0x00, // 007FE at 64
0x10, 0x80, 0xd4, 0x17, 0x40, 0xcf, 0x1a, 0x20, 0x52, 0x0a, 0xa0, // 00A52 at 101
0xf5, 0x1c, 0x00, 0x80, 0x20, 0x00, 0x16, 0xa0, 0xc1, 0x0b, 0x00, // 00BC1 at 128
0x00, 0xc6, 0xa8, 0x00, 0xc2, 0xaa, 0x60, 0x56, 0x82, 0x0d, 0x00, // 00D82 at 160
0xfe, 0x20, 0xb1, 0x07, 0x01, 0x75, 0x10, 0x01, 0x3f, 0x10, 0x80, // 0103F at 196
0xeb, 0x12, 0x21, 0x41, 0x16, 0x01, 0x5c, 0x1a, 0xd4, 0x17, 0x40, // 017D4 at 226
0x01, 0x43, 0x1f, 0x01, 0x2e, 0xcf, 0x41, 0x25, 0xcf, 0x1a, 0x20, // 01ACF at 257
0xe0, 0x01, 0xf0, 0x01, 0x0e, 0xf5, 0x1c, 0x00, // 01CF5 at 288
0x80, 0x20, 0x00, // 02080 at 320
0x16, 0xa0, 0x00, // 0A016 at 352
0xc6, 0xa8, 0x00, // 0A8C6 at 384
0xc2, 0xaa, 0x60, // 0AAC2 at 419
0x56, 0xfe, 0x20, // 0FE56 at 449
0xb1, 0x07, 0x01, // 107B1 at 480
0x75, 0x10, 0x01, // 11075 at 512
0xeb, 0x12, 0x21, // 112EB at 545
0x41, 0x16, 0x01, // 11641 at 576
0x5c, 0x1a, 0x01, // 11A5C at 608
0x43, 0x1f, 0x01, // 11F43 at 640
0x2e, 0xcf, 0x41, // 1CF2E at 674
0x25, 0xe0, 0x01, // 1E025 at 704
0xf0, 0x01, 0x0e, // E01F0 at 736 (upper bound)
}; };
static const uint8_t unicode_prop_ID_Start_table[1100] = { static const uint8_t unicode_prop_ID_Start_table[1100] = {
@ -444,20 +462,41 @@ static const uint8_t unicode_prop_ID_Start_table[1100] = {
}; };
static const uint8_t unicode_prop_ID_Start_index[105] = { static const uint8_t unicode_prop_ID_Start_index[105] = {
0xf6, 0x03, 0x20, 0xa6, 0x07, 0x00, 0xa9, 0x09, 0xf6, 0x03, 0x20, // 003F6 at 33
0x20, 0xb1, 0x0a, 0x00, 0xba, 0x0b, 0x20, 0x3b, 0xa6, 0x07, 0x00, // 007A6 at 64
0x0d, 0x20, 0xc7, 0x0e, 0x20, 0x49, 0x12, 0x00, 0xa9, 0x09, 0x20, // 009A9 at 97
0x9b, 0x16, 0x00, 0xac, 0x19, 0x00, 0xc0, 0x1d, 0xb1, 0x0a, 0x00, // 00AB1 at 128
0x80, 0x80, 0x20, 0x20, 0x70, 0x2d, 0x00, 0x00, 0xba, 0x0b, 0x20, // 00BBA at 161
0x32, 0x00, 0xda, 0xa7, 0x00, 0x4c, 0xaa, 0x20, 0x3b, 0x0d, 0x20, // 00D3B at 193
0xc7, 0xd7, 0x20, 0xfc, 0xfd, 0x20, 0x9d, 0x02, 0xc7, 0x0e, 0x20, // 00EC7 at 225
0x21, 0x96, 0x05, 0x01, 0xf3, 0x08, 0x01, 0xb3, 0x49, 0x12, 0x00, // 01249 at 256
0x0c, 0x21, 0x73, 0x11, 0x61, 0x34, 0x13, 0x01, 0x9b, 0x16, 0x00, // 0169B at 288
0x1b, 0x17, 0x21, 0x8a, 0x1a, 0x01, 0x34, 0x1f, 0xac, 0x19, 0x00, // 019AC at 320
0x21, 0xbf, 0x6a, 0x01, 0x23, 0xb1, 0xa1, 0xad, 0xc0, 0x1d, 0x80, // 01DC0 at 356
0xd4, 0x01, 0x6f, 0xd7, 0x01, 0xff, 0xe7, 0x61, 0x80, 0x20, 0x20, // 02080 at 385
0x5e, 0xee, 0x01, 0xe1, 0xeb, 0x22, 0xb0, 0x23, 0x70, 0x2d, 0x00, // 02D70 at 416
0x03, 0x00, 0x32, 0x00, // 03200 at 448
0xda, 0xa7, 0x00, // 0A7DA at 480
0x4c, 0xaa, 0x20, // 0AA4C at 513
0xc7, 0xd7, 0x20, // 0D7C7 at 545
0xfc, 0xfd, 0x20, // 0FDFC at 577
0x9d, 0x02, 0x21, // 1029D at 609
0x96, 0x05, 0x01, // 10596 at 640
0xf3, 0x08, 0x01, // 108F3 at 672
0xb3, 0x0c, 0x21, // 10CB3 at 705
0x73, 0x11, 0x61, // 11173 at 739
0x34, 0x13, 0x01, // 11334 at 768
0x1b, 0x17, 0x21, // 1171B at 801
0x8a, 0x1a, 0x01, // 11A8A at 832
0x34, 0x1f, 0x21, // 11F34 at 865
0xbf, 0x6a, 0x01, // 16ABF at 896
0x23, 0xb1, 0xa1, // 1B123 at 933
0xad, 0xd4, 0x01, // 1D4AD at 960
0x6f, 0xd7, 0x01, // 1D76F at 992
0xff, 0xe7, 0x61, // 1E7FF at 1027
0x5e, 0xee, 0x01, // 1EE5E at 1056
0xe1, 0xeb, 0x22, // 2EBE1 at 1089
0xb0, 0x23, 0x03, // 323B0 at 1120 (upper bound)
}; };
static const uint8_t unicode_prop_ID_Continue1_table[660] = { static const uint8_t unicode_prop_ID_Continue1_table[660] = {
@ -547,14 +586,27 @@ static const uint8_t unicode_prop_ID_Continue1_table[660] = {
}; };
static const uint8_t unicode_prop_ID_Continue1_index[63] = { static const uint8_t unicode_prop_ID_Continue1_index[63] = {
0xfa, 0x06, 0x00, 0x70, 0x09, 0x00, 0xf0, 0x0a, 0xfa, 0x06, 0x00, // 006FA at 32
0x40, 0x57, 0x0c, 0x00, 0xf0, 0x0d, 0x60, 0xc7, 0x70, 0x09, 0x00, // 00970 at 64
0x0f, 0x20, 0xea, 0x17, 0x40, 0x05, 0x1b, 0x00, 0xf0, 0x0a, 0x40, // 00AF0 at 98
0x41, 0x20, 0x00, 0x0c, 0xa8, 0x80, 0x37, 0xaa, 0x57, 0x0c, 0x00, // 00C57 at 128
0x20, 0x50, 0xfe, 0x20, 0x3a, 0x0d, 0x21, 0x74, 0xf0, 0x0d, 0x60, // 00DF0 at 163
0x11, 0x01, 0x5a, 0x14, 0x21, 0x44, 0x19, 0x81, 0xc7, 0x0f, 0x20, // 00FC7 at 193
0x5a, 0x1d, 0xa1, 0xf5, 0x6a, 0x21, 0x45, 0xd2, 0xea, 0x17, 0x40, // 017EA at 226
0x41, 0xaf, 0xe2, 0x21, 0xf0, 0x01, 0x0e, 0x05, 0x1b, 0x00, // 01B05 at 256
0x41, 0x20, 0x00, // 02041 at 288
0x0c, 0xa8, 0x80, // 0A80C at 324
0x37, 0xaa, 0x20, // 0AA37 at 353
0x50, 0xfe, 0x20, // 0FE50 at 385
0x3a, 0x0d, 0x21, // 10D3A at 417
0x74, 0x11, 0x01, // 11174 at 448
0x5a, 0x14, 0x21, // 1145A at 481
0x44, 0x19, 0x81, // 11944 at 516
0x5a, 0x1d, 0xa1, // 11D5A at 549
0xf5, 0x6a, 0x21, // 16AF5 at 577
0x45, 0xd2, 0x41, // 1D245 at 610
0xaf, 0xe2, 0x21, // 1E2AF at 641
0xf0, 0x01, 0x0e, // E01F0 at 672 (upper bound)
}; };
#ifdef CONFIG_ALL_UNICODE #ifdef CONFIG_ALL_UNICODE
@ -676,17 +728,35 @@ static const uint8_t unicode_cc_table[899] = {
}; };
static const uint8_t unicode_cc_index[87] = { static const uint8_t unicode_cc_index[87] = {
0x4d, 0x03, 0x00, 0x97, 0x05, 0x20, 0xc6, 0x05, 0x4d, 0x03, 0x00, // 0034D at 32
0x00, 0xe7, 0x06, 0x00, 0x45, 0x07, 0x00, 0x9c, 0x97, 0x05, 0x20, // 00597 at 65
0x08, 0x00, 0x4d, 0x09, 0x00, 0x3c, 0x0b, 0x00, 0xc6, 0x05, 0x00, // 005C6 at 96
0x3d, 0x0d, 0x00, 0x36, 0x0f, 0x00, 0x38, 0x10, 0xe7, 0x06, 0x00, // 006E7 at 128
0x20, 0x3a, 0x19, 0x00, 0xcb, 0x1a, 0x20, 0xd3, 0x45, 0x07, 0x00, // 00745 at 160
0x1c, 0x00, 0xcf, 0x1d, 0x00, 0xe2, 0x20, 0x00, 0x9c, 0x08, 0x00, // 0089C at 192
0x2e, 0x30, 0x20, 0x2b, 0xa9, 0x20, 0xed, 0xab, 0x4d, 0x09, 0x00, // 0094D at 224
0x00, 0x39, 0x0a, 0x01, 0x51, 0x0f, 0x01, 0x73, 0x3c, 0x0b, 0x00, // 00B3C at 256
0x11, 0x01, 0x75, 0x13, 0x01, 0x2b, 0x17, 0x21, 0x3d, 0x0d, 0x00, // 00D3D at 288
0x3f, 0x1c, 0x21, 0x9e, 0xbc, 0x21, 0x08, 0xe0, 0x36, 0x0f, 0x00, // 00F36 at 320
0x01, 0x44, 0xe9, 0x01, 0x4b, 0xe9, 0x01, 0x38, 0x10, 0x20, // 01038 at 353
0x3a, 0x19, 0x00, // 0193A at 384
0xcb, 0x1a, 0x20, // 01ACB at 417
0xd3, 0x1c, 0x00, // 01CD3 at 448
0xcf, 0x1d, 0x00, // 01DCF at 480
0xe2, 0x20, 0x00, // 020E2 at 512
0x2e, 0x30, 0x20, // 0302E at 545
0x2b, 0xa9, 0x20, // 0A92B at 577
0xed, 0xab, 0x00, // 0ABED at 608
0x39, 0x0a, 0x01, // 10A39 at 640
0x51, 0x0f, 0x01, // 10F51 at 672
0x73, 0x11, 0x01, // 11173 at 704
0x75, 0x13, 0x01, // 11375 at 736
0x2b, 0x17, 0x21, // 1172B at 769
0x3f, 0x1c, 0x21, // 11C3F at 801
0x9e, 0xbc, 0x21, // 1BC9E at 833
0x08, 0xe0, 0x01, // 1E008 at 864
0x44, 0xe9, 0x01, // 1E944 at 896
0x4b, 0xe9, 0x01, // 1E94B at 928 (upper bound)
}; };
static const uint32_t unicode_decomp_table1[699] = { static const uint32_t unicode_decomp_table1[699] = {
@ -4484,3 +4554,4 @@ static const uint16_t unicode_prop_len_table[] = {
}; };
#endif /* CONFIG_ALL_UNICODE */ #endif /* CONFIG_ALL_UNICODE */
/* 62 tables / 32261 bytes, 5 index / 345 bytes */

View file

@ -262,11 +262,7 @@ int lre_canonicalize(uint32_t c, BOOL is_unicode)
static uint32_t get_le24(const uint8_t *ptr) static uint32_t get_le24(const uint8_t *ptr)
{ {
#if defined(__x86__) || defined(__x86_64__)
return *(uint16_t *)ptr | (ptr[2] << 16);
#else
return ptr[0] | (ptr[1] << 8) | (ptr[2] << 16); return ptr[0] | (ptr[1] << 8) | (ptr[2] << 16);
#endif
} }
#define UNICODE_INDEX_BLOCK_LEN 32 #define UNICODE_INDEX_BLOCK_LEN 32
@ -317,6 +313,14 @@ static BOOL lre_is_in_table(uint32_t c, const uint8_t *table,
return FALSE; /* outside the table */ return FALSE; /* outside the table */
p = table + pos; p = table + pos;
bit = 0; bit = 0;
/* Compressed run length encoding:
00..3F: 2 packed lengths: 3-bit + 3-bit
40..5F: 5-bits plus extra byte for length
60..7F: 5-bits plus 2 extra bytes for length
80..FF: 7-bit length
lengths must be incremented to get character count
Ranges alternate between false and true return value.
*/
for(;;) { for(;;) {
b = *p++; b = *p++;
if (b < 64) { if (b < 64) {
@ -833,6 +837,13 @@ static int unicode_get_cc(uint32_t c)
if (pos < 0) if (pos < 0)
return 0; return 0;
p = unicode_cc_table + pos; p = unicode_cc_table + pos;
/* Compressed run length encoding:
- 2 high order bits are combining class type
- 0:0, 1:230, 2:extra byte linear progression, 3:extra byte
- 00..2F: range length (add 1)
- 30..37: 3-bit range-length + 1 extra byte
- 38..3F: 3-bit range-length + 2 extra byte
*/
for(;;) { for(;;) {
b = *p++; b = *p++;
type = b >> 6; type = b >> 6;
@ -1185,6 +1196,15 @@ static int unicode_general_category1(CharRange *cr, uint32_t gc_mask)
p = unicode_gc_table; p = unicode_gc_table;
p_end = unicode_gc_table + countof(unicode_gc_table); p_end = unicode_gc_table + countof(unicode_gc_table);
c = 0; c = 0;
/* Compressed range encoding:
initial byte:
bits 0..4: category number (special case 31)
bits 5..7: range length (add 1)
special case bits 5..7 == 7: read an extra byte
- 00..7F: range length (add 7 + 1)
- 80..BF: 6-bits plus extra byte for range length (add 7 + 128)
- C0..FF: 6-bits plus 2 extra bytes for range length (add 7 + 128 + 16384)
*/
while (p < p_end) { while (p < p_end) {
b = *p++; b = *p++;
n = b >> 5; n = b >> 5;
@ -1238,6 +1258,14 @@ static int unicode_prop1(CharRange *cr, int prop_idx)
p_end = p + unicode_prop_len_table[prop_idx]; p_end = p + unicode_prop_len_table[prop_idx];
c = 0; c = 0;
bit = 0; bit = 0;
/* Compressed range encoding:
00..3F: 2 packed lengths: 3-bit + 3-bit
40..5F: 5-bits plus extra byte for length
60..7F: 5-bits plus 2 extra bytes for length
80..FF: 7-bit length
lengths must be incremented to get character count
Ranges alternate between false and true return value.
*/
while (p < p_end) { while (p < p_end) {
c0 = c; c0 = c;
b = *p++; b = *p++;

View file

@ -33,6 +33,11 @@
#include "cutils.h" #include "cutils.h"
uint32_t total_tables;
uint32_t total_table_bytes;
uint32_t total_index;
uint32_t total_index_bytes;
/* define it to be able to test unicode.c */ /* define it to be able to test unicode.c */
//#define USE_TEST //#define USE_TEST
/* profile tests */ /* profile tests */
@ -1328,7 +1333,9 @@ void dump_case_conv_table(FILE *f)
uint32_t v; uint32_t v;
const TableEntry *te; const TableEntry *te;
fprintf(f, "static const uint32_t case_conv_table1[%u] = {", conv_table_len); total_tables++;
total_table_bytes += conv_table_len * sizeof(uint32_t);
fprintf(f, "static const uint32_t case_conv_table1[%d] = {", conv_table_len);
for(i = 0; i < conv_table_len; i++) { for(i = 0; i < conv_table_len; i++) {
if (i % 4 == 0) if (i % 4 == 0)
fprintf(f, "\n "); fprintf(f, "\n ");
@ -1341,7 +1348,9 @@ void dump_case_conv_table(FILE *f)
} }
fprintf(f, "\n};\n\n"); fprintf(f, "\n};\n\n");
fprintf(f, "static const uint8_t case_conv_table2[%u] = {", conv_table_len); total_tables++;
total_table_bytes += conv_table_len;
fprintf(f, "static const uint8_t case_conv_table2[%d] = {", conv_table_len);
for(i = 0; i < conv_table_len; i++) { for(i = 0; i < conv_table_len; i++) {
if (i % 8 == 0) if (i % 8 == 0)
fprintf(f, "\n "); fprintf(f, "\n ");
@ -1350,7 +1359,9 @@ void dump_case_conv_table(FILE *f)
} }
fprintf(f, "\n};\n\n"); fprintf(f, "\n};\n\n");
fprintf(f, "static const uint16_t case_conv_ext[%u] = {", ext_data_len); total_tables++;
total_table_bytes += ext_data_len * sizeof(uint16_t);
fprintf(f, "static const uint16_t case_conv_ext[%d] = {", ext_data_len);
for(i = 0; i < ext_data_len; i++) { for(i = 0; i < ext_data_len; i++) {
if (i % 8 == 0) if (i % 8 == 0)
fprintf(f, "\n "); fprintf(f, "\n ");
@ -1470,6 +1481,9 @@ void compute_internal_props(void)
void dump_byte_table(FILE *f, const char *cname, const uint8_t *tab, int len) void dump_byte_table(FILE *f, const char *cname, const uint8_t *tab, int len)
{ {
int i; int i;
total_tables++;
total_table_bytes += len;
fprintf(f, "static const uint8_t %s[%d] = {", cname, len); fprintf(f, "static const uint8_t %s[%d] = {", cname, len);
for(i = 0; i < len; i++) { for(i = 0; i < len; i++) {
if (i % 8 == 0) if (i % 8 == 0)
@ -1479,9 +1493,26 @@ void dump_byte_table(FILE *f, const char *cname, const uint8_t *tab, int len)
fprintf(f, "\n};\n\n"); fprintf(f, "\n};\n\n");
} }
void dump_index_table(FILE *f, const char *cname, const uint8_t *tab, int len)
{
int i, code, offset;
total_index++;
total_index_bytes += len;
fprintf(f, "static const uint8_t %s[%d] = {\n", cname, len);
for(i = 0; i < len; i += 3) {
code = tab[i] + (tab[i+1] << 8) + ((tab[i+2] & 0x1f) << 16);
offset = ((i / 3) + 1) * 32 + (tab[i+2] >> 5);
fprintf(f, " 0x%02x, 0x%02x, 0x%02x,", tab[i], tab[i+1], tab[i+2]);
fprintf(f, " // %6.5X at %d%s\n", code, offset,
i == len - 3 ? " (upper bound)" : "");
}
fprintf(f, "};\n\n");
}
#define PROP_BLOCK_LEN 32 #define PROP_BLOCK_LEN 32
void build_prop_table(FILE *f, int prop_index, BOOL add_index) void build_prop_table(FILE *f, const char *name, int prop_index, BOOL add_index)
{ {
int i, j, n, v, offset, code; int i, j, n, v, offset, code;
DynBuf dbuf_s, *dbuf = &dbuf_s; DynBuf dbuf_s, *dbuf = &dbuf_s;
@ -1533,6 +1564,14 @@ void build_prop_table(FILE *f, int prop_index, BOOL add_index)
block_end_pos += PROP_BLOCK_LEN; block_end_pos += PROP_BLOCK_LEN;
} }
/* Compressed byte encoding:
00..3F: 2 packed lengths: 3-bit + 3-bit
40..5F: 5-bits plus extra byte for length
60..7F: 5-bits plus 2 extra bytes for length
80..FF: 7-bit length
lengths must be incremented to get character count
Ranges alternate between false and true return value.
*/
v = buf[i]; v = buf[i];
code += v + 1; code += v + 1;
bit ^= 1; bit ^= 1;
@ -1573,7 +1612,7 @@ void build_prop_table(FILE *f, int prop_index, BOOL add_index)
dump_byte_table(f, cname, dbuf->buf, dbuf->size); dump_byte_table(f, cname, dbuf->buf, dbuf->size);
if (add_index) { if (add_index) {
snprintf(cname, sizeof(cname), "unicode_prop_%s_index", unicode_prop_name[prop_index]); snprintf(cname, sizeof(cname), "unicode_prop_%s_index", unicode_prop_name[prop_index]);
dump_byte_table(f, cname, dbuf2->buf, dbuf2->size); dump_index_table(f, cname, dbuf2->buf, dbuf2->size);
} }
dbuf_free(dbuf); dbuf_free(dbuf);
@ -1583,10 +1622,10 @@ void build_prop_table(FILE *f, int prop_index, BOOL add_index)
void build_flags_tables(FILE *f) void build_flags_tables(FILE *f)
{ {
build_prop_table(f, PROP_Cased1, TRUE); build_prop_table(f, "Cased1", PROP_Cased1, TRUE);
build_prop_table(f, PROP_Case_Ignorable, TRUE); build_prop_table(f, "Case_Ignorable", PROP_Case_Ignorable, TRUE);
build_prop_table(f, PROP_ID_Start, TRUE); build_prop_table(f, "ID_Start", PROP_ID_Start, TRUE);
build_prop_table(f, PROP_ID_Continue1, TRUE); build_prop_table(f, "ID_Continue1", PROP_ID_Continue1, TRUE);
} }
void dump_name_table(FILE *f, const char *cname, const char **tab_name, int len, void dump_name_table(FILE *f, const char *cname, const char **tab_name, int len,
@ -1845,7 +1884,7 @@ void build_prop_list_table(FILE *f)
i == PROP_ID_Continue1) { i == PROP_ID_Continue1) {
/* already generated */ /* already generated */
} else { } else {
build_prop_table(f, i, FALSE); build_prop_table(f, unicode_prop_name[i], i, FALSE);
} }
} }
@ -1997,6 +2036,8 @@ void check_flags(void)
void build_cc_table(FILE *f) void build_cc_table(FILE *f)
{ {
// Compress combining class table
// see: https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values
int i, cc, n, type, n1, block_end_pos; int i, cc, n, type, n1, block_end_pos;
DynBuf dbuf_s, *dbuf = &dbuf_s; DynBuf dbuf_s, *dbuf = &dbuf_s;
DynBuf dbuf1_s, *dbuf1 = &dbuf1_s; DynBuf dbuf1_s, *dbuf1 = &dbuf1_s;
@ -2055,6 +2096,13 @@ void build_cc_table(FILE *f)
#if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE) #if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE)
cw_start = dbuf->size; cw_start = dbuf->size;
#endif #endif
/* Compressed run length encoding:
- 2 high order bits are combining class type
- 0:0, 1:230, 2:extra byte linear progression, 3:extra byte
- 00..2F: range length (add 1)
- 30..37: 3-bit range-length + 1 extra byte
- 38..3F: 3-bit range-length + 2 extra byte
*/
if (n1 < 48) { if (n1 < 48) {
dbuf_putc(dbuf, n1 | (type << 6)); dbuf_putc(dbuf, n1 | (type << 6));
} else if (n1 < 48 + (1 << 11)) { } else if (n1 < 48 + (1 << 11)) {
@ -2084,7 +2132,7 @@ void build_cc_table(FILE *f)
dbuf_putc(dbuf1, v >> 16); dbuf_putc(dbuf1, v >> 16);
dump_byte_table(f, "unicode_cc_table", dbuf->buf, dbuf->size); dump_byte_table(f, "unicode_cc_table", dbuf->buf, dbuf->size);
dump_byte_table(f, "unicode_cc_index", dbuf1->buf, dbuf1->size); dump_index_table(f, "unicode_cc_index", dbuf1->buf, dbuf1->size);
#if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE) #if defined(DUMP_CC_TABLE) || defined(DUMP_TABLE_SIZE)
printf("CC table: size=%d (%d entries) [", printf("CC table: size=%d (%d entries) [",
@ -2765,8 +2813,9 @@ void build_decompose_table(FILE *f)
} }
#endif #endif
fprintf(f, "static const uint32_t unicode_decomp_table1[%u] = {", total_tables++;
array_len); total_table_bytes += array_len * sizeof(uint32_t);
fprintf(f, "static const uint32_t unicode_decomp_table1[%d] = {", array_len);
count = 0; count = 0;
for(i = 0; i <= code_max; i++) { for(i = 0; i <= code_max; i++) {
de = &tab_de[i]; de = &tab_de[i];
@ -2784,8 +2833,9 @@ void build_decompose_table(FILE *f)
} }
fprintf(f, "\n};\n\n"); fprintf(f, "\n};\n\n");
fprintf(f, "static const uint16_t unicode_decomp_table2[%u] = {", total_tables++;
array_len); total_table_bytes += array_len * sizeof(uint16_t);
fprintf(f, "static const uint16_t unicode_decomp_table2[%d] = {", array_len);
count = 0; count = 0;
for(i = 0; i <= code_max; i++) { for(i = 0; i <= code_max; i++) {
de = &tab_de[i]; de = &tab_de[i];
@ -2798,8 +2848,9 @@ void build_decompose_table(FILE *f)
} }
fprintf(f, "\n};\n\n"); fprintf(f, "\n};\n\n");
fprintf(f, "static const uint8_t unicode_decomp_data[%u] = {", total_tables++;
data_len); total_table_bytes += data_len;
fprintf(f, "static const uint8_t unicode_decomp_data[%d] = {", data_len);
for(i = 0; i < data_len; i++) { for(i = 0; i < data_len; i++) {
if (i % 8 == 0) if (i % 8 == 0)
fprintf(f, "\n "); fprintf(f, "\n ");
@ -2890,8 +2941,9 @@ void build_compose_table(FILE *f, const DecompEntry *tab_de)
} }
#endif #endif
fprintf(f, "static const uint16_t unicode_comp_table[%u] = {", total_tables++;
tab_ce_len); total_table_bytes += tab_ce_len * sizeof(uint16_t);
fprintf(f, "static const uint16_t unicode_comp_table[%u] = {", tab_ce_len);
for(i = 0; i < tab_ce_len; i++) { for(i = 0; i < tab_ce_len; i++) {
if (i % 8 == 0) if (i % 8 == 0)
fprintf(f, "\n "); fprintf(f, "\n ");
@ -3066,22 +3118,24 @@ void normalization_test(const char *filename)
} }
#endif #endif
int main(int argc, char **argv) int main(int argc, char *argv[])
{ {
const char *unicode_db_path, *outfilename; const char *unicode_db_path, *outfilename;
char filename[1024]; char filename[1024];
int arg = 1;
if (argc < 2) { if (arg >= argc || (!strcmp(argv[arg], "-h") || !strcmp(argv[arg], "--help"))) {
printf("usage: %s unicode_db_path [output_file]\n" printf("usage: %s PATH [OUTPUT]\n"
"\n" " PATH path to the Unicode database directory\n"
"If no output_file is given, a self test is done using the current unicode library\n", " OUTPUT name of the output file. If omitted, a self test is performed\n"
argv[0]); " using the files from the Unicode library\n"
exit(1); , argv[0]);
return 1;
} }
unicode_db_path = argv[1]; unicode_db_path = argv[arg++];
outfilename = NULL; outfilename = NULL;
if (argc >= 3) if (arg < argc)
outfilename = argv[2]; outfilename = argv[arg++];
unicode_db = mallocz(sizeof(unicode_db[0]) * (CHARCODE_MAX + 1)); unicode_db = mallocz(sizeof(unicode_db[0]) * (CHARCODE_MAX + 1));
@ -3163,6 +3217,8 @@ int main(int argc, char **argv)
build_script_ext_table(fo); build_script_ext_table(fo);
build_prop_list_table(fo); build_prop_list_table(fo);
fprintf(fo, "#endif /* CONFIG_ALL_UNICODE */\n"); fprintf(fo, "#endif /* CONFIG_ALL_UNICODE */\n");
fprintf(fo, "/* %u tables / %u bytes, %u index / %u bytes */\n",
total_tables, total_table_bytes, total_index, total_index_bytes);
fclose(fo); fclose(fo);
} }
return 0; return 0;