Fix regexp case insensitive flag (#531)

Ref: af308614a8
This commit is contained in:
Saúl Ibarra Corretgé 2024-09-13 23:50:44 +02:00 committed by GitHub
parent ac958f1d2f
commit f5c388d693
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 516 additions and 253 deletions

View file

@ -34,9 +34,6 @@
/* /*
TODO: TODO:
- Add full unicode canonicalize rules for character ranges (not
really useful but needed for exact "ignorecase" compatibility).
- Add a lock step execution mode (=linear time execution guaranteed) - Add a lock step execution mode (=linear time execution guaranteed)
when the regular expression is "simple" i.e. no backreference nor when the regular expression is "simple" i.e. no backreference nor
complicated lookahead. The opcodes are designed for this execution complicated lookahead. The opcodes are designed for this execution
@ -123,33 +120,6 @@ static int dbuf_insert(DynBuf *s, int pos, int len)
return 0; return 0;
} }
/* canonicalize with the specific JS regexp rules */
static uint32_t lre_canonicalize(uint32_t c, BOOL is_unicode)
{
uint32_t res[LRE_CC_RES_LEN_MAX];
int len;
if (is_unicode) {
if (likely(c < 128)) {
if (c >= 'A' && c <= 'Z')
c = c - 'A' + 'a';
} else {
lre_case_conv(res, c, 2);
c = res[0];
}
} else {
if (likely(c < 128)) {
if (c >= 'a' && c <= 'z')
c = c - 'a' + 'A';
} else {
/* legacy regexp: to upper case if single char >= 128 */
len = lre_case_conv(res, c, FALSE);
if (len == 1 && res[0] >= 128)
c = res[0];
}
}
return c;
}
static const uint16_t char_range_d[] = { static const uint16_t char_range_d[] = {
1, 1,
0x0030, 0x0039 + 1, 0x0030, 0x0039 + 1,
@ -248,31 +218,6 @@ static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c)
return -1; return -1;
} }
static int cr_canonicalize(CharRange *cr)
{
CharRange a;
uint32_t pt[2];
int i, ret;
cr_init(&a, cr->mem_opaque, lre_realloc);
pt[0] = 'a';
pt[1] = 'z' + 1;
ret = cr_op(&a, cr->points, cr->len, pt, 2, CR_OP_INTER);
if (ret)
goto fail;
/* convert to upper case */
/* XXX: the generic unicode case would be much more complicated
and not really useful */
for(i = 0; i < a.len; i++) {
a.points[i] += 'A' - 'a';
}
/* Note: for simplicity we keep the lower case ranges */
ret = cr_union1(cr, a.points, a.len);
fail:
cr_free(&a);
return ret;
}
#ifdef DUMP_REOP #ifdef DUMP_REOP
static __maybe_unused void lre_dump_bytecode(const uint8_t *buf, static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
int buf_len) int buf_len)
@ -955,7 +900,7 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
} }
} }
if (s->ignore_case) { if (s->ignore_case) {
if (cr_canonicalize(cr)) if (cr_regexp_canonicalize(cr, s->is_unicode))
goto memory_error; goto memory_error;
} }
if (invert) { if (invert) {

View file

@ -3777,72 +3777,70 @@ static const uint8_t unicode_prop_Changes_When_Titlecased1_table[22] = {
0x8b, 0x80, 0x8e, 0x80, 0xae, 0x80, 0x8b, 0x80, 0x8e, 0x80, 0xae, 0x80,
}; };
static const uint8_t unicode_prop_Changes_When_Casefolded1_table[33] = { static const uint8_t unicode_prop_Changes_When_Casefolded1_table[29] = {
0x40, 0xde, 0x80, 0xcf, 0x80, 0x97, 0x80, 0x44, 0x41, 0xef, 0x80, 0x41, 0x9e, 0x80, 0x9e, 0x80,
0x3c, 0x80, 0x59, 0x11, 0x80, 0x40, 0xe4, 0x3f, 0x5a, 0xe4, 0x83, 0x40, 0xb5, 0x00, 0x00, 0x00,
0x3f, 0x87, 0x89, 0x11, 0x05, 0x02, 0x11, 0x80, 0x80, 0xde, 0x06, 0x06, 0x80, 0x8a, 0x09, 0x81,
0xa9, 0x11, 0x80, 0x60, 0xdb, 0x07, 0x86, 0x8b, 0x89, 0x10, 0x81, 0x8d, 0x80,
0x84,
}; };
static const uint8_t unicode_prop_Changes_When_NFKC_Casefolded1_table[451] = { static const uint8_t unicode_prop_Changes_When_NFKC_Casefolded1_table[447] = {
0x40, 0x9f, 0x06, 0x00, 0x01, 0x00, 0x01, 0x12, 0x40, 0x9f, 0x06, 0x00, 0x01, 0x00, 0x01, 0x12,
0x10, 0x82, 0x9f, 0x80, 0xcf, 0x01, 0x80, 0x8b, 0x10, 0x82, 0xf3, 0x80, 0x8b, 0x80, 0x40, 0x84,
0x07, 0x80, 0xfb, 0x01, 0x01, 0x80, 0xa5, 0x80, 0x01, 0x01, 0x80, 0xa2, 0x01, 0x80, 0x40, 0xbb,
0x40, 0xbb, 0x88, 0x9e, 0x29, 0x84, 0xda, 0x08, 0x88, 0x9e, 0x29, 0x84, 0xda, 0x08, 0x81, 0x89,
0x81, 0x89, 0x80, 0xa3, 0x04, 0x02, 0x04, 0x08, 0x80, 0xa3, 0x04, 0x02, 0x04, 0x08, 0x07, 0x80,
0x80, 0xc9, 0x82, 0x9c, 0x80, 0x41, 0x93, 0x80, 0x9e, 0x80, 0xa0, 0x82, 0x9c, 0x80, 0x42, 0x28,
0x40, 0x93, 0x80, 0xd7, 0x83, 0x42, 0xde, 0x87, 0x80, 0xd7, 0x83, 0x42, 0xde, 0x87, 0xfb, 0x08,
0xfb, 0x08, 0x80, 0xd2, 0x01, 0x80, 0xa1, 0x11, 0x80, 0xd2, 0x01, 0x80, 0xa1, 0x11, 0x80, 0x40,
0x80, 0x40, 0xfc, 0x81, 0x42, 0xd4, 0x80, 0xfe, 0xfc, 0x81, 0x42, 0xd4, 0x80, 0xfe, 0x80, 0xa7,
0x80, 0xa7, 0x81, 0xad, 0x80, 0xb5, 0x80, 0x88, 0x81, 0xad, 0x80, 0xb5, 0x80, 0x88, 0x03, 0x03,
0x03, 0x03, 0x03, 0x80, 0x8b, 0x80, 0x88, 0x00, 0x03, 0x80, 0x8b, 0x80, 0x88, 0x00, 0x26, 0x80,
0x26, 0x80, 0x90, 0x80, 0x88, 0x03, 0x03, 0x03, 0x90, 0x80, 0x88, 0x03, 0x03, 0x03, 0x80, 0x8b,
0x80, 0x8b, 0x80, 0x41, 0x41, 0x80, 0xe1, 0x81, 0x80, 0x41, 0x41, 0x80, 0xe1, 0x81, 0x46, 0x52,
0x46, 0x52, 0x81, 0xd4, 0x84, 0x45, 0x1b, 0x10, 0x81, 0xd4, 0x84, 0x45, 0x1b, 0x10, 0x8a, 0x80,
0x8a, 0x80, 0x91, 0x80, 0x9b, 0x8c, 0x80, 0xa1, 0x91, 0x80, 0x9b, 0x8c, 0x80, 0xa1, 0xa4, 0x40,
0xa4, 0x40, 0xd9, 0x80, 0x40, 0xd5, 0x00, 0x00, 0xd5, 0x83, 0x40, 0xb5, 0x00, 0x00, 0x00, 0x80,
0x00, 0x00, 0x00, 0x00, 0x01, 0x3f, 0x3f, 0x87, 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
0x89, 0x11, 0x04, 0x00, 0x29, 0x04, 0x12, 0x80, 0xb7, 0x05, 0x00, 0x13, 0x05, 0x11, 0x02, 0x0c,
0x88, 0x12, 0x80, 0x88, 0x11, 0x11, 0x04, 0x08, 0x11, 0x00, 0x00, 0x0c, 0x15, 0x05, 0x08, 0x8f,
0x8f, 0x00, 0x20, 0x8b, 0x12, 0x2a, 0x08, 0x0b, 0x00, 0x20, 0x8b, 0x12, 0x2a, 0x08, 0x0b, 0x00,
0x00, 0x07, 0x82, 0x8c, 0x06, 0x92, 0x81, 0x9a, 0x07, 0x82, 0x8c, 0x06, 0x92, 0x81, 0x9a, 0x80,
0x80, 0x8c, 0x8a, 0x80, 0xd6, 0x18, 0x10, 0x8a, 0x8c, 0x8a, 0x80, 0xd6, 0x18, 0x10, 0x8a, 0x01,
0x01, 0x0c, 0x0a, 0x00, 0x10, 0x11, 0x02, 0x06, 0x0c, 0x0a, 0x00, 0x10, 0x11, 0x02, 0x06, 0x05,
0x05, 0x1c, 0x85, 0x8f, 0x8f, 0x8f, 0x88, 0x80, 0x1c, 0x85, 0x8f, 0x8f, 0x8f, 0x88, 0x80, 0x40,
0x40, 0xa1, 0x08, 0x81, 0x40, 0xf7, 0x81, 0x41, 0xa1, 0x08, 0x81, 0x40, 0xf7, 0x81, 0x41, 0x34,
0x34, 0xd5, 0x99, 0x9a, 0x45, 0x20, 0x80, 0xe6, 0xd5, 0x99, 0x9a, 0x45, 0x20, 0x80, 0xe6, 0x82,
0x82, 0xe4, 0x80, 0x41, 0x9e, 0x81, 0x40, 0xf0, 0xe4, 0x80, 0x41, 0x9e, 0x81, 0x40, 0xf0, 0x80,
0x80, 0x41, 0x2e, 0x80, 0xd2, 0x80, 0x8b, 0x40, 0x41, 0x2e, 0x80, 0xd2, 0x80, 0x8b, 0x40, 0xd5,
0xd5, 0xa9, 0x80, 0xb4, 0x00, 0x82, 0xdf, 0x09, 0xa9, 0x80, 0xb4, 0x00, 0x82, 0xdf, 0x09, 0x80,
0x80, 0xde, 0x80, 0xb0, 0xdd, 0x82, 0x8d, 0xdf, 0xde, 0x80, 0xb0, 0xdd, 0x82, 0x8d, 0xdf, 0x9e,
0x9e, 0x80, 0xa7, 0x87, 0xae, 0x80, 0x41, 0x7f, 0x80, 0xa7, 0x87, 0xae, 0x80, 0x41, 0x7f, 0x60,
0x60, 0x72, 0x9b, 0x81, 0x40, 0xd1, 0x80, 0x40, 0x72, 0x9b, 0x81, 0x40, 0xd1, 0x80, 0x40, 0x80,
0x80, 0x12, 0x81, 0x43, 0x61, 0x83, 0x88, 0x80, 0x12, 0x81, 0x43, 0x61, 0x83, 0x88, 0x80, 0x60,
0x60, 0x4d, 0x95, 0x41, 0x0d, 0x08, 0x00, 0x81, 0x4d, 0x95, 0x41, 0x0d, 0x08, 0x00, 0x81, 0x89,
0x89, 0x00, 0x00, 0x09, 0x82, 0xc3, 0x81, 0xe9, 0x00, 0x00, 0x09, 0x82, 0xc3, 0x81, 0xe9, 0xc2,
0xa5, 0x86, 0x8b, 0x24, 0x00, 0x97, 0x04, 0x00, 0x00, 0x97, 0x04, 0x00, 0x01, 0x01, 0x80, 0xeb,
0x01, 0x01, 0x80, 0xeb, 0xa0, 0x41, 0x6a, 0x91, 0xa0, 0x41, 0x6a, 0x91, 0xbf, 0x81, 0xb5, 0xa7,
0xbf, 0x81, 0xb5, 0xa7, 0x8c, 0x82, 0x99, 0x95, 0x8c, 0x82, 0x99, 0x95, 0x94, 0x81, 0x8b, 0x80,
0x94, 0x81, 0x8b, 0x80, 0x92, 0x03, 0x1a, 0x00, 0x92, 0x03, 0x1a, 0x00, 0x80, 0x40, 0x86, 0x08,
0x80, 0x40, 0x86, 0x08, 0x80, 0x9f, 0x99, 0x40, 0x80, 0x9f, 0x99, 0x40, 0x83, 0x15, 0x0d, 0x0d,
0x83, 0x15, 0x0d, 0x0d, 0x0a, 0x16, 0x06, 0x80, 0x0a, 0x16, 0x06, 0x80, 0x88, 0x47, 0x87, 0x20,
0x88, 0x47, 0x87, 0x20, 0xa9, 0x80, 0x88, 0x60, 0xa9, 0x80, 0x88, 0x60, 0xb4, 0xe4, 0x83, 0x54,
0xb4, 0xe4, 0x83, 0x54, 0xb9, 0x86, 0x8d, 0x87, 0xb9, 0x86, 0x8d, 0x87, 0xbf, 0x85, 0x42, 0x3e,
0xbf, 0x85, 0x42, 0x3e, 0xd4, 0x80, 0xc6, 0x01, 0xd4, 0x80, 0xc6, 0x01, 0x08, 0x09, 0x0b, 0x80,
0x08, 0x09, 0x0b, 0x80, 0x8b, 0x00, 0x06, 0x80, 0x8b, 0x00, 0x06, 0x80, 0xc0, 0x03, 0x0f, 0x06,
0xc0, 0x03, 0x0f, 0x06, 0x80, 0x9b, 0x03, 0x04, 0x80, 0x9b, 0x03, 0x04, 0x00, 0x16, 0x80, 0x41,
0x00, 0x16, 0x80, 0x41, 0x53, 0x81, 0x41, 0x23, 0x53, 0x81, 0x41, 0x23, 0x81, 0xb1, 0x48, 0x2f,
0x81, 0xb1, 0x48, 0x2f, 0xbd, 0x4d, 0x91, 0x18, 0xbd, 0x4d, 0x91, 0x18, 0x9a, 0x01, 0x00, 0x08,
0x9a, 0x01, 0x00, 0x08, 0x80, 0x89, 0x03, 0x00, 0x80, 0x89, 0x03, 0x00, 0x00, 0x28, 0x18, 0x00,
0x00, 0x28, 0x18, 0x00, 0x00, 0x02, 0x01, 0x00, 0x00, 0x02, 0x01, 0x00, 0x08, 0x00, 0x00, 0x00,
0x08, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x01, 0x00, 0x0b, 0x06, 0x03, 0x03, 0x00,
0x06, 0x03, 0x03, 0x00, 0x80, 0x89, 0x80, 0x90, 0x80, 0x89, 0x80, 0x90, 0x22, 0x04, 0x80, 0x90,
0x22, 0x04, 0x80, 0x90, 0x42, 0x43, 0x8a, 0x84, 0x42, 0x43, 0x8a, 0x84, 0x9e, 0x80, 0x9f, 0x99,
0x9e, 0x80, 0x9f, 0x99, 0x82, 0xa2, 0x80, 0xee, 0x82, 0xa2, 0x80, 0xee, 0x82, 0x8c, 0xab, 0x83,
0x82, 0x8c, 0xab, 0x83, 0x88, 0x31, 0x49, 0x9d, 0x88, 0x31, 0x49, 0x9d, 0x89, 0x60, 0xfc, 0x05,
0x89, 0x60, 0xfc, 0x05, 0x42, 0x1d, 0x6b, 0x05, 0x42, 0x1d, 0x6b, 0x05, 0xe1, 0x4f, 0xff,
0xe1, 0x4f, 0xff,
}; };
static const uint8_t unicode_prop_ASCII_Hex_Digit_table[5] = { static const uint8_t unicode_prop_ASCII_Hex_Digit_table[5] = {

View file

@ -43,47 +43,26 @@ enum {
RUN_TYPE_UF_D1_EXT, RUN_TYPE_UF_D1_EXT,
RUN_TYPE_U_EXT, RUN_TYPE_U_EXT,
RUN_TYPE_LF_EXT, RUN_TYPE_LF_EXT,
RUN_TYPE_U_EXT2, RUN_TYPE_UF_EXT2,
RUN_TYPE_L_EXT2, RUN_TYPE_LF_EXT2,
RUN_TYPE_U_EXT3, RUN_TYPE_UF_EXT3,
}; };
/* conv_type: static int lre_case_conv1(uint32_t c, int conv_type)
0 = to upper
1 = to lower
2 = case folding (= to lower with modifications)
*/
int lre_case_conv(uint32_t *res, uint32_t c, int conv_type)
{ {
if (c < 128) { uint32_t res[LRE_CC_RES_LEN_MAX];
if (conv_type) { lre_case_conv(res, c, conv_type);
if (c >= 'A' && c <= 'Z') { return res[0];
c = c - 'A' + 'a'; }
}
} else {
if (c >= 'a' && c <= 'z') {
c = c - 'a' + 'A';
}
}
} else {
uint32_t v, code, data, type, len, a, is_lower;
int idx, idx_min, idx_max;
/* case conversion using the table entry 'idx' with value 'v' */
static int lre_case_conv_entry(uint32_t *res, uint32_t c, int conv_type, uint32_t idx, uint32_t v)
{
uint32_t code, data, type, a, is_lower;
is_lower = (conv_type != 0); is_lower = (conv_type != 0);
idx_min = 0;
idx_max = countof(case_conv_table1) - 1;
while (idx_min <= idx_max) {
idx = (unsigned)(idx_max + idx_min) / 2;
v = case_conv_table1[idx];
code = v >> (32 - 17);
len = (v >> (32 - 17 - 7)) & 0x7f;
if (c < code) {
idx_max = idx - 1;
} else if (c >= code + len) {
idx_min = idx + 1;
} else {
type = (v >> (32 - 17 - 7 - 4)) & 0xf; type = (v >> (32 - 17 - 7 - 4)) & 0xf;
data = ((v & 0xf) << 8) | case_conv_table2[idx]; data = ((v & 0xf) << 8) | case_conv_table2[idx];
code = v >> (32 - 17);
switch(type) { switch(type) {
case RUN_TYPE_U: case RUN_TYPE_U:
case RUN_TYPE_L: case RUN_TYPE_L:
@ -133,23 +112,76 @@ int lre_case_conv(uint32_t *res, uint32_t c, int conv_type)
break; break;
c = case_conv_ext[data]; c = case_conv_ext[data];
break; break;
case RUN_TYPE_U_EXT2: case RUN_TYPE_LF_EXT2:
case RUN_TYPE_L_EXT2: if (!is_lower)
if (conv_type != (type - RUN_TYPE_U_EXT2))
break; break;
res[0] = c - code + case_conv_ext[data >> 6]; res[0] = c - code + case_conv_ext[data >> 6];
res[1] = case_conv_ext[data & 0x3f]; res[1] = case_conv_ext[data & 0x3f];
return 2; return 2;
case RUN_TYPE_UF_EXT2:
if (conv_type == 1)
break;
res[0] = c - code + case_conv_ext[data >> 6];
res[1] = case_conv_ext[data & 0x3f];
if (conv_type == 2) {
/* convert to lower */
res[0] = lre_case_conv1(res[0], 1);
res[1] = lre_case_conv1(res[1], 1);
}
return 2;
default: default:
case RUN_TYPE_U_EXT3: case RUN_TYPE_UF_EXT3:
if (conv_type != 0) if (conv_type == 1)
break; break;
res[0] = case_conv_ext[data >> 8]; res[0] = case_conv_ext[data >> 8];
res[1] = case_conv_ext[(data >> 4) & 0xf]; res[1] = case_conv_ext[(data >> 4) & 0xf];
res[2] = case_conv_ext[data & 0xf]; res[2] = case_conv_ext[data & 0xf];
if (conv_type == 2) {
/* convert to lower */
res[0] = lre_case_conv1(res[0], 1);
res[1] = lre_case_conv1(res[1], 1);
res[2] = lre_case_conv1(res[2], 1);
}
return 3; return 3;
} }
break; res[0] = c;
return 1;
}
/* conv_type:
0 = to upper
1 = to lower
2 = case folding (= to lower with modifications)
*/
int lre_case_conv(uint32_t *res, uint32_t c, int conv_type)
{
if (c < 128) {
if (conv_type) {
if (c >= 'A' && c <= 'Z') {
c = c - 'A' + 'a';
}
} else {
if (c >= 'a' && c <= 'z') {
c = c - 'a' + 'A';
}
}
} else {
uint32_t v, code, len;
int idx, idx_min, idx_max;
idx_min = 0;
idx_max = countof(case_conv_table1) - 1;
while (idx_min <= idx_max) {
idx = (unsigned)(idx_max + idx_min) / 2;
v = case_conv_table1[idx];
code = v >> (32 - 17);
len = (v >> (32 - 17 - 7)) & 0x7f;
if (c < code) {
idx_max = idx - 1;
} else if (c >= code + len) {
idx_min = idx + 1;
} else {
return lre_case_conv_entry(res, c, conv_type, idx, v);
} }
} }
} }
@ -157,6 +189,77 @@ int lre_case_conv(uint32_t *res, uint32_t c, int conv_type)
return 1; return 1;
} }
static int lre_case_folding_entry(uint32_t c, uint32_t idx, uint32_t v, BOOL is_unicode)
{
uint32_t res[LRE_CC_RES_LEN_MAX];
int len;
if (is_unicode) {
len = lre_case_conv_entry(res, c, 2, idx, v);
if (len == 1) {
c = res[0];
} else {
/* handle the few specific multi-character cases (see
unicode_gen.c:dump_case_folding_special_cases()) */
if (c == 0xfb06) {
c = 0xfb05;
} else if (c == 0x01fd3) {
c = 0x390;
} else if (c == 0x01fe3) {
c = 0x3b0;
}
}
} else {
if (likely(c < 128)) {
if (c >= 'a' && c <= 'z')
c = c - 'a' + 'A';
} else {
/* legacy regexp: to upper case if single char >= 128 */
len = lre_case_conv_entry(res, c, FALSE, idx, v);
if (len == 1 && res[0] >= 128)
c = res[0];
}
}
return c;
}
/* JS regexp specific rules for case folding */
int lre_canonicalize(uint32_t c, BOOL is_unicode)
{
if (c < 128) {
/* fast case */
if (is_unicode) {
if (c >= 'A' && c <= 'Z') {
c = c - 'A' + 'a';
}
} else {
if (c >= 'a' && c <= 'z') {
c = c - 'a' + 'A';
}
}
} else {
uint32_t v, code, len;
int idx, idx_min, idx_max;
idx_min = 0;
idx_max = countof(case_conv_table1) - 1;
while (idx_min <= idx_max) {
idx = (unsigned)(idx_max + idx_min) / 2;
v = case_conv_table1[idx];
code = v >> (32 - 17);
len = (v >> (32 - 17 - 7)) & 0x7f;
if (c < code) {
idx_max = idx - 1;
} else if (c >= code + len) {
idx_min = idx + 1;
} else {
return lre_case_folding_entry(c, idx, v, is_unicode);
}
}
}
return c;
}
static uint32_t get_le24(const uint8_t *ptr) static uint32_t get_le24(const uint8_t *ptr)
{ {
return ptr[0] | (ptr[1] << 8) | (ptr[2] << 16); return ptr[0] | (ptr[1] << 8) | (ptr[2] << 16);
@ -1166,11 +1269,11 @@ static int unicode_case1(CharRange *cr, int case_mask)
#define MR(x) (1 << RUN_TYPE_ ## x) #define MR(x) (1 << RUN_TYPE_ ## x)
const uint32_t tab_run_mask[3] = { const uint32_t tab_run_mask[3] = {
MR(U) | MR(UF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(UF_D20) | MR(U) | MR(UF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(UF_D20) |
MR(UF_D1_EXT) | MR(U_EXT) | MR(U_EXT2) | MR(U_EXT3), MR(UF_D1_EXT) | MR(U_EXT) | MR(UF_EXT2) | MR(UF_EXT3),
MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(L_EXT2), MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2),
MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT), MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT) | MR(UF_EXT2) | MR(UF_EXT3),
}; };
#undef MR #undef MR
uint32_t mask, v, code, type, len, i, idx; uint32_t mask, v, code, type, len, i, idx;
@ -1224,6 +1327,135 @@ static int unicode_case1(CharRange *cr, int case_mask)
return 0; return 0;
} }
static int point_cmp(const void *p1, const void *p2, void *arg)
{
uint32_t v1 = *(uint32_t *)p1;
uint32_t v2 = *(uint32_t *)p2;
return (v1 > v2) - (v1 < v2);
}
static void cr_sort_and_remove_overlap(CharRange *cr)
{
uint32_t start, end, start1, end1, i, j;
/* the resulting ranges are not necessarily sorted and may overlap */
rqsort(cr->points, cr->len / 2, sizeof(cr->points[0]) * 2, point_cmp, NULL);
j = 0;
for(i = 0; i < cr->len; ) {
start = cr->points[i];
end = cr->points[i + 1];
i += 2;
while (i < cr->len) {
start1 = cr->points[i];
end1 = cr->points[i + 1];
if (start1 > end) {
/* |------|
* |-------| */
break;
} else if (end1 <= end) {
/* |------|
* |--| */
i += 2;
} else {
/* |------|
* |-------| */
end = end1;
i += 2;
}
}
cr->points[j] = start;
cr->points[j + 1] = end;
j += 2;
}
cr->len = j;
}
/* canonicalize a character set using the JS regex case folding rules
(see lre_canonicalize()) */
int cr_regexp_canonicalize(CharRange *cr, BOOL is_unicode)
{
CharRange cr_inter, cr_mask, cr_result, cr_sub;
uint32_t v, code, len, i, idx, start, end, c, d_start, d_end, d;
cr_init(&cr_mask, cr->mem_opaque, cr->realloc_func);
cr_init(&cr_inter, cr->mem_opaque, cr->realloc_func);
cr_init(&cr_result, cr->mem_opaque, cr->realloc_func);
cr_init(&cr_sub, cr->mem_opaque, cr->realloc_func);
if (unicode_case1(&cr_mask, is_unicode ? CASE_F : CASE_U))
goto fail;
if (cr_op(&cr_inter, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
goto fail;
if (cr_invert(&cr_mask))
goto fail;
if (cr_op(&cr_sub, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER))
goto fail;
/* cr_inter = cr & cr_mask */
/* cr_sub = cr & ~cr_mask */
/* use the case conversion table to compute the result */
d_start = -1;
d_end = -1;
idx = 0;
v = case_conv_table1[idx];
code = v >> (32 - 17);
len = (v >> (32 - 17 - 7)) & 0x7f;
for(i = 0; i < cr_inter.len; i += 2) {
start = cr_inter.points[i];
end = cr_inter.points[i + 1];
for(c = start; c < end; c++) {
for(;;) {
if (c >= code && c < code + len)
break;
idx++;
assert(idx < countof(case_conv_table1));
v = case_conv_table1[idx];
code = v >> (32 - 17);
len = (v >> (32 - 17 - 7)) & 0x7f;
}
d = lre_case_folding_entry(c, idx, v, is_unicode);
/* try to merge with the current interval */
if (d_start == -1) {
d_start = d;
d_end = d + 1;
} else if (d_end == d) {
d_end++;
} else {
cr_add_interval(&cr_result, d_start, d_end);
d_start = d;
d_end = d + 1;
}
}
}
if (d_start != -1) {
if (cr_add_interval(&cr_result, d_start, d_end))
goto fail;
}
/* the resulting ranges are not necessarily sorted and may overlap */
cr_sort_and_remove_overlap(&cr_result);
/* or with the character not affected by the case folding */
cr->len = 0;
if (cr_op(cr, cr_result.points, cr_result.len, cr_sub.points, cr_sub.len, CR_OP_UNION))
goto fail;
cr_free(&cr_inter);
cr_free(&cr_mask);
cr_free(&cr_result);
cr_free(&cr_sub);
return 0;
fail:
cr_free(&cr_inter);
cr_free(&cr_mask);
cr_free(&cr_result);
cr_free(&cr_sub);
return -1;
}
typedef enum { typedef enum {
POP_GC, POP_GC,
POP_PROP, POP_PROP,

View file

@ -43,6 +43,7 @@ typedef enum {
} UnicodeNormalizationEnum; } UnicodeNormalizationEnum;
int lre_case_conv(uint32_t *res, uint32_t c, int conv_type); int lre_case_conv(uint32_t *res, uint32_t c, int conv_type);
int lre_canonicalize(uint32_t c, BOOL is_unicode);
LRE_BOOL lre_is_cased(uint32_t c); LRE_BOOL lre_is_cased(uint32_t c);
LRE_BOOL lre_is_case_ignorable(uint32_t c); LRE_BOOL lre_is_case_ignorable(uint32_t c);
@ -102,6 +103,7 @@ int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len,
const uint32_t *b_pt, int b_len, int op); const uint32_t *b_pt, int b_len, int op);
int cr_invert(CharRange *cr); int cr_invert(CharRange *cr);
int cr_regexp_canonicalize(CharRange *cr, BOOL is_unicode);
LRE_BOOL lre_is_id_start(uint32_t c); LRE_BOOL lre_is_id_start(uint32_t c);
LRE_BOOL lre_is_id_continue(uint32_t c); LRE_BOOL lre_is_id_continue(uint32_t c);

View file

@ -70,8 +70,6 @@ test262/test/built-ins/RegExp/property-escapes/generated/XID_Continue.js:16: Tes
test262/test/built-ins/RegExp/property-escapes/generated/XID_Continue.js:16: strict mode: Test262Error: `\p{XID_Continue}` should match U+00200C (``) test262/test/built-ins/RegExp/property-escapes/generated/XID_Continue.js:16: strict mode: Test262Error: `\p{XID_Continue}` should match U+00200C (``)
test262/test/built-ins/RegExp/property-escapes/generated/XID_Start.js:16: Test262Error: `\p{XID_Start}` should match U+02EBF0 (`𮯰`) test262/test/built-ins/RegExp/property-escapes/generated/XID_Start.js:16: Test262Error: `\p{XID_Start}` should match U+02EBF0 (`𮯰`)
test262/test/built-ins/RegExp/property-escapes/generated/XID_Start.js:16: strict mode: Test262Error: `\p{XID_Start}` should match U+02EBF0 (`𮯰`) test262/test/built-ins/RegExp/property-escapes/generated/XID_Start.js:16: strict mode: Test262Error: `\p{XID_Start}` should match U+02EBF0 (`𮯰`)
test262/test/built-ins/RegExp/unicode_full_case_folding.js:20: Test262Error: \u0390 does not match \u1fd3
test262/test/built-ins/RegExp/unicode_full_case_folding.js:20: strict mode: Test262Error: \u0390 does not match \u1fd3
test262/test/built-ins/Set/prototype/isSupersetOf/allows-set-like-class.js:29: TypeError: not a function test262/test/built-ins/Set/prototype/isSupersetOf/allows-set-like-class.js:29: TypeError: not a function
test262/test/built-ins/Set/prototype/isSupersetOf/allows-set-like-class.js:29: strict mode: TypeError: not a function test262/test/built-ins/Set/prototype/isSupersetOf/allows-set-like-class.js:29: strict mode: TypeError: not a function
test262/test/built-ins/Set/prototype/isSupersetOf/allows-set-like-object.js:27: TypeError: not a function test262/test/built-ins/Set/prototype/isSupersetOf/allows-set-like-object.js:27: TypeError: not a function

View file

@ -42,6 +42,7 @@
//#define DUMP_TABLE_SIZE //#define DUMP_TABLE_SIZE
//#define DUMP_CC_TABLE //#define DUMP_CC_TABLE
//#define DUMP_DECOMP_TABLE //#define DUMP_DECOMP_TABLE
//#define DUMP_CASE_FOLDING_SPECIAL_CASES
/* Ideas: /* Ideas:
- Generalize run length encoding + index for all tables - Generalize run length encoding + index for all tables
@ -223,9 +224,10 @@ typedef struct {
/* case conv */ /* case conv */
uint8_t u_len; uint8_t u_len;
uint8_t l_len; uint8_t l_len;
int u_data[CC_LEN_MAX]; uint8_t f_len;
int l_data[CC_LEN_MAX]; int u_data[CC_LEN_MAX]; /* to upper case */
int f_code; int l_data[CC_LEN_MAX]; /* to lower case */
int f_data[CC_LEN_MAX]; /* to case folding */
uint8_t combining_class; uint8_t combining_class;
uint8_t is_compat:1; uint8_t is_compat:1;
@ -485,7 +487,7 @@ void parse_case_folding(CCInfo *tab, const char *filename)
FILE *f; FILE *f;
char line[1024]; char line[1024];
const char *p; const char *p;
int code; int code, status;
CCInfo *ci; CCInfo *ci;
f = fopen(filename, "rb"); f = fopen(filename, "rb");
@ -516,14 +518,28 @@ void parse_case_folding(CCInfo *tab, const char *filename)
/* locale dependent casing */ /* locale dependent casing */
while (isspace(*p)) while (isspace(*p))
p++; p++;
if (*p != 'C' && *p != 'S') status = *p;
if (status != 'C' && status != 'S' && status != 'F')
continue; continue;
p = get_field(line, 2); p = get_field(line, 2);
assert(p != 0); assert(p != NULL);
assert(ci->f_code == 0); if (status == 'S') {
ci->f_code = strtoul(p, NULL, 16); /* we always select the simple case folding and assume it
assert(ci->f_code != 0 && ci->f_code != code); * comes after the full case folding case */
assert(ci->f_len >= 2);
ci->f_len = 0;
} else {
assert(ci->f_len == 0);
}
for(;;) {
while (isspace(*p))
p++;
if (*p == ';')
break;
assert(ci->l_len < CC_LEN_MAX);
ci->f_data[ci->f_len++] = strtoul(p, (char **)&p, 16);
}
} }
fclose(f); fclose(f);
@ -850,19 +866,21 @@ void dump_cc_info(CCInfo *ci, int i)
for(j = 0; j < ci->l_len; j++) for(j = 0; j < ci->l_len; j++)
printf(" %05x", ci->l_data[j]); printf(" %05x", ci->l_data[j]);
} }
if (ci->f_code != 0) { if (ci->f_len != 0) {
printf(" F: %05x", ci->f_code); printf(" F:");
for(j = 0; j < ci->f_len; j++)
printf(" %05x", ci->f_data[j]);
} }
printf("\n"); printf("\n");
} }
void dump_data(CCInfo *tab) void dump_unicode_data(CCInfo *tab)
{ {
int i; int i;
CCInfo *ci; CCInfo *ci;
for(i = 0; i <= CHARCODE_MAX; i++) { for(i = 0; i <= CHARCODE_MAX; i++) {
ci = &tab[i]; ci = &tab[i];
if (ci->u_len != 0 || ci->l_len != 0 || ci->f_code != 0) { if (ci->u_len != 0 || ci->l_len != 0 || ci->f_len != 0) {
dump_cc_info(ci, i); dump_cc_info(ci, i);
} }
} }
@ -872,8 +890,8 @@ BOOL is_complicated_case(const CCInfo *ci)
{ {
return (ci->u_len > 1 || ci->l_len > 1 || return (ci->u_len > 1 || ci->l_len > 1 ||
(ci->u_len > 0 && ci->l_len > 0) || (ci->u_len > 0 && ci->l_len > 0) ||
(ci->f_code != 0) != ci->l_len || (ci->f_len != ci->l_len) ||
(ci->f_code != 0 && ci->l_data[0] != ci->f_code)); (memcmp(ci->f_data, ci->l_data, ci->f_len * sizeof(ci->f_data[0])) != 0));
} }
#ifndef USE_TEST #ifndef USE_TEST
@ -889,9 +907,9 @@ enum {
RUN_TYPE_UF_D1_EXT, RUN_TYPE_UF_D1_EXT,
RUN_TYPE_U_EXT, RUN_TYPE_U_EXT,
RUN_TYPE_LF_EXT, RUN_TYPE_LF_EXT,
RUN_TYPE_U_EXT2, RUN_TYPE_UF_EXT2,
RUN_TYPE_L_EXT2, RUN_TYPE_LF_EXT2,
RUN_TYPE_U_EXT3, RUN_TYPE_UF_EXT3,
}; };
#endif #endif
@ -907,9 +925,9 @@ const char *run_type_str[] = {
"UF_D1_EXT", "UF_D1_EXT",
"U_EXT", "U_EXT",
"LF_EXT", "LF_EXT",
"U_EXT2", "UF_EXT2",
"L_EXT2", "LF_EXT2",
"U_EXT3", "UF_EXT3",
}; };
typedef struct { typedef struct {
@ -922,6 +940,13 @@ typedef struct {
int data_index; /* 'data' coming from the table */ int data_index; /* 'data' coming from the table */
} TableEntry; } TableEntry;
static int simple_to_lower(CCInfo *tab, int c)
{
if (tab[c].l_len != 1)
return c;
return tab[c].l_data[0];
}
/* code (17), len (7), type (4) */ /* code (17), len (7), type (4) */
void find_run_type(TableEntry *te, CCInfo *tab, int code) void find_run_type(TableEntry *te, CCInfo *tab, int code)
@ -935,15 +960,15 @@ void find_run_type(TableEntry *te, CCInfo *tab, int code)
te->code = code; te->code = code;
if (ci->l_len == 1 && ci->l_data[0] == code + 2 && if (ci->l_len == 1 && ci->l_data[0] == code + 2 &&
ci->f_code == ci->l_data[0] && ci->f_len == 1 && ci->f_data[0] == ci->l_data[0] &&
ci->u_len == 0 && ci->u_len == 0 &&
ci1->l_len == 1 && ci1->l_data[0] == code + 2 && ci1->l_len == 1 && ci1->l_data[0] == code + 2 &&
ci1->f_code == ci1->l_data[0] && ci1->f_len == 1 && ci1->f_data[0] == ci1->l_data[0] &&
ci1->u_len == 1 && ci1->u_data[0] == code && ci1->u_len == 1 && ci1->u_data[0] == code &&
ci2->l_len == 0 && ci2->l_len == 0 &&
ci2->f_code == 0 && ci2->f_len == 0 &&
ci2->u_len == 1 && ci2->u_data[0] == code) { ci2->u_len == 1 && ci2->u_data[0] == code) {
te->len = 3; te->len = 3;
te->data = 0; te->data = 0;
@ -958,7 +983,7 @@ void find_run_type(TableEntry *te, CCInfo *tab, int code)
if (ci1->u_len != 1 || if (ci1->u_len != 1 ||
ci1->u_data[0] != ci->u_data[0] + len || ci1->u_data[0] != ci->u_data[0] + len ||
ci1->l_len != 0 || ci1->l_len != 0 ||
ci1->f_code != ci1->u_data[0]) ci1->f_len != 1 || ci1->f_data[0] != ci1->u_data[0])
break; break;
len++; len++;
} }
@ -969,21 +994,25 @@ void find_run_type(TableEntry *te, CCInfo *tab, int code)
return; return;
} }
if (ci->u_len == 2 && ci->u_data[1] == 0x399 && if (ci->l_len == 0 &&
ci->f_code == 0 && ci->l_len == 0) { ci->u_len == 2 && ci->u_data[1] == 0x399 &&
ci->f_len == 2 && ci->f_data[1] == 0x3B9 &&
ci->f_data[0] == simple_to_lower(tab, ci->u_data[0])) {
len = 1; len = 1;
while (code + len <= CHARCODE_MAX) { while (code + len <= CHARCODE_MAX) {
ci1 = &tab[code + len]; ci1 = &tab[code + len];
if (!(ci1->u_len == 2 && if (!(ci1->u_len == 2 &&
ci1->u_data[1] == 0x399 && ci1->u_data[1] == ci->u_data[1] &&
ci1->u_data[0] == ci->u_data[0] + len && ci1->u_data[0] == ci->u_data[0] + len &&
ci1->f_code == 0 && ci1->f_len == 2 &&
ci1->f_data[1] == ci->f_data[1] &&
ci1->f_data[0] == ci->f_data[0] + len &&
ci1->l_len == 0)) ci1->l_len == 0))
break; break;
len++; len++;
} }
te->len = len; te->len = len;
te->type = RUN_TYPE_U_EXT2; te->type = RUN_TYPE_UF_EXT2;
te->ext_data[0] = ci->u_data[0]; te->ext_data[0] = ci->u_data[0];
te->ext_data[1] = ci->u_data[1]; te->ext_data[1] = ci->u_data[1];
te->ext_len = 2; te->ext_len = 2;
@ -991,7 +1020,8 @@ void find_run_type(TableEntry *te, CCInfo *tab, int code)
} }
if (ci->u_len == 2 && ci->u_data[1] == 0x399 && if (ci->u_len == 2 && ci->u_data[1] == 0x399 &&
ci->l_len == 1 && ci->f_code == ci->l_data[0]) { ci->l_len == 1 &&
ci->f_len == 1 && ci->f_data[0] == ci->l_data[0]) {
len = 1; len = 1;
while (code + len <= CHARCODE_MAX) { while (code + len <= CHARCODE_MAX) {
ci1 = &tab[code + len]; ci1 = &tab[code + len];
@ -1000,7 +1030,7 @@ void find_run_type(TableEntry *te, CCInfo *tab, int code)
ci1->u_data[0] == ci->u_data[0] + len && ci1->u_data[0] == ci->u_data[0] + len &&
ci1->l_len == 1 && ci1->l_len == 1 &&
ci1->l_data[0] == ci->l_data[0] + len && ci1->l_data[0] == ci->l_data[0] + len &&
ci1->f_code == ci1->l_data[0])) ci1->f_len == 1 && ci1->f_data[0] == ci1->l_data[0]))
break; break;
len++; len++;
} }
@ -1012,13 +1042,13 @@ void find_run_type(TableEntry *te, CCInfo *tab, int code)
return; return;
} }
if (ci->l_len == 1 && ci->u_len == 0 && ci->f_code == 0) { if (ci->l_len == 1 && ci->u_len == 0 && ci->f_len == 0) {
len = 1; len = 1;
while (code + len <= CHARCODE_MAX) { while (code + len <= CHARCODE_MAX) {
ci1 = &tab[code + len]; ci1 = &tab[code + len];
if (!(ci1->l_len == 1 && if (!(ci1->l_len == 1 &&
ci1->l_data[0] == ci->l_data[0] + len && ci1->l_data[0] == ci->l_data[0] + len &&
ci1->u_len == 0 && ci1->f_code == 0)) ci1->u_len == 0 && ci1->f_len == 0))
break; break;
len++; len++;
} }
@ -1031,32 +1061,39 @@ void find_run_type(TableEntry *te, CCInfo *tab, int code)
if (ci->l_len == 0 && if (ci->l_len == 0 &&
ci->u_len == 1 && ci->u_len == 1 &&
ci->u_data[0] < 0x1000 && ci->u_data[0] < 0x1000 &&
ci->f_code == ci->u_data[0] + 0x20) { ci->f_len == 1 && ci->f_data[0] == ci->u_data[0] + 0x20) {
te->len = 1; te->len = 1;
te->type = RUN_TYPE_UF_D20; te->type = RUN_TYPE_UF_D20;
te->data = ci->u_data[0]; te->data = ci->u_data[0];
} else if (ci->l_len == 0 && } else if (ci->l_len == 0 &&
ci->u_len == 1 && ci->u_len == 1 &&
ci->f_code == ci->u_data[0] + 1) { ci->f_len == 1 && ci->f_data[0] == ci->u_data[0] + 1) {
te->len = 1; te->len = 1;
te->type = RUN_TYPE_UF_D1_EXT; te->type = RUN_TYPE_UF_D1_EXT;
te->ext_data[0] = ci->u_data[0]; te->ext_data[0] = ci->u_data[0];
te->ext_len = 1; te->ext_len = 1;
} else if (ci->l_len == 2 && ci->u_len == 0 && ci->f_code == 0) { } else if (ci->l_len == 2 && ci->u_len == 0 && ci->f_len == 2 &&
ci->l_data[0] == ci->f_data[0] &&
ci->l_data[1] == ci->f_data[1]) {
te->len = 1; te->len = 1;
te->type = RUN_TYPE_L_EXT2; te->type = RUN_TYPE_LF_EXT2;
te->ext_data[0] = ci->l_data[0]; te->ext_data[0] = ci->l_data[0];
te->ext_data[1] = ci->l_data[1]; te->ext_data[1] = ci->l_data[1];
te->ext_len = 2; te->ext_len = 2;
} else if (ci->u_len == 2 && ci->l_len == 0 && ci->f_code == 0) { } else if (ci->u_len == 2 && ci->l_len == 0 && ci->f_len == 2 &&
ci->f_data[0] == simple_to_lower(tab, ci->u_data[0]) &&
ci->f_data[1] == simple_to_lower(tab, ci->u_data[1])) {
te->len = 1; te->len = 1;
te->type = RUN_TYPE_U_EXT2; te->type = RUN_TYPE_UF_EXT2;
te->ext_data[0] = ci->u_data[0]; te->ext_data[0] = ci->u_data[0];
te->ext_data[1] = ci->u_data[1]; te->ext_data[1] = ci->u_data[1];
te->ext_len = 2; te->ext_len = 2;
} else if (ci->u_len == 3 && ci->l_len == 0 && ci->f_code == 0) { } else if (ci->u_len == 3 && ci->l_len == 0 && ci->f_len == 3 &&
ci->f_data[0] == simple_to_lower(tab, ci->u_data[0]) &&
ci->f_data[1] == simple_to_lower(tab, ci->u_data[1]) &&
ci->f_data[2] == simple_to_lower(tab, ci->u_data[2])) {
te->len = 1; te->len = 1;
te->type = RUN_TYPE_U_EXT3; te->type = RUN_TYPE_UF_EXT3;
te->ext_data[0] = ci->u_data[0]; te->ext_data[0] = ci->u_data[0];
te->ext_data[1] = ci->u_data[1]; te->ext_data[1] = ci->u_data[1];
te->ext_data[2] = ci->u_data[2]; te->ext_data[2] = ci->u_data[2];
@ -1174,7 +1211,7 @@ void build_conv_table(CCInfo *tab)
te = conv_table; te = conv_table;
for(code = 0; code <= CHARCODE_MAX; code++) { for(code = 0; code <= CHARCODE_MAX; code++) {
ci = &tab[code]; ci = &tab[code];
if (ci->u_len == 0 && ci->l_len == 0 && ci->f_code == 0) if (ci->u_len == 0 && ci->l_len == 0 && ci->f_len == 0)
continue; continue;
assert(te - conv_table < countof(conv_table)); assert(te - conv_table < countof(conv_table));
find_run_type(te, tab, code); find_run_type(te, tab, code);
@ -1224,7 +1261,7 @@ void build_conv_table(CCInfo *tab)
/* find the data index for ext_data */ /* find the data index for ext_data */
for(i = 0; i < conv_table_len; i++) { for(i = 0; i < conv_table_len; i++) {
te = &conv_table[i]; te = &conv_table[i];
if (te->type == RUN_TYPE_U_EXT3) { if (te->type == RUN_TYPE_UF_EXT3) {
int p, v; int p, v;
v = 0; v = 0;
for(j = 0; j < 3; j++) { for(j = 0; j < 3; j++) {
@ -1238,8 +1275,8 @@ void build_conv_table(CCInfo *tab)
for(i = 0; i < conv_table_len; i++) { for(i = 0; i < conv_table_len; i++) {
te = &conv_table[i]; te = &conv_table[i];
if (te->type == RUN_TYPE_L_EXT2 || if (te->type == RUN_TYPE_LF_EXT2 ||
te->type == RUN_TYPE_U_EXT2 || te->type == RUN_TYPE_UF_EXT2 ||
te->type == RUN_TYPE_U2L_399_EXT2) { te->type == RUN_TYPE_U2L_399_EXT2) {
int p, v; int p, v;
v = 0; v = 0;
@ -1302,6 +1339,54 @@ void dump_case_conv_table(FILE *f)
fprintf(f, "\n};\n\n"); fprintf(f, "\n};\n\n");
} }
static CCInfo *global_tab;
static int sp_cc_cmp(const void *p1, const void *p2)
{
CCInfo *c1 = &global_tab[*(const int *)p1];
CCInfo *c2 = &global_tab[*(const int *)p2];
if (c1->f_len < c2->f_len) {
return -1;
} else if (c2->f_len < c1->f_len) {
return 1;
} else {
return memcmp(c1->f_data, c2->f_data, sizeof(c1->f_data[0]) * c1->f_len);
}
}
/* dump the case special cases (multi character results which are
identical and need specific handling in lre_canonicalize() */
void dump_case_folding_special_cases(CCInfo *tab)
{
int i, len, j;
int *perm;
perm = malloc(sizeof(perm[0]) * (CHARCODE_MAX + 1));
for(i = 0; i <= CHARCODE_MAX; i++)
perm[i] = i;
global_tab = tab;
qsort(perm, CHARCODE_MAX + 1, sizeof(perm[0]), sp_cc_cmp);
for(i = 0; i <= CHARCODE_MAX;) {
if (tab[perm[i]].f_len <= 1) {
i++;
} else {
len = 1;
while ((i + len) <= CHARCODE_MAX && !sp_cc_cmp(&perm[i], &perm[i + len]))
len++;
if (len > 1) {
for(j = i; j < i + len; j++)
dump_cc_info(&tab[perm[j]], perm[j]);
}
i += len;
}
}
free(perm);
global_tab = NULL;
}
int tabcmp(const int *tab1, const int *tab2, int n) int tabcmp(const int *tab1, const int *tab2, int n)
{ {
int i; int i;
@ -1328,7 +1413,7 @@ void compute_internal_props(void)
for(i = 0; i <= CHARCODE_MAX; i++) { for(i = 0; i <= CHARCODE_MAX; i++) {
CCInfo *ci = &unicode_db[i]; CCInfo *ci = &unicode_db[i];
has_ul = (ci->u_len != 0 || ci->l_len != 0 || ci->f_code != 0); has_ul = (ci->u_len != 0 || ci->l_len != 0 || ci->f_len != 0);
if (has_ul) { if (has_ul) {
assert(get_prop(i, PROP_Cased)); assert(get_prop(i, PROP_Cased));
} else { } else {
@ -1343,10 +1428,10 @@ void compute_internal_props(void)
set_prop(i, PROP_Changes_When_Titlecased1, set_prop(i, PROP_Changes_When_Titlecased1,
get_prop(i, PROP_Changes_When_Titlecased) ^ (ci->u_len != 0)); get_prop(i, PROP_Changes_When_Titlecased) ^ (ci->u_len != 0));
set_prop(i, PROP_Changes_When_Casefolded1, set_prop(i, PROP_Changes_When_Casefolded1,
get_prop(i, PROP_Changes_When_Casefolded) ^ (ci->f_code != 0)); get_prop(i, PROP_Changes_When_Casefolded) ^ (ci->f_len != 0));
/* XXX: reduce table size (438 bytes) */ /* XXX: reduce table size (438 bytes) */
set_prop(i, PROP_Changes_When_NFKC_Casefolded1, set_prop(i, PROP_Changes_When_NFKC_Casefolded1,
get_prop(i, PROP_Changes_When_NFKC_Casefolded) ^ (ci->f_code != 0)); get_prop(i, PROP_Changes_When_NFKC_Casefolded) ^ (ci->f_len != 0));
} }
} }
@ -1765,8 +1850,10 @@ void check_case_conv(void)
ci->u_len = 1; ci->u_len = 1;
ci->u_data[0] = code; ci->u_data[0] = code;
} }
if (ci->f_code == 0) if (ci->f_len == 0) {
ci->f_code = code; ci->f_len = 1;
ci->f_data[0] = code;
}
error = 0; error = 0;
l = check_conv(res, code, 0); l = check_conv(res, code, 0);
@ -1780,7 +1867,7 @@ void check_case_conv(void)
error++; error++;
} }
l = check_conv(res, code, 2); l = check_conv(res, code, 2);
if (l != 1 || res[0] != ci->f_code) { if (l != ci->f_len || tabcmp((int *)res, ci->f_data, l)) {
printf("ERROR: F\n"); printf("ERROR: F\n");
error++; error++;
} }
@ -2951,11 +3038,12 @@ int main(int argc, char **argv)
unicode_db_path); unicode_db_path);
parse_prop_list(filename); parse_prop_list(filename);
// dump_data(unicode_db); // dump_unicode_data(unicode_db);
build_conv_table(unicode_db); build_conv_table(unicode_db);
// dump_table(); #ifdef DUMP_CASE_FOLDING_SPECIAL_CASES
dump_case_folding_special_cases(unicode_db);
#endif
if (!outfilename) { if (!outfilename) {
#ifdef USE_TEST #ifdef USE_TEST