diff --git a/libregexp.c b/libregexp.c index 28af7a3..165158a 100644 --- a/libregexp.c +++ b/libregexp.c @@ -34,9 +34,6 @@ /* TODO: - - Add full unicode canonicalize rules for character ranges (not - really useful but needed for exact "ignorecase" compatibility). - - Add a lock step execution mode (=linear time execution guaranteed) when the regular expression is "simple" i.e. no backreference nor complicated lookahead. The opcodes are designed for this execution @@ -123,33 +120,6 @@ static int dbuf_insert(DynBuf *s, int pos, int len) return 0; } -/* canonicalize with the specific JS regexp rules */ -static uint32_t lre_canonicalize(uint32_t c, BOOL is_unicode) -{ - uint32_t res[LRE_CC_RES_LEN_MAX]; - int len; - if (is_unicode) { - if (likely(c < 128)) { - if (c >= 'A' && c <= 'Z') - c = c - 'A' + 'a'; - } else { - lre_case_conv(res, c, 2); - c = res[0]; - } - } else { - if (likely(c < 128)) { - if (c >= 'a' && c <= 'z') - c = c - 'a' + 'A'; - } else { - /* legacy regexp: to upper case if single char >= 128 */ - len = lre_case_conv(res, c, FALSE); - if (len == 1 && res[0] >= 128) - c = res[0]; - } - } - return c; -} - static const uint16_t char_range_d[] = { 1, 0x0030, 0x0039 + 1, @@ -248,31 +218,6 @@ static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c) return -1; } -static int cr_canonicalize(CharRange *cr) -{ - CharRange a; - uint32_t pt[2]; - int i, ret; - - cr_init(&a, cr->mem_opaque, lre_realloc); - pt[0] = 'a'; - pt[1] = 'z' + 1; - ret = cr_op(&a, cr->points, cr->len, pt, 2, CR_OP_INTER); - if (ret) - goto fail; - /* convert to upper case */ - /* XXX: the generic unicode case would be much more complicated - and not really useful */ - for(i = 0; i < a.len; i++) { - a.points[i] += 'A' - 'a'; - } - /* Note: for simplicity we keep the lower case ranges */ - ret = cr_union1(cr, a.points, a.len); - fail: - cr_free(&a); - return ret; -} - #ifdef DUMP_REOP static __maybe_unused void lre_dump_bytecode(const uint8_t *buf, int buf_len) @@ -955,7 +900,7 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp) } } if (s->ignore_case) { - if (cr_canonicalize(cr)) + if (cr_regexp_canonicalize(cr, s->is_unicode)) goto memory_error; } if (invert) { diff --git a/libunicode-table.h b/libunicode-table.h index a7b1e9d..7a0366a 100644 --- a/libunicode-table.h +++ b/libunicode-table.h @@ -3777,72 +3777,70 @@ static const uint8_t unicode_prop_Changes_When_Titlecased1_table[22] = { 0x8b, 0x80, 0x8e, 0x80, 0xae, 0x80, }; -static const uint8_t unicode_prop_Changes_When_Casefolded1_table[33] = { - 0x40, 0xde, 0x80, 0xcf, 0x80, 0x97, 0x80, 0x44, - 0x3c, 0x80, 0x59, 0x11, 0x80, 0x40, 0xe4, 0x3f, - 0x3f, 0x87, 0x89, 0x11, 0x05, 0x02, 0x11, 0x80, - 0xa9, 0x11, 0x80, 0x60, 0xdb, 0x07, 0x86, 0x8b, - 0x84, +static const uint8_t unicode_prop_Changes_When_Casefolded1_table[29] = { + 0x41, 0xef, 0x80, 0x41, 0x9e, 0x80, 0x9e, 0x80, + 0x5a, 0xe4, 0x83, 0x40, 0xb5, 0x00, 0x00, 0x00, + 0x80, 0xde, 0x06, 0x06, 0x80, 0x8a, 0x09, 0x81, + 0x89, 0x10, 0x81, 0x8d, 0x80, }; -static const uint8_t unicode_prop_Changes_When_NFKC_Casefolded1_table[451] = { +static const uint8_t unicode_prop_Changes_When_NFKC_Casefolded1_table[447] = { 0x40, 0x9f, 0x06, 0x00, 0x01, 0x00, 0x01, 0x12, - 0x10, 0x82, 0x9f, 0x80, 0xcf, 0x01, 0x80, 0x8b, - 0x07, 0x80, 0xfb, 0x01, 0x01, 0x80, 0xa5, 0x80, - 0x40, 0xbb, 0x88, 0x9e, 0x29, 0x84, 0xda, 0x08, - 0x81, 0x89, 0x80, 0xa3, 0x04, 0x02, 0x04, 0x08, - 0x80, 0xc9, 0x82, 0x9c, 0x80, 0x41, 0x93, 0x80, - 0x40, 0x93, 0x80, 0xd7, 0x83, 0x42, 0xde, 0x87, - 0xfb, 0x08, 0x80, 0xd2, 0x01, 0x80, 0xa1, 0x11, - 0x80, 0x40, 0xfc, 0x81, 0x42, 0xd4, 0x80, 0xfe, - 0x80, 0xa7, 0x81, 0xad, 0x80, 0xb5, 0x80, 0x88, - 0x03, 0x03, 0x03, 0x80, 0x8b, 0x80, 0x88, 0x00, - 0x26, 0x80, 0x90, 0x80, 0x88, 0x03, 0x03, 0x03, - 0x80, 0x8b, 0x80, 0x41, 0x41, 0x80, 0xe1, 0x81, - 0x46, 0x52, 0x81, 0xd4, 0x84, 0x45, 0x1b, 0x10, - 0x8a, 0x80, 0x91, 0x80, 0x9b, 0x8c, 0x80, 0xa1, - 0xa4, 0x40, 0xd9, 0x80, 0x40, 0xd5, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x01, 0x3f, 0x3f, 0x87, - 0x89, 0x11, 0x04, 0x00, 0x29, 0x04, 0x12, 0x80, - 0x88, 0x12, 0x80, 0x88, 0x11, 0x11, 0x04, 0x08, - 0x8f, 0x00, 0x20, 0x8b, 0x12, 0x2a, 0x08, 0x0b, - 0x00, 0x07, 0x82, 0x8c, 0x06, 0x92, 0x81, 0x9a, - 0x80, 0x8c, 0x8a, 0x80, 0xd6, 0x18, 0x10, 0x8a, - 0x01, 0x0c, 0x0a, 0x00, 0x10, 0x11, 0x02, 0x06, - 0x05, 0x1c, 0x85, 0x8f, 0x8f, 0x8f, 0x88, 0x80, - 0x40, 0xa1, 0x08, 0x81, 0x40, 0xf7, 0x81, 0x41, - 0x34, 0xd5, 0x99, 0x9a, 0x45, 0x20, 0x80, 0xe6, - 0x82, 0xe4, 0x80, 0x41, 0x9e, 0x81, 0x40, 0xf0, - 0x80, 0x41, 0x2e, 0x80, 0xd2, 0x80, 0x8b, 0x40, - 0xd5, 0xa9, 0x80, 0xb4, 0x00, 0x82, 0xdf, 0x09, - 0x80, 0xde, 0x80, 0xb0, 0xdd, 0x82, 0x8d, 0xdf, - 0x9e, 0x80, 0xa7, 0x87, 0xae, 0x80, 0x41, 0x7f, - 0x60, 0x72, 0x9b, 0x81, 0x40, 0xd1, 0x80, 0x40, - 0x80, 0x12, 0x81, 0x43, 0x61, 0x83, 0x88, 0x80, - 0x60, 0x4d, 0x95, 0x41, 0x0d, 0x08, 0x00, 0x81, - 0x89, 0x00, 0x00, 0x09, 0x82, 0xc3, 0x81, 0xe9, - 0xa5, 0x86, 0x8b, 0x24, 0x00, 0x97, 0x04, 0x00, - 0x01, 0x01, 0x80, 0xeb, 0xa0, 0x41, 0x6a, 0x91, - 0xbf, 0x81, 0xb5, 0xa7, 0x8c, 0x82, 0x99, 0x95, - 0x94, 0x81, 0x8b, 0x80, 0x92, 0x03, 0x1a, 0x00, - 0x80, 0x40, 0x86, 0x08, 0x80, 0x9f, 0x99, 0x40, - 0x83, 0x15, 0x0d, 0x0d, 0x0a, 0x16, 0x06, 0x80, - 0x88, 0x47, 0x87, 0x20, 0xa9, 0x80, 0x88, 0x60, - 0xb4, 0xe4, 0x83, 0x54, 0xb9, 0x86, 0x8d, 0x87, - 0xbf, 0x85, 0x42, 0x3e, 0xd4, 0x80, 0xc6, 0x01, - 0x08, 0x09, 0x0b, 0x80, 0x8b, 0x00, 0x06, 0x80, - 0xc0, 0x03, 0x0f, 0x06, 0x80, 0x9b, 0x03, 0x04, - 0x00, 0x16, 0x80, 0x41, 0x53, 0x81, 0x41, 0x23, - 0x81, 0xb1, 0x48, 0x2f, 0xbd, 0x4d, 0x91, 0x18, - 0x9a, 0x01, 0x00, 0x08, 0x80, 0x89, 0x03, 0x00, - 0x00, 0x28, 0x18, 0x00, 0x00, 0x02, 0x01, 0x00, - 0x08, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x0b, - 0x06, 0x03, 0x03, 0x00, 0x80, 0x89, 0x80, 0x90, - 0x22, 0x04, 0x80, 0x90, 0x42, 0x43, 0x8a, 0x84, - 0x9e, 0x80, 0x9f, 0x99, 0x82, 0xa2, 0x80, 0xee, - 0x82, 0x8c, 0xab, 0x83, 0x88, 0x31, 0x49, 0x9d, - 0x89, 0x60, 0xfc, 0x05, 0x42, 0x1d, 0x6b, 0x05, - 0xe1, 0x4f, 0xff, + 0x10, 0x82, 0xf3, 0x80, 0x8b, 0x80, 0x40, 0x84, + 0x01, 0x01, 0x80, 0xa2, 0x01, 0x80, 0x40, 0xbb, + 0x88, 0x9e, 0x29, 0x84, 0xda, 0x08, 0x81, 0x89, + 0x80, 0xa3, 0x04, 0x02, 0x04, 0x08, 0x07, 0x80, + 0x9e, 0x80, 0xa0, 0x82, 0x9c, 0x80, 0x42, 0x28, + 0x80, 0xd7, 0x83, 0x42, 0xde, 0x87, 0xfb, 0x08, + 0x80, 0xd2, 0x01, 0x80, 0xa1, 0x11, 0x80, 0x40, + 0xfc, 0x81, 0x42, 0xd4, 0x80, 0xfe, 0x80, 0xa7, + 0x81, 0xad, 0x80, 0xb5, 0x80, 0x88, 0x03, 0x03, + 0x03, 0x80, 0x8b, 0x80, 0x88, 0x00, 0x26, 0x80, + 0x90, 0x80, 0x88, 0x03, 0x03, 0x03, 0x80, 0x8b, + 0x80, 0x41, 0x41, 0x80, 0xe1, 0x81, 0x46, 0x52, + 0x81, 0xd4, 0x84, 0x45, 0x1b, 0x10, 0x8a, 0x80, + 0x91, 0x80, 0x9b, 0x8c, 0x80, 0xa1, 0xa4, 0x40, + 0xd5, 0x83, 0x40, 0xb5, 0x00, 0x00, 0x00, 0x80, + 0x99, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, + 0xb7, 0x05, 0x00, 0x13, 0x05, 0x11, 0x02, 0x0c, + 0x11, 0x00, 0x00, 0x0c, 0x15, 0x05, 0x08, 0x8f, + 0x00, 0x20, 0x8b, 0x12, 0x2a, 0x08, 0x0b, 0x00, + 0x07, 0x82, 0x8c, 0x06, 0x92, 0x81, 0x9a, 0x80, + 0x8c, 0x8a, 0x80, 0xd6, 0x18, 0x10, 0x8a, 0x01, + 0x0c, 0x0a, 0x00, 0x10, 0x11, 0x02, 0x06, 0x05, + 0x1c, 0x85, 0x8f, 0x8f, 0x8f, 0x88, 0x80, 0x40, + 0xa1, 0x08, 0x81, 0x40, 0xf7, 0x81, 0x41, 0x34, + 0xd5, 0x99, 0x9a, 0x45, 0x20, 0x80, 0xe6, 0x82, + 0xe4, 0x80, 0x41, 0x9e, 0x81, 0x40, 0xf0, 0x80, + 0x41, 0x2e, 0x80, 0xd2, 0x80, 0x8b, 0x40, 0xd5, + 0xa9, 0x80, 0xb4, 0x00, 0x82, 0xdf, 0x09, 0x80, + 0xde, 0x80, 0xb0, 0xdd, 0x82, 0x8d, 0xdf, 0x9e, + 0x80, 0xa7, 0x87, 0xae, 0x80, 0x41, 0x7f, 0x60, + 0x72, 0x9b, 0x81, 0x40, 0xd1, 0x80, 0x40, 0x80, + 0x12, 0x81, 0x43, 0x61, 0x83, 0x88, 0x80, 0x60, + 0x4d, 0x95, 0x41, 0x0d, 0x08, 0x00, 0x81, 0x89, + 0x00, 0x00, 0x09, 0x82, 0xc3, 0x81, 0xe9, 0xc2, + 0x00, 0x97, 0x04, 0x00, 0x01, 0x01, 0x80, 0xeb, + 0xa0, 0x41, 0x6a, 0x91, 0xbf, 0x81, 0xb5, 0xa7, + 0x8c, 0x82, 0x99, 0x95, 0x94, 0x81, 0x8b, 0x80, + 0x92, 0x03, 0x1a, 0x00, 0x80, 0x40, 0x86, 0x08, + 0x80, 0x9f, 0x99, 0x40, 0x83, 0x15, 0x0d, 0x0d, + 0x0a, 0x16, 0x06, 0x80, 0x88, 0x47, 0x87, 0x20, + 0xa9, 0x80, 0x88, 0x60, 0xb4, 0xe4, 0x83, 0x54, + 0xb9, 0x86, 0x8d, 0x87, 0xbf, 0x85, 0x42, 0x3e, + 0xd4, 0x80, 0xc6, 0x01, 0x08, 0x09, 0x0b, 0x80, + 0x8b, 0x00, 0x06, 0x80, 0xc0, 0x03, 0x0f, 0x06, + 0x80, 0x9b, 0x03, 0x04, 0x00, 0x16, 0x80, 0x41, + 0x53, 0x81, 0x41, 0x23, 0x81, 0xb1, 0x48, 0x2f, + 0xbd, 0x4d, 0x91, 0x18, 0x9a, 0x01, 0x00, 0x08, + 0x80, 0x89, 0x03, 0x00, 0x00, 0x28, 0x18, 0x00, + 0x00, 0x02, 0x01, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x00, 0x0b, 0x06, 0x03, 0x03, 0x00, + 0x80, 0x89, 0x80, 0x90, 0x22, 0x04, 0x80, 0x90, + 0x42, 0x43, 0x8a, 0x84, 0x9e, 0x80, 0x9f, 0x99, + 0x82, 0xa2, 0x80, 0xee, 0x82, 0x8c, 0xab, 0x83, + 0x88, 0x31, 0x49, 0x9d, 0x89, 0x60, 0xfc, 0x05, + 0x42, 0x1d, 0x6b, 0x05, 0xe1, 0x4f, 0xff, }; static const uint8_t unicode_prop_ASCII_Hex_Digit_table[5] = { diff --git a/libunicode.c b/libunicode.c index f0b5146..90ce4c2 100644 --- a/libunicode.c +++ b/libunicode.c @@ -43,11 +43,111 @@ enum { RUN_TYPE_UF_D1_EXT, RUN_TYPE_U_EXT, RUN_TYPE_LF_EXT, - RUN_TYPE_U_EXT2, - RUN_TYPE_L_EXT2, - RUN_TYPE_U_EXT3, + RUN_TYPE_UF_EXT2, + RUN_TYPE_LF_EXT2, + RUN_TYPE_UF_EXT3, }; +static int lre_case_conv1(uint32_t c, int conv_type) +{ + uint32_t res[LRE_CC_RES_LEN_MAX]; + lre_case_conv(res, c, conv_type); + return res[0]; +} + +/* case conversion using the table entry 'idx' with value 'v' */ +static int lre_case_conv_entry(uint32_t *res, uint32_t c, int conv_type, uint32_t idx, uint32_t v) +{ + uint32_t code, data, type, a, is_lower; + is_lower = (conv_type != 0); + type = (v >> (32 - 17 - 7 - 4)) & 0xf; + data = ((v & 0xf) << 8) | case_conv_table2[idx]; + code = v >> (32 - 17); + switch(type) { + case RUN_TYPE_U: + case RUN_TYPE_L: + case RUN_TYPE_UF: + case RUN_TYPE_LF: + if (conv_type == (type & 1) || + (type >= RUN_TYPE_UF && conv_type == 2)) { + c = c - code + (case_conv_table1[data] >> (32 - 17)); + } + break; + case RUN_TYPE_UL: + a = c - code; + if ((a & 1) != (1 - is_lower)) + break; + c = (a ^ 1) + code; + break; + case RUN_TYPE_LSU: + a = c - code; + if (a == 1) { + c += 2 * is_lower - 1; + } else if (a == (1 - is_lower) * 2) { + c += (2 * is_lower - 1) * 2; + } + break; + case RUN_TYPE_U2L_399_EXT2: + if (!is_lower) { + res[0] = c - code + case_conv_ext[data >> 6]; + res[1] = 0x399; + return 2; + } else { + c = c - code + case_conv_ext[data & 0x3f]; + } + break; + case RUN_TYPE_UF_D20: + if (conv_type == 1) + break; + c = data + (conv_type == 2) * 0x20; + break; + case RUN_TYPE_UF_D1_EXT: + if (conv_type == 1) + break; + c = case_conv_ext[data] + (conv_type == 2); + break; + case RUN_TYPE_U_EXT: + case RUN_TYPE_LF_EXT: + if (is_lower != (type - RUN_TYPE_U_EXT)) + break; + c = case_conv_ext[data]; + break; + case RUN_TYPE_LF_EXT2: + if (!is_lower) + break; + res[0] = c - code + case_conv_ext[data >> 6]; + res[1] = case_conv_ext[data & 0x3f]; + return 2; + case RUN_TYPE_UF_EXT2: + if (conv_type == 1) + break; + res[0] = c - code + case_conv_ext[data >> 6]; + res[1] = case_conv_ext[data & 0x3f]; + if (conv_type == 2) { + /* convert to lower */ + res[0] = lre_case_conv1(res[0], 1); + res[1] = lre_case_conv1(res[1], 1); + } + return 2; + default: + case RUN_TYPE_UF_EXT3: + if (conv_type == 1) + break; + res[0] = case_conv_ext[data >> 8]; + res[1] = case_conv_ext[(data >> 4) & 0xf]; + res[2] = case_conv_ext[data & 0xf]; + if (conv_type == 2) { + /* convert to lower */ + res[0] = lre_case_conv1(res[0], 1); + res[1] = lre_case_conv1(res[1], 1); + res[2] = lre_case_conv1(res[2], 1); + } + return 3; + } + res[0] = c; + return 1; +} + /* conv_type: 0 = to upper 1 = to lower @@ -66,10 +166,9 @@ int lre_case_conv(uint32_t *res, uint32_t c, int conv_type) } } } else { - uint32_t v, code, data, type, len, a, is_lower; + uint32_t v, code, len; int idx, idx_min, idx_max; - is_lower = (conv_type != 0); idx_min = 0; idx_max = countof(case_conv_table1) - 1; while (idx_min <= idx_max) { @@ -82,74 +181,7 @@ int lre_case_conv(uint32_t *res, uint32_t c, int conv_type) } else if (c >= code + len) { idx_min = idx + 1; } else { - type = (v >> (32 - 17 - 7 - 4)) & 0xf; - data = ((v & 0xf) << 8) | case_conv_table2[idx]; - switch(type) { - case RUN_TYPE_U: - case RUN_TYPE_L: - case RUN_TYPE_UF: - case RUN_TYPE_LF: - if (conv_type == (type & 1) || - (type >= RUN_TYPE_UF && conv_type == 2)) { - c = c - code + (case_conv_table1[data] >> (32 - 17)); - } - break; - case RUN_TYPE_UL: - a = c - code; - if ((a & 1) != (1 - is_lower)) - break; - c = (a ^ 1) + code; - break; - case RUN_TYPE_LSU: - a = c - code; - if (a == 1) { - c += 2 * is_lower - 1; - } else if (a == (1 - is_lower) * 2) { - c += (2 * is_lower - 1) * 2; - } - break; - case RUN_TYPE_U2L_399_EXT2: - if (!is_lower) { - res[0] = c - code + case_conv_ext[data >> 6]; - res[1] = 0x399; - return 2; - } else { - c = c - code + case_conv_ext[data & 0x3f]; - } - break; - case RUN_TYPE_UF_D20: - if (conv_type == 1) - break; - c = data + (conv_type == 2) * 0x20; - break; - case RUN_TYPE_UF_D1_EXT: - if (conv_type == 1) - break; - c = case_conv_ext[data] + (conv_type == 2); - break; - case RUN_TYPE_U_EXT: - case RUN_TYPE_LF_EXT: - if (is_lower != (type - RUN_TYPE_U_EXT)) - break; - c = case_conv_ext[data]; - break; - case RUN_TYPE_U_EXT2: - case RUN_TYPE_L_EXT2: - if (conv_type != (type - RUN_TYPE_U_EXT2)) - break; - res[0] = c - code + case_conv_ext[data >> 6]; - res[1] = case_conv_ext[data & 0x3f]; - return 2; - default: - case RUN_TYPE_U_EXT3: - if (conv_type != 0) - break; - res[0] = case_conv_ext[data >> 8]; - res[1] = case_conv_ext[(data >> 4) & 0xf]; - res[2] = case_conv_ext[data & 0xf]; - return 3; - } - break; + return lre_case_conv_entry(res, c, conv_type, idx, v); } } } @@ -157,6 +189,77 @@ int lre_case_conv(uint32_t *res, uint32_t c, int conv_type) return 1; } +static int lre_case_folding_entry(uint32_t c, uint32_t idx, uint32_t v, BOOL is_unicode) +{ + uint32_t res[LRE_CC_RES_LEN_MAX]; + int len; + + if (is_unicode) { + len = lre_case_conv_entry(res, c, 2, idx, v); + if (len == 1) { + c = res[0]; + } else { + /* handle the few specific multi-character cases (see + unicode_gen.c:dump_case_folding_special_cases()) */ + if (c == 0xfb06) { + c = 0xfb05; + } else if (c == 0x01fd3) { + c = 0x390; + } else if (c == 0x01fe3) { + c = 0x3b0; + } + } + } else { + if (likely(c < 128)) { + if (c >= 'a' && c <= 'z') + c = c - 'a' + 'A'; + } else { + /* legacy regexp: to upper case if single char >= 128 */ + len = lre_case_conv_entry(res, c, FALSE, idx, v); + if (len == 1 && res[0] >= 128) + c = res[0]; + } + } + return c; +} + +/* JS regexp specific rules for case folding */ +int lre_canonicalize(uint32_t c, BOOL is_unicode) +{ + if (c < 128) { + /* fast case */ + if (is_unicode) { + if (c >= 'A' && c <= 'Z') { + c = c - 'A' + 'a'; + } + } else { + if (c >= 'a' && c <= 'z') { + c = c - 'a' + 'A'; + } + } + } else { + uint32_t v, code, len; + int idx, idx_min, idx_max; + + idx_min = 0; + idx_max = countof(case_conv_table1) - 1; + while (idx_min <= idx_max) { + idx = (unsigned)(idx_max + idx_min) / 2; + v = case_conv_table1[idx]; + code = v >> (32 - 17); + len = (v >> (32 - 17 - 7)) & 0x7f; + if (c < code) { + idx_max = idx - 1; + } else if (c >= code + len) { + idx_min = idx + 1; + } else { + return lre_case_folding_entry(c, idx, v, is_unicode); + } + } + } + return c; +} + static uint32_t get_le24(const uint8_t *ptr) { return ptr[0] | (ptr[1] << 8) | (ptr[2] << 16); @@ -1166,11 +1269,11 @@ static int unicode_case1(CharRange *cr, int case_mask) #define MR(x) (1 << RUN_TYPE_ ## x) const uint32_t tab_run_mask[3] = { MR(U) | MR(UF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(UF_D20) | - MR(UF_D1_EXT) | MR(U_EXT) | MR(U_EXT2) | MR(U_EXT3), + MR(UF_D1_EXT) | MR(U_EXT) | MR(UF_EXT2) | MR(UF_EXT3), - MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(L_EXT2), + MR(L) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2), - MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT), + MR(UF) | MR(LF) | MR(UL) | MR(LSU) | MR(U2L_399_EXT2) | MR(LF_EXT) | MR(LF_EXT2) | MR(UF_D20) | MR(UF_D1_EXT) | MR(LF_EXT) | MR(UF_EXT2) | MR(UF_EXT3), }; #undef MR uint32_t mask, v, code, type, len, i, idx; @@ -1224,6 +1327,135 @@ static int unicode_case1(CharRange *cr, int case_mask) return 0; } +static int point_cmp(const void *p1, const void *p2, void *arg) +{ + uint32_t v1 = *(uint32_t *)p1; + uint32_t v2 = *(uint32_t *)p2; + return (v1 > v2) - (v1 < v2); +} + +static void cr_sort_and_remove_overlap(CharRange *cr) +{ + uint32_t start, end, start1, end1, i, j; + + /* the resulting ranges are not necessarily sorted and may overlap */ + rqsort(cr->points, cr->len / 2, sizeof(cr->points[0]) * 2, point_cmp, NULL); + j = 0; + for(i = 0; i < cr->len; ) { + start = cr->points[i]; + end = cr->points[i + 1]; + i += 2; + while (i < cr->len) { + start1 = cr->points[i]; + end1 = cr->points[i + 1]; + if (start1 > end) { + /* |------| + * |-------| */ + break; + } else if (end1 <= end) { + /* |------| + * |--| */ + i += 2; + } else { + /* |------| + * |-------| */ + end = end1; + i += 2; + } + } + cr->points[j] = start; + cr->points[j + 1] = end; + j += 2; + } + cr->len = j; +} + +/* canonicalize a character set using the JS regex case folding rules + (see lre_canonicalize()) */ +int cr_regexp_canonicalize(CharRange *cr, BOOL is_unicode) +{ + CharRange cr_inter, cr_mask, cr_result, cr_sub; + uint32_t v, code, len, i, idx, start, end, c, d_start, d_end, d; + + cr_init(&cr_mask, cr->mem_opaque, cr->realloc_func); + cr_init(&cr_inter, cr->mem_opaque, cr->realloc_func); + cr_init(&cr_result, cr->mem_opaque, cr->realloc_func); + cr_init(&cr_sub, cr->mem_opaque, cr->realloc_func); + + if (unicode_case1(&cr_mask, is_unicode ? CASE_F : CASE_U)) + goto fail; + if (cr_op(&cr_inter, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER)) + goto fail; + + if (cr_invert(&cr_mask)) + goto fail; + if (cr_op(&cr_sub, cr_mask.points, cr_mask.len, cr->points, cr->len, CR_OP_INTER)) + goto fail; + + /* cr_inter = cr & cr_mask */ + /* cr_sub = cr & ~cr_mask */ + + /* use the case conversion table to compute the result */ + d_start = -1; + d_end = -1; + idx = 0; + v = case_conv_table1[idx]; + code = v >> (32 - 17); + len = (v >> (32 - 17 - 7)) & 0x7f; + for(i = 0; i < cr_inter.len; i += 2) { + start = cr_inter.points[i]; + end = cr_inter.points[i + 1]; + + for(c = start; c < end; c++) { + for(;;) { + if (c >= code && c < code + len) + break; + idx++; + assert(idx < countof(case_conv_table1)); + v = case_conv_table1[idx]; + code = v >> (32 - 17); + len = (v >> (32 - 17 - 7)) & 0x7f; + } + d = lre_case_folding_entry(c, idx, v, is_unicode); + /* try to merge with the current interval */ + if (d_start == -1) { + d_start = d; + d_end = d + 1; + } else if (d_end == d) { + d_end++; + } else { + cr_add_interval(&cr_result, d_start, d_end); + d_start = d; + d_end = d + 1; + } + } + } + if (d_start != -1) { + if (cr_add_interval(&cr_result, d_start, d_end)) + goto fail; + } + + /* the resulting ranges are not necessarily sorted and may overlap */ + cr_sort_and_remove_overlap(&cr_result); + + /* or with the character not affected by the case folding */ + cr->len = 0; + if (cr_op(cr, cr_result.points, cr_result.len, cr_sub.points, cr_sub.len, CR_OP_UNION)) + goto fail; + + cr_free(&cr_inter); + cr_free(&cr_mask); + cr_free(&cr_result); + cr_free(&cr_sub); + return 0; + fail: + cr_free(&cr_inter); + cr_free(&cr_mask); + cr_free(&cr_result); + cr_free(&cr_sub); + return -1; +} + typedef enum { POP_GC, POP_PROP, diff --git a/libunicode.h b/libunicode.h index 337ffdc..d7d6a49 100644 --- a/libunicode.h +++ b/libunicode.h @@ -43,6 +43,7 @@ typedef enum { } UnicodeNormalizationEnum; int lre_case_conv(uint32_t *res, uint32_t c, int conv_type); +int lre_canonicalize(uint32_t c, BOOL is_unicode); LRE_BOOL lre_is_cased(uint32_t c); LRE_BOOL lre_is_case_ignorable(uint32_t c); @@ -102,6 +103,7 @@ int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len, const uint32_t *b_pt, int b_len, int op); int cr_invert(CharRange *cr); +int cr_regexp_canonicalize(CharRange *cr, BOOL is_unicode); LRE_BOOL lre_is_id_start(uint32_t c); LRE_BOOL lre_is_id_continue(uint32_t c); diff --git a/test262_errors.txt b/test262_errors.txt index b775a35..91ef5e4 100644 --- a/test262_errors.txt +++ b/test262_errors.txt @@ -70,8 +70,6 @@ test262/test/built-ins/RegExp/property-escapes/generated/XID_Continue.js:16: Tes test262/test/built-ins/RegExp/property-escapes/generated/XID_Continue.js:16: strict mode: Test262Error: `\p{XID_Continue}` should match U+00200C (`‌`) test262/test/built-ins/RegExp/property-escapes/generated/XID_Start.js:16: Test262Error: `\p{XID_Start}` should match U+02EBF0 (`𮯰`) test262/test/built-ins/RegExp/property-escapes/generated/XID_Start.js:16: strict mode: Test262Error: `\p{XID_Start}` should match U+02EBF0 (`𮯰`) -test262/test/built-ins/RegExp/unicode_full_case_folding.js:20: Test262Error: \u0390 does not match \u1fd3 -test262/test/built-ins/RegExp/unicode_full_case_folding.js:20: strict mode: Test262Error: \u0390 does not match \u1fd3 test262/test/built-ins/Set/prototype/isSupersetOf/allows-set-like-class.js:29: TypeError: not a function test262/test/built-ins/Set/prototype/isSupersetOf/allows-set-like-class.js:29: strict mode: TypeError: not a function test262/test/built-ins/Set/prototype/isSupersetOf/allows-set-like-object.js:27: TypeError: not a function diff --git a/unicode_gen.c b/unicode_gen.c index d94ec64..ba455cb 100644 --- a/unicode_gen.c +++ b/unicode_gen.c @@ -42,6 +42,7 @@ //#define DUMP_TABLE_SIZE //#define DUMP_CC_TABLE //#define DUMP_DECOMP_TABLE +//#define DUMP_CASE_FOLDING_SPECIAL_CASES /* Ideas: - Generalize run length encoding + index for all tables @@ -223,9 +224,10 @@ typedef struct { /* case conv */ uint8_t u_len; uint8_t l_len; - int u_data[CC_LEN_MAX]; - int l_data[CC_LEN_MAX]; - int f_code; + uint8_t f_len; + int u_data[CC_LEN_MAX]; /* to upper case */ + int l_data[CC_LEN_MAX]; /* to lower case */ + int f_data[CC_LEN_MAX]; /* to case folding */ uint8_t combining_class; uint8_t is_compat:1; @@ -485,7 +487,7 @@ void parse_case_folding(CCInfo *tab, const char *filename) FILE *f; char line[1024]; const char *p; - int code; + int code, status; CCInfo *ci; f = fopen(filename, "rb"); @@ -516,14 +518,28 @@ void parse_case_folding(CCInfo *tab, const char *filename) /* locale dependent casing */ while (isspace(*p)) p++; - if (*p != 'C' && *p != 'S') + status = *p; + if (status != 'C' && status != 'S' && status != 'F') continue; p = get_field(line, 2); - assert(p != 0); - assert(ci->f_code == 0); - ci->f_code = strtoul(p, NULL, 16); - assert(ci->f_code != 0 && ci->f_code != code); + assert(p != NULL); + if (status == 'S') { + /* we always select the simple case folding and assume it + * comes after the full case folding case */ + assert(ci->f_len >= 2); + ci->f_len = 0; + } else { + assert(ci->f_len == 0); + } + for(;;) { + while (isspace(*p)) + p++; + if (*p == ';') + break; + assert(ci->l_len < CC_LEN_MAX); + ci->f_data[ci->f_len++] = strtoul(p, (char **)&p, 16); + } } fclose(f); @@ -850,19 +866,21 @@ void dump_cc_info(CCInfo *ci, int i) for(j = 0; j < ci->l_len; j++) printf(" %05x", ci->l_data[j]); } - if (ci->f_code != 0) { - printf(" F: %05x", ci->f_code); + if (ci->f_len != 0) { + printf(" F:"); + for(j = 0; j < ci->f_len; j++) + printf(" %05x", ci->f_data[j]); } printf("\n"); } -void dump_data(CCInfo *tab) +void dump_unicode_data(CCInfo *tab) { int i; CCInfo *ci; for(i = 0; i <= CHARCODE_MAX; i++) { ci = &tab[i]; - if (ci->u_len != 0 || ci->l_len != 0 || ci->f_code != 0) { + if (ci->u_len != 0 || ci->l_len != 0 || ci->f_len != 0) { dump_cc_info(ci, i); } } @@ -872,8 +890,8 @@ BOOL is_complicated_case(const CCInfo *ci) { return (ci->u_len > 1 || ci->l_len > 1 || (ci->u_len > 0 && ci->l_len > 0) || - (ci->f_code != 0) != ci->l_len || - (ci->f_code != 0 && ci->l_data[0] != ci->f_code)); + (ci->f_len != ci->l_len) || + (memcmp(ci->f_data, ci->l_data, ci->f_len * sizeof(ci->f_data[0])) != 0)); } #ifndef USE_TEST @@ -889,9 +907,9 @@ enum { RUN_TYPE_UF_D1_EXT, RUN_TYPE_U_EXT, RUN_TYPE_LF_EXT, - RUN_TYPE_U_EXT2, - RUN_TYPE_L_EXT2, - RUN_TYPE_U_EXT3, + RUN_TYPE_UF_EXT2, + RUN_TYPE_LF_EXT2, + RUN_TYPE_UF_EXT3, }; #endif @@ -907,9 +925,9 @@ const char *run_type_str[] = { "UF_D1_EXT", "U_EXT", "LF_EXT", - "U_EXT2", - "L_EXT2", - "U_EXT3", + "UF_EXT2", + "LF_EXT2", + "UF_EXT3", }; typedef struct { @@ -922,6 +940,13 @@ typedef struct { int data_index; /* 'data' coming from the table */ } TableEntry; +static int simple_to_lower(CCInfo *tab, int c) +{ + if (tab[c].l_len != 1) + return c; + return tab[c].l_data[0]; +} + /* code (17), len (7), type (4) */ void find_run_type(TableEntry *te, CCInfo *tab, int code) @@ -935,15 +960,15 @@ void find_run_type(TableEntry *te, CCInfo *tab, int code) te->code = code; if (ci->l_len == 1 && ci->l_data[0] == code + 2 && - ci->f_code == ci->l_data[0] && + ci->f_len == 1 && ci->f_data[0] == ci->l_data[0] && ci->u_len == 0 && ci1->l_len == 1 && ci1->l_data[0] == code + 2 && - ci1->f_code == ci1->l_data[0] && + ci1->f_len == 1 && ci1->f_data[0] == ci1->l_data[0] && ci1->u_len == 1 && ci1->u_data[0] == code && ci2->l_len == 0 && - ci2->f_code == 0 && + ci2->f_len == 0 && ci2->u_len == 1 && ci2->u_data[0] == code) { te->len = 3; te->data = 0; @@ -958,7 +983,7 @@ void find_run_type(TableEntry *te, CCInfo *tab, int code) if (ci1->u_len != 1 || ci1->u_data[0] != ci->u_data[0] + len || ci1->l_len != 0 || - ci1->f_code != ci1->u_data[0]) + ci1->f_len != 1 || ci1->f_data[0] != ci1->u_data[0]) break; len++; } @@ -969,21 +994,25 @@ void find_run_type(TableEntry *te, CCInfo *tab, int code) return; } - if (ci->u_len == 2 && ci->u_data[1] == 0x399 && - ci->f_code == 0 && ci->l_len == 0) { + if (ci->l_len == 0 && + ci->u_len == 2 && ci->u_data[1] == 0x399 && + ci->f_len == 2 && ci->f_data[1] == 0x3B9 && + ci->f_data[0] == simple_to_lower(tab, ci->u_data[0])) { len = 1; while (code + len <= CHARCODE_MAX) { ci1 = &tab[code + len]; if (!(ci1->u_len == 2 && - ci1->u_data[1] == 0x399 && + ci1->u_data[1] == ci->u_data[1] && ci1->u_data[0] == ci->u_data[0] + len && - ci1->f_code == 0 && + ci1->f_len == 2 && + ci1->f_data[1] == ci->f_data[1] && + ci1->f_data[0] == ci->f_data[0] + len && ci1->l_len == 0)) break; len++; } te->len = len; - te->type = RUN_TYPE_U_EXT2; + te->type = RUN_TYPE_UF_EXT2; te->ext_data[0] = ci->u_data[0]; te->ext_data[1] = ci->u_data[1]; te->ext_len = 2; @@ -991,7 +1020,8 @@ void find_run_type(TableEntry *te, CCInfo *tab, int code) } if (ci->u_len == 2 && ci->u_data[1] == 0x399 && - ci->l_len == 1 && ci->f_code == ci->l_data[0]) { + ci->l_len == 1 && + ci->f_len == 1 && ci->f_data[0] == ci->l_data[0]) { len = 1; while (code + len <= CHARCODE_MAX) { ci1 = &tab[code + len]; @@ -1000,7 +1030,7 @@ void find_run_type(TableEntry *te, CCInfo *tab, int code) ci1->u_data[0] == ci->u_data[0] + len && ci1->l_len == 1 && ci1->l_data[0] == ci->l_data[0] + len && - ci1->f_code == ci1->l_data[0])) + ci1->f_len == 1 && ci1->f_data[0] == ci1->l_data[0])) break; len++; } @@ -1012,13 +1042,13 @@ void find_run_type(TableEntry *te, CCInfo *tab, int code) return; } - if (ci->l_len == 1 && ci->u_len == 0 && ci->f_code == 0) { + if (ci->l_len == 1 && ci->u_len == 0 && ci->f_len == 0) { len = 1; while (code + len <= CHARCODE_MAX) { ci1 = &tab[code + len]; if (!(ci1->l_len == 1 && ci1->l_data[0] == ci->l_data[0] + len && - ci1->u_len == 0 && ci1->f_code == 0)) + ci1->u_len == 0 && ci1->f_len == 0)) break; len++; } @@ -1031,32 +1061,39 @@ void find_run_type(TableEntry *te, CCInfo *tab, int code) if (ci->l_len == 0 && ci->u_len == 1 && ci->u_data[0] < 0x1000 && - ci->f_code == ci->u_data[0] + 0x20) { + ci->f_len == 1 && ci->f_data[0] == ci->u_data[0] + 0x20) { te->len = 1; te->type = RUN_TYPE_UF_D20; te->data = ci->u_data[0]; } else if (ci->l_len == 0 && - ci->u_len == 1 && - ci->f_code == ci->u_data[0] + 1) { + ci->u_len == 1 && + ci->f_len == 1 && ci->f_data[0] == ci->u_data[0] + 1) { te->len = 1; te->type = RUN_TYPE_UF_D1_EXT; te->ext_data[0] = ci->u_data[0]; te->ext_len = 1; - } else if (ci->l_len == 2 && ci->u_len == 0 && ci->f_code == 0) { + } else if (ci->l_len == 2 && ci->u_len == 0 && ci->f_len == 2 && + ci->l_data[0] == ci->f_data[0] && + ci->l_data[1] == ci->f_data[1]) { te->len = 1; - te->type = RUN_TYPE_L_EXT2; + te->type = RUN_TYPE_LF_EXT2; te->ext_data[0] = ci->l_data[0]; te->ext_data[1] = ci->l_data[1]; te->ext_len = 2; - } else if (ci->u_len == 2 && ci->l_len == 0 && ci->f_code == 0) { + } else if (ci->u_len == 2 && ci->l_len == 0 && ci->f_len == 2 && + ci->f_data[0] == simple_to_lower(tab, ci->u_data[0]) && + ci->f_data[1] == simple_to_lower(tab, ci->u_data[1])) { te->len = 1; - te->type = RUN_TYPE_U_EXT2; + te->type = RUN_TYPE_UF_EXT2; te->ext_data[0] = ci->u_data[0]; te->ext_data[1] = ci->u_data[1]; te->ext_len = 2; - } else if (ci->u_len == 3 && ci->l_len == 0 && ci->f_code == 0) { + } else if (ci->u_len == 3 && ci->l_len == 0 && ci->f_len == 3 && + ci->f_data[0] == simple_to_lower(tab, ci->u_data[0]) && + ci->f_data[1] == simple_to_lower(tab, ci->u_data[1]) && + ci->f_data[2] == simple_to_lower(tab, ci->u_data[2])) { te->len = 1; - te->type = RUN_TYPE_U_EXT3; + te->type = RUN_TYPE_UF_EXT3; te->ext_data[0] = ci->u_data[0]; te->ext_data[1] = ci->u_data[1]; te->ext_data[2] = ci->u_data[2]; @@ -1174,7 +1211,7 @@ void build_conv_table(CCInfo *tab) te = conv_table; for(code = 0; code <= CHARCODE_MAX; code++) { ci = &tab[code]; - if (ci->u_len == 0 && ci->l_len == 0 && ci->f_code == 0) + if (ci->u_len == 0 && ci->l_len == 0 && ci->f_len == 0) continue; assert(te - conv_table < countof(conv_table)); find_run_type(te, tab, code); @@ -1224,7 +1261,7 @@ void build_conv_table(CCInfo *tab) /* find the data index for ext_data */ for(i = 0; i < conv_table_len; i++) { te = &conv_table[i]; - if (te->type == RUN_TYPE_U_EXT3) { + if (te->type == RUN_TYPE_UF_EXT3) { int p, v; v = 0; for(j = 0; j < 3; j++) { @@ -1238,8 +1275,8 @@ void build_conv_table(CCInfo *tab) for(i = 0; i < conv_table_len; i++) { te = &conv_table[i]; - if (te->type == RUN_TYPE_L_EXT2 || - te->type == RUN_TYPE_U_EXT2 || + if (te->type == RUN_TYPE_LF_EXT2 || + te->type == RUN_TYPE_UF_EXT2 || te->type == RUN_TYPE_U2L_399_EXT2) { int p, v; v = 0; @@ -1302,6 +1339,54 @@ void dump_case_conv_table(FILE *f) fprintf(f, "\n};\n\n"); } + +static CCInfo *global_tab; + +static int sp_cc_cmp(const void *p1, const void *p2) +{ + CCInfo *c1 = &global_tab[*(const int *)p1]; + CCInfo *c2 = &global_tab[*(const int *)p2]; + if (c1->f_len < c2->f_len) { + return -1; + } else if (c2->f_len < c1->f_len) { + return 1; + } else { + return memcmp(c1->f_data, c2->f_data, sizeof(c1->f_data[0]) * c1->f_len); + } +} + +/* dump the case special cases (multi character results which are + identical and need specific handling in lre_canonicalize() */ +void dump_case_folding_special_cases(CCInfo *tab) +{ + int i, len, j; + int *perm; + + perm = malloc(sizeof(perm[0]) * (CHARCODE_MAX + 1)); + for(i = 0; i <= CHARCODE_MAX; i++) + perm[i] = i; + global_tab = tab; + qsort(perm, CHARCODE_MAX + 1, sizeof(perm[0]), sp_cc_cmp); + for(i = 0; i <= CHARCODE_MAX;) { + if (tab[perm[i]].f_len <= 1) { + i++; + } else { + len = 1; + while ((i + len) <= CHARCODE_MAX && !sp_cc_cmp(&perm[i], &perm[i + len])) + len++; + + if (len > 1) { + for(j = i; j < i + len; j++) + dump_cc_info(&tab[perm[j]], perm[j]); + } + i += len; + } + } + free(perm); + global_tab = NULL; +} + + int tabcmp(const int *tab1, const int *tab2, int n) { int i; @@ -1328,7 +1413,7 @@ void compute_internal_props(void) for(i = 0; i <= CHARCODE_MAX; i++) { CCInfo *ci = &unicode_db[i]; - has_ul = (ci->u_len != 0 || ci->l_len != 0 || ci->f_code != 0); + has_ul = (ci->u_len != 0 || ci->l_len != 0 || ci->f_len != 0); if (has_ul) { assert(get_prop(i, PROP_Cased)); } else { @@ -1343,10 +1428,10 @@ void compute_internal_props(void) set_prop(i, PROP_Changes_When_Titlecased1, get_prop(i, PROP_Changes_When_Titlecased) ^ (ci->u_len != 0)); set_prop(i, PROP_Changes_When_Casefolded1, - get_prop(i, PROP_Changes_When_Casefolded) ^ (ci->f_code != 0)); + get_prop(i, PROP_Changes_When_Casefolded) ^ (ci->f_len != 0)); /* XXX: reduce table size (438 bytes) */ set_prop(i, PROP_Changes_When_NFKC_Casefolded1, - get_prop(i, PROP_Changes_When_NFKC_Casefolded) ^ (ci->f_code != 0)); + get_prop(i, PROP_Changes_When_NFKC_Casefolded) ^ (ci->f_len != 0)); } } @@ -1765,8 +1850,10 @@ void check_case_conv(void) ci->u_len = 1; ci->u_data[0] = code; } - if (ci->f_code == 0) - ci->f_code = code; + if (ci->f_len == 0) { + ci->f_len = 1; + ci->f_data[0] = code; + } error = 0; l = check_conv(res, code, 0); @@ -1780,7 +1867,7 @@ void check_case_conv(void) error++; } l = check_conv(res, code, 2); - if (l != 1 || res[0] != ci->f_code) { + if (l != ci->f_len || tabcmp((int *)res, ci->f_data, l)) { printf("ERROR: F\n"); error++; } @@ -2951,11 +3038,12 @@ int main(int argc, char **argv) unicode_db_path); parse_prop_list(filename); - // dump_data(unicode_db); - + // dump_unicode_data(unicode_db); build_conv_table(unicode_db); - // dump_table(); +#ifdef DUMP_CASE_FOLDING_SPECIAL_CASES + dump_case_folding_special_cases(unicode_db); +#endif if (!outfilename) { #ifdef USE_TEST