From 554907e88f9edf9ff1b9ccb3d291a2549b5cc847 Mon Sep 17 00:00:00 2001 From: Ben Noordhuis Date: Fri, 15 Nov 2024 12:17:38 +0100 Subject: [PATCH] Add RegExp.escape (#687) --- libunicode-table.h | 16 ++++++++++------ libunicode.c | 7 +++++++ libunicode.h | 1 + quickjs.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++ test262.conf | 2 +- unicode_gen.c | 4 +++- 6 files changed, 70 insertions(+), 8 deletions(-) diff --git a/libunicode-table.h b/libunicode-table.h index bb5d496..b48a3a7 100644 --- a/libunicode-table.h +++ b/libunicode-table.h @@ -572,6 +572,16 @@ static const uint8_t unicode_prop_ID_Continue1_index[66] = { 0x01, 0x0e, }; +static const uint8_t unicode_prop_White_Space_table[22] = { + 0x88, 0x84, 0x91, 0x80, 0xe3, 0x80, 0x99, 0x80, + 0x55, 0xde, 0x80, 0x49, 0x7e, 0x8a, 0x9c, 0x0c, + 0x80, 0xae, 0x80, 0x4f, 0x9f, 0x80, +}; + +static const uint8_t unicode_prop_White_Space_index[3] = { + 0x01, 0x30, 0x00, +}; + static const uint8_t unicode_cc_table[916] = { 0xb2, 0xcf, 0xd4, 0x00, 0xe8, 0x03, 0xdc, 0x00, 0xe8, 0x00, 0xd8, 0x04, 0xdc, 0x01, 0xca, 0x03, @@ -4262,12 +4272,6 @@ static const uint8_t unicode_prop_Variation_Selector_table[13] = { 0x6d, 0x02, 0xef, 0x40, 0xef, }; -static const uint8_t unicode_prop_White_Space_table[22] = { - 0x88, 0x84, 0x91, 0x80, 0xe3, 0x80, 0x99, 0x80, - 0x55, 0xde, 0x80, 0x49, 0x7e, 0x8a, 0x9c, 0x0c, - 0x80, 0xae, 0x80, 0x4f, 0x9f, 0x80, -}; - static const uint8_t unicode_prop_Bidi_Mirrored_table[173] = { 0xa7, 0x81, 0x91, 0x00, 0x80, 0x9b, 0x00, 0x80, 0x9c, 0x00, 0x80, 0xac, 0x80, 0x8e, 0x80, 0x4e, diff --git a/libunicode.c b/libunicode.c index 1b4a098..e68b0ca 100644 --- a/libunicode.c +++ b/libunicode.c @@ -545,6 +545,13 @@ BOOL lre_is_id_continue(uint32_t c) sizeof(unicode_prop_ID_Continue1_index) / 3); } +BOOL lre_is_white_space(uint32_t c) +{ + return lre_is_in_table(c, unicode_prop_White_Space_table, + unicode_prop_White_Space_index, + sizeof(unicode_prop_White_Space_index) / 3); +} + #define UNICODE_DECOMP_LEN_MAX 18 typedef enum { diff --git a/libunicode.h b/libunicode.h index d7d6a49..ae20a49 100644 --- a/libunicode.h +++ b/libunicode.h @@ -107,6 +107,7 @@ int cr_regexp_canonicalize(CharRange *cr, BOOL is_unicode); LRE_BOOL lre_is_id_start(uint32_t c); LRE_BOOL lre_is_id_continue(uint32_t c); +LRE_BOOL lre_is_white_space(uint32_t c); int unicode_normalize(uint32_t **pdst, const uint32_t *src, int src_len, UnicodeNormalizationEnum n_type, diff --git a/quickjs.c b/quickjs.c index 73819c0..7542ce3 100644 --- a/quickjs.c +++ b/quickjs.c @@ -43836,6 +43836,53 @@ void *lre_realloc(void *opaque, void *ptr, size_t size) return js_realloc_rt(ctx->rt, ptr, size); } +static JSValue js_regexp_escape(JSContext *ctx, JSValue this_val, + int argc, JSValue *argv) +{ + StringBuffer b_s, *b = &b_s; + JSString *p; + uint32_t c, i; + char s[16]; + + if (!JS_IsString(argv[0])) + return JS_ThrowTypeError(ctx, "not a string"); + p = JS_VALUE_GET_STRING(argv[0]); + string_buffer_init2(ctx, b, 0, p->is_wide_char); + for (i = 0; i < p->len; i++) { + c = p->is_wide_char ? (uint32_t)p->u.str16[i] : (uint32_t)p->u.str8[i]; + if (c < 33) { + if (c >= 9 && c <= 13) { + string_buffer_putc8(b, '\\'); + string_buffer_putc8(b, "tnvfr"[c - 9]); + } else { + goto hex2; + } + } else if (c < 128) { + if ((c >= '0' && c <= '9') + || (c >= 'A' && c <= 'Z') + || (c >= 'a' && c <= 'z')) { + if (i == 0) + goto hex2; + } else if (strchr(",-=<>#&!%:;@~'`\"", c)) { + goto hex2; + } else if (c != '_') { + string_buffer_putc8(b, '\\'); + } + string_buffer_putc8(b, c); + } else if (c < 256) { + hex2: + snprintf(s, sizeof(s), "\\x%02x", c); + string_buffer_puts8(b, s); + } else if (is_surrogate(c) || lre_is_white_space(c) || c == 0xFEFF) { + snprintf(s, sizeof(s), "\\u%04x", c); + string_buffer_puts8(b, s); + } else { + string_buffer_putc16(b, c); + } + } + return string_buffer_end(b); +} + static JSValue js_regexp_exec(JSContext *ctx, JSValue this_val, int argc, JSValue *argv) { @@ -44864,6 +44911,7 @@ done: } static const JSCFunctionListEntry js_regexp_funcs[] = { + JS_CFUNC_DEF("escape", 1, js_regexp_escape ), JS_CGETSET_DEF("[Symbol.species]", js_get_this, NULL ), }; diff --git a/test262.conf b/test262.conf index e8710fd..a4d4841 100644 --- a/test262.conf +++ b/test262.conf @@ -174,7 +174,7 @@ regexp-modifiers=skip regexp-named-groups regexp-unicode-property-escapes regexp-v-flag -RegExp.escape=skip +RegExp.escape resizable-arraybuffer rest-parameters Set diff --git a/unicode_gen.c b/unicode_gen.c index 2cba190..76806e6 100644 --- a/unicode_gen.c +++ b/unicode_gen.c @@ -1574,6 +1574,7 @@ void build_flags_tables(FILE *f) build_prop_table(f, PROP_Case_Ignorable, TRUE); build_prop_table(f, PROP_ID_Start, TRUE); build_prop_table(f, PROP_ID_Continue1, TRUE); + build_prop_table(f, PROP_White_Space, TRUE); } void dump_name_table(FILE *f, const char *cname, const char **tab_name, int len, @@ -1813,7 +1814,8 @@ void build_prop_list_table(FILE *f) for(i = 0; i < PROP_TABLE_COUNT; i++) { if (i == PROP_ID_Start || i == PROP_Case_Ignorable || - i == PROP_ID_Continue1) { + i == PROP_ID_Continue1 || + i == PROP_White_Space) { /* already generated */ } else { build_prop_table(f, i, FALSE);