diff --git a/gen/function_source.c b/gen/function_source.c index c1beaaa..4d4ca0e 100644 Binary files a/gen/function_source.c and b/gen/function_source.c differ diff --git a/gen/hello.c b/gen/hello.c index a957441..9edb4ed 100644 Binary files a/gen/hello.c and b/gen/hello.c differ diff --git a/gen/hello_module.c b/gen/hello_module.c index 45e6c37..0f8dd33 100644 Binary files a/gen/hello_module.c and b/gen/hello_module.c differ diff --git a/gen/repl.c b/gen/repl.c index 39af7f9..a034792 100644 Binary files a/gen/repl.c and b/gen/repl.c differ diff --git a/gen/test_fib.c b/gen/test_fib.c index 4cbe0cf..d167461 100644 Binary files a/gen/test_fib.c and b/gen/test_fib.c differ diff --git a/libregexp-opcode.h b/libregexp-opcode.h index 7fdc887..5c1714a 100644 --- a/libregexp-opcode.h +++ b/libregexp-opcode.h @@ -51,8 +51,7 @@ DEF(range32, 3) /* variable length */ DEF(lookahead, 5) DEF(negative_lookahead, 5) DEF(push_char_pos, 1) /* push the character position on the stack */ -DEF(bne_char_pos, 5) /* pop one stack element and jump if equal to the character - position */ +DEF(check_advance, 1) /* pop one stack element and check that it is different from the character position */ DEF(prev, 1) /* go to the previous char */ DEF(simple_greedy_quant, 17) diff --git a/libregexp.c b/libregexp.c index 165158a..799c8f6 100644 --- a/libregexp.c +++ b/libregexp.c @@ -283,7 +283,6 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf, case REOP_loop: case REOP_lookahead: case REOP_negative_lookahead: - case REOP_bne_char_pos: val = get_u32(buf + pos + 1); val += (pos + 5); printf(" %u", val); @@ -921,21 +920,17 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp) } /* Return: - 1 if the opcodes in bc_buf[] always advance the character pointer. - 0 if the character pointer may not be advanced. - -1 if the code may depend on side effects of its previous execution (backreference) + - true if the opcodes may not advance the char pointer + - false if the opcodes always advance the char pointer */ -static int re_check_advance(const uint8_t *bc_buf, int bc_buf_len) +static BOOL re_need_check_advance(const uint8_t *bc_buf, int bc_buf_len) { - int pos, opcode, ret, len, i; - uint32_t val, last; - BOOL has_back_reference; - uint8_t capture_bitmap[CAPTURE_COUNT_MAX]; + int pos, opcode, len; + uint32_t val; + BOOL ret; - ret = -2; /* not known yet */ + ret = TRUE; pos = 0; - has_back_reference = FALSE; - memset(capture_bitmap, 0, sizeof(capture_bitmap)); while (pos < bc_buf_len) { opcode = bc_buf[pos]; @@ -955,8 +950,7 @@ static int re_check_advance(const uint8_t *bc_buf, int bc_buf_len) case REOP_dot: case REOP_any: simple_char: - if (ret == -2) - ret = 1; + ret = FALSE; break; case REOP_line_start: case REOP_line_end: @@ -970,41 +964,16 @@ static int re_check_advance(const uint8_t *bc_buf, int bc_buf_len) break; case REOP_save_start: case REOP_save_end: - val = bc_buf[pos + 1]; - capture_bitmap[val] |= 1; - break; case REOP_save_reset: - { - val = bc_buf[pos + 1]; - last = bc_buf[pos + 2]; - while (val < last) - capture_bitmap[val++] |= 1; - } - break; case REOP_back_reference: case REOP_backward_back_reference: - val = bc_buf[pos + 1]; - capture_bitmap[val] |= 2; - has_back_reference = TRUE; break; default: /* safe behvior: we cannot predict the outcome */ - if (ret == -2) - ret = 0; - break; + return TRUE; } pos += len; } - if (has_back_reference) { - /* check if there is back reference which references a capture - made in the some code */ - for(i = 0; i < CAPTURE_COUNT_MAX; i++) { - if (capture_bitmap[i] == 3) - return -1; - } - } - if (ret == -2) - ret = 0; return ret; } @@ -1583,8 +1552,8 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir) running the atom after the first quant_min times, then there is no match. We remove this test when we are sure the atom always advances the position. */ - add_zero_advance_check = (re_check_advance(s->byte_code.buf + last_atom_start, - s->byte_code.size - last_atom_start) == 0); + add_zero_advance_check = re_need_check_advance(s->byte_code.buf + last_atom_start, + s->byte_code.size - last_atom_start); { int len, pos; @@ -1601,38 +1570,34 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir) } if (quant_max == 0) { s->byte_code.size = last_atom_start; - } else if (quant_max == 1) { - if (dbuf_insert(&s->byte_code, last_atom_start, 5)) - goto out_of_memory; - s->byte_code.buf[last_atom_start] = REOP_split_goto_first + - greedy; - put_u32(s->byte_code.buf + last_atom_start + 1, len); - } else if (quant_max == INT32_MAX) { + } else if (quant_max == 1 || quant_max == INT32_MAX) { + BOOL has_goto = (quant_max == INT32_MAX); if (dbuf_insert(&s->byte_code, last_atom_start, 5 + add_zero_advance_check)) goto out_of_memory; s->byte_code.buf[last_atom_start] = REOP_split_goto_first + greedy; put_u32(s->byte_code.buf + last_atom_start + 1, - len + 5 + add_zero_advance_check); + len + 5 * has_goto + add_zero_advance_check * 2); if (add_zero_advance_check) { - /* avoid infinite loop by stoping the - recursion if no advance was made in the - atom (only works if the atom has no - side effect) */ s->byte_code.buf[last_atom_start + 1 + 4] = REOP_push_char_pos; - re_emit_goto(s, REOP_bne_char_pos, last_atom_start); - } else { - re_emit_goto(s, REOP_goto, last_atom_start); + re_emit_op(s, REOP_check_advance); } + if (has_goto) + re_emit_goto(s, REOP_goto, last_atom_start); } else { - if (dbuf_insert(&s->byte_code, last_atom_start, 10)) + if (dbuf_insert(&s->byte_code, last_atom_start, 10 + add_zero_advance_check)) goto out_of_memory; pos = last_atom_start; s->byte_code.buf[pos++] = REOP_push_i32; put_u32(s->byte_code.buf + pos, quant_max); pos += 4; s->byte_code.buf[pos++] = REOP_split_goto_first + greedy; - put_u32(s->byte_code.buf + pos, len + 5); + put_u32(s->byte_code.buf + pos, len + 5 + add_zero_advance_check * 2); + pos += 4; + if (add_zero_advance_check) { + s->byte_code.buf[pos++] = REOP_push_char_pos; + re_emit_op(s, REOP_check_advance); + } re_emit_goto(s, REOP_loop, last_atom_start + 5); re_emit_op(s, REOP_drop); } @@ -1656,22 +1621,25 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir) if (quant_max == INT32_MAX) { pos = s->byte_code.size; re_emit_op_u32(s, REOP_split_goto_first + greedy, - len + 5 + add_zero_advance_check); + len + 5 + add_zero_advance_check * 2); if (add_zero_advance_check) re_emit_op(s, REOP_push_char_pos); /* copy the atom */ dbuf_put_self(&s->byte_code, last_atom_start, len); if (add_zero_advance_check) - re_emit_goto(s, REOP_bne_char_pos, pos); - else - re_emit_goto(s, REOP_goto, pos); + re_emit_op(s, REOP_check_advance); + re_emit_goto(s, REOP_goto, pos); } else if (quant_max > quant_min) { re_emit_op_u32(s, REOP_push_i32, quant_max - quant_min); pos = s->byte_code.size; - re_emit_op_u32(s, REOP_split_goto_first + greedy, len + 5); + re_emit_op_u32(s, REOP_split_goto_first + greedy, + len + 5 + add_zero_advance_check * 2); + if (add_zero_advance_check) + re_emit_op(s, REOP_push_char_pos); /* copy the atom */ dbuf_put_self(&s->byte_code, last_atom_start, len); - + if (add_zero_advance_check) + re_emit_op(s, REOP_check_advance); re_emit_goto(s, REOP_loop, pos); re_emit_op(s, REOP_drop); } @@ -1785,7 +1753,7 @@ static int lre_compute_stack_size(const uint8_t *bc_buf, int bc_buf_len) } break; case REOP_drop: - case REOP_bne_char_pos: + case REOP_check_advance: assert(stack_size > 0); stack_size--; break; @@ -2281,11 +2249,9 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, case REOP_push_char_pos: stack[stack_len++] = (uintptr_t)cptr; break; - case REOP_bne_char_pos: - val = get_u32(pc); - pc += 4; - if (stack[--stack_len] != (uintptr_t)cptr) - pc += (int)val; + case REOP_check_advance: + if (stack[--stack_len] == (uintptr_t)cptr) + goto no_match; break; case REOP_word_boundary: case REOP_not_word_boundary: diff --git a/quickjs.c b/quickjs.c index 22acf26..ae87f22 100644 --- a/quickjs.c +++ b/quickjs.c @@ -33312,7 +33312,7 @@ typedef enum BCTagEnum { BC_TAG_SET, } BCTagEnum; -#define BC_VERSION 13 +#define BC_VERSION 14 typedef struct BCWriterState { JSContext *ctx; diff --git a/test262_errors.txt b/test262_errors.txt index 930d217..8193447 100644 --- a/test262_errors.txt +++ b/test262_errors.txt @@ -18,10 +18,6 @@ test262/test/built-ins/AsyncFromSyncIteratorPrototype/throw/throw-undefined-retu test262/test/built-ins/AsyncFromSyncIteratorPrototype/throw/throw-undefined-return-not-object.js:72: strict mode: TypeError: $DONE() not called test262/test/built-ins/AsyncFromSyncIteratorPrototype/throw/throw-undefined-return-object.js:66: TypeError: $DONE() not called test262/test/built-ins/AsyncFromSyncIteratorPrototype/throw/throw-undefined-return-object.js:66: strict mode: TypeError: $DONE() not called -test262/test/built-ins/RegExp/lookahead-quantifier-match-groups.js:27: Test262Error: Expected [a, abc] and [a, undefined] to have the same contents. ? quantifier -test262/test/built-ins/RegExp/lookahead-quantifier-match-groups.js:27: strict mode: Test262Error: Expected [a, abc] and [a, undefined] to have the same contents. ? quantifier -test262/test/built-ins/RegExp/nullable-quantifier.js:21: Test262Error: The regex is expected to match the whole string Expected SameValue(«a», «ab») to be true -test262/test/built-ins/RegExp/nullable-quantifier.js:21: strict mode: Test262Error: The regex is expected to match the whole string Expected SameValue(«a», «ab») to be true test262/test/built-ins/RegExp/property-escapes/generated/Alphabetic.js:16: Test262Error: `\p{Alphabetic}` should match U+02EBF0 (`𮯰`) test262/test/built-ins/RegExp/property-escapes/generated/Alphabetic.js:16: strict mode: Test262Error: `\p{Alphabetic}` should match U+02EBF0 (`𮯰`) test262/test/built-ins/RegExp/property-escapes/generated/Assigned.js:16: Test262Error: `\p{Assigned}` should match U+002FFC (`⿼`) diff --git a/tests/test_builtin.js b/tests/test_builtin.js index 1193f63..411784e 100644 --- a/tests/test_builtin.js +++ b/tests/test_builtin.js @@ -775,6 +775,14 @@ function test_regexp() /* test zero length matches */ a = /()*?a/.exec(","); assert(a, null); + a = /(?:(?=(abc)))a/.exec("abc"); + assert(a, ["a", "abc"]); + a = /(?:(?=(abc)))?a/.exec("abc"); + assert(a, ["a", undefined]); + a = /(?:(?=(abc))){0,2}a/.exec("abc"); + assert(a, ["a", undefined]); + a = /(?:|[\w])+([0-9])/.exec("123a23"); + assert(a, ["123a23", "3"]); } function test_symbol()