regexp: fixed the zero advance logic in quantifiers

Ref: 10fc744ae4
This commit is contained in:
Saúl Ibarra Corretgé 2024-09-14 22:00:48 +02:00 committed by GitHub
parent 56da486312
commit 5f5170796e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 47 additions and 78 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -51,8 +51,7 @@ DEF(range32, 3) /* variable length */
DEF(lookahead, 5) DEF(lookahead, 5)
DEF(negative_lookahead, 5) DEF(negative_lookahead, 5)
DEF(push_char_pos, 1) /* push the character position on the stack */ DEF(push_char_pos, 1) /* push the character position on the stack */
DEF(bne_char_pos, 5) /* pop one stack element and jump if equal to the character DEF(check_advance, 1) /* pop one stack element and check that it is different from the character position */
position */
DEF(prev, 1) /* go to the previous char */ DEF(prev, 1) /* go to the previous char */
DEF(simple_greedy_quant, 17) DEF(simple_greedy_quant, 17)

View file

@ -283,7 +283,6 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
case REOP_loop: case REOP_loop:
case REOP_lookahead: case REOP_lookahead:
case REOP_negative_lookahead: case REOP_negative_lookahead:
case REOP_bne_char_pos:
val = get_u32(buf + pos + 1); val = get_u32(buf + pos + 1);
val += (pos + 5); val += (pos + 5);
printf(" %u", val); printf(" %u", val);
@ -921,21 +920,17 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
} }
/* Return: /* Return:
1 if the opcodes in bc_buf[] always advance the character pointer. - true if the opcodes may not advance the char pointer
0 if the character pointer may not be advanced. - false if the opcodes always advance the char pointer
-1 if the code may depend on side effects of its previous execution (backreference)
*/ */
static int re_check_advance(const uint8_t *bc_buf, int bc_buf_len) static BOOL re_need_check_advance(const uint8_t *bc_buf, int bc_buf_len)
{ {
int pos, opcode, ret, len, i; int pos, opcode, len;
uint32_t val, last; uint32_t val;
BOOL has_back_reference; BOOL ret;
uint8_t capture_bitmap[CAPTURE_COUNT_MAX];
ret = -2; /* not known yet */ ret = TRUE;
pos = 0; pos = 0;
has_back_reference = FALSE;
memset(capture_bitmap, 0, sizeof(capture_bitmap));
while (pos < bc_buf_len) { while (pos < bc_buf_len) {
opcode = bc_buf[pos]; opcode = bc_buf[pos];
@ -955,8 +950,7 @@ static int re_check_advance(const uint8_t *bc_buf, int bc_buf_len)
case REOP_dot: case REOP_dot:
case REOP_any: case REOP_any:
simple_char: simple_char:
if (ret == -2) ret = FALSE;
ret = 1;
break; break;
case REOP_line_start: case REOP_line_start:
case REOP_line_end: case REOP_line_end:
@ -970,41 +964,16 @@ static int re_check_advance(const uint8_t *bc_buf, int bc_buf_len)
break; break;
case REOP_save_start: case REOP_save_start:
case REOP_save_end: case REOP_save_end:
val = bc_buf[pos + 1];
capture_bitmap[val] |= 1;
break;
case REOP_save_reset: case REOP_save_reset:
{
val = bc_buf[pos + 1];
last = bc_buf[pos + 2];
while (val < last)
capture_bitmap[val++] |= 1;
}
break;
case REOP_back_reference: case REOP_back_reference:
case REOP_backward_back_reference: case REOP_backward_back_reference:
val = bc_buf[pos + 1];
capture_bitmap[val] |= 2;
has_back_reference = TRUE;
break; break;
default: default:
/* safe behvior: we cannot predict the outcome */ /* safe behvior: we cannot predict the outcome */
if (ret == -2) return TRUE;
ret = 0;
break;
} }
pos += len; pos += len;
} }
if (has_back_reference) {
/* check if there is back reference which references a capture
made in the some code */
for(i = 0; i < CAPTURE_COUNT_MAX; i++) {
if (capture_bitmap[i] == 3)
return -1;
}
}
if (ret == -2)
ret = 0;
return ret; return ret;
} }
@ -1583,8 +1552,8 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
running the atom after the first quant_min times, running the atom after the first quant_min times,
then there is no match. We remove this test when we then there is no match. We remove this test when we
are sure the atom always advances the position. */ are sure the atom always advances the position. */
add_zero_advance_check = (re_check_advance(s->byte_code.buf + last_atom_start, add_zero_advance_check = re_need_check_advance(s->byte_code.buf + last_atom_start,
s->byte_code.size - last_atom_start) == 0); s->byte_code.size - last_atom_start);
{ {
int len, pos; int len, pos;
@ -1601,38 +1570,34 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
} }
if (quant_max == 0) { if (quant_max == 0) {
s->byte_code.size = last_atom_start; s->byte_code.size = last_atom_start;
} else if (quant_max == 1) { } else if (quant_max == 1 || quant_max == INT32_MAX) {
if (dbuf_insert(&s->byte_code, last_atom_start, 5)) BOOL has_goto = (quant_max == INT32_MAX);
goto out_of_memory;
s->byte_code.buf[last_atom_start] = REOP_split_goto_first +
greedy;
put_u32(s->byte_code.buf + last_atom_start + 1, len);
} else if (quant_max == INT32_MAX) {
if (dbuf_insert(&s->byte_code, last_atom_start, 5 + add_zero_advance_check)) if (dbuf_insert(&s->byte_code, last_atom_start, 5 + add_zero_advance_check))
goto out_of_memory; goto out_of_memory;
s->byte_code.buf[last_atom_start] = REOP_split_goto_first + s->byte_code.buf[last_atom_start] = REOP_split_goto_first +
greedy; greedy;
put_u32(s->byte_code.buf + last_atom_start + 1, put_u32(s->byte_code.buf + last_atom_start + 1,
len + 5 + add_zero_advance_check); len + 5 * has_goto + add_zero_advance_check * 2);
if (add_zero_advance_check) { if (add_zero_advance_check) {
/* avoid infinite loop by stoping the
recursion if no advance was made in the
atom (only works if the atom has no
side effect) */
s->byte_code.buf[last_atom_start + 1 + 4] = REOP_push_char_pos; s->byte_code.buf[last_atom_start + 1 + 4] = REOP_push_char_pos;
re_emit_goto(s, REOP_bne_char_pos, last_atom_start); re_emit_op(s, REOP_check_advance);
} else {
re_emit_goto(s, REOP_goto, last_atom_start);
} }
if (has_goto)
re_emit_goto(s, REOP_goto, last_atom_start);
} else { } else {
if (dbuf_insert(&s->byte_code, last_atom_start, 10)) if (dbuf_insert(&s->byte_code, last_atom_start, 10 + add_zero_advance_check))
goto out_of_memory; goto out_of_memory;
pos = last_atom_start; pos = last_atom_start;
s->byte_code.buf[pos++] = REOP_push_i32; s->byte_code.buf[pos++] = REOP_push_i32;
put_u32(s->byte_code.buf + pos, quant_max); put_u32(s->byte_code.buf + pos, quant_max);
pos += 4; pos += 4;
s->byte_code.buf[pos++] = REOP_split_goto_first + greedy; s->byte_code.buf[pos++] = REOP_split_goto_first + greedy;
put_u32(s->byte_code.buf + pos, len + 5); put_u32(s->byte_code.buf + pos, len + 5 + add_zero_advance_check * 2);
pos += 4;
if (add_zero_advance_check) {
s->byte_code.buf[pos++] = REOP_push_char_pos;
re_emit_op(s, REOP_check_advance);
}
re_emit_goto(s, REOP_loop, last_atom_start + 5); re_emit_goto(s, REOP_loop, last_atom_start + 5);
re_emit_op(s, REOP_drop); re_emit_op(s, REOP_drop);
} }
@ -1656,22 +1621,25 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
if (quant_max == INT32_MAX) { if (quant_max == INT32_MAX) {
pos = s->byte_code.size; pos = s->byte_code.size;
re_emit_op_u32(s, REOP_split_goto_first + greedy, re_emit_op_u32(s, REOP_split_goto_first + greedy,
len + 5 + add_zero_advance_check); len + 5 + add_zero_advance_check * 2);
if (add_zero_advance_check) if (add_zero_advance_check)
re_emit_op(s, REOP_push_char_pos); re_emit_op(s, REOP_push_char_pos);
/* copy the atom */ /* copy the atom */
dbuf_put_self(&s->byte_code, last_atom_start, len); dbuf_put_self(&s->byte_code, last_atom_start, len);
if (add_zero_advance_check) if (add_zero_advance_check)
re_emit_goto(s, REOP_bne_char_pos, pos); re_emit_op(s, REOP_check_advance);
else re_emit_goto(s, REOP_goto, pos);
re_emit_goto(s, REOP_goto, pos);
} else if (quant_max > quant_min) { } else if (quant_max > quant_min) {
re_emit_op_u32(s, REOP_push_i32, quant_max - quant_min); re_emit_op_u32(s, REOP_push_i32, quant_max - quant_min);
pos = s->byte_code.size; pos = s->byte_code.size;
re_emit_op_u32(s, REOP_split_goto_first + greedy, len + 5); re_emit_op_u32(s, REOP_split_goto_first + greedy,
len + 5 + add_zero_advance_check * 2);
if (add_zero_advance_check)
re_emit_op(s, REOP_push_char_pos);
/* copy the atom */ /* copy the atom */
dbuf_put_self(&s->byte_code, last_atom_start, len); dbuf_put_self(&s->byte_code, last_atom_start, len);
if (add_zero_advance_check)
re_emit_op(s, REOP_check_advance);
re_emit_goto(s, REOP_loop, pos); re_emit_goto(s, REOP_loop, pos);
re_emit_op(s, REOP_drop); re_emit_op(s, REOP_drop);
} }
@ -1785,7 +1753,7 @@ static int lre_compute_stack_size(const uint8_t *bc_buf, int bc_buf_len)
} }
break; break;
case REOP_drop: case REOP_drop:
case REOP_bne_char_pos: case REOP_check_advance:
assert(stack_size > 0); assert(stack_size > 0);
stack_size--; stack_size--;
break; break;
@ -2281,11 +2249,9 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
case REOP_push_char_pos: case REOP_push_char_pos:
stack[stack_len++] = (uintptr_t)cptr; stack[stack_len++] = (uintptr_t)cptr;
break; break;
case REOP_bne_char_pos: case REOP_check_advance:
val = get_u32(pc); if (stack[--stack_len] == (uintptr_t)cptr)
pc += 4; goto no_match;
if (stack[--stack_len] != (uintptr_t)cptr)
pc += (int)val;
break; break;
case REOP_word_boundary: case REOP_word_boundary:
case REOP_not_word_boundary: case REOP_not_word_boundary:

View file

@ -33312,7 +33312,7 @@ typedef enum BCTagEnum {
BC_TAG_SET, BC_TAG_SET,
} BCTagEnum; } BCTagEnum;
#define BC_VERSION 13 #define BC_VERSION 14
typedef struct BCWriterState { typedef struct BCWriterState {
JSContext *ctx; JSContext *ctx;

View file

@ -18,10 +18,6 @@ test262/test/built-ins/AsyncFromSyncIteratorPrototype/throw/throw-undefined-retu
test262/test/built-ins/AsyncFromSyncIteratorPrototype/throw/throw-undefined-return-not-object.js:72: strict mode: TypeError: $DONE() not called test262/test/built-ins/AsyncFromSyncIteratorPrototype/throw/throw-undefined-return-not-object.js:72: strict mode: TypeError: $DONE() not called
test262/test/built-ins/AsyncFromSyncIteratorPrototype/throw/throw-undefined-return-object.js:66: TypeError: $DONE() not called test262/test/built-ins/AsyncFromSyncIteratorPrototype/throw/throw-undefined-return-object.js:66: TypeError: $DONE() not called
test262/test/built-ins/AsyncFromSyncIteratorPrototype/throw/throw-undefined-return-object.js:66: strict mode: TypeError: $DONE() not called test262/test/built-ins/AsyncFromSyncIteratorPrototype/throw/throw-undefined-return-object.js:66: strict mode: TypeError: $DONE() not called
test262/test/built-ins/RegExp/lookahead-quantifier-match-groups.js:27: Test262Error: Expected [a, abc] and [a, undefined] to have the same contents. ? quantifier
test262/test/built-ins/RegExp/lookahead-quantifier-match-groups.js:27: strict mode: Test262Error: Expected [a, abc] and [a, undefined] to have the same contents. ? quantifier
test262/test/built-ins/RegExp/nullable-quantifier.js:21: Test262Error: The regex is expected to match the whole string Expected SameValue(«a», «ab») to be true
test262/test/built-ins/RegExp/nullable-quantifier.js:21: strict mode: Test262Error: The regex is expected to match the whole string Expected SameValue(«a», «ab») to be true
test262/test/built-ins/RegExp/property-escapes/generated/Alphabetic.js:16: Test262Error: `\p{Alphabetic}` should match U+02EBF0 (`𮯰`) test262/test/built-ins/RegExp/property-escapes/generated/Alphabetic.js:16: Test262Error: `\p{Alphabetic}` should match U+02EBF0 (`𮯰`)
test262/test/built-ins/RegExp/property-escapes/generated/Alphabetic.js:16: strict mode: Test262Error: `\p{Alphabetic}` should match U+02EBF0 (`𮯰`) test262/test/built-ins/RegExp/property-escapes/generated/Alphabetic.js:16: strict mode: Test262Error: `\p{Alphabetic}` should match U+02EBF0 (`𮯰`)
test262/test/built-ins/RegExp/property-escapes/generated/Assigned.js:16: Test262Error: `\p{Assigned}` should match U+002FFC (`⿼`) test262/test/built-ins/RegExp/property-escapes/generated/Assigned.js:16: Test262Error: `\p{Assigned}` should match U+002FFC (`⿼`)

View file

@ -775,6 +775,14 @@ function test_regexp()
/* test zero length matches */ /* test zero length matches */
a = /()*?a/.exec(","); a = /()*?a/.exec(",");
assert(a, null); assert(a, null);
a = /(?:(?=(abc)))a/.exec("abc");
assert(a, ["a", "abc"]);
a = /(?:(?=(abc)))?a/.exec("abc");
assert(a, ["a", undefined]);
a = /(?:(?=(abc))){0,2}a/.exec("abc");
assert(a, ["a", undefined]);
a = /(?:|[\w])+([0-9])/.exec("123a23");
assert(a, ["123a23", "3"]);
} }
function test_symbol() function test_symbol()