Improve surrogate handling readability

- add inline function to test and convert surrogates
  is_surrogate(c), is_hi_surrogate(c), is_lo_surrogate(c),
  get_hi_surrogate(c), get_lo_surrogate(c), from_surrogate(hi, lo)
- use names for BC header offsets and lengths in libregexp.c
- remove strict aliasing violations in `lre_exec_backtrack()`
- pass all context variables to XXX_CHAR macros in `lre_exec_backtrack()`
This commit is contained in:
Charlie Gordon 2024-02-20 00:22:32 +01:00
parent 8d932deb49
commit 12c91df577
3 changed files with 150 additions and 124 deletions

View file

@ -45,9 +45,10 @@
#ifndef countof #ifndef countof
#define countof(x) (sizeof(x) / sizeof((x)[0])) #define countof(x) (sizeof(x) / sizeof((x)[0]))
#endif #endif
#ifndef container_of
/* return the pointer of type 'type *' containing 'ptr' as field 'member' */ /* return the pointer of type 'type *' containing 'ptr' as field 'member' */
#define container_of(ptr, type, member) ((type *)((uint8_t *)(ptr) - offsetof(type, member))) #define container_of(ptr, type, member) ((type *)((uint8_t *)(ptr) - offsetof(type, member)))
#endif
typedef int BOOL; typedef int BOOL;
@ -207,17 +208,22 @@ static inline void put_u8(uint8_t *tab, uint8_t val)
*tab = val; *tab = val;
} }
#ifndef bswap16
static inline uint16_t bswap16(uint16_t x) static inline uint16_t bswap16(uint16_t x)
{ {
return (x >> 8) | (x << 8); return (x >> 8) | (x << 8);
} }
#endif
#ifndef bswap32
static inline uint32_t bswap32(uint32_t v) static inline uint32_t bswap32(uint32_t v)
{ {
return ((v & 0xff000000) >> 24) | ((v & 0x00ff0000) >> 8) | return ((v & 0xff000000) >> 24) | ((v & 0x00ff0000) >> 8) |
((v & 0x0000ff00) << 8) | ((v & 0x000000ff) << 24); ((v & 0x0000ff00) << 8) | ((v & 0x000000ff) << 24);
} }
#endif
#ifndef bswap64
static inline uint64_t bswap64(uint64_t v) static inline uint64_t bswap64(uint64_t v)
{ {
return ((v & ((uint64_t)0xff << (7 * 8))) >> (7 * 8)) | return ((v & ((uint64_t)0xff << (7 * 8))) >> (7 * 8)) |
@ -229,6 +235,7 @@ static inline uint64_t bswap64(uint64_t v)
((v & ((uint64_t)0xff << (1 * 8))) << (5 * 8)) | ((v & ((uint64_t)0xff << (1 * 8))) << (5 * 8)) |
((v & ((uint64_t)0xff << (0 * 8))) << (7 * 8)); ((v & ((uint64_t)0xff << (0 * 8))) << (7 * 8));
} }
#endif
/* XXX: should take an extra argument to pass slack information to the caller */ /* XXX: should take an extra argument to pass slack information to the caller */
typedef void *DynBufReallocFunc(void *opaque, void *ptr, size_t size); typedef void *DynBufReallocFunc(void *opaque, void *ptr, size_t size);
@ -278,6 +285,36 @@ static inline void dbuf_set_error(DynBuf *s)
int unicode_to_utf8(uint8_t *buf, unsigned int c); int unicode_to_utf8(uint8_t *buf, unsigned int c);
int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp); int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp);
static inline BOOL is_surrogate(uint32_t c)
{
return (c >> 11) == (0xD800 >> 11); // 0xD800-0xDFFF
}
static inline BOOL is_hi_surrogate(uint32_t c)
{
return (c >> 10) == (0xD800 >> 10); // 0xD800-0xDBFF
}
static inline BOOL is_lo_surrogate(uint32_t c)
{
return (c >> 10) == (0xDC00 >> 10); // 0xDC00-0xDFFF
}
static inline uint32_t get_hi_surrogate(uint32_t c)
{
return (c >> 10) - (0x10000 >> 10) + 0xD800;
}
static inline uint32_t get_lo_surrogate(uint32_t c)
{
return (c & 0x3FF) | 0xDC00;
}
static inline uint32_t from_surrogate(uint32_t hi, uint32_t lo)
{
return 0x10000 + 0x400 * (hi - 0xD800) + (lo - 0xDC00);
}
static inline int from_hex(int c) static inline int from_hex(int c)
{ {
if (c >= '0' && c <= '9') if (c >= '0' && c <= '9')

View file

@ -100,6 +100,7 @@ static const REOpCode reopcode_info[REOP_COUNT] = {
#define RE_HEADER_FLAGS 0 #define RE_HEADER_FLAGS 0
#define RE_HEADER_CAPTURE_COUNT 1 #define RE_HEADER_CAPTURE_COUNT 1
#define RE_HEADER_STACK_SIZE 2 #define RE_HEADER_STACK_SIZE 2
#define RE_HEADER_BYTECODE_LEN 3
#define RE_HEADER_LEN 7 #define RE_HEADER_LEN 7
@ -224,16 +225,16 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
assert(buf_len >= RE_HEADER_LEN); assert(buf_len >= RE_HEADER_LEN);
re_flags = buf[0]; re_flags = lre_get_flags(buf);
bc_len = get_u32(buf + 3); bc_len = get_u32(buf + RE_HEADER_BYTECODE_LEN);
assert(bc_len + RE_HEADER_LEN <= buf_len); assert(bc_len + RE_HEADER_LEN <= buf_len);
printf("flags: 0x%x capture_count=%d stack_size=%d\n", printf("flags: 0x%x capture_count=%d stack_size=%d\n",
re_flags, buf[1], buf[2]); re_flags, buf[RE_HEADER_CAPTURE_COUNT], buf[RE_HEADER_STACK_SIZE]);
if (re_flags & LRE_FLAG_NAMED_GROUPS) { if (re_flags & LRE_FLAG_NAMED_GROUPS) {
const char *p; const char *p;
p = (char *)buf + RE_HEADER_LEN + bc_len; p = (char *)buf + RE_HEADER_LEN + bc_len;
printf("named groups: "); printf("named groups: ");
for(i = 1; i < buf[1]; i++) { for(i = 1; i < buf[RE_HEADER_CAPTURE_COUNT]; i++) {
if (i != 1) if (i != 1)
printf(","); printf(",");
printf("<%s>", p); printf("<%s>", p);
@ -494,7 +495,7 @@ int lre_parse_escape(const uint8_t **pp, int allow_utf16)
} }
c = (c << 4) | h; c = (c << 4) | h;
} }
if (c >= 0xd800 && c < 0xdc00 && if (is_hi_surrogate(c) &&
allow_utf16 == 2 && p[0] == '\\' && p[1] == 'u') { allow_utf16 == 2 && p[0] == '\\' && p[1] == 'u') {
/* convert an escaped surrogate pair into a /* convert an escaped surrogate pair into a
unicode char */ unicode char */
@ -505,9 +506,9 @@ int lre_parse_escape(const uint8_t **pp, int allow_utf16)
break; break;
c1 = (c1 << 4) | h; c1 = (c1 << 4) | h;
} }
if (i == 4 && c1 >= 0xdc00 && c1 < 0xe000) { if (i == 4 && is_lo_surrogate(c1)) {
p += 6; p += 6;
c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000; c = from_surrogate(c, c1);
} }
} }
} }
@ -936,7 +937,7 @@ static BOOL re_need_check_advance(const uint8_t *bc_buf, int bc_buf_len)
case REOP_backward_back_reference: case REOP_backward_back_reference:
break; break;
default: default:
/* safe behvior: we cannot predict the outcome */ /* safe behavior: we cannot predict the outcome */
return TRUE; return TRUE;
} }
pos += len; pos += len;
@ -1005,10 +1006,10 @@ static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp)
break; break;
} else if (c >= 128) { } else if (c >= 128) {
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
if (c >= 0xD800 && c <= 0xDBFF) { if (is_hi_surrogate(c)) {
d = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1); d = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p1);
if (d >= 0xDC00 && d <= 0xDFFF) { if (is_lo_surrogate(d)) {
c = 0x10000 + 0x400 * (c - 0xD800) + (d - 0xDC00); c = from_surrogate(c, d);
p = p1; p = p1;
} }
} }
@ -1116,9 +1117,10 @@ static int find_group_name(REParseState *s, const char *name)
size_t len, name_len; size_t len, name_len;
int capture_index; int capture_index;
name_len = strlen(name);
p = (char *)s->group_names.buf; p = (char *)s->group_names.buf;
if (!p) return -1;
buf_end = (char *)s->group_names.buf + s->group_names.size; buf_end = (char *)s->group_names.buf + s->group_names.size;
name_len = strlen(name);
capture_index = 1; capture_index = 1;
while (p < buf_end) { while (p < buf_end) {
len = strlen(p); len = strlen(p);
@ -1813,7 +1815,8 @@ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
s->byte_code.buf[RE_HEADER_CAPTURE_COUNT] = s->capture_count; s->byte_code.buf[RE_HEADER_CAPTURE_COUNT] = s->capture_count;
s->byte_code.buf[RE_HEADER_STACK_SIZE] = stack_size; s->byte_code.buf[RE_HEADER_STACK_SIZE] = stack_size;
put_u32(s->byte_code.buf + 3, s->byte_code.size - RE_HEADER_LEN); put_u32(s->byte_code.buf + RE_HEADER_BYTECODE_LEN,
s->byte_code.size - RE_HEADER_LEN);
/* add the named groups if needed */ /* add the named groups if needed */
if (s->group_names.size > (s->capture_count - 1)) { if (s->group_names.size > (s->capture_count - 1)) {
@ -1844,93 +1847,86 @@ static BOOL is_word_char(uint32_t c)
(c == '_')); (c == '_'));
} }
#define GET_CHAR(c, cptr, cbuf_end) \ #define GET_CHAR(c, cptr, cbuf_end, cbuf_type) \
do { \ do { \
if (cbuf_type == 0) { \ if (cbuf_type == 0) { \
c = *cptr++; \ c = *cptr++; \
} else { \ } else { \
uint32_t __c1; \ const uint16_t *_p = (const uint16_t *)cptr; \
c = *(uint16_t *)cptr; \ const uint16_t *_end = (const uint16_t *)cbuf_end; \
cptr += 2; \ c = *_p++; \
if (c >= 0xd800 && c < 0xdc00 && \ if (is_hi_surrogate(c) && cbuf_type == 2) { \
cbuf_type == 2 && cptr < cbuf_end) { \ if (_p < _end && is_lo_surrogate(*_p)) { \
__c1 = *(uint16_t *)cptr; \ c = from_surrogate(c, *_p++); \
if (__c1 >= 0xdc00 && __c1 < 0xe000) { \
c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \
cptr += 2; \
} \ } \
} \ } \
cptr = (const void *)_p; \
} \ } \
} while (0) } while (0)
#define PEEK_CHAR(c, cptr, cbuf_end) \ #define PEEK_CHAR(c, cptr, cbuf_end, cbuf_type) \
do { \ do { \
if (cbuf_type == 0) { \ if (cbuf_type == 0) { \
c = cptr[0]; \ c = cptr[0]; \
} else { \ } else { \
uint32_t __c1; \ const uint16_t *_p = (const uint16_t *)cptr; \
c = ((uint16_t *)cptr)[0]; \ const uint16_t *_end = (const uint16_t *)cbuf_end; \
if (c >= 0xd800 && c < 0xdc00 && \ c = *_p++; \
cbuf_type == 2 && (cptr + 2) < cbuf_end) { \ if (is_hi_surrogate(c) && cbuf_type == 2) { \
__c1 = ((uint16_t *)cptr)[1]; \ if (_p < _end && is_lo_surrogate(*_p)) { \
if (__c1 >= 0xdc00 && __c1 < 0xe000) { \ c = from_surrogate(c, *_p); \
c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \
} \ } \
} \ } \
} \ } \
} while (0) } while (0)
#define PEEK_PREV_CHAR(c, cptr, cbuf_start) \ #define PEEK_PREV_CHAR(c, cptr, cbuf_start, cbuf_type) \
do { \ do { \
if (cbuf_type == 0) { \ if (cbuf_type == 0) { \
c = cptr[-1]; \ c = cptr[-1]; \
} else { \ } else { \
uint32_t __c1; \ const uint16_t *_p = (const uint16_t *)cptr - 1; \
c = ((uint16_t *)cptr)[-1]; \ const uint16_t *_start = (const uint16_t *)cbuf_start; \
if (c >= 0xdc00 && c < 0xe000 && \ c = *_p; \
cbuf_type == 2 && (cptr - 4) >= cbuf_start) { \ if (is_lo_surrogate(c) && cbuf_type == 2) { \
__c1 = ((uint16_t *)cptr)[-2]; \ if (_p > _start && is_hi_surrogate(_p[-1])) { \
if (__c1 >= 0xd800 && __c1 < 0xdc00 ) { \ c = from_surrogate(*--_p, c); \
c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \
} \ } \
} \ } \
} \ } \
} while (0) } while (0)
#define GET_PREV_CHAR(c, cptr, cbuf_start) \ #define GET_PREV_CHAR(c, cptr, cbuf_start, cbuf_type) \
do { \ do { \
if (cbuf_type == 0) { \ if (cbuf_type == 0) { \
cptr--; \ cptr--; \
c = cptr[0]; \ c = cptr[0]; \
} else { \ } else { \
uint32_t __c1; \ const uint16_t *_p = (const uint16_t *)cptr - 1; \
cptr -= 2; \ const uint16_t *_start = (const uint16_t *)cbuf_start; \
c = ((uint16_t *)cptr)[0]; \ c = *_p; \
if (c >= 0xdc00 && c < 0xe000 && \ if (is_lo_surrogate(c) && cbuf_type == 2) { \
cbuf_type == 2 && cptr > cbuf_start) { \ if (_p > _start && is_hi_surrogate(_p[-1])) { \
__c1 = ((uint16_t *)cptr)[-1]; \ c = from_surrogate(*--_p, c); \
if (__c1 >= 0xd800 && __c1 < 0xdc00 ) { \
cptr -= 2; \
c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \
} \ } \
} \ } \
cptr = (const void *)_p; \
} \ } \
} while (0) } while (0)
#define PREV_CHAR(cptr, cbuf_start) \ #define PREV_CHAR(cptr, cbuf_start, cbuf_type) \
do { \ do { \
if (cbuf_type == 0) { \ if (cbuf_type == 0) { \
cptr--; \ cptr--; \
} else { \ } else { \
cptr -= 2; \ const uint16_t *_p = (const uint16_t *)cptr - 1; \
if (cbuf_type == 2) { \ const uint16_t *_start = (const uint16_t *)cbuf_start; \
c = ((uint16_t *)cptr)[0]; \ if (is_lo_surrogate(*_p) && cbuf_type == 2) { \
if (c >= 0xdc00 && c < 0xe000 && cptr > cbuf_start) { \ if (_p > _start && is_hi_surrogate(_p[-1])) { \
c = ((uint16_t *)cptr)[-1]; \ --_p; \
if (c >= 0xd800 && c < 0xdc00) \
cptr -= 2; \
} \ } \
} \ } \
cptr = (const void *)_p; \
} \ } \
} while (0) } while (0)
@ -2070,7 +2066,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
/* go backward */ /* go backward */
char_count = get_u32(pc + 12); char_count = get_u32(pc + 12);
for(i = 0; i < char_count; i++) { for(i = 0; i < char_count; i++) {
PREV_CHAR(cptr, s->cbuf); PREV_CHAR(cptr, s->cbuf, cbuf_type);
} }
pc = (pc + 16) + (int)get_u32(pc); pc = (pc + 16) + (int)get_u32(pc);
rs->cptr = cptr; rs->cptr = cptr;
@ -2105,7 +2101,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
test_char: test_char:
if (cptr >= cbuf_end) if (cptr >= cbuf_end)
goto no_match; goto no_match;
GET_CHAR(c, cptr, cbuf_end); GET_CHAR(c, cptr, cbuf_end, cbuf_type);
if (s->ignore_case) { if (s->ignore_case) {
c = lre_canonicalize(c, s->is_unicode); c = lre_canonicalize(c, s->is_unicode);
} }
@ -2152,7 +2148,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
break; break;
if (!s->multi_line) if (!s->multi_line)
goto no_match; goto no_match;
PEEK_PREV_CHAR(c, cptr, s->cbuf); PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
if (!is_line_terminator(c)) if (!is_line_terminator(c))
goto no_match; goto no_match;
break; break;
@ -2161,21 +2157,21 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
break; break;
if (!s->multi_line) if (!s->multi_line)
goto no_match; goto no_match;
PEEK_CHAR(c, cptr, cbuf_end); PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
if (!is_line_terminator(c)) if (!is_line_terminator(c))
goto no_match; goto no_match;
break; break;
case REOP_dot: case REOP_dot:
if (cptr == cbuf_end) if (cptr == cbuf_end)
goto no_match; goto no_match;
GET_CHAR(c, cptr, cbuf_end); GET_CHAR(c, cptr, cbuf_end, cbuf_type);
if (is_line_terminator(c)) if (is_line_terminator(c))
goto no_match; goto no_match;
break; break;
case REOP_any: case REOP_any:
if (cptr == cbuf_end) if (cptr == cbuf_end)
goto no_match; goto no_match;
GET_CHAR(c, cptr, cbuf_end); GET_CHAR(c, cptr, cbuf_end, cbuf_type);
break; break;
case REOP_save_start: case REOP_save_start:
case REOP_save_end: case REOP_save_end:
@ -2227,14 +2223,14 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
if (cptr == s->cbuf) { if (cptr == s->cbuf) {
v1 = FALSE; v1 = FALSE;
} else { } else {
PEEK_PREV_CHAR(c, cptr, s->cbuf); PEEK_PREV_CHAR(c, cptr, s->cbuf, cbuf_type);
v1 = is_word_char(c); v1 = is_word_char(c);
} }
/* current char */ /* current char */
if (cptr >= cbuf_end) { if (cptr >= cbuf_end) {
v2 = FALSE; v2 = FALSE;
} else { } else {
PEEK_CHAR(c, cptr, cbuf_end); PEEK_CHAR(c, cptr, cbuf_end, cbuf_type);
v2 = is_word_char(c); v2 = is_word_char(c);
} }
if (v1 ^ v2 ^ (REOP_not_word_boundary - opcode)) if (v1 ^ v2 ^ (REOP_not_word_boundary - opcode))
@ -2259,8 +2255,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
while (cptr1 < cptr1_end) { while (cptr1 < cptr1_end) {
if (cptr >= cbuf_end) if (cptr >= cbuf_end)
goto no_match; goto no_match;
GET_CHAR(c1, cptr1, cptr1_end); GET_CHAR(c1, cptr1, cptr1_end, cbuf_type);
GET_CHAR(c2, cptr, cbuf_end); GET_CHAR(c2, cptr, cbuf_end, cbuf_type);
if (s->ignore_case) { if (s->ignore_case) {
c1 = lre_canonicalize(c1, s->is_unicode); c1 = lre_canonicalize(c1, s->is_unicode);
c2 = lre_canonicalize(c2, s->is_unicode); c2 = lre_canonicalize(c2, s->is_unicode);
@ -2273,8 +2269,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
while (cptr1 > cptr1_start) { while (cptr1 > cptr1_start) {
if (cptr == s->cbuf) if (cptr == s->cbuf)
goto no_match; goto no_match;
GET_PREV_CHAR(c1, cptr1, cptr1_start); GET_PREV_CHAR(c1, cptr1, cptr1_start, cbuf_type);
GET_PREV_CHAR(c2, cptr, s->cbuf); GET_PREV_CHAR(c2, cptr, s->cbuf, cbuf_type);
if (s->ignore_case) { if (s->ignore_case) {
c1 = lre_canonicalize(c1, s->is_unicode); c1 = lre_canonicalize(c1, s->is_unicode);
c2 = lre_canonicalize(c2, s->is_unicode); c2 = lre_canonicalize(c2, s->is_unicode);
@ -2294,7 +2290,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
pc += 2; pc += 2;
if (cptr >= cbuf_end) if (cptr >= cbuf_end)
goto no_match; goto no_match;
GET_CHAR(c, cptr, cbuf_end); GET_CHAR(c, cptr, cbuf_end, cbuf_type);
if (s->ignore_case) { if (s->ignore_case) {
c = lre_canonicalize(c, s->is_unicode); c = lre_canonicalize(c, s->is_unicode);
} }
@ -2334,7 +2330,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
pc += 2; pc += 2;
if (cptr >= cbuf_end) if (cptr >= cbuf_end)
goto no_match; goto no_match;
GET_CHAR(c, cptr, cbuf_end); GET_CHAR(c, cptr, cbuf_end, cbuf_type);
if (s->ignore_case) { if (s->ignore_case) {
c = lre_canonicalize(c, s->is_unicode); c = lre_canonicalize(c, s->is_unicode);
} }
@ -2366,7 +2362,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
/* go to the previous char */ /* go to the previous char */
if (cptr == s->cbuf) if (cptr == s->cbuf)
goto no_match; goto no_match;
PREV_CHAR(cptr, s->cbuf); PREV_CHAR(cptr, s->cbuf, cbuf_type);
break; break;
case REOP_simple_greedy_quant: case REOP_simple_greedy_quant:
{ {
@ -2425,7 +2421,7 @@ int lre_exec(uint8_t **capture,
int re_flags, i, alloca_size, ret; int re_flags, i, alloca_size, ret;
StackInt *stack_buf; StackInt *stack_buf;
re_flags = bc_buf[RE_HEADER_FLAGS]; re_flags = lre_get_flags(bc_buf);
s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0; s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0;
s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0; s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0;
s->is_unicode = (re_flags & LRE_FLAG_UNICODE) != 0; s->is_unicode = (re_flags & LRE_FLAG_UNICODE) != 0;
@ -2472,8 +2468,8 @@ const char *lre_get_groupnames(const uint8_t *bc_buf)
uint32_t re_bytecode_len; uint32_t re_bytecode_len;
if ((lre_get_flags(bc_buf) & LRE_FLAG_NAMED_GROUPS) == 0) if ((lre_get_flags(bc_buf) & LRE_FLAG_NAMED_GROUPS) == 0)
return NULL; return NULL;
re_bytecode_len = get_u32(bc_buf + 3); re_bytecode_len = get_u32(bc_buf + RE_HEADER_BYTECODE_LEN);
return (const char *)(bc_buf + 7 + re_bytecode_len); return (const char *)(bc_buf + RE_HEADER_LEN + re_bytecode_len);
} }
#ifdef TEST #ifdef TEST
@ -2490,25 +2486,26 @@ void *lre_realloc(void *opaque, void *ptr, size_t size)
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
int len, ret, i; int len, flags, ret, i;
uint8_t *bc; uint8_t *bc;
char error_msg[64]; char error_msg[64];
uint8_t *capture[CAPTURE_COUNT_MAX * 2]; uint8_t *capture[CAPTURE_COUNT_MAX * 2];
const char *input; const char *input;
int input_len, capture_count; int input_len, capture_count;
if (argc < 3) { if (argc < 4) {
printf("usage: %s regexp input\n", argv[0]); printf("usage: %s regexp flags input\n", argv[0]);
exit(1); return 1;
} }
flags = atoi(argv[2]);
bc = lre_compile(&len, error_msg, sizeof(error_msg), argv[1], bc = lre_compile(&len, error_msg, sizeof(error_msg), argv[1],
strlen(argv[1]), 0, NULL); strlen(argv[1]), flags, NULL);
if (!bc) { if (!bc) {
fprintf(stderr, "error: %s\n", error_msg); fprintf(stderr, "error: %s\n", error_msg);
exit(1); exit(1);
} }
input = argv[2]; input = argv[3];
input_len = strlen(input); input_len = strlen(input);
ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL); ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL);

View file

@ -3685,10 +3685,9 @@ static int string_buffer_putc(StringBuffer *s, uint32_t c)
{ {
if (unlikely(c >= 0x10000)) { if (unlikely(c >= 0x10000)) {
/* surrogate pair */ /* surrogate pair */
c -= 0x10000; if (string_buffer_putc16(s, get_hi_surrogate(c)))
if (string_buffer_putc16(s, (c >> 10) + 0xd800))
return -1; return -1;
c = (c & 0x3ff) + 0xdc00; c = get_lo_surrogate(c);
} }
return string_buffer_putc16(s, c); return string_buffer_putc16(s, c);
} }
@ -3699,10 +3698,10 @@ static int string_getc(const JSString *p, int *pidx)
idx = *pidx; idx = *pidx;
if (p->is_wide_char) { if (p->is_wide_char) {
c = p->u.str16[idx++]; c = p->u.str16[idx++];
if (c >= 0xd800 && c < 0xdc00 && idx < p->len) { if (is_hi_surrogate(c) && idx < p->len) {
c1 = p->u.str16[idx]; c1 = p->u.str16[idx];
if (c1 >= 0xdc00 && c1 < 0xe000) { if (is_lo_surrogate(c1)) {
c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000; c = from_surrogate(c, c1);
idx++; idx++;
} }
} }
@ -3900,9 +3899,8 @@ JSValue JS_NewStringLen(JSContext *ctx, const char *buf, size_t buf_len)
} else if (c <= 0x10FFFF) { } else if (c <= 0x10FFFF) {
p = p_next; p = p_next;
/* surrogate pair */ /* surrogate pair */
c -= 0x10000; string_buffer_putc16(b, get_hi_surrogate(c));
string_buffer_putc16(b, (c >> 10) + 0xd800); c = get_lo_surrogate(c);
c = (c & 0x3ff) + 0xdc00;
} else { } else {
/* invalid char */ /* invalid char */
c = 0xfffd; c = 0xfffd;
@ -4040,13 +4038,12 @@ const char *JS_ToCStringLen2(JSContext *ctx, size_t *plen, JSValueConst val1, BO
if (c < 0x80) { if (c < 0x80) {
*q++ = c; *q++ = c;
} else { } else {
if (c >= 0xd800 && c < 0xdc00) { if (is_hi_surrogate(c)) {
if (pos < len && !cesu8) { if (pos < len && !cesu8) {
c1 = src[pos]; c1 = src[pos];
if (c1 >= 0xdc00 && c1 < 0xe000) { if (is_lo_surrogate(c1)) {
pos++; pos++;
/* surrogate pair */ c = from_surrogate(c, c1);
c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000;
} else { } else {
/* Keep unmatched surrogate code points */ /* Keep unmatched surrogate code points */
/* c = 0xfffd; */ /* error */ /* c = 0xfffd; */ /* error */
@ -11729,7 +11726,7 @@ static JSValue JS_ToQuotedString(JSContext *ctx, JSValueConst val1)
goto fail; goto fail;
break; break;
default: default:
if (c < 32 || (c >= 0xd800 && c < 0xe000)) { if (c < 32 || is_surrogate(c)) {
snprintf(buf, sizeof(buf), "\\u%04x", c); snprintf(buf, sizeof(buf), "\\u%04x", c);
if (string_buffer_puts8(b, buf)) if (string_buffer_puts8(b, buf))
goto fail; goto fail;
@ -41583,18 +41580,18 @@ static int64_t string_advance_index(JSString *p, int64_t index, BOOL unicode)
-1 if none */ -1 if none */
static int js_string_find_invalid_codepoint(JSString *p) static int js_string_find_invalid_codepoint(JSString *p)
{ {
int i, c; int i;
if (!p->is_wide_char) if (!p->is_wide_char)
return -1; return -1;
for(i = 0; i < p->len; i++) { for(i = 0; i < p->len; i++) {
c = p->u.str16[i]; uint32_t c = p->u.str16[i];
if (c >= 0xD800 && c <= 0xDFFF) { if (is_surrogate(c)) {
if (c >= 0xDC00 || (i + 1) >= p->len) if (is_hi_surrogate(c) && (i + 1) < p->len
&& is_lo_surrogate(p->u.str16[i + 1])) {
i++;
} else {
return i; return i;
c = p->u.str16[i + 1]; }
if (c < 0xDC00 || c > 0xDFFF)
return i;
i++;
} }
} }
return -1; return -1;
@ -41621,7 +41618,7 @@ static JSValue js_string_toWellFormed(JSContext *ctx, JSValueConst this_val,
{ {
JSValue str, ret; JSValue str, ret;
JSString *p; JSString *p;
int c, i; int i;
str = JS_ToStringCheckObject(ctx, this_val); str = JS_ToStringCheckObject(ctx, this_val);
if (JS_IsException(str)) if (JS_IsException(str))
@ -41640,17 +41637,13 @@ static JSValue js_string_toWellFormed(JSContext *ctx, JSValueConst this_val,
p = JS_VALUE_GET_STRING(ret); p = JS_VALUE_GET_STRING(ret);
for (; i < p->len; i++) { for (; i < p->len; i++) {
c = p->u.str16[i]; uint32_t c = p->u.str16[i];
if (c >= 0xD800 && c <= 0xDFFF) { if (is_surrogate(c)) {
if (c >= 0xDC00 || (i + 1) >= p->len) { if (is_hi_surrogate(c) && (i + 1) < p->len
p->u.str16[i] = 0xFFFD; && is_lo_surrogate(p->u.str16[i + 1])) {
i++;
} else { } else {
c = p->u.str16[i + 1]; p->u.str16[i] = 0xFFFD;
if (c < 0xDC00 || c > 0xDFFF) {
p->u.str16[i] = 0xFFFD;
} else {
i++;
}
} }
} }
} }
@ -42427,10 +42420,10 @@ static int string_prevc(JSString *p, int *pidx)
idx--; idx--;
if (p->is_wide_char) { if (p->is_wide_char) {
c = p->u.str16[idx]; c = p->u.str16[idx];
if (c >= 0xdc00 && c < 0xe000 && idx > 0) { if (is_lo_surrogate(c) && idx > 0) {
c1 = p->u.str16[idx - 1]; c1 = p->u.str16[idx - 1];
if (c1 >= 0xd800 && c1 <= 0xdc00) { if (is_hi_surrogate(c1)) {
c = (((c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; c = from_surrogate(c1, c);
idx--; idx--;
} }
} }
@ -49114,8 +49107,7 @@ static JSValue js_global_decodeURI(JSContext *ctx, JSValueConst this_val,
} }
c = (c << 6) | (c1 & 0x3f); c = (c << 6) | (c1 & 0x3f);
} }
if (c < c_min || c > 0x10FFFF || if (c < c_min || c > 0x10FFFF || is_surrogate(c)) {
(c >= 0xd800 && c < 0xe000)) {
js_throw_URIError(ctx, "malformed UTF-8"); js_throw_URIError(ctx, "malformed UTF-8");
goto fail; goto fail;
} }
@ -49190,21 +49182,21 @@ static JSValue js_global_encodeURI(JSContext *ctx, JSValueConst this_val,
if (isURIUnescaped(c, isComponent)) { if (isURIUnescaped(c, isComponent)) {
string_buffer_putc16(b, c); string_buffer_putc16(b, c);
} else { } else {
if (c >= 0xdc00 && c <= 0xdfff) { if (is_lo_surrogate(c)) {
js_throw_URIError(ctx, "invalid character"); js_throw_URIError(ctx, "invalid character");
goto fail; goto fail;
} else if (c >= 0xd800 && c <= 0xdbff) { } else if (is_hi_surrogate(c)) {
if (k >= p->len) { if (k >= p->len) {
js_throw_URIError(ctx, "expecting surrogate pair"); js_throw_URIError(ctx, "expecting surrogate pair");
goto fail; goto fail;
} }
c1 = string_get(p, k); c1 = string_get(p, k);
k++; k++;
if (c1 < 0xdc00 || c1 > 0xdfff) { if (!is_lo_surrogate(c1)) {
js_throw_URIError(ctx, "expecting surrogate pair"); js_throw_URIError(ctx, "expecting surrogate pair");
goto fail; goto fail;
} }
c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000; c = from_surrogate(c, c1);
} }
if (c < 0x80) { if (c < 0x80) {
encodeURI_hex(b, c); encodeURI_hex(b, c);