diff options
Diffstat (limited to 'src/regexp.c')
| -rw-r--r-- | src/regexp.c | 3044 |
1 files changed, 3044 insertions, 0 deletions
diff --git a/src/regexp.c b/src/regexp.c new file mode 100644 index 0000000..6f3544e --- /dev/null +++ b/src/regexp.c @@ -0,0 +1,3044 @@ +/* vim: set et ts=4 sw=4 : */ +/* + * Regular Expression Engine + * + * Copyright (c) 2017-2018 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include <assert.h> +#include <alloca.h> +#include <inttypes.h> +#include <stdarg.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "regexp.h" + +/* must be provided by the user */ +bool lre_check_stack_overflow(void *opaque, size_t alloca_size) { + (void)opaque; + (void)alloca_size; + return false; +} + +void *lre_realloc(void *opaque, void *ptr, size_t size) { + (void)opaque; + return realloc(ptr, size); +} + +/* quickjs/cutils: */ +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) +#ifndef countof +#define countof(x) (sizeof(x) / sizeof((x)[0])) +#endif + +static inline int max_int(int a, int b) +{ + if (a > b) + return a; + else + return b; +} + +void pstrcpy(char *buf, int buf_size, const char *str) +{ + int c; + char *q = buf; + + if (buf_size <= 0) + return; + + for(;;) { + c = *str++; + if (c == 0 || q >= buf + buf_size - 1) + break; + *q++ = c; + } + *q = '\0'; +} + +typedef void *DynBufReallocFunc(void *opaque, void *ptr, size_t size); + +typedef struct DynBuf { + uint8_t *buf; + size_t size; + size_t allocated_size; + bool error; /* true if a memory allocation error occurred */ + DynBufReallocFunc *realloc_func; + void *opaque; /* for realloc_func */ +} DynBuf; + +int dbuf_put(DynBuf *s, const uint8_t *data, size_t len); + +struct __attribute__((packed)) packed_u16 { + uint16_t v; +}; + +struct __attribute__((packed)) packed_u32 { + uint32_t v; +}; + +static inline uint32_t get_u16(const uint8_t *tab) +{ + return ((const struct packed_u16 *)tab)->v; +} + +static inline uint32_t get_u32(const uint8_t *tab) +{ + return ((const struct packed_u32 *)tab)->v; +} + +static inline void put_u32(uint8_t *tab, uint32_t val) +{ + ((struct packed_u32 *)tab)->v = val; +} + +static inline int dbuf_put_u16(DynBuf *s, uint16_t val) +{ + return dbuf_put(s, (uint8_t *)&val, 2); +} +static inline int dbuf_put_u32(DynBuf *s, uint32_t val) +{ + return dbuf_put(s, (uint8_t *)&val, 4); +} + +static inline bool dbuf_error(DynBuf *s) { + return s->error; +} + +/* return < 0 if error */ +int dbuf_realloc(DynBuf *s, size_t new_size) +{ + size_t size; + uint8_t *new_buf; + if (new_size > s->allocated_size) { + if (s->error) + return -1; + size = s->allocated_size * 3 / 2; + if (size > new_size) + new_size = size; + new_buf = s->realloc_func(s->opaque, s->buf, new_size); + if (!new_buf) { + s->error = true; + return -1; + } + s->buf = new_buf; + s->allocated_size = new_size; + } + return 0; +} + +void dbuf_free(DynBuf *s) +{ + /* we test s->buf as a fail safe to avoid crashing if dbuf_free() + is called twice */ + if (s->buf) { + s->realloc_func(s->opaque, s->buf, 0); + } + memset(s, 0, sizeof(*s)); +} + +static void *dbuf_default_realloc(void *opaque, void *ptr, size_t size) +{ + (void)opaque; + return realloc(ptr, size); +} + +void dbuf_init2(DynBuf *s, void *opaque, DynBufReallocFunc *realloc_func) +{ + memset(s, 0, sizeof(*s)); + if (!realloc_func) + realloc_func = dbuf_default_realloc; + s->opaque = opaque; + s->realloc_func = realloc_func; +} + +int dbuf_put(DynBuf *s, const uint8_t *data, size_t len) +{ + if (unlikely((s->size + len) > s->allocated_size)) { + if (dbuf_realloc(s, s->size + len)) + return -1; + } + memcpy(s->buf + s->size, data, len); + s->size += len; + return 0; +} + +int dbuf_put_self(DynBuf *s, size_t offset, size_t len) +{ + if (unlikely((s->size + len) > s->allocated_size)) { + if (dbuf_realloc(s, s->size + len)) + return -1; + } + memcpy(s->buf + s->size, s->buf + offset, len); + s->size += len; + return 0; +} + +int dbuf_putc(DynBuf *s, uint8_t c) +{ + return dbuf_put(s, &c, 1); +} + +static inline int from_hex(int c) +{ + if (c >= '0' && c <= '9') + return c - '0'; + else if (c >= 'A' && c <= 'F') + return c - 'A' + 10; + else if (c >= 'a' && c <= 'f') + return c - 'a' + 10; + else + return -1; +} + +#define UTF8_CHAR_LEN_MAX 6 + +/* Note: at most 31 bits are encoded. At most UTF8_CHAR_LEN_MAX bytes + are output. */ +int unicode_to_utf8(uint8_t *buf, unsigned int c) +{ + uint8_t *q = buf; + + if (c < 0x80) { + *q++ = c; + } else { + if (c < 0x800) { + *q++ = (c >> 6) | 0xc0; + } else { + if (c < 0x10000) { + *q++ = (c >> 12) | 0xe0; + } else { + if (c < 0x00200000) { + *q++ = (c >> 18) | 0xf0; + } else { + if (c < 0x04000000) { + *q++ = (c >> 24) | 0xf8; + } else if (c < 0x80000000) { + *q++ = (c >> 30) | 0xfc; + *q++ = ((c >> 24) & 0x3f) | 0x80; + } else { + return 0; + } + *q++ = ((c >> 18) & 0x3f) | 0x80; + } + *q++ = ((c >> 12) & 0x3f) | 0x80; + } + *q++ = ((c >> 6) & 0x3f) | 0x80; + } + *q++ = (c & 0x3f) | 0x80; + } + return q - buf; +} + +static const unsigned int utf8_min_code[5] = { + 0x80, 0x800, 0x10000, 0x00200000, 0x04000000, +}; + +static const unsigned char utf8_first_code_mask[5] = { + 0x1f, 0xf, 0x7, 0x3, 0x1, +}; + +/* return -1 if error. *pp is not updated in this case. max_len must + be >= 1. The maximum length for a UTF8 byte sequence is 6 bytes. */ +int unicode_from_utf8(const uint8_t *p, int max_len, const uint8_t **pp) +{ + int l, c, b, i; + + c = *p++; + if (c < 0x80) { + *pp = p; + return c; + } + switch(c) { + case 0xc0: case 0xc1: case 0xc2: case 0xc3: + case 0xc4: case 0xc5: case 0xc6: case 0xc7: + case 0xc8: case 0xc9: case 0xca: case 0xcb: + case 0xcc: case 0xcd: case 0xce: case 0xcf: + case 0xd0: case 0xd1: case 0xd2: case 0xd3: + case 0xd4: case 0xd5: case 0xd6: case 0xd7: + case 0xd8: case 0xd9: case 0xda: case 0xdb: + case 0xdc: case 0xdd: case 0xde: case 0xdf: + l = 1; + break; + case 0xe0: case 0xe1: case 0xe2: case 0xe3: + case 0xe4: case 0xe5: case 0xe6: case 0xe7: + case 0xe8: case 0xe9: case 0xea: case 0xeb: + case 0xec: case 0xed: case 0xee: case 0xef: + l = 2; + break; + case 0xf0: case 0xf1: case 0xf2: case 0xf3: + case 0xf4: case 0xf5: case 0xf6: case 0xf7: + l = 3; + break; + case 0xf8: case 0xf9: case 0xfa: case 0xfb: + l = 4; + break; + case 0xfc: case 0xfd: + l = 5; + break; + default: + return -1; + } + /* check that we have enough characters */ + if (l > (max_len - 1)) + return -1; + c &= utf8_first_code_mask[l - 1]; + for(i = 0; i < l; i++) { + b = *p++; + if (b < 0x80 || b >= 0xc0) + return -1; + c = (c << 6) | (b & 0x3f); + } + if (c < (int)utf8_min_code[l - 1]) + return -1; + *pp = p; + return c; +} + +/* quickjs/libunicode: */ +#include "unicode-table.h" + +#define LRE_CC_RES_LEN_MAX 3 + +enum { + RUN_TYPE_U, + RUN_TYPE_L, + RUN_TYPE_UF, + RUN_TYPE_LF, + RUN_TYPE_UL, + RUN_TYPE_LSU, + RUN_TYPE_U2L_399_EXT2, + RUN_TYPE_UF_D20, + RUN_TYPE_UF_D1_EXT, + RUN_TYPE_U_EXT, + RUN_TYPE_LF_EXT, + RUN_TYPE_U_EXT2, + RUN_TYPE_L_EXT2, + RUN_TYPE_U_EXT3, +}; + +typedef struct { + int len; /* in points, always even */ + int size; + uint32_t *points; /* points sorted by increasing value */ + void *mem_opaque; + void *(*realloc_func)(void *opaque, void *ptr, size_t size); +} CharRange; + +typedef enum { + CR_OP_UNION, + CR_OP_INTER, + CR_OP_XOR, +} CharRangeOpEnum; + +static void *cr_default_realloc(void *opaque, void *ptr, size_t size) +{ + (void)opaque; + return realloc(ptr, size); +} + +void cr_init(CharRange *cr, void *mem_opaque, DynBufReallocFunc *realloc_func) +{ + cr->len = cr->size = 0; + cr->points = NULL; + cr->mem_opaque = mem_opaque; + cr->realloc_func = realloc_func ? realloc_func : cr_default_realloc; +} + +void cr_free(CharRange *cr) +{ + cr->realloc_func(cr->mem_opaque, cr->points, 0); +} + +int cr_realloc(CharRange *cr, int size) +{ + int new_size; + uint32_t *new_buf; + + if (size > cr->size) { + new_size = max_int(size, cr->size * 3 / 2); + new_buf = cr->realloc_func(cr->mem_opaque, cr->points, + new_size * sizeof(cr->points[0])); + if (!new_buf) + return -1; + cr->points = new_buf; + cr->size = new_size; + } + return 0; +} + +static void cr_compress(CharRange *cr) +{ + int i, j, k, len; + uint32_t *pt; + + pt = cr->points; + len = cr->len; + i = 0; + j = 0; + k = 0; + while ((i + 1) < len) { + if (pt[i] == pt[i + 1]) { + /* empty interval */ + i += 2; + } else { + j = i; + while ((j + 3) < len && pt[j + 1] == pt[j + 2]) + j += 2; + /* just copy */ + pt[k] = pt[i]; + pt[k + 1] = pt[j + 1]; + k += 2; + i = j + 2; + } + } + cr->len = k; +} + +static inline int cr_add_point(CharRange *cr, uint32_t v) +{ + if (cr->len >= cr->size) { + if (cr_realloc(cr, cr->len + 1)) + return -1; + } + cr->points[cr->len++] = v; + return 0; +} + +int cr_invert(CharRange *cr) +{ + int len; + len = cr->len; + if (cr_realloc(cr, len + 2)) + return -1; + memmove(cr->points + 1, cr->points, len * sizeof(cr->points[0])); + cr->points[0] = 0; + cr->points[len + 1] = UINT32_MAX; + cr->len = len + 2; + cr_compress(cr); + return 0; +} + +int cr_op(CharRange *cr, const uint32_t *a_pt, int a_len, + const uint32_t *b_pt, int b_len, int op) +{ + int a_idx, b_idx, is_in; + uint32_t v; + + a_idx = 0; + b_idx = 0; + for(;;) { + /* get one more point from a or b in increasing order */ + if (a_idx < a_len && b_idx < b_len) { + if (a_pt[a_idx] < b_pt[b_idx]) { + goto a_add; + } else if (a_pt[a_idx] == b_pt[b_idx]) { + v = a_pt[a_idx]; + a_idx++; + b_idx++; + } else { + goto b_add; + } + } else if (a_idx < a_len) { + a_add: + v = a_pt[a_idx++]; + } else if (b_idx < b_len) { + b_add: + v = b_pt[b_idx++]; + } else { + break; + } + /* add the point if the in/out status changes */ + switch(op) { + case CR_OP_UNION: + is_in = (a_idx & 1) | (b_idx & 1); + break; + case CR_OP_INTER: + is_in = (a_idx & 1) & (b_idx & 1); + break; + case CR_OP_XOR: + is_in = (a_idx & 1) ^ (b_idx & 1); + break; + default: + abort(); + } + if (is_in != (cr->len & 1)) { + if (cr_add_point(cr, v)) + return -1; + } + } + cr_compress(cr); + return 0; +} + +int cr_union1(CharRange *cr, const uint32_t *b_pt, int b_len) +{ + CharRange a = *cr; + int ret; + cr->len = 0; + cr->size = 0; + cr->points = NULL; + ret = cr_op(cr, a.points, a.len, b_pt, b_len, CR_OP_UNION); + cr_free(&a); + return ret; +} + +static inline int cr_union_interval(CharRange *cr, uint32_t c1, uint32_t c2) +{ + uint32_t b_pt[2]; + b_pt[0] = c1; + b_pt[1] = c2 + 1; + return cr_union1(cr, b_pt, 2); +} + +/* conv_type: + 0 = to upper + 1 = to lower + 2 = case folding (= to lower with modifications) +*/ +int lre_case_conv(uint32_t *res, uint32_t c, int conv_type) +{ + if (c < 128) { + if (conv_type) { + if (c >= 'A' && c <= 'Z') { + c = c - 'A' + 'a'; + } + } else { + if (c >= 'a' && c <= 'z') { + c = c - 'a' + 'A'; + } + } + } else { + uint32_t v, code, data, type, len, a, is_lower; + int idx, idx_min, idx_max; + + is_lower = (conv_type != 0); + idx_min = 0; + idx_max = countof(case_conv_table1) - 1; + while (idx_min <= idx_max) { + idx = (unsigned)(idx_max + idx_min) / 2; + v = case_conv_table1[idx]; + code = v >> (32 - 17); + len = (v >> (32 - 17 - 7)) & 0x7f; + if (c < code) { + idx_max = idx - 1; + } else if (c >= code + len) { + idx_min = idx + 1; + } else { + type = (v >> (32 - 17 - 7 - 4)) & 0xf; + data = ((v & 0xf) << 8) | case_conv_table2[idx]; + switch(type) { + case RUN_TYPE_U: + case RUN_TYPE_L: + case RUN_TYPE_UF: + case RUN_TYPE_LF: + if ((uint32_t)conv_type == (type & 1) || + (type >= RUN_TYPE_UF && conv_type == 2)) { + c = c - code + (case_conv_table1[data] >> (32 - 17)); + } + break; + case RUN_TYPE_UL: + a = c - code; + if ((a & 1) != (1 - is_lower)) + break; + c = (a ^ 1) + code; + break; + case RUN_TYPE_LSU: + a = c - code; + if (a == 1) { + c += 2 * is_lower - 1; + } else if (a == (1 - is_lower) * 2) { + c += (2 * is_lower - 1) * 2; + } + break; + case RUN_TYPE_U2L_399_EXT2: + if (!is_lower) { + res[0] = c - code + case_conv_ext[data >> 6]; + res[1] = 0x399; + return 2; + } else { + c = c - code + case_conv_ext[data & 0x3f]; + } + break; + case RUN_TYPE_UF_D20: + if (conv_type == 1) + break; + c = data + (conv_type == 2) * 0x20; + break; + case RUN_TYPE_UF_D1_EXT: + if (conv_type == 1) + break; + c = case_conv_ext[data] + (conv_type == 2); + break; + case RUN_TYPE_U_EXT: + case RUN_TYPE_LF_EXT: + if (is_lower != (type - RUN_TYPE_U_EXT)) + break; + c = case_conv_ext[data]; + break; + case RUN_TYPE_U_EXT2: + case RUN_TYPE_L_EXT2: + if ((uint32_t)conv_type != (type - RUN_TYPE_U_EXT2)) + break; + res[0] = c - code + case_conv_ext[data >> 6]; + res[1] = case_conv_ext[data & 0x3f]; + return 2; + default: + case RUN_TYPE_U_EXT3: + if (conv_type != 0) + break; + res[0] = case_conv_ext[data >> 8]; + res[1] = case_conv_ext[(data >> 4) & 0xf]; + res[2] = case_conv_ext[data & 0xf]; + return 3; + } + break; + } + } + } + res[0] = c; + return 1; +} + +/* quickjs/libregexp: */ +typedef enum { +#define DEF(id, size) REOP_ ## id, +#include "regexp-opcode.h" +#undef DEF + REOP_COUNT, +} REOPCodeEnum; + +#define CAPTURE_COUNT_MAX 255 +#define STACK_SIZE_MAX 255 + +/* unicode code points */ +#define CP_LS 0x2028 +#define CP_PS 0x2029 + +#define TMP_BUF_SIZE 128 + +typedef struct { + DynBuf byte_code; + const uint8_t *buf_ptr; + const uint8_t *buf_end; + const uint8_t *buf_start; + int re_flags; + bool is_utf16; + bool ignore_case; + bool dotall; + int capture_count; + int total_capture_count; /* -1 = not computed yet */ + int has_named_captures; /* -1 = don't know, 0 = no, 1 = yes */ + void *mem_opaque; + DynBuf group_names; + union { + char error_msg[TMP_BUF_SIZE]; + char tmp_buf[TMP_BUF_SIZE]; + } u; +} REParseState; + +typedef struct { + uint8_t size; +} REOpCode; + +static const REOpCode reopcode_info[REOP_COUNT] = { +#define DEF(id, size) { size }, +#include "regexp-opcode.h" +#undef DEF +}; + +#define RE_HEADER_FLAGS 0 +#define RE_HEADER_CAPTURE_COUNT 1 +#define RE_HEADER_STACK_SIZE 2 + +#define RE_HEADER_LEN 7 + +static inline int is_digit(int c) { + return c >= '0' && c <= '9'; +} + +/* insert 'len' bytes at position 'pos'. Return < 0 if error. */ +static int dbuf_insert(DynBuf *s, int pos, int len) +{ + if (dbuf_realloc(s, s->size + len)) + return -1; + memmove(s->buf + pos + len, s->buf + pos, s->size - pos); + s->size += len; + return 0; +} + +/* canonicalize with the specific JS regexp rules */ +static uint32_t lre_canonicalize(uint32_t c, bool is_utf16) +{ + uint32_t res[LRE_CC_RES_LEN_MAX]; + int len; + if (is_utf16) { + if (likely(c < 128)) { + if (c >= 'A' && c <= 'Z') + c = c - 'A' + 'a'; + } else { + lre_case_conv(res, c, 2); + c = res[0]; + } + } else { + if (likely(c < 128)) { + if (c >= 'a' && c <= 'z') + c = c - 'a' + 'A'; + } else { + /* legacy regexp: to upper case if single char >= 128 */ + len = lre_case_conv(res, c, false); + if (len == 1 && res[0] >= 128) + c = res[0]; + } + } + return c; +} + +static const uint16_t char_range_d[] = { + 1, + 0x0030, 0x0039 + 1, +}; + +/* code point ranges for Zs,Zl or Zp property */ +static const uint16_t char_range_s[] = { + 10, + 0x0009, 0x000D + 1, + 0x0020, 0x0020 + 1, + 0x00A0, 0x00A0 + 1, + 0x1680, 0x1680 + 1, + 0x2000, 0x200A + 1, + /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */ + /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */ + 0x2028, 0x2029 + 1, + 0x202F, 0x202F + 1, + 0x205F, 0x205F + 1, + 0x3000, 0x3000 + 1, + /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */ + 0xFEFF, 0xFEFF + 1, +}; + +bool lre_is_space(int c) +{ + int i, n, low, high; + n = (countof(char_range_s) - 1) / 2; + for(i = 0; i < n; i++) { + low = char_range_s[2 * i + 1]; + if (c < low) + return false; + high = char_range_s[2 * i + 2]; + if (c < high) + return true; + } + return false; +} + +uint32_t const lre_id_start_table_ascii[4] = { + /* $ A-Z _ a-z */ + 0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE +}; + +uint32_t const lre_id_continue_table_ascii[4] = { + /* $ 0-9 A-Z _ a-z */ + 0x00000000, 0x03FF0010, 0x87FFFFFE, 0x07FFFFFE +}; + + +static const uint16_t char_range_w[] = { + 4, + 0x0030, 0x0039 + 1, + 0x0041, 0x005A + 1, + 0x005F, 0x005F + 1, + 0x0061, 0x007A + 1, +}; + +#define CLASS_RANGE_BASE 0x40000000 + +typedef enum { + CHAR_RANGE_d, + CHAR_RANGE_D, + CHAR_RANGE_s, + CHAR_RANGE_S, + CHAR_RANGE_w, + CHAR_RANGE_W, +} CharRangeEnum; + +static const uint16_t *char_range_table[] = { + char_range_d, + char_range_s, + char_range_w, +}; + +static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c) +{ + bool invert; + const uint16_t *c_pt; + int len, i; + + invert = c & 1; + c_pt = char_range_table[c >> 1]; + len = *c_pt++; + cr_init(cr, s->mem_opaque, lre_realloc); + for(i = 0; i < len * 2; i++) { + if (cr_add_point(cr, c_pt[i])) + goto fail; + } + if (invert) { + if (cr_invert(cr)) + goto fail; + } + return 0; + fail: + cr_free(cr); + return -1; +} + +static int cr_canonicalize(CharRange *cr) +{ + CharRange a; + uint32_t pt[2]; + int i, ret; + + cr_init(&a, cr->mem_opaque, lre_realloc); + pt[0] = 'a'; + pt[1] = 'z' + 1; + ret = cr_op(&a, cr->points, cr->len, pt, 2, CR_OP_INTER); + if (ret) + goto fail; + /* convert to upper case */ + /* XXX: the generic unicode case would be much more complicated + and not really useful */ + for(i = 0; i < a.len; i++) { + a.points[i] += 'A' - 'a'; + } + /* Note: for simplicity we keep the lower case ranges */ + ret = cr_union1(cr, a.points, a.len); + fail: + cr_free(&a); + return ret; +} + +static void re_emit_op(REParseState *s, int op) +{ + dbuf_putc(&s->byte_code, op); +} + +/* return the offset of the u32 value */ +static int re_emit_op_u32(REParseState *s, int op, uint32_t val) +{ + int pos; + dbuf_putc(&s->byte_code, op); + pos = s->byte_code.size; + dbuf_put_u32(&s->byte_code, val); + return pos; +} + +static int re_emit_goto(REParseState *s, int op, uint32_t val) +{ + int pos; + dbuf_putc(&s->byte_code, op); + pos = s->byte_code.size; + dbuf_put_u32(&s->byte_code, val - (pos + 4)); + return pos; +} + +static void re_emit_op_u8(REParseState *s, int op, uint32_t val) +{ + dbuf_putc(&s->byte_code, op); + dbuf_putc(&s->byte_code, val); +} + +static void re_emit_op_u16(REParseState *s, int op, uint32_t val) +{ + dbuf_putc(&s->byte_code, op); + dbuf_put_u16(&s->byte_code, val); +} + +static int __attribute__((format(printf, 2, 3))) re_parse_error(REParseState *s, const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + vsnprintf(s->u.error_msg, sizeof(s->u.error_msg), fmt, ap); + va_end(ap); + return -1; +} + +static int re_parse_out_of_memory(REParseState *s) +{ + return re_parse_error(s, "out of memory"); +} + +/* If allow_overflow is false, return -1 in case of + overflow. Otherwise return INT32_MAX. */ +static int parse_digits(const uint8_t **pp, bool allow_overflow) +{ + const uint8_t *p; + uint64_t v; + int c; + + p = *pp; + v = 0; + for(;;) { + c = *p; + if (c < '0' || c > '9') + break; + v = v * 10 + c - '0'; + if (v >= INT32_MAX) { + if (allow_overflow) + v = INT32_MAX; + else + return -1; + } + p++; + } + *pp = p; + return v; +} + +static int re_parse_expect(REParseState *s, const uint8_t **pp, int c) +{ + const uint8_t *p; + p = *pp; + if (*p != c) + return re_parse_error(s, "expecting '%c'", c); + p++; + *pp = p; + return 0; +} + +/* Parse an escape sequence, *pp points after the '\': + allow_utf16 value: + 0 : no UTF-16 escapes allowed + 1 : UTF-16 escapes allowed + 2 : UTF-16 escapes allowed and escapes of surrogate pairs are + converted to a unicode character (unicode regexp case). + + Return the unicode char and update *pp if recognized, + return -1 if malformed escape, + return -2 otherwise. */ +int lre_parse_escape(const uint8_t **pp, int allow_utf16) +{ + const uint8_t *p; + uint32_t c; + + p = *pp; + c = *p++; + switch(c) { + case 'b': + c = '\b'; + break; + case 'f': + c = '\f'; + break; + case 'n': + c = '\n'; + break; + case 'r': + c = '\r'; + break; + case 't': + c = '\t'; + break; + case 'v': + c = '\v'; + break; + case 'x': + case 'u': + { + int h, n, i; + uint32_t c1; + + if (*p == '{' && allow_utf16) { + p++; + c = 0; + for(;;) { + h = from_hex(*p++); + if (h < 0) + return -1; + c = (c << 4) | h; + if (c > 0x10FFFF) + return -1; + if (*p == '}') + break; + } + p++; + } else { + if (c == 'x') { + n = 2; + } else { + n = 4; + } + + c = 0; + for(i = 0; i < n; i++) { + h = from_hex(*p++); + if (h < 0) { + return -1; + } + c = (c << 4) | h; + } + if (c >= 0xd800 && c < 0xdc00 && + allow_utf16 == 2 && p[0] == '\\' && p[1] == 'u') { + /* convert an escaped surrogate pair into a + unicode char */ + c1 = 0; + for(i = 0; i < 4; i++) { + h = from_hex(p[2 + i]); + if (h < 0) + break; + c1 = (c1 << 4) | h; + } + if (i == 4 && c1 >= 0xdc00 && c1 < 0xe000) { + p += 6; + c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000; + } + } + } + } + break; + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + c -= '0'; + if (allow_utf16 == 2) { + /* only accept \0 not followed by digit */ + if (c != 0 || is_digit(*p)) + return -1; + } else { + /* parse a legacy octal sequence */ + uint32_t v; + v = *p - '0'; + if (v > 7) + break; + c = (c << 3) | v; + p++; + if (c >= 32) + break; + v = *p - '0'; + if (v > 7) + break; + c = (c << 3) | v; + p++; + } + break; + default: + return -2; + } + *pp = p; + return c; +} + +#ifdef CONFIG_ALL_UNICODE +/* XXX: we use the same chars for name and value */ +static bool is_unicode_char(int c) +{ + return ((c >= '0' && c <= '9') || + (c >= 'A' && c <= 'Z') || + (c >= 'a' && c <= 'z') || + (c == '_')); +} + +static int parse_unicode_property(REParseState *s, CharRange *cr, + const uint8_t **pp, bool is_inv) +{ + const uint8_t *p; + char name[64], value[64]; + char *q; + bool script_ext; + int ret; + + p = *pp; + if (*p != '{') + return re_parse_error(s, "expecting '{' after \\p"); + p++; + q = name; + while (is_unicode_char(*p)) { + if ((q - name) > sizeof(name) - 1) + goto unknown_property_name; + *q++ = *p++; + } + *q = '\0'; + q = value; + if (*p == '=') { + p++; + while (is_unicode_char(*p)) { + if ((q - value) > sizeof(value) - 1) + return re_parse_error(s, "unknown unicode property value"); + *q++ = *p++; + } + } + *q = '\0'; + if (*p != '}') + return re_parse_error(s, "expecting '}'"); + p++; + // printf("name=%s value=%s\n", name, value); + + if (!strcmp(name, "Script") || !strcmp(name, "sc")) { + script_ext = false; + goto do_script; + } else if (!strcmp(name, "Script_Extensions") || !strcmp(name, "scx")) { + script_ext = true; + do_script: + cr_init(cr, s->mem_opaque, lre_realloc); + ret = unicode_script(cr, value, script_ext); + if (ret) { + cr_free(cr); + if (ret == -2) + return re_parse_error(s, "unknown unicode script"); + else + goto out_of_memory; + } + } else if (!strcmp(name, "General_Category") || !strcmp(name, "gc")) { + cr_init(cr, s->mem_opaque, lre_realloc); + ret = unicode_general_category(cr, value); + if (ret) { + cr_free(cr); + if (ret == -2) + return re_parse_error(s, "unknown unicode general category"); + else + goto out_of_memory; + } + } else if (value[0] == '\0') { + cr_init(cr, s->mem_opaque, lre_realloc); + ret = unicode_general_category(cr, name); + if (ret == -1) { + cr_free(cr); + goto out_of_memory; + } + if (ret < 0) { + ret = unicode_prop(cr, name); + if (ret) { + cr_free(cr); + if (ret == -2) + goto unknown_property_name; + else + goto out_of_memory; + } + } + } else { + unknown_property_name: + return re_parse_error(s, "unknown unicode property name"); + } + + if (is_inv) { + if (cr_invert(cr)) { + cr_free(cr); + return -1; + } + } + *pp = p; + return 0; + out_of_memory: + return re_parse_out_of_memory(s); +} +#endif /* CONFIG_ALL_UNICODE */ + +/* return -1 if error otherwise the character or a class range + (CLASS_RANGE_BASE). In case of class range, 'cr' is + initialized. Otherwise, it is ignored. */ +static int get_class_atom(REParseState *s, CharRange *cr, + const uint8_t **pp, bool inclass) +{ + const uint8_t *p; + uint32_t c; + int ret; + + p = *pp; + + c = *p; + switch(c) { + case '\\': + p++; + if (p >= s->buf_end) + goto unexpected_end; + c = *p++; + switch(c) { + case 'd': + c = CHAR_RANGE_d; + goto class_range; + case 'D': + c = CHAR_RANGE_D; + goto class_range; + case 's': + c = CHAR_RANGE_s; + goto class_range; + case 'S': + c = CHAR_RANGE_S; + goto class_range; + case 'w': + c = CHAR_RANGE_w; + goto class_range; + case 'W': + c = CHAR_RANGE_W; + class_range: + if (cr_init_char_range(s, cr, c)) + return -1; + c = CLASS_RANGE_BASE; + break; + case 'c': + c = *p; + if ((c >= 'a' && c <= 'z') || + (c >= 'A' && c <= 'Z') || + (((c >= '0' && c <= '9') || c == '_') && + inclass && !s->is_utf16)) { /* Annex B.1.4 */ + c &= 0x1f; + p++; + } else if (s->is_utf16) { + goto invalid_escape; + } else { + /* otherwise return '\' and 'c' */ + p--; + c = '\\'; + } + break; +#ifdef CONFIG_ALL_UNICODE + case 'p': + case 'P': + if (s->is_utf16) { + if (parse_unicode_property(s, cr, &p, (c == 'P'))) + return -1; + c = CLASS_RANGE_BASE; + break; + } + /* fall thru */ +#endif + default: + p--; + ret = lre_parse_escape(&p, s->is_utf16 * 2); + if (ret >= 0) { + c = ret; + } else { + if (ret == -2 && *p != '\0' && strchr("^$\\.*+?()[]{}|/", *p)) { + /* always valid to escape these characters */ + goto normal_char; + } else if (s->is_utf16) { + invalid_escape: + return re_parse_error(s, "invalid escape sequence in regular expression"); + } else { + /* just ignore the '\' */ + goto normal_char; + } + } + break; + } + break; + case '\0': + if (p >= s->buf_end) { + unexpected_end: + return re_parse_error(s, "unexpected end"); + } + /* fall thru */ + default: + normal_char: + /* normal char */ + if (c >= 128) { + c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); + if ((unsigned)c > 0xffff && !s->is_utf16) { + /* XXX: should handle non BMP-1 code points */ + return re_parse_error(s, "malformed unicode char"); + } + } else { + p++; + } + break; + } + *pp = p; + return c; +} + +static int re_emit_range(REParseState *s, const CharRange *cr) +{ + int len, i; + uint32_t high; + + len = (unsigned)cr->len / 2; + if (len >= 65535) + return re_parse_error(s, "too many ranges"); + if (len == 0) { + /* not sure it can really happen. Emit a match that is always + false */ + re_emit_op_u32(s, REOP_char32, -1); + } else { + high = cr->points[cr->len - 1]; + if (high == UINT32_MAX) + high = cr->points[cr->len - 2]; + if (high <= 0xffff) { + /* can use 16 bit ranges with the conversion that 0xffff = + infinity */ + re_emit_op_u16(s, REOP_range, len); + for(i = 0; i < cr->len; i += 2) { + dbuf_put_u16(&s->byte_code, cr->points[i]); + high = cr->points[i + 1] - 1; + if (high == UINT32_MAX - 1) + high = 0xffff; + dbuf_put_u16(&s->byte_code, high); + } + } else { + re_emit_op_u16(s, REOP_range32, len); + for(i = 0; i < cr->len; i += 2) { + dbuf_put_u32(&s->byte_code, cr->points[i]); + dbuf_put_u32(&s->byte_code, cr->points[i + 1] - 1); + } + } + } + return 0; +} + +static int re_parse_char_class(REParseState *s, const uint8_t **pp) +{ + const uint8_t *p; + uint32_t c1, c2; + CharRange cr_s, *cr = &cr_s; + CharRange cr1_s, *cr1 = &cr1_s; + bool invert; + + cr_init(cr, s->mem_opaque, lre_realloc); + p = *pp; + p++; /* skip '[' */ + invert = false; + if (*p == '^') { + p++; + invert = true; + } + for(;;) { + if (*p == ']') + break; + c1 = get_class_atom(s, cr1, &p, true); + if ((int)c1 < 0) + goto fail; + if (*p == '-' && p[1] != ']') { + const uint8_t *p0 = p + 1; + if (c1 >= CLASS_RANGE_BASE) { + if (s->is_utf16) { + cr_free(cr1); + goto invalid_class_range; + } + /* Annex B: match '-' character */ + goto class_atom; + } + c2 = get_class_atom(s, cr1, &p0, true); + if ((int)c2 < 0) + goto fail; + if (c2 >= CLASS_RANGE_BASE) { + cr_free(cr1); + if (s->is_utf16) { + goto invalid_class_range; + } + /* Annex B: match '-' character */ + goto class_atom; + } + p = p0; + if (c2 < c1) { + invalid_class_range: + re_parse_error(s, "invalid class range"); + goto fail; + } + if (cr_union_interval(cr, c1, c2)) + goto memory_error; + } else { + class_atom: + if (c1 >= CLASS_RANGE_BASE) { + int ret; + ret = cr_union1(cr, cr1->points, cr1->len); + cr_free(cr1); + if (ret) + goto memory_error; + } else { + if (cr_union_interval(cr, c1, c1)) + goto memory_error; + } + } + } + if (s->ignore_case) { + if (cr_canonicalize(cr)) + goto memory_error; + } + if (invert) { + if (cr_invert(cr)) + goto memory_error; + } + if (re_emit_range(s, cr)) + goto fail; + cr_free(cr); + p++; /* skip ']' */ + *pp = p; + return 0; + memory_error: + re_parse_out_of_memory(s); + fail: + cr_free(cr); + return -1; +} + +/* Return: + 1 if the opcodes in bc_buf[] always advance the character pointer. + 0 if the character pointer may not be advanced. + -1 if the code may depend on side effects of its previous execution (backreference) +*/ +static int re_check_advance(const uint8_t *bc_buf, int bc_buf_len) +{ + int pos, opcode, ret, len, i; + uint32_t val, last; + bool has_back_reference; + uint8_t capture_bitmap[CAPTURE_COUNT_MAX]; + + ret = -2; /* not known yet */ + pos = 0; + has_back_reference = false; + memset(capture_bitmap, 0, sizeof(capture_bitmap)); + + while (pos < bc_buf_len) { + opcode = bc_buf[pos]; + len = reopcode_info[opcode].size; + switch(opcode) { + case REOP_range: + val = get_u16(bc_buf + pos + 1); + len += val * 4; + goto simple_char; + case REOP_range32: + val = get_u16(bc_buf + pos + 1); + len += val * 8; + goto simple_char; + case REOP_char: + case REOP_char32: + case REOP_dot: + case REOP_any: + simple_char: + if (ret == -2) + ret = 1; + break; + case REOP_line_start: + case REOP_line_end: + case REOP_push_i32: + case REOP_push_char_pos: + case REOP_drop: + case REOP_word_boundary: + case REOP_not_word_boundary: + case REOP_prev: + /* no effect */ + break; + case REOP_save_start: + case REOP_save_end: + val = bc_buf[pos + 1]; + capture_bitmap[val] |= 1; + break; + case REOP_save_reset: + { + val = bc_buf[pos + 1]; + last = bc_buf[pos + 2]; + while (val < last) + capture_bitmap[val++] |= 1; + } + break; + case REOP_back_reference: + case REOP_backward_back_reference: + val = bc_buf[pos + 1]; + capture_bitmap[val] |= 2; + has_back_reference = true; + break; + default: + /* safe behvior: we cannot predict the outcome */ + if (ret == -2) + ret = 0; + break; + } + pos += len; + } + if (has_back_reference) { + /* check if there is back reference which references a capture + made in the some code */ + for(i = 0; i < CAPTURE_COUNT_MAX; i++) { + if (capture_bitmap[i] == 3) + return -1; + } + } + if (ret == -2) + ret = 0; + return ret; +} + +/* return -1 if a simple quantifier cannot be used. Otherwise return + the number of characters in the atom. */ +static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len) +{ + int pos, opcode, len, count; + uint32_t val; + + count = 0; + pos = 0; + while (pos < bc_buf_len) { + opcode = bc_buf[pos]; + len = reopcode_info[opcode].size; + switch(opcode) { + case REOP_range: + val = get_u16(bc_buf + pos + 1); + len += val * 4; + goto simple_char; + case REOP_range32: + val = get_u16(bc_buf + pos + 1); + len += val * 8; + goto simple_char; + case REOP_char: + case REOP_char32: + case REOP_dot: + case REOP_any: + simple_char: + count++; + break; + case REOP_line_start: + case REOP_line_end: + case REOP_word_boundary: + case REOP_not_word_boundary: + break; + default: + return -1; + } + pos += len; + } + return count; +} + +/* '*pp' is the first char after '<' */ +static int re_parse_group_name(char *buf, int buf_size, + const uint8_t **pp, bool is_utf16) +{ + const uint8_t *p; + uint32_t c; + char *q; + + p = *pp; + q = buf; + for(;;) { + c = *p; + if (c == '\\') { + p++; + if (*p != 'u') + return -1; + c = lre_parse_escape(&p, is_utf16 * 2); + } else if (c == '>') { + break; + } else if (c >= 128) { + c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p); + } else { + p++; + } + if (c > 0x10FFFF) + return -1; + if (q == buf) { + if (!lre_js_is_ident_first(c)) + return -1; + } else { + if (!lre_js_is_ident_next(c)) + return -1; + } + if ((q - buf + UTF8_CHAR_LEN_MAX + 1) > buf_size) + return -1; + if (c < 128) { + *q++ = c; + } else { + q += unicode_to_utf8((uint8_t*)q, c); + } + } + if (q == buf) + return -1; + *q = '\0'; + p++; + *pp = p; + return 0; +} + +/* if capture_name = NULL: return the number of captures + 1. + Otherwise, return the capture index corresponding to capture_name + or -1 if none */ +static int re_parse_captures(REParseState *s, int *phas_named_captures, + const char *capture_name) +{ + const uint8_t *p; + int capture_index; + char name[TMP_BUF_SIZE]; + + capture_index = 1; + *phas_named_captures = 0; + for (p = s->buf_start; p < s->buf_end; p++) { + switch (*p) { + case '(': + if (p[1] == '?') { + if (p[2] == '<' && p[3] != '=' && p[3] != '!') { + *phas_named_captures = 1; + /* potential named capture */ + if (capture_name) { + p += 3; + if (re_parse_group_name(name, sizeof(name), &p, + s->is_utf16) == 0) { + if (!strcmp(name, capture_name)) + return capture_index; + } + } + capture_index++; + } + } else { + capture_index++; + } + break; + case '\\': + p++; + break; + case '[': + for (p += 1 + (*p == ']'); p < s->buf_end && *p != ']'; p++) { + if (*p == '\\') + p++; + } + break; + } + } + if (capture_name) + return -1; + else + return capture_index; +} + +static int re_count_captures(REParseState *s) +{ + if (s->total_capture_count < 0) { + s->total_capture_count = re_parse_captures(s, &s->has_named_captures, + NULL); + } + return s->total_capture_count; +} + +static bool re_has_named_captures(REParseState *s) +{ + if (s->has_named_captures < 0) + re_count_captures(s); + return s->has_named_captures; +} + +static int find_group_name(REParseState *s, const char *name) +{ + const char *p, *buf_end; + size_t len, name_len; + int capture_index; + + name_len = strlen(name); + p = (char *)s->group_names.buf; + buf_end = (char *)s->group_names.buf + s->group_names.size; + capture_index = 1; + while (p < buf_end) { + len = strlen(p); + if (len == name_len && memcmp(name, p, name_len) == 0) + return capture_index; + p += len + 1; + capture_index++; + } + return -1; +} + +static int re_parse_disjunction(REParseState *s, bool is_backward_dir); + +static int re_parse_term(REParseState *s, bool is_backward_dir) +{ + const uint8_t *p; + int c, last_atom_start, quant_min, quant_max, last_capture_count; + bool greedy, add_zero_advance_check, is_neg, is_backward_lookahead; + CharRange cr_s, *cr = &cr_s; + + last_atom_start = -1; + last_capture_count = 0; + p = s->buf_ptr; + c = *p; + switch(c) { + case '^': + p++; + re_emit_op(s, REOP_line_start); + break; + case '$': + p++; + re_emit_op(s, REOP_line_end); + break; + case '.': + p++; + last_atom_start = s->byte_code.size; + last_capture_count = s->capture_count; + if (is_backward_dir) + re_emit_op(s, REOP_prev); + re_emit_op(s, s->dotall ? REOP_any : REOP_dot); + if (is_backward_dir) + re_emit_op(s, REOP_prev); + break; + case '{': + if (s->is_utf16) { + return re_parse_error(s, "syntax error"); + } else if (!is_digit(p[1])) { + /* Annex B: we accept '{' not followed by digits as a + normal atom */ + goto parse_class_atom; + } else { + const uint8_t *p1 = p + 1; + /* Annex B: error if it is like a repetition count */ + parse_digits(&p1, true); + if (*p1 == ',') { + p1++; + if (is_digit(*p1)) { + parse_digits(&p1, true); + } + } + if (*p1 != '}') { + goto parse_class_atom; + } + } + /* fall thru */ + case '*': + case '+': + case '?': + return re_parse_error(s, "nothing to repeat"); + case '(': + if (p[1] == '?') { + if (p[2] == ':') { + p += 3; + last_atom_start = s->byte_code.size; + last_capture_count = s->capture_count; + s->buf_ptr = p; + if (re_parse_disjunction(s, is_backward_dir)) + return -1; + p = s->buf_ptr; + if (re_parse_expect(s, &p, ')')) + return -1; + } else if ((p[2] == '=' || p[2] == '!')) { + is_neg = (p[2] == '!'); + is_backward_lookahead = false; + p += 3; + goto lookahead; + } else if (p[2] == '<' && + (p[3] == '=' || p[3] == '!')) { + int pos; + is_neg = (p[3] == '!'); + is_backward_lookahead = true; + p += 4; + /* lookahead */ + lookahead: + /* Annex B allows lookahead to be used as an atom for + the quantifiers */ + if (!s->is_utf16 && !is_backward_lookahead) { + last_atom_start = s->byte_code.size; + last_capture_count = s->capture_count; + } + pos = re_emit_op_u32(s, REOP_lookahead + is_neg, 0); + s->buf_ptr = p; + if (re_parse_disjunction(s, is_backward_lookahead)) + return -1; + p = s->buf_ptr; + if (re_parse_expect(s, &p, ')')) + return -1; + re_emit_op(s, REOP_match); + /* jump after the 'match' after the lookahead is successful */ + if (dbuf_error(&s->byte_code)) + return -1; + put_u32(s->byte_code.buf + pos, s->byte_code.size - (pos + 4)); + } else if (p[2] == '<') { + p += 3; + if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf), + &p, s->is_utf16)) { + return re_parse_error(s, "invalid group name"); + } + if (find_group_name(s, s->u.tmp_buf) > 0) { + return re_parse_error(s, "duplicate group name"); + } + /* group name with a trailing zero */ + dbuf_put(&s->group_names, (uint8_t *)s->u.tmp_buf, + strlen(s->u.tmp_buf) + 1); + s->has_named_captures = 1; + goto parse_capture; + } else { + return re_parse_error(s, "invalid group"); + } + } else { + int capture_index; + p++; + /* capture without group name */ + dbuf_putc(&s->group_names, 0); + parse_capture: + if (s->capture_count >= CAPTURE_COUNT_MAX) + return re_parse_error(s, "too many captures"); + last_atom_start = s->byte_code.size; + last_capture_count = s->capture_count; + capture_index = s->capture_count++; + re_emit_op_u8(s, REOP_save_start + is_backward_dir, + capture_index); + + s->buf_ptr = p; + if (re_parse_disjunction(s, is_backward_dir)) + return -1; + p = s->buf_ptr; + + re_emit_op_u8(s, REOP_save_start + 1 - is_backward_dir, + capture_index); + + if (re_parse_expect(s, &p, ')')) + return -1; + } + break; + case '\\': + switch(p[1]) { + case 'b': + case 'B': + re_emit_op(s, REOP_word_boundary + (p[1] != 'b')); + p += 2; + break; + case 'k': + { + const uint8_t *p1; + int dummy_res; + + p1 = p; + if (p1[2] != '<') { + /* annex B: we tolerate invalid group names in non + unicode mode if there is no named capture + definition */ + if (s->is_utf16 || re_has_named_captures(s)) + return re_parse_error(s, "expecting group name"); + else + goto parse_class_atom; + } + p1 += 3; + if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf), + &p1, s->is_utf16)) { + if (s->is_utf16 || re_has_named_captures(s)) + return re_parse_error(s, "invalid group name"); + else + goto parse_class_atom; + } + c = find_group_name(s, s->u.tmp_buf); + if (c < 0) { + /* no capture name parsed before, try to look + after (inefficient, but hopefully not common */ + c = re_parse_captures(s, &dummy_res, s->u.tmp_buf); + if (c < 0) { + if (s->is_utf16 || re_has_named_captures(s)) + return re_parse_error(s, "group name not defined"); + else + goto parse_class_atom; + } + } + p = p1; + } + goto emit_back_reference; + case '0': + p += 2; + c = 0; + if (s->is_utf16) { + if (is_digit(*p)) { + return re_parse_error(s, "invalid decimal escape in regular expression"); + } + } else { + /* Annex B.1.4: accept legacy octal */ + if (*p >= '0' && *p <= '7') { + c = *p++ - '0'; + if (*p >= '0' && *p <= '7') { + c = (c << 3) + *p++ - '0'; + } + } + } + goto normal_char; + case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': + case '9': + { + const uint8_t *q = ++p; + + c = parse_digits(&p, false); + if (c < 0 || (c >= s->capture_count && c >= re_count_captures(s))) { + if (!s->is_utf16) { + /* Annex B.1.4: accept legacy octal */ + p = q; + if (*p <= '7') { + c = 0; + if (*p <= '3') + c = *p++ - '0'; + if (*p >= '0' && *p <= '7') { + c = (c << 3) + *p++ - '0'; + if (*p >= '0' && *p <= '7') { + c = (c << 3) + *p++ - '0'; + } + } + } else { + c = *p++; + } + goto normal_char; + } + return re_parse_error(s, "back reference out of range in regular expression"); + } + emit_back_reference: + last_atom_start = s->byte_code.size; + last_capture_count = s->capture_count; + re_emit_op_u8(s, REOP_back_reference + is_backward_dir, c); + } + break; + default: + goto parse_class_atom; + } + break; + case '[': + last_atom_start = s->byte_code.size; + last_capture_count = s->capture_count; + if (is_backward_dir) + re_emit_op(s, REOP_prev); + if (re_parse_char_class(s, &p)) + return -1; + if (is_backward_dir) + re_emit_op(s, REOP_prev); + break; + case ']': + case '}': + if (s->is_utf16) + return re_parse_error(s, "syntax error"); + goto parse_class_atom; + default: + parse_class_atom: + c = get_class_atom(s, cr, &p, false); + if ((int)c < 0) + return -1; + normal_char: + last_atom_start = s->byte_code.size; + last_capture_count = s->capture_count; + if (is_backward_dir) + re_emit_op(s, REOP_prev); + if (c >= CLASS_RANGE_BASE) { + int ret; + /* Note: canonicalization is not needed */ + ret = re_emit_range(s, cr); + cr_free(cr); + if (ret) + return -1; + } else { + if (s->ignore_case) + c = lre_canonicalize(c, s->is_utf16); + if (c <= 0xffff) + re_emit_op_u16(s, REOP_char, c); + else + re_emit_op_u32(s, REOP_char32, c); + } + if (is_backward_dir) + re_emit_op(s, REOP_prev); + break; + } + + /* quantifier */ + if (last_atom_start >= 0) { + c = *p; + switch(c) { + case '*': + p++; + quant_min = 0; + quant_max = INT32_MAX; + goto quantifier; + case '+': + p++; + quant_min = 1; + quant_max = INT32_MAX; + goto quantifier; + case '?': + p++; + quant_min = 0; + quant_max = 1; + goto quantifier; + case '{': + { + const uint8_t *p1 = p; + /* As an extension (see ES6 annex B), we accept '{' not + followed by digits as a normal atom */ + if (!is_digit(p[1])) { + if (s->is_utf16) + goto invalid_quant_count; + break; + } + p++; + quant_min = parse_digits(&p, true); + quant_max = quant_min; + if (*p == ',') { + p++; + if (is_digit(*p)) { + quant_max = parse_digits(&p, true); + if (quant_max < quant_min) { + invalid_quant_count: + return re_parse_error(s, "invalid repetition count"); + } + } else { + quant_max = INT32_MAX; /* infinity */ + } + } + if (*p != '}' && !s->is_utf16) { + /* Annex B: normal atom if invalid '{' syntax */ + p = p1; + break; + } + if (re_parse_expect(s, &p, '}')) + return -1; + } + quantifier: + greedy = true; + if (*p == '?') { + p++; + greedy = false; + } + if (last_atom_start < 0) { + return re_parse_error(s, "nothing to repeat"); + } + if (greedy) { + int len, pos; + + if (quant_max > 0) { + /* specific optimization for simple quantifiers */ + if (dbuf_error(&s->byte_code)) + goto out_of_memory; + len = re_is_simple_quantifier(s->byte_code.buf + last_atom_start, + s->byte_code.size - last_atom_start); + if (len > 0) { + re_emit_op(s, REOP_match); + + if (dbuf_insert(&s->byte_code, last_atom_start, 17)) + goto out_of_memory; + pos = last_atom_start; + s->byte_code.buf[pos++] = REOP_simple_greedy_quant; + put_u32(&s->byte_code.buf[pos], + s->byte_code.size - last_atom_start - 17); + pos += 4; + put_u32(&s->byte_code.buf[pos], quant_min); + pos += 4; + put_u32(&s->byte_code.buf[pos], quant_max); + pos += 4; + put_u32(&s->byte_code.buf[pos], len); + pos += 4; + goto done; + } + } + + if (dbuf_error(&s->byte_code)) + goto out_of_memory; + add_zero_advance_check = (re_check_advance(s->byte_code.buf + last_atom_start, + s->byte_code.size - last_atom_start) == 0); + } else { + add_zero_advance_check = false; + } + + { + int len, pos; + len = s->byte_code.size - last_atom_start; + if (quant_min == 0) { + /* need to reset the capture in case the atom is + not executed */ + if (last_capture_count != s->capture_count) { + if (dbuf_insert(&s->byte_code, last_atom_start, 3)) + goto out_of_memory; + s->byte_code.buf[last_atom_start++] = REOP_save_reset; + s->byte_code.buf[last_atom_start++] = last_capture_count; + s->byte_code.buf[last_atom_start++] = s->capture_count - 1; + } + if (quant_max == 0) { + s->byte_code.size = last_atom_start; + } else if (quant_max == 1) { + if (dbuf_insert(&s->byte_code, last_atom_start, 5)) + goto out_of_memory; + s->byte_code.buf[last_atom_start] = REOP_split_goto_first + + greedy; + put_u32(s->byte_code.buf + last_atom_start + 1, len); + } else if (quant_max == INT32_MAX) { + if (dbuf_insert(&s->byte_code, last_atom_start, 5 + add_zero_advance_check)) + goto out_of_memory; + s->byte_code.buf[last_atom_start] = REOP_split_goto_first + + greedy; + put_u32(s->byte_code.buf + last_atom_start + 1, + len + 5 + add_zero_advance_check); + if (add_zero_advance_check) { + /* avoid infinite loop by stoping the + recursion if no advance was made in the + atom (only works if the atom has no + side effect) */ + s->byte_code.buf[last_atom_start + 1 + 4] = REOP_push_char_pos; + re_emit_goto(s, REOP_bne_char_pos, last_atom_start); + } else { + re_emit_goto(s, REOP_goto, last_atom_start); + } + } else { + if (dbuf_insert(&s->byte_code, last_atom_start, 10)) + goto out_of_memory; + pos = last_atom_start; + s->byte_code.buf[pos++] = REOP_push_i32; + put_u32(s->byte_code.buf + pos, quant_max); + pos += 4; + s->byte_code.buf[pos++] = REOP_split_goto_first + greedy; + put_u32(s->byte_code.buf + pos, len + 5); + re_emit_goto(s, REOP_loop, last_atom_start + 5); + re_emit_op(s, REOP_drop); + } + } else if (quant_min == 1 && quant_max == INT32_MAX && + !add_zero_advance_check) { + re_emit_goto(s, REOP_split_next_first - greedy, + last_atom_start); + } else { + if (quant_min == 1) { + /* nothing to add */ + } else { + if (dbuf_insert(&s->byte_code, last_atom_start, 5)) + goto out_of_memory; + s->byte_code.buf[last_atom_start] = REOP_push_i32; + put_u32(s->byte_code.buf + last_atom_start + 1, + quant_min); + last_atom_start += 5; + re_emit_goto(s, REOP_loop, last_atom_start); + re_emit_op(s, REOP_drop); + } + if (quant_max == INT32_MAX) { + pos = s->byte_code.size; + re_emit_op_u32(s, REOP_split_goto_first + greedy, + len + 5 + add_zero_advance_check); + if (add_zero_advance_check) + re_emit_op(s, REOP_push_char_pos); + /* copy the atom */ + dbuf_put_self(&s->byte_code, last_atom_start, len); + if (add_zero_advance_check) + re_emit_goto(s, REOP_bne_char_pos, pos); + else + re_emit_goto(s, REOP_goto, pos); + } else if (quant_max > quant_min) { + re_emit_op_u32(s, REOP_push_i32, quant_max - quant_min); + pos = s->byte_code.size; + re_emit_op_u32(s, REOP_split_goto_first + greedy, len + 5); + /* copy the atom */ + dbuf_put_self(&s->byte_code, last_atom_start, len); + + re_emit_goto(s, REOP_loop, pos); + re_emit_op(s, REOP_drop); + } + } + last_atom_start = -1; + } + break; + default: + break; + } + } + done: + s->buf_ptr = p; + return 0; + out_of_memory: + return re_parse_out_of_memory(s); +} + +static int re_parse_alternative(REParseState *s, bool is_backward_dir) +{ + const uint8_t *p; + int ret; + size_t start, term_start, end, term_size; + + start = s->byte_code.size; + for(;;) { + p = s->buf_ptr; + if (p >= s->buf_end) + break; + if (*p == '|' || *p == ')') + break; + term_start = s->byte_code.size; + ret = re_parse_term(s, is_backward_dir); + if (ret) + return ret; + if (is_backward_dir) { + /* reverse the order of the terms (XXX: inefficient, but + speed is not really critical here) */ + end = s->byte_code.size; + term_size = end - term_start; + if (dbuf_realloc(&s->byte_code, end + term_size)) + return -1; + memmove(s->byte_code.buf + start + term_size, + s->byte_code.buf + start, + end - start); + memcpy(s->byte_code.buf + start, s->byte_code.buf + end, + term_size); + } + } + return 0; +} + +static int re_parse_disjunction(REParseState *s, bool is_backward_dir) +{ + int start, len, pos; + + start = s->byte_code.size; + if (re_parse_alternative(s, is_backward_dir)) + return -1; + while (*s->buf_ptr == '|') { + s->buf_ptr++; + + len = s->byte_code.size - start; + + /* insert a split before the first alternative */ + if (dbuf_insert(&s->byte_code, start, 5)) { + return re_parse_out_of_memory(s); + } + s->byte_code.buf[start] = REOP_split_next_first; + put_u32(s->byte_code.buf + start + 1, len + 5); + + pos = re_emit_op_u32(s, REOP_goto, 0); + + if (re_parse_alternative(s, is_backward_dir)) + return -1; + + /* patch the goto */ + len = s->byte_code.size - (pos + 4); + put_u32(s->byte_code.buf + pos, len); + } + return 0; +} + +/* the control flow is recursive so the analysis can be linear */ +static int compute_stack_size(const uint8_t *bc_buf, int bc_buf_len) +{ + int stack_size, stack_size_max, pos, opcode, len; + uint32_t val; + + stack_size = 0; + stack_size_max = 0; + bc_buf += RE_HEADER_LEN; + bc_buf_len -= RE_HEADER_LEN; + pos = 0; + while (pos < bc_buf_len) { + opcode = bc_buf[pos]; + len = reopcode_info[opcode].size; + assert(opcode < REOP_COUNT); + assert((pos + len) <= bc_buf_len); + switch(opcode) { + case REOP_push_i32: + case REOP_push_char_pos: + stack_size++; + if (stack_size > stack_size_max) { + if (stack_size > STACK_SIZE_MAX) + return -1; + stack_size_max = stack_size; + } + break; + case REOP_drop: + case REOP_bne_char_pos: + assert(stack_size > 0); + stack_size--; + break; + case REOP_range: + val = get_u16(bc_buf + pos + 1); + len += val * 4; + break; + case REOP_range32: + val = get_u16(bc_buf + pos + 1); + len += val * 8; + break; + } + pos += len; + } + return stack_size_max; +} + +/* 'buf' must be a zero terminated UTF-8 string of length buf_len. + Return NULL if error and allocate an error message in *perror_msg, + otherwise the compiled bytecode and its length in plen. +*/ +uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size, + const char *buf, size_t buf_len, int re_flags, + void *opaque) +{ + REParseState s_s, *s = &s_s; + int stack_size; + bool is_sticky; + + memset(s, 0, sizeof(*s)); + s->mem_opaque = opaque; + s->buf_ptr = (const uint8_t *)buf; + s->buf_end = s->buf_ptr + buf_len; + s->buf_start = s->buf_ptr; + s->re_flags = re_flags; + s->is_utf16 = ((re_flags & LRE_FLAG_UTF16) != 0); + is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0); + s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0); + s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0); + s->capture_count = 1; + s->total_capture_count = -1; + s->has_named_captures = -1; + + dbuf_init2(&s->byte_code, opaque, lre_realloc); + dbuf_init2(&s->group_names, opaque, lre_realloc); + + dbuf_putc(&s->byte_code, re_flags); /* first element is the flags */ + dbuf_putc(&s->byte_code, 0); /* second element is the number of captures */ + dbuf_putc(&s->byte_code, 0); /* stack size */ + dbuf_put_u32(&s->byte_code, 0); /* bytecode length */ + + if (!is_sticky) { + /* iterate thru all positions (about the same as .*?( ... ) ) + . We do it without an explicit loop so that lock step + thread execution will be possible in an optimized + implementation */ + re_emit_op_u32(s, REOP_split_goto_first, 1 + 5); + re_emit_op(s, REOP_any); + re_emit_op_u32(s, REOP_goto, -(5 + 1 + 5)); + } + re_emit_op_u8(s, REOP_save_start, 0); + + if (re_parse_disjunction(s, false)) { + error: + dbuf_free(&s->byte_code); + dbuf_free(&s->group_names); + pstrcpy(error_msg, error_msg_size, s->u.error_msg); + *plen = 0; + return NULL; + } + + re_emit_op_u8(s, REOP_save_end, 0); + + re_emit_op(s, REOP_match); + + if (*s->buf_ptr != '\0') { + re_parse_error(s, "extraneous characters at the end"); + goto error; + } + + if (dbuf_error(&s->byte_code)) { + re_parse_out_of_memory(s); + goto error; + } + + stack_size = compute_stack_size(s->byte_code.buf, s->byte_code.size); + if (stack_size < 0) { + re_parse_error(s, "too many imbricated quantifiers"); + goto error; + } + + s->byte_code.buf[RE_HEADER_CAPTURE_COUNT] = s->capture_count; + s->byte_code.buf[RE_HEADER_STACK_SIZE] = stack_size; + put_u32(s->byte_code.buf + 3, s->byte_code.size - RE_HEADER_LEN); + + /* add the named groups if needed */ + if (s->group_names.size > (size_t)(s->capture_count - 1)) { + dbuf_put(&s->byte_code, s->group_names.buf, s->group_names.size); + s->byte_code.buf[RE_HEADER_FLAGS] |= LRE_FLAG_NAMED_GROUPS; + } + dbuf_free(&s->group_names); + + error_msg[0] = '\0'; + *plen = s->byte_code.size; + return s->byte_code.buf; +} + +static bool is_line_terminator(uint32_t c) +{ + return (c == '\n' || c == '\r' || c == CP_LS || c == CP_PS); +} + +static bool is_word_char(uint32_t c) +{ + return ((c >= '0' && c <= '9') || + (c >= 'a' && c <= 'z') || + (c >= 'A' && c <= 'Z') || + (c == '_')); +} + +#define GET_CHAR(c, cptr, cbuf_end) \ + do { \ + if (cbuf_type == 0) { \ + c = *cptr++; \ + } else { \ + uint32_t __c1; \ + c = *(uint16_t *)cptr; \ + cptr += 2; \ + if (c >= 0xd800 && c < 0xdc00 && \ + cbuf_type == 2 && cptr < cbuf_end) { \ + __c1 = *(uint16_t *)cptr; \ + if (__c1 >= 0xdc00 && __c1 < 0xe000) { \ + c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \ + cptr += 2; \ + } \ + } \ + } \ + } while (0) + +#define PEEK_CHAR(c, cptr, cbuf_end) \ + do { \ + if (cbuf_type == 0) { \ + c = cptr[0]; \ + } else { \ + uint32_t __c1; \ + c = ((uint16_t *)cptr)[0]; \ + if (c >= 0xd800 && c < 0xdc00 && \ + cbuf_type == 2 && (cptr + 2) < cbuf_end) { \ + __c1 = ((uint16_t *)cptr)[1]; \ + if (__c1 >= 0xdc00 && __c1 < 0xe000) { \ + c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \ + } \ + } \ + } \ + } while (0) + +#define PEEK_PREV_CHAR(c, cptr, cbuf_start) \ + do { \ + if (cbuf_type == 0) { \ + c = cptr[-1]; \ + } else { \ + uint32_t __c1; \ + c = ((uint16_t *)cptr)[-1]; \ + if (c >= 0xdc00 && c < 0xe000 && \ + cbuf_type == 2 && (cptr - 4) >= cbuf_start) { \ + __c1 = ((uint16_t *)cptr)[-2]; \ + if (__c1 >= 0xd800 && __c1 < 0xdc00 ) { \ + c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \ + } \ + } \ + } \ + } while (0) + +#define GET_PREV_CHAR(c, cptr, cbuf_start) \ + do { \ + if (cbuf_type == 0) { \ + cptr--; \ + c = cptr[0]; \ + } else { \ + uint32_t __c1; \ + cptr -= 2; \ + c = ((uint16_t *)cptr)[0]; \ + if (c >= 0xdc00 && c < 0xe000 && \ + cbuf_type == 2 && cptr > cbuf_start) { \ + __c1 = ((uint16_t *)cptr)[-1]; \ + if (__c1 >= 0xd800 && __c1 < 0xdc00 ) { \ + cptr -= 2; \ + c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \ + } \ + } \ + } \ + } while (0) + +#define PREV_CHAR(cptr, cbuf_start) \ + do { \ + if (cbuf_type == 0) { \ + cptr--; \ + } else { \ + cptr -= 2; \ + if (cbuf_type == 2) { \ + c = ((uint16_t *)cptr)[0]; \ + if (c >= 0xdc00 && c < 0xe000 && cptr > cbuf_start) { \ + c = ((uint16_t *)cptr)[-1]; \ + if (c >= 0xd800 && c < 0xdc00) \ + cptr -= 2; \ + } \ + } \ + } \ + } while (0) + +typedef uintptr_t StackInt; + +typedef enum { + RE_EXEC_STATE_SPLIT, + RE_EXEC_STATE_LOOKAHEAD, + RE_EXEC_STATE_NEGATIVE_LOOKAHEAD, + RE_EXEC_STATE_GREEDY_QUANT, +} REExecStateEnum; + +typedef struct REExecState { + REExecStateEnum type : 8; + uint8_t stack_len; + size_t count; /* only used for RE_EXEC_STATE_GREEDY_QUANT */ + const uint8_t *cptr; + const uint8_t *pc; + void *buf[]; +} REExecState; + +typedef struct { + const uint8_t *cbuf; + const uint8_t *cbuf_end; + /* 0 = 8 bit chars, 1 = 16 bit chars, 2 = 16 bit chars, UTF-16 */ + int cbuf_type; + int capture_count; + int stack_size_max; + bool multi_line; + bool ignore_case; + bool is_utf16; + void *opaque; /* used for stack overflow check */ + + size_t state_size; + uint8_t *state_stack; + size_t state_stack_size; + size_t state_stack_len; +} REExecContext; + +static int push_state(REExecContext *s, + uint8_t **capture, + StackInt *stack, size_t stack_len, + const uint8_t *pc, const uint8_t *cptr, + REExecStateEnum type, size_t count) +{ + REExecState *rs; + uint8_t *new_stack; + size_t new_size, i, n; + StackInt *stack_buf; + + if (unlikely((s->state_stack_len + 1) > s->state_stack_size)) { + /* reallocate the stack */ + new_size = s->state_stack_size * 3 / 2; + if (new_size < 8) + new_size = 8; + new_stack = lre_realloc(s->opaque, s->state_stack, new_size * s->state_size); + if (!new_stack) + return -1; + s->state_stack_size = new_size; + s->state_stack = new_stack; + } + rs = (REExecState *)(s->state_stack + s->state_stack_len * s->state_size); + s->state_stack_len++; + rs->type = type; + rs->count = count; + rs->stack_len = stack_len; + rs->cptr = cptr; + rs->pc = pc; + n = 2 * s->capture_count; + for(i = 0; i < n; i++) + rs->buf[i] = capture[i]; + stack_buf = (StackInt *)(rs->buf + n); + for(i = 0; i < stack_len; i++) + stack_buf[i] = stack[i]; + return 0; +} + +/* return 1 if match, 0 if not match or -1 if error. */ +static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture, + StackInt *stack, int stack_len, + const uint8_t *pc, const uint8_t *cptr, + bool no_recurse) +{ + int opcode, ret; + int cbuf_type; + uint32_t val, c; + const uint8_t *cbuf_end; + + cbuf_type = s->cbuf_type; + cbuf_end = s->cbuf_end; + + for(;;) { + // printf("top=%p: pc=%d\n", th_list.top, (int)(pc - (bc_buf + RE_HEADER_LEN))); + opcode = *pc++; + switch(opcode) { + case REOP_match: + { + REExecState *rs; + if (no_recurse) + return (intptr_t)cptr; + ret = 1; + goto recurse; + no_match: + if (no_recurse) + return 0; + ret = 0; + recurse: + for(;;) { + if (s->state_stack_len == 0) + return ret; + rs = (REExecState *)(s->state_stack + + (s->state_stack_len - 1) * s->state_size); + if (rs->type == RE_EXEC_STATE_SPLIT) { + if (!ret) { + pop_state: + memcpy(capture, rs->buf, + sizeof(capture[0]) * 2 * s->capture_count); + pop_state1: + pc = rs->pc; + cptr = rs->cptr; + stack_len = rs->stack_len; + memcpy(stack, rs->buf + 2 * s->capture_count, + stack_len * sizeof(stack[0])); + s->state_stack_len--; + break; + } + } else if (rs->type == RE_EXEC_STATE_GREEDY_QUANT) { + if (!ret) { + uint32_t char_count, i; + memcpy(capture, rs->buf, + sizeof(capture[0]) * 2 * s->capture_count); + stack_len = rs->stack_len; + memcpy(stack, rs->buf + 2 * s->capture_count, + stack_len * sizeof(stack[0])); + pc = rs->pc; + cptr = rs->cptr; + /* go backward */ + char_count = get_u32(pc + 12); + for(i = 0; i < char_count; i++) { + PREV_CHAR(cptr, s->cbuf); + } + pc = (pc + 16) + (int)get_u32(pc); + rs->cptr = cptr; + rs->count--; + if (rs->count == 0) { + s->state_stack_len--; + } + break; + } + } else { + ret = ((rs->type == RE_EXEC_STATE_LOOKAHEAD && ret) || + (rs->type == RE_EXEC_STATE_NEGATIVE_LOOKAHEAD && !ret)); + if (ret) { + /* keep the capture in case of positive lookahead */ + if (rs->type == RE_EXEC_STATE_LOOKAHEAD) + goto pop_state1; + else + goto pop_state; + } + } + s->state_stack_len--; + } + } + break; + case REOP_char32: + val = get_u32(pc); + pc += 4; + goto test_char; + case REOP_char: + val = get_u16(pc); + pc += 2; + test_char: + if (cptr >= cbuf_end) + goto no_match; + GET_CHAR(c, cptr, cbuf_end); + if (s->ignore_case) { + c = lre_canonicalize(c, s->is_utf16); + } + if (val != c) + goto no_match; + break; + case REOP_split_goto_first: + case REOP_split_next_first: + { + const uint8_t *pc1; + + val = get_u32(pc); + pc += 4; + if (opcode == REOP_split_next_first) { + pc1 = pc + (int)val; + } else { + pc1 = pc; + pc = pc + (int)val; + } + ret = push_state(s, capture, stack, stack_len, + pc1, cptr, RE_EXEC_STATE_SPLIT, 0); + if (ret < 0) + return -1; + break; + } + case REOP_lookahead: + case REOP_negative_lookahead: + val = get_u32(pc); + pc += 4; + ret = push_state(s, capture, stack, stack_len, + pc + (int)val, cptr, + RE_EXEC_STATE_LOOKAHEAD + opcode - REOP_lookahead, + 0); + if (ret < 0) + return -1; + break; + + case REOP_goto: + val = get_u32(pc); + pc += 4 + (int)val; + break; + case REOP_line_start: + if (cptr == s->cbuf) + break; + if (!s->multi_line) + goto no_match; + PEEK_PREV_CHAR(c, cptr, s->cbuf); + if (!is_line_terminator(c)) + goto no_match; + break; + case REOP_line_end: + if (cptr == cbuf_end) + break; + if (!s->multi_line) + goto no_match; + PEEK_CHAR(c, cptr, cbuf_end); + if (!is_line_terminator(c)) + goto no_match; + break; + case REOP_dot: + if (cptr == cbuf_end) + goto no_match; + GET_CHAR(c, cptr, cbuf_end); + if (is_line_terminator(c)) + goto no_match; + break; + case REOP_any: + if (cptr == cbuf_end) + goto no_match; + GET_CHAR(c, cptr, cbuf_end); + break; + case REOP_save_start: + case REOP_save_end: + val = *pc++; + assert(val < (uint32_t)s->capture_count); + capture[2 * val + opcode - REOP_save_start] = (uint8_t *)cptr; + break; + case REOP_save_reset: + { + uint32_t val2; + val = pc[0]; + val2 = pc[1]; + pc += 2; + assert(val2 < (uint32_t)s->capture_count); + while (val <= val2) { + capture[2 * val] = NULL; + capture[2 * val + 1] = NULL; + val++; + } + } + break; + case REOP_push_i32: + val = get_u32(pc); + pc += 4; + stack[stack_len++] = val; + break; + case REOP_drop: + stack_len--; + break; + case REOP_loop: + val = get_u32(pc); + pc += 4; + if (--stack[stack_len - 1] != 0) { + pc += (int)val; + } + break; + case REOP_push_char_pos: + stack[stack_len++] = (uintptr_t)cptr; + break; + case REOP_bne_char_pos: + val = get_u32(pc); + pc += 4; + if (stack[--stack_len] != (uintptr_t)cptr) + pc += (int)val; + break; + case REOP_word_boundary: + case REOP_not_word_boundary: + { + bool v1, v2; + /* char before */ + if (cptr == s->cbuf) { + v1 = false; + } else { + PEEK_PREV_CHAR(c, cptr, s->cbuf); + v1 = is_word_char(c); + } + /* current char */ + if (cptr >= cbuf_end) { + v2 = false; + } else { + PEEK_CHAR(c, cptr, cbuf_end); + v2 = is_word_char(c); + } + if (v1 ^ v2 ^ (REOP_not_word_boundary - opcode)) + goto no_match; + } + break; + case REOP_back_reference: + case REOP_backward_back_reference: + { + const uint8_t *cptr1, *cptr1_end, *cptr1_start; + uint32_t c1, c2; + + val = *pc++; + if (val >= (uint32_t)s->capture_count) + goto no_match; + cptr1_start = capture[2 * val]; + cptr1_end = capture[2 * val + 1]; + if (!cptr1_start || !cptr1_end) + break; + if (opcode == REOP_back_reference) { + cptr1 = cptr1_start; + while (cptr1 < cptr1_end) { + if (cptr >= cbuf_end) + goto no_match; + GET_CHAR(c1, cptr1, cptr1_end); + GET_CHAR(c2, cptr, cbuf_end); + if (s->ignore_case) { + c1 = lre_canonicalize(c1, s->is_utf16); + c2 = lre_canonicalize(c2, s->is_utf16); + } + if (c1 != c2) + goto no_match; + } + } else { + cptr1 = cptr1_end; + while (cptr1 > cptr1_start) { + if (cptr == s->cbuf) + goto no_match; + GET_PREV_CHAR(c1, cptr1, cptr1_start); + GET_PREV_CHAR(c2, cptr, s->cbuf); + if (s->ignore_case) { + c1 = lre_canonicalize(c1, s->is_utf16); + c2 = lre_canonicalize(c2, s->is_utf16); + } + if (c1 != c2) + goto no_match; + } + } + } + break; + case REOP_range: + { + int n; + uint32_t low, high, idx_min, idx_max, idx; + + n = get_u16(pc); /* n must be >= 1 */ + pc += 2; + if (cptr >= cbuf_end) + goto no_match; + GET_CHAR(c, cptr, cbuf_end); + if (s->ignore_case) { + c = lre_canonicalize(c, s->is_utf16); + } + idx_min = 0; + low = get_u16(pc + 0 * 4); + if (c < low) + goto no_match; + idx_max = n - 1; + high = get_u16(pc + idx_max * 4 + 2); + /* 0xffff in for last value means +infinity */ + if (unlikely(c >= 0xffff) && high == 0xffff) + goto range_match; + if (c > high) + goto no_match; + while (idx_min <= idx_max) { + idx = (idx_min + idx_max) / 2; + low = get_u16(pc + idx * 4); + high = get_u16(pc + idx * 4 + 2); + if (c < low) + idx_max = idx - 1; + else if (c > high) + idx_min = idx + 1; + else + goto range_match; + } + goto no_match; + range_match: + pc += 4 * n; + } + break; + case REOP_range32: + { + int n; + uint32_t low, high, idx_min, idx_max, idx; + + n = get_u16(pc); /* n must be >= 1 */ + pc += 2; + if (cptr >= cbuf_end) + goto no_match; + GET_CHAR(c, cptr, cbuf_end); + if (s->ignore_case) { + c = lre_canonicalize(c, s->is_utf16); + } + idx_min = 0; + low = get_u32(pc + 0 * 8); + if (c < low) + goto no_match; + idx_max = n - 1; + high = get_u32(pc + idx_max * 8 + 4); + if (c > high) + goto no_match; + while (idx_min <= idx_max) { + idx = (idx_min + idx_max) / 2; + low = get_u32(pc + idx * 8); + high = get_u32(pc + idx * 8 + 4); + if (c < low) + idx_max = idx - 1; + else if (c > high) + idx_min = idx + 1; + else + goto range32_match; + } + goto no_match; + range32_match: + pc += 8 * n; + } + break; + case REOP_prev: + /* go to the previous char */ + if (cptr == s->cbuf) + goto no_match; + PREV_CHAR(cptr, s->cbuf); + break; + case REOP_simple_greedy_quant: + { + uint32_t next_pos, quant_min, quant_max; + size_t q; + intptr_t res; + const uint8_t *pc1; + + next_pos = get_u32(pc); + quant_min = get_u32(pc + 4); + quant_max = get_u32(pc + 8); + pc += 16; + pc1 = pc; + pc += (int)next_pos; + + q = 0; + for(;;) { + res = lre_exec_backtrack(s, capture, stack, stack_len, + pc1, cptr, true); + if (res == -1) + return res; + if (!res) + break; + cptr = (uint8_t *)res; + q++; + if (q >= quant_max && quant_max != INT32_MAX) + break; + } + if (q < quant_min) + goto no_match; + if (q > quant_min) { + /* will examine all matches down to quant_min */ + ret = push_state(s, capture, stack, stack_len, + pc1 - 16, cptr, + RE_EXEC_STATE_GREEDY_QUANT, + q - quant_min); + if (ret < 0) + return -1; + } + } + break; + default: + abort(); + } + } +} + +/* Return 1 if match, 0 if not match or -1 if error. cindex is the + starting position of the match and must be such as 0 <= cindex <= + clen. */ +int lre_exec(uint8_t **capture, + const uint8_t *bc_buf, const uint8_t *cbuf, int cindex, int clen, + int cbuf_type, void *opaque) +{ + REExecContext s_s, *s = &s_s; + int re_flags, i, alloca_size, ret; + StackInt *stack_buf; + + re_flags = bc_buf[RE_HEADER_FLAGS]; + s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0; + s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0; + s->is_utf16 = (re_flags & LRE_FLAG_UTF16) != 0; + s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT]; + s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE]; + s->cbuf = cbuf; + s->cbuf_end = cbuf + (clen << cbuf_type); + s->cbuf_type = cbuf_type; + if (s->cbuf_type == 1 && s->is_utf16) + s->cbuf_type = 2; + s->opaque = opaque; + + s->state_size = sizeof(REExecState) + + s->capture_count * sizeof(capture[0]) * 2 + + s->stack_size_max * sizeof(stack_buf[0]); + s->state_stack = NULL; + s->state_stack_len = 0; + s->state_stack_size = 0; + + for(i = 0; i < s->capture_count * 2; i++) + capture[i] = NULL; + alloca_size = s->stack_size_max * sizeof(stack_buf[0]); + stack_buf = alloca(alloca_size); + ret = lre_exec_backtrack(s, capture, stack_buf, 0, bc_buf + RE_HEADER_LEN, + cbuf + (cindex << cbuf_type), false); + lre_realloc(s->opaque, s->state_stack, 0); + return ret; +} + +int lre_get_capture_count(const uint8_t *bc_buf) +{ + return bc_buf[RE_HEADER_CAPTURE_COUNT]; +} + +int lre_get_flags(const uint8_t *bc_buf) +{ + return bc_buf[RE_HEADER_FLAGS]; +} + +/* Return NULL if no group names. Otherwise, return a pointer to + 'capture_count - 1' zero terminated UTF-8 strings. */ +const char *lre_get_groupnames(const uint8_t *bc_buf) +{ + uint32_t re_bytecode_len; + if ((lre_get_flags(bc_buf) & LRE_FLAG_NAMED_GROUPS) == 0) + return NULL; + re_bytecode_len = get_u32(bc_buf + 3); + return (const char *)(bc_buf + 7 + re_bytecode_len); +} + +#ifdef TEST + +bool lre_check_stack_overflow(void *opaque, size_t alloca_size) +{ + return false; +} + +void *lre_realloc(void *opaque, void *ptr, size_t size) +{ + return realloc(ptr, size); +} + +int main(int argc, char **argv) +{ + int len, ret, i; + uint8_t *bc; + char error_msg[64]; + uint8_t *capture[CAPTURE_COUNT_MAX * 2]; + const char *input; + int input_len, capture_count; + + if (argc < 3) { + printf("usage: %s regexp input\n", argv[0]); + exit(1); + } + bc = lre_compile(&len, error_msg, sizeof(error_msg), argv[1], + strlen(argv[1]), 0, NULL); + if (!bc) { + fprintf(stderr, "error: %s\n", error_msg); + exit(1); + } + + input = argv[2]; + input_len = strlen(input); + + ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL); + printf("ret=%d\n", ret); + if (ret == 1) { + capture_count = lre_get_capture_count(bc); + for(i = 0; i < 2 * capture_count; i++) { + uint8_t *ptr; + ptr = capture[i]; + printf("%d: ", i); + if (!ptr) + printf("<nil>"); + else + printf("%u", (int)(ptr - (uint8_t *)input)); + printf("\n"); + } + } + return 0; +} +#endif |
