aboutsummaryrefslogtreecommitdiffstats
path: root/regexec.c
diff options
context:
space:
mode:
authorHiroya Fujinami <make.just.on@gmail.com>2023-10-30 13:10:42 +0900
committerGitHub <noreply@github.com>2023-10-30 13:10:42 +0900
commit34cb174800e1e41323807c99386641b688927adc (patch)
tree62a1ac23fa566aff85a18042655bebbd2bd750ab /regexec.c
parent13c9cbe09ef310c7ddf055d57ebf4586e9f9a111 (diff)
downloadruby-34cb174800e1e41323807c99386641b688927adc.tar.gz
Optimize regexp matching for look-around and atomic groups (#7931)
Diffstat (limited to 'regexec.c')
-rw-r--r--regexec.c375
1 files changed, 224 insertions, 151 deletions
diff --git a/regexec.c b/regexec.c
index ba0e7c2cd2..f841fbffb5 100644
--- a/regexec.c
+++ b/regexec.c
@@ -242,9 +242,6 @@ The `Regexp#match` optimization by using a cache.
"cache opcode"
A cachable opcode (e.g. `OP_PUSH`, `OP_REPEAT`, etc).
It is corresponding to some cache points.
-`OP_NULL_CHECK_START` and `OP_NULL_CHECK_END_MEMST` are exceptions.
-They are counted as cache opcodes, but they have no corresponding cache points.
-They are used for resetting a match cache buffer when a null loop is detected.
"cache point"
A cachable point on matching.
@@ -258,7 +255,7 @@ We encode a match cache point to an integer value by the following equation:
"match cache point" = "position on input string" * "total number of cache points" + "cache point"
"match cache buffer"
-A bit-array for recording (caching) match cache points once arrived.
+A bit-array for memoizing (recording) match cache points once backtracked.
*/
/* count the total number of cache opcodes for allocating a match cache buffer. */
@@ -272,7 +269,7 @@ count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
OnigEncoding enc = reg->enc;
MemNumType current_repeat_mem = -1;
long num_cache_opcodes = 0;
- long num_cache_opcodes_at_null_check_start = -1;
+ int lookaround_nesting = 0;
while (p < pend) {
switch (*p++) {
@@ -326,7 +323,7 @@ count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
break;
case OP_ANYCHAR_STAR:
case OP_ANYCHAR_ML_STAR:
- num_cache_opcodes++;break;
+ num_cache_opcodes++; break;
case OP_ANYCHAR_STAR_PEEK_NEXT:
case OP_ANYCHAR_ML_STAR_PEEK_NEXT:
p++; num_cache_opcodes++; break;
@@ -370,7 +367,12 @@ count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
case OP_MEMORY_END_PUSH_REC:
case OP_MEMORY_END:
case OP_MEMORY_END_REC:
- p += SIZE_MEMNUM; break;
+ p += SIZE_MEMNUM;
+ // A memory (capture) in look-around is found.
+ if (lookaround_nesting != 0) {
+ goto impossible;
+ }
+ break;
case OP_KEEP:
break;
@@ -422,37 +424,60 @@ count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
goto impossible;
case OP_NULL_CHECK_START:
p += SIZE_MEMNUM;
- if (num_cache_opcodes_at_null_check_start != -1) {
- // A nested OP_NULL_CHECK_START is not yet supported.
- goto impossible;
- }
- num_cache_opcodes_at_null_check_start = num_cache_opcodes;
break;
case OP_NULL_CHECK_END:
case OP_NULL_CHECK_END_MEMST_PUSH:
p += SIZE_MEMNUM;
- num_cache_opcodes_at_null_check_start = -1;
break;
case OP_NULL_CHECK_END_MEMST:
p += SIZE_MEMNUM;
- // OP_NULL_CHECK_START and OP_NULL_CHECK_END_MEMST are counted as cache opcodes.
- if (num_cache_opcodes_at_null_check_start < num_cache_opcodes) {
- num_cache_opcodes += 2;
- }
- num_cache_opcodes_at_null_check_start = -1;
break;
case OP_PUSH_POS:
- case OP_POP_POS:
+ if (lookaround_nesting < 0) {
+ // A look-around nested in a atomic grouping is found.
+ goto impossible;
+ }
+ lookaround_nesting++;
+ break;
case OP_PUSH_POS_NOT:
+ if (lookaround_nesting < 0) {
+ // A look-around nested in a atomic grouping is found.
+ goto impossible;
+ }
+ p += SIZE_RELADDR;
+ lookaround_nesting++;
+ break;
+ case OP_PUSH_LOOK_BEHIND_NOT:
+ if (lookaround_nesting < 0) {
+ // A look-around nested in a atomic grouping is found.
+ goto impossible;
+ }
+ p += SIZE_RELADDR;
+ p += SIZE_LENGTH;
+ lookaround_nesting++;
+ break;
+ case OP_POP_POS:
case OP_FAIL_POS:
- goto impossible;
+ case OP_FAIL_LOOK_BEHIND_NOT:
+ lookaround_nesting--;
+ break;
case OP_PUSH_STOP_BT:
+ if (lookaround_nesting != 0) {
+ // A nested atomic grouping is found.
+ goto impossible;
+ }
+ lookaround_nesting++;
+ lookaround_nesting *= -1;
+ break;
case OP_POP_STOP_BT:
+ lookaround_nesting *= -1;
+ lookaround_nesting--;
break;
case OP_LOOK_BEHIND:
- case OP_PUSH_LOOK_BEHIND_NOT:
- case OP_FAIL_LOOK_BEHIND_NOT:
+ p += SIZE_LENGTH;
+ break;
+
case OP_PUSH_ABSENT_POS:
case OP_ABSENT_END:
case OP_ABSENT:
@@ -506,19 +531,34 @@ init_cache_opcodes(const regex_t* reg, OnigCacheOpcode* cache_opcodes, long* num
MemNumType current_repeat_mem = -1;
long cache_point = 0;
long num_cache_points_at_repeat = 0;
+ int lookaround_nesting = 0;
+ const OnigCacheOpcode* cache_opcodes_begin = cache_opcodes;
OnigCacheOpcode* cache_opcodes_at_repeat = NULL;
- UChar* null_check_start_addr = NULL;
- OnigCacheOpcode* cache_opcodes_at_null_check_start = NULL;
-# define INC_CACHE_OPCODES do {\
+# define INC_CACHE_OPCODES do {\
cache_opcodes->addr = pbegin;\
cache_opcodes->cache_point = cache_point - num_cache_points_at_repeat;\
cache_opcodes->outer_repeat_mem = current_repeat_mem;\
cache_opcodes->num_cache_points_at_outer_repeat = num_cache_points_at_repeat;\
cache_opcodes->num_cache_points_in_outer_repeat = 0;\
- cache_point++;\
+ cache_opcodes->lookaround_nesting = lookaround_nesting;\
+ cache_point += lookaround_nesting > 0 ? 2 : 1;\
cache_opcodes++;\
- } while (0)
+ } while (0)
+
+# define INC_LOOKAROUND_NESTING lookaround_nesting++
+# define DEC_LOOKAROUND_NESTING do {\
+ UChar* match_addr = p - 1;\
+ OnigCacheOpcode* cache_opcodes_in_lookaround = cache_opcodes - 1;\
+ while (\
+ cache_opcodes_in_lookaround >= cache_opcodes_begin &&\
+ cache_opcodes_in_lookaround->lookaround_nesting == lookaround_nesting\
+ ) {\
+ cache_opcodes_in_lookaround->match_addr = match_addr;\
+ cache_opcodes_in_lookaround--;\
+ }\
+ lookaround_nesting--;\
+ } while (0)
while (p < pend) {
pbegin = p;
@@ -620,7 +660,11 @@ init_cache_opcodes(const regex_t* reg, OnigCacheOpcode* cache_opcodes, long* num
case OP_MEMORY_END_PUSH_REC:
case OP_MEMORY_END:
case OP_MEMORY_END_REC:
- p += SIZE_MEMNUM; break;
+ p += SIZE_MEMNUM;
+ if (lookaround_nesting != 0) {
+ goto unexpected_bytecode_error;
+ }
+ break;
case OP_KEEP:
break;
@@ -679,46 +723,45 @@ init_cache_opcodes(const regex_t* reg, OnigCacheOpcode* cache_opcodes, long* num
case OP_REPEAT_INC_NG_SG:
goto unexpected_bytecode_error;
case OP_NULL_CHECK_START:
- if (cache_opcodes_at_null_check_start != NULL) {
- goto unexpected_bytecode_error;
- }
p += SIZE_MEMNUM;
- null_check_start_addr = pbegin;
- cache_opcodes_at_null_check_start = cache_opcodes;
break;
case OP_NULL_CHECK_END:
case OP_NULL_CHECK_END_MEMST_PUSH:
p += SIZE_MEMNUM;
- cache_opcodes_at_null_check_start = NULL;
break;
case OP_NULL_CHECK_END_MEMST:
p += SIZE_MEMNUM;
- {
- if (cache_opcodes_at_null_check_start < cache_opcodes) {
- *cache_opcodes = *(cache_opcodes - 1);
- cache_opcodes->addr = pbegin;
- cache_opcodes++;
- long num_cache_opcodes_in_null_check = (cache_opcodes - cache_opcodes_at_null_check_start);
- xmemmove(cache_opcodes_at_null_check_start + 1, cache_opcodes_at_null_check_start, sizeof(OnigCacheOpcode) * num_cache_opcodes_in_null_check);
- cache_opcodes_at_null_check_start->addr = null_check_start_addr;
- cache_opcodes++;
- }
- cache_opcodes_at_null_check_start = NULL;
- }
break;
case OP_PUSH_POS:
- case OP_POP_POS:
+ INC_LOOKAROUND_NESTING;
+ break;
case OP_PUSH_POS_NOT:
+ p += SIZE_RELADDR;
+ INC_LOOKAROUND_NESTING;
+ break;
+ case OP_PUSH_LOOK_BEHIND_NOT:
+ p += SIZE_RELADDR;
+ p += SIZE_LENGTH;
+ INC_LOOKAROUND_NESTING;
+ break;
+ case OP_POP_POS:
case OP_FAIL_POS:
- goto unexpected_bytecode_error;
+ case OP_FAIL_LOOK_BEHIND_NOT:
+ DEC_LOOKAROUND_NESTING;
+ break;
case OP_PUSH_STOP_BT:
+ INC_LOOKAROUND_NESTING;
+ lookaround_nesting *= -1;
+ break;
case OP_POP_STOP_BT:
+ lookaround_nesting *= -1;
+ DEC_LOOKAROUND_NESTING;
break;
case OP_LOOK_BEHIND:
- case OP_PUSH_LOOK_BEHIND_NOT:
- case OP_FAIL_LOOK_BEHIND_NOT:
- case OP_PUSH_ABSENT_POS:
+ p += SIZE_LENGTH;
+ break;
+
case OP_ABSENT_END:
case OP_ABSENT:
goto unexpected_bytecode_error;
@@ -933,31 +976,33 @@ onig_region_copy(OnigRegion* to, const OnigRegion* from)
/* stack type */
/* used by normal-POP */
-#define STK_ALT 0x0001
-#define STK_LOOK_BEHIND_NOT 0x0002
-#define STK_POS_NOT 0x0003
+#define STK_ALT 0x0001
+#define STK_LOOK_BEHIND_NOT 0x0002
+#define STK_POS_NOT 0x0003
/* handled by normal-POP */
-#define STK_MEM_START 0x0100
-#define STK_MEM_END 0x8200
-#define STK_REPEAT_INC 0x0300
-#define STK_STATE_CHECK_MARK 0x1000
+#define STK_MEM_START 0x0100
+#define STK_MEM_END 0x8200
+#define STK_REPEAT_INC 0x0300
+#define STK_STATE_CHECK_MARK 0x1000
/* avoided by normal-POP */
-#define STK_NULL_CHECK_START 0x3000
-#define STK_NULL_CHECK_END 0x5000 /* for recursive call */
-#define STK_MEM_END_MARK 0x8400
-#define STK_POS 0x0500 /* used when POP-POS */
-#define STK_STOP_BT 0x0600 /* mark for "(?>...)" */
-#define STK_REPEAT 0x0700
-#define STK_CALL_FRAME 0x0800
-#define STK_RETURN 0x0900
-#define STK_VOID 0x0a00 /* for fill a blank */
-#define STK_ABSENT_POS 0x0b00 /* for absent */
-#define STK_ABSENT 0x0c00 /* absent inner loop marker */
+#define STK_NULL_CHECK_START 0x3000
+#define STK_NULL_CHECK_END 0x5000 /* for recursive call */
+#define STK_MEM_END_MARK 0x8400
+#define STK_POS 0x0500 /* used when POP-POS */
+#define STK_STOP_BT 0x0600 /* mark for "(?>...)" */
+#define STK_REPEAT 0x0700
+#define STK_CALL_FRAME 0x0800
+#define STK_RETURN 0x0900
+#define STK_VOID 0x0a00 /* for fill a blank */
+#define STK_ABSENT_POS 0x0b00 /* for absent */
+#define STK_ABSENT 0x0c00 /* absent inner loop marker */
+#define STK_MATCH_CACHE_POINT 0x0d00 /* for the match cache optimization */
+#define STK_ATOMIC_MATCH_CACHE_POINT 0x0e00
/* stack type check mask */
-#define STK_MASK_POP_USED 0x00ff
-#define STK_MASK_TO_VOID_TARGET 0x10ff
-#define STK_MASK_MEM_END_OR_MARK 0x8000 /* MEM_END or MEM_END_MARK */
+#define STK_MASK_POP_USED 0x00ff
+#define STK_MASK_TO_VOID_TARGET 0x10ff
+#define STK_MASK_MEM_END_OR_MARK 0x8000 /* MEM_END or MEM_END_MARK */
#ifdef USE_MATCH_CACHE
#define MATCH_ARG_INIT_MATCH_CACHE(msa) do {\
@@ -1387,6 +1432,15 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
STACK_INC;\
} while(0)
+#define STACK_PUSH_MATCH_CACHE_POINT(match_cache_point_index, match_cache_point_mask) do {\
+ STACK_ENSURE(1);\
+ stk->type = STK_MATCH_CACHE_POINT;\
+ stk->null_check = stk == stk_base ? 0 : (stk-1)->null_check;\
+ stk->u.match_cache_point.index = (match_cache_point_index);\
+ stk->u.match_cache_point.mask = (match_cache_point_mask);\
+ STACK_INC;\
+} while(0)
+
#ifdef ONIG_DEBUG
# define STACK_BASE_CHECK(p, at) \
@@ -1398,6 +1452,42 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
# define STACK_BASE_CHECK(p, at)
#endif
+#ifdef ONIG_DEBUG_MATCH_CACHE
+# define MATCH_CACHE_DEBUG_MEMOIZE(stkp) fprintf(stderr, "MATCH CACHE: memoize (index=%ld mask=%d)\n", stkp->u.match_cache_point.index, stkp->u.match_cache_point.mask);
+#else
+# define MATCH_CACHE_DEBUG_MEMOIZE(stkp) ((void) 0)
+#endif
+
+#ifdef USE_MATCH_CACHE
+# define INC_NUM_FAILS msa->num_fails++
+# define MEMOIZE_MATCH_CACHE_POINT do {\
+ if (stk->type == STK_MATCH_CACHE_POINT) {\
+ msa->match_cache_buf[stk->u.match_cache_point.index] |= stk->u.match_cache_point.mask;\
+ MATCH_CACHE_DEBUG_MEMOIZE(stk);\
+ } else if (stk->type == STK_ATOMIC_MATCH_CACHE_POINT) {\
+ memoize_extended_match_cache_point(msa->match_cache_buf, stk->u.match_cache_point.index, stk->u.match_cache_point.mask);\
+ MATCH_CACHE_DEBUG_MEMOIZE(stkp);\
+ }\
+ } while(0)
+# define MEMOIZE_LOOKAROUND_MATCH_CACHE_POINT(stkp) do {\
+ if (stkp->type == STK_MATCH_CACHE_POINT) {\
+ stkp->type = STK_VOID;\
+ memoize_extended_match_cache_point(msa->match_cache_buf, stkp->u.match_cache_point.index, stkp->u.match_cache_point.mask);\
+ MATCH_CACHE_DEBUG_MEMOIZE(stkp);\
+ }\
+ } while(0)
+# define MEMOIZE_ATOMIC_MATCH_CACHE_POINT do {\
+ if (stk->type == STK_MATCH_CACHE_POINT) {\
+ memoize_extended_match_cache_point(msa->match_cache_buf, stk->u.match_cache_point.index, stk->u.match_cache_point.mask);\
+ MATCH_CACHE_DEBUG_MEMOIZE(stkp);\
+ }\
+ } while(0)
+#else
+# define INC_NUM_FAILS ((void) 0)
+# define MEMOIZE_MATCH_CACHE_POINT ((void) 0)
+# define MEMOIZE_LOOKAROUND_MATCH_CACHE_POINT ((void) 0)
+#endif
+
#define STACK_POP_ONE do {\
stk--;\
STACK_BASE_CHECK(stk, "STACK_POP_ONE"); \
@@ -1411,6 +1501,7 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
STACK_BASE_CHECK(stk, "STACK_POP"); \
if ((stk->type & STK_MASK_POP_USED) != 0) break;\
ELSE_IF_STATE_CHECK_MARK(stk);\
+ MEMOIZE_MATCH_CACHE_POINT;\
}\
break;\
case STACK_POP_LEVEL_MEM_START:\
@@ -1423,6 +1514,7 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\
}\
ELSE_IF_STATE_CHECK_MARK(stk);\
+ MEMOIZE_MATCH_CACHE_POINT;\
}\
break;\
default:\
@@ -1442,6 +1534,7 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\
}\
ELSE_IF_STATE_CHECK_MARK(stk);\
+ MEMOIZE_MATCH_CACHE_POINT;\
}\
break;\
}\
@@ -1463,7 +1556,11 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\
mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\
}\
+ else if (IS_TO_VOID_TARGET(stk)) {\
+ INC_NUM_FAILS;\
+ }\
ELSE_IF_STATE_CHECK_MARK(stk);\
+ MEMOIZE_LOOKAROUND_MATCH_CACHE_POINT(stk);\
}\
} while(0)
@@ -1520,12 +1617,14 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
k--;\
STACK_BASE_CHECK(k, "STACK_POS_END"); \
if (IS_TO_VOID_TARGET(k)) {\
+ INC_NUM_FAILS;\
k->type = STK_VOID;\
}\
else if (k->type == STK_POS) {\
k->type = STK_VOID;\
break;\
}\
+ MEMOIZE_LOOKAROUND_MATCH_CACHE_POINT(k);\
}\
} while(0)
@@ -1535,12 +1634,28 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
k--;\
STACK_BASE_CHECK(k, "STACK_STOP_BT_END"); \
if (IS_TO_VOID_TARGET(k)) {\
+ INC_NUM_FAILS;\
k->type = STK_VOID;\
}\
else if (k->type == STK_STOP_BT) {\
k->type = STK_VOID;\
break;\
}\
+ else if (k->type == STK_MATCH_CACHE_POINT) {\
+ k->type = STK_ATOMIC_MATCH_CACHE_POINT;\
+ }\
+ }\
+} while(0)
+
+#define STACK_STOP_BT_FAIL do {\
+ while (1) {\
+ stk--;\
+ STACK_BASE_CHECK(stk, "STACK_STOP_BT_END"); \
+ if (stk->type == STK_STOP_BT) {\
+ stk->type = STK_VOID;\
+ break;\
+ }\
+ MEMOIZE_ATOMIC_MATCH_CACHE_POINT;\
}\
} while(0)
@@ -1579,7 +1694,7 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
}\
} while(0)
-#define STACK_NULL_CHECK_MEMST(isnull,ischange,id,s,reg) do {\
+#define STACK_NULL_CHECK_MEMST(isnull,id,s,reg) do {\
OnigStackType* k = STACK_AT((stk-1)->null_check)+1;\
while (1) {\
k--;\
@@ -1596,14 +1711,14 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
while (k < stk) {\
if (k->type == STK_MEM_START) {\
if (k->u.mem.end == INVALID_STACK_INDEX) {\
- (isnull) = 0; (ischange) = 1; break;\
+ (isnull) = 0; break;\
}\
if (BIT_STATUS_AT(reg->bt_mem_end, k->u.mem.num))\
endp = STACK_AT(k->u.mem.end)->u.mem.pstr;\
else\
endp = (UChar* )k->u.mem.end;\
if (STACK_AT(k->u.mem.start)->u.mem.pstr != endp) {\
- (isnull) = 0; (ischange) = 1; break;\
+ (isnull) = 0; break;\
}\
else if (endp != s) {\
(isnull) = -1; /* empty, but position changed */ \
@@ -2008,6 +2123,7 @@ stack_type_str(int stack_type)
case STK_VOID: return "Void ";
case STK_ABSENT_POS: return "AbsPos";
case STK_ABSENT: return "Absent";
+ case STK_MATCH_CACHE_POINT: return "MCache";
default: return " ";
}
}
@@ -2029,7 +2145,7 @@ bsearch_cache_opcodes(const OnigCacheOpcode *cache_opcodes, long num_cache_opcod
}
static long
-find_cache_point(regex_t* reg, const OnigCacheOpcode* cache_opcodes, long num_cache_opcodes, const UChar* p, const OnigStackType *stk, const OnigStackIndex *repeat_stk)
+find_cache_point(regex_t* reg, const OnigCacheOpcode* cache_opcodes, long num_cache_opcodes, const UChar* p, const OnigStackType *stk, const OnigStackIndex *repeat_stk, const OnigCacheOpcode **cache_opcode_ptr)
{
long m;
const OnigCacheOpcode* cache_opcode;
@@ -2048,6 +2164,7 @@ find_cache_point(regex_t* reg, const OnigCacheOpcode* cache_opcodes, long num_ca
}
cache_opcode = &cache_opcodes[m];
+ *cache_opcode_ptr = &cache_opcodes[m];
cache_point = cache_opcode->cache_point;
if (cache_opcode->outer_repeat_mem == -1) {
return cache_point;
@@ -2079,54 +2196,23 @@ find_cache_point(regex_t* reg, const OnigCacheOpcode* cache_opcodes, long num_ca
cache_point;
}
-static void
-reset_match_cache_on_null_check(regex_t* reg, const UChar* pstart, const UChar* pend, long pos, uint8_t* match_cache_buf, const OnigCacheOpcode* cache_opcodes, long num_cache_opcodes, long num_cache_points, const OnigStackType *stk, const OnigStackIndex *repeat_stk)
-{
- long cache_point_start, cache_point_end;
- long match_cache_point_start, match_cache_point_end;
- long match_cache_point_start_index, match_cache_point_end_index;
- uint8_t match_cache_point_start_bit, match_cache_point_end_bit;
-
-#ifdef ONIG_DEBUG_MATCH_CACHE
- fprintf(stderr, "MATCH CACHE: reset start=%p end=%p pos=%ld cache_opcodes=%p num_cache_opcodes=%ld\n", pstart, pend, pos, cache_opcodes, num_cache_opcodes);
-#endif
-
- cache_point_start = find_cache_point(reg, cache_opcodes, num_cache_opcodes, pstart, stk, repeat_stk);
- cache_point_end = find_cache_point(reg, cache_opcodes, num_cache_opcodes, pend, stk, repeat_stk);
- // Skip to reset if `pstart` or `pend` is not found.
- // This situation occurs when there is no cache point
- // between OP_NULL_CHECK_START and OP_NULL_CHECK_END_MEMST.
- if (cache_point_start == -1 || cache_point_end == -1) return;
-
- match_cache_point_start = pos * num_cache_points + cache_point_start;
- match_cache_point_end = pos * num_cache_points + cache_point_end;
- match_cache_point_start_index = match_cache_point_start >> 3;
- match_cache_point_end_index = match_cache_point_end >> 3;
- match_cache_point_start_bit = match_cache_point_start & 7;
- match_cache_point_end_bit = match_cache_point_end & 7;
-
-#ifdef ONIG_DEBUG_MATCH_CACHE
- fprintf(stderr, "MATCH CACHE: reset start %ld (%ld index=%ld bit=%d)\n", match_cache_point_start, cache_point_start, match_cache_point_start_index, match_cache_point_start_bit);
- fprintf(stderr, "MATCH CACHE: reset end %ld (%ld index=%ld bit=%d)\n", match_cache_point_end, cache_point_end, match_cache_point_end_index, match_cache_point_end_bit);
-#endif
+static int check_extended_match_cache_point(uint8_t *match_cache_buf, long match_cache_point_index, uint8_t match_cache_point_mask) {
+ if (match_cache_point_mask & 0x80) {
+ return (match_cache_buf[match_cache_point_index + 1] & 0x01) > 0;
+ } else {
+ return (match_cache_buf[match_cache_point_index] & (match_cache_point_mask << 1)) > 0;
+ }
+}
- if (match_cache_point_start_index == match_cache_point_end_index) {
- match_cache_buf[match_cache_point_start_index] &=
- (((1 << (8 - match_cache_point_end_bit - 1)) - 1) << (match_cache_point_end_bit + 1)) |
- ((1 << match_cache_point_start_bit) - 1);
+static void memoize_extended_match_cache_point(uint8_t *match_cache_buf, long match_cache_point_index, uint8_t match_cache_point_mask) {
+ match_cache_buf[match_cache_point_index] |= match_cache_point_mask;
+ if (match_cache_point_mask & 0x80) {
+ match_cache_buf[match_cache_point_index + 1] |= 0x01;
} else {
- if (match_cache_point_start_bit) {
- match_cache_buf[match_cache_point_start_index] &= (1 << (match_cache_point_start_bit - 1)) - 1;
- match_cache_point_start_index++;
- }
- if (match_cache_point_start_index < match_cache_point_end_index) {
- xmemset(&match_cache_buf[match_cache_point_start_index], 0, match_cache_point_end_index - match_cache_point_start_index);
- if (match_cache_point_end_bit) {
- match_cache_buf[match_cache_point_end_index] &= (((1 << (8 - match_cache_point_end_bit - 1)) - 1) << (match_cache_point_end_bit + 1));
- }
- }
+ match_cache_buf[match_cache_point_index] |= match_cache_point_mask << 1;
}
}
+
#endif /* USE_MATCH_CACHE */
/* match data(str - end) from position (sstart). */
@@ -2447,7 +2533,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
# define CHECK_MATCH_CACHE do {\
if (msa->match_cache_status == MATCH_CACHE_STATUS_ENABLED) {\
- long cache_point = find_cache_point(reg, msa->cache_opcodes, msa->num_cache_opcodes, pbegin, stk_base, repeat_stk);\
+ const OnigCacheOpcode *cache_opcode;\
+ long cache_point = find_cache_point(reg, msa->cache_opcodes, msa->num_cache_opcodes, pbegin, stk_base, repeat_stk, &cache_opcode);\
if (cache_point >= 0) {\
long match_cache_point = msa->num_cache_points * (long)(s - str) + cache_point;\
long match_cache_point_index = match_cache_point >> 3;\
@@ -2455,9 +2542,21 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
MATCH_CACHE_DEBUG;\
if (msa->match_cache_buf[match_cache_point_index] & match_cache_point_mask) {\
MATCH_CACHE_DEBUG_HIT;\
- goto fail;\
+ if (cache_opcode->lookaround_nesting == 0) goto fail;\
+ else if (cache_opcode->lookaround_nesting < 0) {\
+ if (check_extended_match_cache_point(msa->match_cache_buf, match_cache_point_index, match_cache_point_mask)) {\
+ STACK_STOP_BT_FAIL;\
+ goto fail;\
+ } else goto fail;\
+ } else {\
+ if (check_extended_match_cache_point(msa->match_cache_buf, match_cache_point_index, match_cache_point_mask)) {\
+ p = cache_opcode->match_addr;\
+ MOP_OUT;\
+ JUMP;\
+ } else goto fail;\
+ }\
}\
- msa->match_cache_buf[match_cache_point_index] |= match_cache_point_mask;\
+ STACK_PUSH_MATCH_CACHE_POINT(match_cache_point_index, match_cache_point_mask);\
}\
}\
} while (0)
@@ -3538,10 +3637,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
CASE(OP_NULL_CHECK_END_MEMST) MOP_IN(OP_NULL_CHECK_END_MEMST);
{
int isnull;
- int ischanged = 0; // set 1 when a loop is empty but memory status is changed.
GET_MEMNUM_INC(mem, p); /* mem: null check id */
- STACK_NULL_CHECK_MEMST(isnull, ischanged, mem, s, reg);
+ STACK_NULL_CHECK_MEMST(isnull, mem, s, reg);
if (isnull) {
# ifdef ONIG_DEBUG_MATCH
fprintf(stderr, "NULL_CHECK_END_MEMST: skip id:%d, s:%"PRIuPTR" (%p)\n",
@@ -3550,31 +3648,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
if (isnull == -1) goto fail;
goto null_check_found;
}
-# ifdef USE_MATCH_CACHE
- if (ischanged && msa->match_cache_status == MATCH_CACHE_STATUS_ENABLED) {
- RelAddrType rel;
- OnigUChar *null_check_start;
- OnigUChar *null_check_end = pbegin;
- MemNumType mem;
- UChar* tmp = p;
- switch (*tmp++) {
- case OP_JUMP:
- case OP_PUSH:
- GET_RELADDR_INC(rel, tmp);
- null_check_start = tmp + rel;
- break;
- case OP_REPEAT_INC:
- case OP_REPEAT_INC_NG:
- GET_MEMNUM_INC(mem, tmp);
- // Skip OP_REPEAT (or OP_REPEAT_NG)
- null_check_start = STACK_AT(repeat_stk[mem])->u.repeat.pcode;
- break;
- default:
- goto unexpected_bytecode_error;
- }
- reset_match_cache_on_null_check(reg, null_check_start, null_check_end, (long)(s - str), msa->match_cache_buf, msa->cache_opcodes, msa->num_cache_opcodes, msa->num_cache_points, stk_base, repeat_stk);
- }
-# endif
}
MOP_OUT;
JUMP;
@@ -4002,7 +4075,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
fprintf(stderr, "MATCH CACHE: #cache points = %ld\n", msa->num_cache_points);
fprintf(stderr, "MATCH CACHE: cache opcodes (%p):\n", msa->cache_opcodes);
for (int i = 0; i < msa->num_cache_opcodes; i++) {
- fprintf(stderr, "MATCH CACHE: [%p] cache_point=%ld outer_repeat_mem=%d num_cache_opcodes_at_outer_repeat=%ld num_cache_opcodes_in_outer_repeat=%ld\n", msa->cache_opcodes[i].addr, msa->cache_opcodes[i].cache_point, msa->cache_opcodes[i].outer_repeat_mem, msa->cache_opcodes[i].num_cache_points_at_outer_repeat, msa->cache_opcodes[i].num_cache_points_in_outer_repeat);
+ fprintf(stderr, "MATCH CACHE: [%p] cache_point=%ld outer_repeat_mem=%d num_cache_opcodes_at_outer_repeat=%ld num_cache_opcodes_in_outer_repeat=%ld lookaround_nesting=%d match_addr=%p\n", msa->cache_opcodes[i].addr, msa->cache_opcodes[i].cache_point, msa->cache_opcodes[i].outer_repeat_mem, msa->cache_opcodes[i].num_cache_points_at_outer_repeat, msa->cache_opcodes[i].num_cache_points_in_outer_repeat, msa->cache_opcodes[i].lookaround_nesting, msa->cache_opcodes[i].match_addr);
}
#endif
}