From 0046f355c1109dfc2883f1a3c64c61222aab0ce9 Mon Sep 17 00:00:00 2001 From: kosako Date: Sun, 27 Aug 2006 12:58:22 +0000 Subject: merge Oniguruma 4.4.0 git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@10782 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- regexec.c | 243 +++++++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 218 insertions(+), 25 deletions(-) (limited to 'regexec.c') diff --git a/regexec.c b/regexec.c index 78d8094202..1ba060f0ec 100644 --- a/regexec.c +++ b/regexec.c @@ -306,6 +306,9 @@ typedef struct _StackType { UChar *pcode; /* byte code position */ UChar *pstr; /* string position */ UChar *pstr_prev; /* previous char position of pstr */ +#ifdef USE_COMBINATION_EXPLOSION_CHECK + unsigned int state_check; +#endif } state; struct { int count; /* for OP_REPEAT_INC, OP_REPEAT_INC_NG */ @@ -339,29 +342,28 @@ typedef struct _StackType { /* stack type */ /* used by normal-POP */ #define STK_ALT 0x0001 -#define STK_LOOK_BEHIND_NOT 0x0003 -#define STK_POS_NOT 0x0005 -/* avoided by normal-POP, but value should be small */ -#define STK_NULL_CHECK_START 0x0100 +#define STK_LOOK_BEHIND_NOT 0x0002 +#define STK_POS_NOT 0x0003 /* handled by normal-POP */ -#define STK_MEM_START 0x0200 -#define STK_MEM_END 0x0300 -#define STK_REPEAT_INC 0x0400 +#define STK_MEM_START 0x0100 +#define STK_MEM_END 0x8200 +#define STK_REPEAT_INC 0x0300 +#define STK_STATE_CHECK_MARK 0x1000 /* avoided by normal-POP */ +#define STK_NULL_CHECK_START 0x3000 +#define STK_NULL_CHECK_END 0x5000 /* for recursive call */ +#define STK_MEM_END_MARK 0x8400 #define STK_POS 0x0500 /* used when POP-POS */ #define STK_STOP_BT 0x0600 /* mark for "(?>...)" */ #define STK_REPEAT 0x0700 #define STK_CALL_FRAME 0x0800 #define STK_RETURN 0x0900 -#define STK_MEM_END_MARK 0x0a00 -#define STK_VOID 0x0b00 /* for fill a blank */ -#define STK_NULL_CHECK_END 0x0c00 /* for recursive call */ +#define STK_VOID 0x0a00 /* for fill a blank */ /* stack type check mask */ -#define STK_MASK_POP_USED 0x00ff -#define IS_TO_VOID_TARGET(stk) \ - (((stk)->type & STK_MASK_POP_USED) || \ - (stk)->type == STK_NULL_CHECK_START || (stk)->type == STK_NULL_CHECK_END) +#define STK_MASK_POP_USED 0x00ff +#define STK_MASK_TO_VOID_TARGET 0x10ff +#define STK_MASK_MEM_END_OR_MARK 0x8000 /* MEM_END or MEM_END_MARK */ typedef struct { void* stack_p; @@ -369,6 +371,10 @@ typedef struct { OnigOptionType options; OnigRegion* region; const UChar* start; /* search start position (for \G: BEGIN_POSITION) */ +#ifdef USE_COMBINATION_EXPLOSION_CHECK + void* state_check_buff; + int state_check_buff_size; +#endif } MatchArg; #define MATCH_ARG_INIT(msa, arg_option, arg_region, arg_start) do {\ @@ -378,7 +384,36 @@ typedef struct { (msa).start = (arg_start);\ } while (0) -#define MATCH_ARG_FREE(msa) if ((msa).stack_p) xfree((msa).stack_p) +#ifdef USE_COMBINATION_EXPLOSION_CHECK + +#define STATE_CHECK_BUFF_MALLOC_THRESHOLD_SIZE 16 + +#define STATE_CHECK_BUFF_INIT(msa, str_len, state_num) do { \ + (msa).state_check_buff = (void* )0;\ + if ((state_num) > 0 && str_len >= STATE_CHECK_STRING_THRESHOLD_LEN) {\ + int size = ((int )((str_len) + 1) * (state_num) + 7) / 8;\ + (msa).state_check_buff_size = size; \ + if (size > 0 && size < STATE_CHECK_BUFF_MAX_SIZE) {\ + if (size >= STATE_CHECK_BUFF_MALLOC_THRESHOLD_SIZE) \ + (msa).state_check_buff = (void* )xmalloc(size);\ + else \ + (msa).state_check_buff = (void* )xalloca(size);\ + xmemset((msa).state_check_buff, 0, (size_t )size);\ + }\ + }\ +} while (0) + +#define MATCH_ARG_FREE(msa) do {\ + if ((msa).stack_p) xfree((msa).stack_p);\ + if ((msa).state_check_buff_size >= STATE_CHECK_BUFF_MALLOC_THRESHOLD_SIZE) { \ + if ((msa).state_check_buff) xfree((msa).state_check_buff);\ + }\ +} while (0); +#else +#define STATE_CHECK_BUFF_INIT(msa, str_len, state_num) +#define MATCH_ARG_FREE(msa) if ((msa).stack_p) xfree((msa).stack_p) +#endif + #define STACK_INIT(alloc_addr, ptr_num, stack_num) do {\ @@ -472,26 +507,88 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, #define STACK_AT(index) (stk_base + (index)) #define GET_STACK_INDEX(stk) ((stk) - stk_base) +#define STACK_PUSH_TYPE(stack_type) do {\ + STACK_ENSURE(1);\ + stk->type = (stack_type);\ + STACK_INC;\ +} while(0) + +#define IS_TO_VOID_TARGET(stk) (((stk)->type & STK_MASK_TO_VOID_TARGET) != 0) + +#ifdef USE_COMBINATION_EXPLOSION_CHECK +#define STATE_CHECK_POS(s,snum) \ + (((s) - str) * num_comb_exp_check + ((snum) - 1)) +#define STATE_CHECK_VAL(v,snum) do {\ + if (state_check_buff != NULL) {\ + int x = STATE_CHECK_POS(s,snum);\ + (v) = state_check_buff[x/8] & (1<<(x%8));\ + }\ + else (v) = 0;\ +} while(0) + + +#define ELSE_IF_STATE_CHECK_MARK(stk) \ + else if ((stk)->type == STK_STATE_CHECK_MARK) { \ + int x = STATE_CHECK_POS(stk->u.state.pstr, stk->u.state.state_check);\ + state_check_buff[x/8] |= (1<<(x%8)); \ + } + #define STACK_PUSH(stack_type,pat,s,sprev) do {\ STACK_ENSURE(1);\ stk->type = (stack_type);\ stk->u.state.pcode = (pat);\ stk->u.state.pstr = (s);\ stk->u.state.pstr_prev = (sprev);\ + stk->u.state.state_check = 0;\ STACK_INC;\ } while(0) #define STACK_PUSH_ENSURED(stack_type,pat) do {\ stk->type = (stack_type);\ stk->u.state.pcode = (pat);\ + stk->u.state.state_check = 0;\ STACK_INC;\ } while(0) -#define STACK_PUSH_TYPE(stack_type) do {\ +#define STACK_PUSH_ALT_WITH_STATE_CHECK(pat,s,sprev,snum) do {\ + STACK_ENSURE(1);\ + stk->type = STK_ALT;\ + stk->u.state.pcode = (pat);\ + stk->u.state.pstr = (s);\ + stk->u.state.pstr_prev = (sprev);\ + stk->u.state.state_check = ((state_check_buff != NULL) ? (snum) : 0);\ + STACK_INC;\ +} while(0) + +#define STACK_PUSH_STATE_CHECK(s,snum) do {\ + if (state_check_buff != NULL) {\ + STACK_ENSURE(1);\ + stk->type = STK_STATE_CHECK_MARK;\ + stk->u.state.pstr = (s);\ + stk->u.state.state_check = (snum);\ + STACK_INC;\ + }\ +} while(0) + +#else /* USE_COMBINATION_EXPLOSION_CHECK */ + +#define ELSE_IF_STATE_CHECK_MARK(stk) + +#define STACK_PUSH(stack_type,pat,s,sprev) do {\ STACK_ENSURE(1);\ stk->type = (stack_type);\ + stk->u.state.pcode = (pat);\ + stk->u.state.pstr = (s);\ + stk->u.state.pstr_prev = (sprev);\ + STACK_INC;\ +} while(0) + +#define STACK_PUSH_ENSURED(stack_type,pat) do {\ + stk->type = (stack_type);\ + stk->u.state.pcode = (pat);\ STACK_INC;\ } while(0) +#endif /* USE_COMBINATION_EXPLOSION_CHECK */ #define STACK_PUSH_ALT(pat,s,sprev) STACK_PUSH(STK_ALT,pat,s,sprev) #define STACK_PUSH_POS(s,sprev) STACK_PUSH(STK_POS,NULL_UCHARP,s,sprev) @@ -551,7 +648,7 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, k = stk;\ while (k > stk_base) {\ k--;\ - if ((k->type == STK_MEM_END_MARK || k->type == STK_MEM_END) \ + if ((k->type & STK_MASK_MEM_END_OR_MARK) != 0 \ && k->u.mem.num == (mnum)) {\ level++;\ }\ @@ -631,6 +728,7 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, stk--;\ STACK_BASE_CHECK(stk, "STACK_POP"); \ if ((stk->type & STK_MASK_POP_USED) != 0) break;\ + ELSE_IF_STATE_CHECK_MARK(stk);\ }\ break;\ case STACK_POP_LEVEL_MEM_START:\ @@ -642,6 +740,7 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\ }\ + ELSE_IF_STATE_CHECK_MARK(stk);\ }\ break;\ default:\ @@ -660,6 +759,7 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\ }\ + ELSE_IF_STATE_CHECK_MARK(stk);\ }\ break;\ }\ @@ -681,6 +781,7 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\ }\ + ELSE_IF_STATE_CHECK_MARK(stk);\ }\ } while(0) @@ -700,6 +801,7 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\ }\ + ELSE_IF_STATE_CHECK_MARK(stk);\ }\ } while(0) @@ -947,6 +1049,7 @@ static int string_cmp_ic(OnigEncoding enc, int ambig_flag, is_fail = 0; \ } while(0) + #define ON_STR_BEGIN(s) ((s) == str) #define ON_STR_END(s) ((s) == end) #define IS_EMPTY_STR (str == end) @@ -1314,6 +1417,11 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, const UChar* sstart, StackIndex si; StackIndex *repeat_stk; StackIndex *mem_start_stk, *mem_end_stk; +#ifdef USE_COMBINATION_EXPLOSION_CHECK + int scv; + unsigned char* state_check_buff = msa->state_check_buff; + int num_comb_exp_check = reg->num_comb_exp_check; +#endif n = reg->num_repeat + reg->num_mem * 2; STACK_INIT(alloca_base, n, INIT_MATCH_STACK_SIZE); @@ -1924,6 +2032,47 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, const UChar* sstart, STAT_OP_OUT; break; +#ifdef USE_COMBINATION_EXPLOSION_CHECK + case OP_STATE_CHECK_ANYCHAR_STAR: STAT_OP_IN(OP_STATE_CHECK_ANYCHAR_STAR); + GET_STATE_CHECK_NUM_INC(mem, p); + while (s < end) { + STATE_CHECK_VAL(scv, mem); + if (scv) goto fail; + + STACK_PUSH_ALT_WITH_STATE_CHECK(p, s, sprev, mem); + n = enc_len(encode, s); + DATA_ENSURE(n); + if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) goto fail; + sprev = s; + s += n; + } + STAT_OP_OUT; + break; + + case OP_STATE_CHECK_ANYCHAR_ML_STAR: + STAT_OP_IN(OP_STATE_CHECK_ANYCHAR_ML_STAR); + + GET_STATE_CHECK_NUM_INC(mem, p); + while (s < end) { + STATE_CHECK_VAL(scv, mem); + if (scv) goto fail; + + STACK_PUSH_ALT_WITH_STATE_CHECK(p, s, sprev, mem); + n = enc_len(encode, s); + if (n > 1) { + DATA_ENSURE(n); + sprev = s; + s += n; + } + else { + sprev = s; + s++; + } + } + STAT_OP_OUT; + break; +#endif /* USE_COMBINATION_EXPLOSION_CHECK */ + case OP_WORD: STAT_OP_IN(OP_WORD); DATA_ENSURE(1); if (! ONIGENC_IS_MBC_WORD(encode, s, end)) @@ -2154,11 +2303,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, const UChar* sstart, goto backref; break; - case OP_BACKREF3: STAT_OP_IN(OP_BACKREF3); - mem = 3; - goto backref; - break; - case OP_BACKREFN: STAT_OP_IN(OP_BACKREFN); GET_MEMNUM_INC(mem, p); backref: @@ -2451,6 +2595,43 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, const UChar* sstart, continue; break; +#ifdef USE_COMBINATION_EXPLOSION_CHECK + case OP_STATE_CHECK_PUSH: STAT_OP_IN(OP_STATE_CHECK_PUSH); + GET_STATE_CHECK_NUM_INC(mem, p); + STATE_CHECK_VAL(scv, mem); + if (scv) goto fail; + + GET_RELADDR_INC(addr, p); + STACK_PUSH_ALT_WITH_STATE_CHECK(p + addr, s, sprev, mem); + STAT_OP_OUT; + continue; + break; + + case OP_STATE_CHECK_PUSH_OR_JUMP: STAT_OP_IN(OP_STATE_CHECK_PUSH_OR_JUMP); + GET_STATE_CHECK_NUM_INC(mem, p); + GET_RELADDR_INC(addr, p); + STATE_CHECK_VAL(scv, mem); + if (scv) { + p += addr; + } + else { + STACK_PUSH_ALT_WITH_STATE_CHECK(p + addr, s, sprev, mem); + } + STAT_OP_OUT; + continue; + break; + + case OP_STATE_CHECK: STAT_OP_IN(OP_STATE_CHECK); + GET_STATE_CHECK_NUM_INC(mem, p); + STATE_CHECK_VAL(scv, mem); + if (scv) goto fail; + + STACK_PUSH_STATE_CHECK(s, mem); + STAT_OP_OUT; + continue; + break; +#endif /* USE_COMBINATION_EXPLOSION_CHECK */ + case OP_POP: STAT_OP_IN(OP_POP); STACK_POP_ONE; STAT_OP_OUT; @@ -2525,7 +2706,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, const UChar* sstart, repeat_inc: stkp->u.repeat.count++; - if (stkp->u.repeat.count == reg->repeat_range[mem].upper) { + if (stkp->u.repeat.count >= reg->repeat_range[mem].upper) { /* end of repeat. Nothing to do. */ } else if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { @@ -2555,8 +2736,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, const UChar* sstart, repeat_inc_ng: stkp->u.repeat.count++; - if (stkp->u.repeat.count < reg->repeat_range[mem].upper || - IS_REPEAT_INFINITE(reg->repeat_range[mem].upper)) { + if (stkp->u.repeat.count < reg->repeat_range[mem].upper) { if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { UChar* pcode = stkp->u.repeat.pcode; @@ -2685,6 +2865,14 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, const UChar* sstart, p = stk->u.state.pcode; s = stk->u.state.pstr; sprev = stk->u.state.pstr_prev; + +#ifdef USE_COMBINATION_EXPLOSION_CHECK + if (stk->u.state.state_check != 0) { + stk->type = STK_STATE_CHECK_MARK; + stk++; + } +#endif + STAT_OP_OUT; continue; break; @@ -3073,6 +3261,7 @@ onig_match(regex_t* reg, const UChar* str, const UChar* end, const UChar* at, On #endif /* USE_RECOMPILE_API && USE_MULTI_THREAD_SYSTEM */ MATCH_ARG_INIT(msa, option, region, at); + STATE_CHECK_BUFF_INIT(msa, end - str, reg->num_comb_exp_check); if (region #ifdef USE_POSIX_REGION_OPTION @@ -3475,6 +3664,9 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, prev = (UChar* )NULL; MATCH_ARG_INIT(msa, option, region, start); +#ifdef USE_COMBINATION_EXPLOSION_CHECK + msa.state_check_buff = (void* )0; +#endif MATCH_AND_RETURN_CHECK; goto mismatch; } @@ -3487,6 +3679,7 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, #endif MATCH_ARG_INIT(msa, option, region, orig_start); + STATE_CHECK_BUFF_INIT(msa, end - str, reg->num_comb_exp_check); s = (UChar* )start; if (range > start) { /* forward search */ -- cgit v1.2.3