From a67d4fa01c43124048be957f7d8e4090b7255393 Mon Sep 17 00:00:00 2001 From: akr Date: Tue, 16 Sep 2008 16:48:05 +0000 Subject: * include/ruby/oniguruma.h (OnigEncodingTypeST): add precise_ret argument for mbc_to_code. (ONIGENC_MBC_TO_CODE): provide NULL for precise_ret. (ONIGENC_MBC_PRECISE_CODEPOINT): defined. * include/ruby/encoding.h (rb_enc_mbc_precise_codepoint): defined. * regenc.h (onigenc_single_byte_mbc_to_code): precise_ret argument added. (onigenc_mbn_mbc_to_code): ditto. * regenc.c (onigenc_single_byte_mbc_to_code): precise_ret argument added. (onigenc_mbn_mbc_to_code): ditto. * string.c (count_utf8_lead_bytes_with_word): removed. (str_utf8_nth): removed. (str_utf8_offset): removed. (str_strlen): UTF-8 codepoint oriented optimization removed. (rb_str_substr): ditto. (enc_succ_char): use rb_enc_mbc_precise_codepoint. (enc_pred_char): ditto. (rb_str_succ): ditto. * encoding.c (rb_enc_ascget): check length with rb_enc_mbc_precise_codepoint. (rb_enc_codepoint): use rb_enc_mbc_precise_codepoint. * regexec.c (string_cmp_ic): add text_end argument. (match_at): check end of character after exact string matches. * enc/utf_8.c (graphme_table): defined for extended graphme cluster boundary. (grapheme_cmp): defined. (get_grapheme_properties): defined. (grapheme_boundary_p): defined. (MAX_BYTES_LENGTH): defined. (comb_char_enc_len): defined. (mbc_to_code0): extracted from mbc_to_code. (mbc_to_code): use mbc_to_code0. (left_adjust_combchar_head): defined. (utf_8): use a extended graphme cluster as a unit. * enc/unicode.c (onigenc_unicode_mbc_case_fold): use ONIGENC_MBC_PRECISE_CODEPOINT to extract codepoints. (onigenc_unicode_get_case_fold_codes_by_str): ditto. * enc/euc_jp.c (mbc_to_code): follow mbc_to_code field change. use onigenc_mbn_mbc_to_code. * enc/shift_jis.c (mbc_to_code): ditto. * enc/emacs_mule.c (mbc_to_code): ditto. * enc/gbk.c (gbk_mbc_to_code): follow mbc_to_code field and onigenc_mbn_mbc_to_code change. * enc/cp949.c (cp949_mbc_to_code): ditto. * enc/big5.c (big5_mbc_to_code): ditto. * enc/euc_tw.c (euctw_mbc_to_code): ditto. * enc/euc_kr.c (euckr_mbc_to_code): ditto. * enc/gb18030.c (gb18030_mbc_to_code): ditto. * enc/utf_32be.c (utf32be_mbc_to_code): follow mbc_to_code field change. * enc/utf_16be.c (utf16be_mbc_to_code): ditto. * enc/utf_32le.c (utf32le_mbc_to_code): ditto. * enc/utf_16le.c (utf16le_mbc_to_code): ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@19389 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- regexec.c | 52 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 13 deletions(-) (limited to 'regexec.c') diff --git a/regexec.c b/regexec.c index c936a60352..93240dae81 100644 --- a/regexec.c +++ b/regexec.c @@ -977,25 +977,24 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end, }\ } while(0) -#define STRING_CMP_IC(case_fold_flag,s1,ps2,len) do {\ - if (string_cmp_ic(encode, case_fold_flag, s1, ps2, len) == 0) \ +#define STRING_CMP_IC(case_fold_flag,s1,ps2,len,text_end) do {\ + if (string_cmp_ic(encode, case_fold_flag, s1, ps2, len, text_end) == 0) \ goto fail; \ } while(0) static int string_cmp_ic(OnigEncoding enc, int case_fold_flag, - UChar* s1, UChar** ps2, int mblen) + UChar* s1, UChar** ps2, int mblen, const UChar* text_end) { UChar buf1[ONIGENC_MBC_CASE_FOLD_MAXLEN]; UChar buf2[ONIGENC_MBC_CASE_FOLD_MAXLEN]; - UChar *p1, *p2, *end1, *s2, *end2; + UChar *p1, *p2, *end1, *s2; int len1, len2; s2 = *ps2; end1 = s1 + mblen; - end2 = s2 + mblen; while (s1 < end1) { len1 = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &s1, end1, buf1); - len2 = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &s2, end2, buf2); + len2 = ONIGENC_MBC_CASE_FOLD(enc, case_fold_flag, &s2, text_end, buf2); if (len1 != len2) return 0; p1 = buf1; p2 = buf2; @@ -1019,8 +1018,8 @@ static int string_cmp_ic(OnigEncoding enc, int case_fold_flag, }\ } while(0) -#define STRING_CMP_VALUE_IC(case_fold_flag,s1,ps2,len,is_fail) do {\ - if (string_cmp_ic(encode, case_fold_flag, s1, ps2, len) == 0) \ +#define STRING_CMP_VALUE_IC(case_fold_flag,s1,ps2,len,text_end,is_fail) do {\ + if (string_cmp_ic(encode, case_fold_flag, s1, ps2, len, text_end) == 0) \ is_fail = 1; \ else \ is_fail = 0; \ @@ -1126,7 +1125,7 @@ static int backref_match_at_nested_level(regex_t* reg if (ignore_case != 0) { if (string_cmp_ic(reg->enc, case_fold_flag, - pstart, &ss, (int )(pend - pstart)) == 0) + pstart, &ss, (int )(pend - pstart), send) == 0) return 0; /* or goto next_mem; */ } else { @@ -1442,6 +1441,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (*p != *s++) goto fail; DATA_ENSURE(0); p++; + if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) + goto fail; MOP_OUT; break; @@ -1464,6 +1465,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, p++; q++; } } + if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) + goto fail; MOP_OUT; break; @@ -1474,6 +1477,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (*p != *s) goto fail; sprev = s; p++; s++; + if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) + goto fail; MOP_OUT; continue; break; @@ -1487,6 +1492,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (*p != *s) goto fail; sprev = s; p++; s++; + if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) + goto fail; MOP_OUT; continue; break; @@ -1502,6 +1509,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (*p != *s) goto fail; sprev = s; p++; s++; + if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) + goto fail; MOP_OUT; continue; break; @@ -1519,6 +1528,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (*p != *s) goto fail; sprev = s; p++; s++; + if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) + goto fail; MOP_OUT; continue; break; @@ -1530,6 +1541,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, if (*p++ != *s++) goto fail; } sprev = s - 1; + if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) + goto fail; MOP_OUT; continue; break; @@ -1557,7 +1570,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } } } - + if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) + goto fail; MOP_OUT; continue; break; @@ -1568,6 +1582,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, p++; s++; if (*p != *s) goto fail; p++; s++; + if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) + goto fail; MOP_OUT; break; @@ -1582,6 +1598,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, p++; s++; if (*p != *s) goto fail; p++; s++; + if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) + goto fail; MOP_OUT; continue; break; @@ -1601,6 +1619,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, p++; s++; if (*p != *s) goto fail; p++; s++; + if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) + goto fail; MOP_OUT; continue; break; @@ -1615,6 +1635,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, p++; s++; } sprev = s - 2; + if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) + goto fail; MOP_OUT; continue; break; @@ -1631,6 +1653,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, p++; s++; } sprev = s - 3; + if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) + goto fail; MOP_OUT; continue; break; @@ -1645,6 +1669,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, p++; s++; } sprev = s - tlen; + if (s != end && ONIGENC_LEFT_ADJUST_CHAR_HEAD(encode, str, s, end) != s) + goto fail; MOP_OUT; continue; break; @@ -2199,7 +2225,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, n = pend - pstart; DATA_ENSURE(n); sprev = s; - STRING_CMP_IC(case_fold_flag, pstart, &s, n); + STRING_CMP_IC(case_fold_flag, pstart, &s, n, end); while (sprev + (len = enclen(encode, sprev, end)) < s) sprev += len; @@ -2271,7 +2297,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, DATA_ENSURE(n); sprev = s; swork = s; - STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, is_fail); + STRING_CMP_VALUE_IC(case_fold_flag, pstart, &swork, n, end, is_fail); if (is_fail) continue; s = swork; while (sprev + (len = enclen(encode, sprev, end)) < s) @@ -2780,7 +2806,7 @@ slow_search(OnigEncoding enc, UChar* target, UChar* target_end, if (target_end == t || memcmp(t, p, target_end - t) == 0) return s; } - s += enclen(enc, s, end); + s += enclen(enc, s, text_end); } return (UChar* )NULL; -- cgit v1.2.3