From 1b7d03b9d0d60d4dede7d38739503e59a8b25d06 Mon Sep 17 00:00:00 2001 From: naruse Date: Sat, 10 Dec 2016 17:47:04 +0000 Subject: Merge Onigmo 6.0.0 * https://github.com/k-takata/Onigmo/blob/Onigmo-6.0.0/HISTORY * fix for ruby 2.4: https://github.com/k-takata/Onigmo/pull/78 * suppress warning: https://github.com/k-takata/Onigmo/pull/79 * include/ruby/oniguruma.h: include onigmo.h. * template/encdb.h.tmpl: ignore duplicated definition of EUC-CN in enc/euc_kr.c. It is defined in enc/gb2313.c with CRuby macro. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@57045 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- NEWS | 2 + enc/ascii.c | 9 +- enc/big5.c | 6 +- enc/cp949.c | 2 +- enc/emacs_mule.c | 4 +- enc/euc_jp.c | 4 +- enc/euc_kr.c | 27 +- enc/euc_tw.c | 2 +- enc/gb18030.c | 3 +- enc/gbk.c | 2 +- enc/iso_8859_1.c | 33 +- enc/iso_8859_10.c | 41 +- enc/iso_8859_11.c | 2 +- enc/iso_8859_13.c | 41 +- enc/iso_8859_14.c | 50 +-- enc/iso_8859_15.c | 47 +- enc/iso_8859_16.c | 46 +- enc/iso_8859_2.c | 34 +- enc/iso_8859_3.c | 43 +- enc/iso_8859_4.c | 29 +- enc/iso_8859_5.c | 22 +- enc/iso_8859_6.c | 2 +- enc/iso_8859_7.c | 48 +- enc/iso_8859_8.c | 2 +- enc/iso_8859_9.c | 47 +- enc/koi8_r.c | 3 +- enc/koi8_u.c | 2 +- enc/mktable.c | 42 +- enc/shift_jis.c | 4 +- enc/unicode.c | 248 +++++------ enc/us_ascii.c | 9 +- enc/utf_16be.c | 2 +- enc/utf_16le.c | 2 +- enc/utf_32be.c | 3 +- enc/utf_32le.c | 2 +- enc/utf_8.c | 14 +- enc/windows_1250.c | 35 +- enc/windows_1251.c | 35 +- enc/windows_1252.c | 29 +- enc/windows_1253.c | 43 +- enc/windows_1254.c | 47 +- enc/windows_1257.c | 50 +-- enc/windows_31j.c | 4 +- include/ruby/onigmo.h | 934 +++++++++++++++++++++++++++++++++++++++ include/ruby/oniguruma.h | 880 +------------------------------------ re.c | 3 +- regcomp.c | 583 ++++++++++++------------ regenc.c | 67 +-- regenc.h | 136 +++--- regerror.c | 65 +-- regexec.c | 895 ++++++++++++++++++------------------- regint.h | 416 ++++++++---------- regparse.c | 1099 +++++++++++++++++++++------------------------- regparse.h | 56 +-- regsyntax.c | 15 +- template/encdb.h.tmpl | 3 +- tool/enc-unicode.rb | 17 +- 57 files changed, 3108 insertions(+), 3183 deletions(-) create mode 100644 include/ruby/onigmo.h diff --git a/NEWS b/NEWS index cf10cd9868..fc80975967 100644 --- a/NEWS +++ b/NEWS @@ -138,6 +138,8 @@ with all sufficient information, see the ChangeLog file or Redmine * meta character \X matches Unicode 9.0 characters with some workarounds for UTR #51 Unicode Emoji, Version 4.0 emoji zwj sequences. + * Update Onigmo 6.0.0. + * Regexp/String: Updated Unicode version from 8.0.0 to 9.0.0 [Feature #12513] * RubyVM::Env diff --git a/enc/ascii.c b/enc/ascii.c index d34cc20582..8b32c414fe 100644 --- a/enc/ascii.c +++ b/enc/ascii.c @@ -29,9 +29,12 @@ */ #include "regenc.h" -#include "encindex.h" +#ifdef RUBY +# include "encindex.h" +#endif + #ifndef ENCINDEX_ASCII -#define ENCINDEX_ASCII 0 +# define ENCINDEX_ASCII 0 #endif OnigEncodingDefine(ascii, ASCII) = { @@ -51,9 +54,9 @@ OnigEncodingDefine(ascii, ASCII) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, + onigenc_single_byte_ascii_only_case_map, ENCINDEX_ASCII, ONIGENC_FLAG_NONE, - onigenc_single_byte_ascii_only_case_map, }; ENC_ALIAS("BINARY", "ASCII-8BIT") ENC_REPLICATE("IBM437", "ASCII-8BIT") diff --git a/enc/big5.c b/enc/big5.c index fc2bcadcc1..ab4fb69819 100644 --- a/enc/big5.c +++ b/enc/big5.c @@ -300,9 +300,9 @@ OnigEncodingDefine(big5, BIG5) = { onigenc_not_support_get_ctype_code_range, big5_left_adjust_char_head, big5_is_allowed_reverse_match, + onigenc_ascii_only_case_map, 0, ONIGENC_FLAG_NONE, - onigenc_ascii_only_case_map, }; /* @@ -335,9 +335,9 @@ OnigEncodingDefine(big5_hkscs, BIG5_HKSCS) = { onigenc_not_support_get_ctype_code_range, big5_left_adjust_char_head, big5_is_allowed_reverse_match, + onigenc_ascii_only_case_map, 0, ONIGENC_FLAG_NONE, - onigenc_ascii_only_case_map, }; ENC_ALIAS("Big5-HKSCS:2008", "Big5-HKSCS") @@ -370,7 +370,7 @@ OnigEncodingDefine(big5_uao, BIG5_UAO) = { onigenc_not_support_get_ctype_code_range, big5_left_adjust_char_head, big5_is_allowed_reverse_match, + onigenc_ascii_only_case_map, 0, ONIGENC_FLAG_NONE, - onigenc_ascii_only_case_map, }; diff --git a/enc/cp949.c b/enc/cp949.c index f832cd5758..bd2c8d21a4 100644 --- a/enc/cp949.c +++ b/enc/cp949.c @@ -211,9 +211,9 @@ OnigEncodingDefine(cp949, CP949) = { onigenc_not_support_get_ctype_code_range, cp949_left_adjust_char_head, cp949_is_allowed_reverse_match, + onigenc_ascii_only_case_map, 0, ONIGENC_FLAG_NONE, - onigenc_ascii_only_case_map, }; /* * Name: CP949 diff --git a/enc/emacs_mule.c b/enc/emacs_mule.c index a53f243dfe..f92eb183cf 100644 --- a/enc/emacs_mule.c +++ b/enc/emacs_mule.c @@ -27,7 +27,7 @@ * SUCH DAMAGE. */ -#include "regint.h" +#include "regenc.h" #define emacsmule_islead(c) ((UChar )(c) < 0x9e) @@ -334,9 +334,9 @@ OnigEncodingDefine(emacs_mule, Emacs_Mule) = { onigenc_not_support_get_ctype_code_range, left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, + onigenc_ascii_only_case_map, 0, ONIGENC_FLAG_NONE, - onigenc_ascii_only_case_map, }; ENC_REPLICATE("stateless-ISO-2022-JP", "Emacs-Mule") diff --git a/enc/euc_jp.c b/enc/euc_jp.c index f9604b8d6e..ded051af69 100644 --- a/enc/euc_jp.c +++ b/enc/euc_jp.c @@ -28,7 +28,7 @@ * SUCH DAMAGE. */ -#include "regint.h" +#include "regenc.h" #define eucjp_islead(c) ((UChar )((c) - 0xa1) > 0xfe - 0xa1) @@ -576,9 +576,9 @@ OnigEncodingDefine(euc_jp, EUC_JP) = { get_ctype_code_range, left_adjust_char_head, is_allowed_reverse_match, + onigenc_ascii_only_case_map, 0, ONIGENC_FLAG_NONE, - onigenc_ascii_only_case_map, }; /* * Name: EUC-JP diff --git a/enc/euc_kr.c b/enc/euc_kr.c index eb17f476e9..21d6ab4e1c 100644 --- a/enc/euc_kr.c +++ b/enc/euc_kr.c @@ -188,8 +188,33 @@ OnigEncodingDefine(euc_kr, EUC_KR) = { onigenc_not_support_get_ctype_code_range, euckr_left_adjust_char_head, euckr_is_allowed_reverse_match, + onigenc_ascii_only_case_map, 0, ONIGENC_FLAG_NONE, - onigenc_ascii_only_case_map, }; ENC_ALIAS("eucKR", "EUC-KR") + +#ifndef RUBY +/* Same with OnigEncodingEUC_KR except the name */ +OnigEncodingDefine(euc_cn, EUC_CN) = { + euckr_mbc_enc_len, + "EUC-CN", /* name */ + 2, /* max enc length */ + 1, /* min enc length */ + onigenc_is_mbc_newline_0x0a, + euckr_mbc_to_code, + onigenc_mb2_code_to_mbclen, + euckr_code_to_mbc, + euckr_mbc_case_fold, + onigenc_ascii_apply_all_case_fold, + onigenc_ascii_get_case_fold_codes_by_str, + onigenc_minimum_property_name_to_ctype, + euckr_is_code_ctype, + onigenc_not_support_get_ctype_code_range, + euckr_left_adjust_char_head, + euckr_is_allowed_reverse_match, + onigenc_ascii_only_case_map, + 0, + ONIGENC_FLAG_NONE, +}; +#endif /* RUBY */ diff --git a/enc/euc_tw.c b/enc/euc_tw.c index e7d5187c4a..1c5659cb1d 100644 --- a/enc/euc_tw.c +++ b/enc/euc_tw.c @@ -221,8 +221,8 @@ OnigEncodingDefine(euc_tw, EUC_TW) = { onigenc_not_support_get_ctype_code_range, euctw_left_adjust_char_head, euctw_is_allowed_reverse_match, + onigenc_ascii_only_case_map, 0, ONIGENC_FLAG_NONE, - onigenc_ascii_only_case_map, }; ENC_ALIAS("eucTW", "EUC-TW") diff --git a/enc/gb18030.c b/enc/gb18030.c index 8a00332991..63d2e633ec 100644 --- a/enc/gb18030.c +++ b/enc/gb18030.c @@ -597,8 +597,7 @@ OnigEncodingDefine(gb18030, GB18030) = { onigenc_not_support_get_ctype_code_range, gb18030_left_adjust_char_head, gb18030_is_allowed_reverse_match, + onigenc_ascii_only_case_map, 0, ONIGENC_FLAG_NONE, - onigenc_ascii_only_case_map, }; - diff --git a/enc/gbk.c b/enc/gbk.c index d3bb1a5864..31032553bf 100644 --- a/enc/gbk.c +++ b/enc/gbk.c @@ -211,9 +211,9 @@ OnigEncodingDefine(gbk, GBK) = { onigenc_not_support_get_ctype_code_range, gbk_left_adjust_char_head, gbk_is_allowed_reverse_match, + onigenc_ascii_only_case_map, 0, ONIGENC_FLAG_NONE, - onigenc_ascii_only_case_map, }; /* * Name: GBK diff --git a/enc/iso_8859_1.c b/enc/iso_8859_1.c index 2440c9f5a1..7af0888c3e 100644 --- a/enc/iso_8859_1.c +++ b/enc/iso_8859_1.c @@ -256,45 +256,46 @@ is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc ARG_UNUSE } static int -case_map (OnigCaseFoldType* flagP, const OnigUChar** pp, - const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, - const struct OnigEncodingTypeST* enc) +case_map(OnigCaseFoldType* flagP, const OnigUChar** pp, + const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, + const struct OnigEncodingTypeST* enc) { OnigCodePoint code; OnigUChar *to_start = to; OnigCaseFoldType flags = *flagP; - while (*pp=0xA0 && code<=0xBF) + if (code >= 0xA0 && code <= 0xBF) code -= 0x10; else code -= 0x20; } *to++ = code; - if (flags&ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ - flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE); + if (flags & ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ + flags ^= (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_TITLECASE); } *flagP = flags; - return (int)(to-to_start); + return (int )(to - to_start); } OnigEncodingDefine(iso_8859_10, ISO_8859_10) = { @@ -286,8 +287,8 @@ OnigEncodingDefine(iso_8859_10, ISO_8859_10) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, + case_map, 0, ONIGENC_FLAG_NONE, - case_map, }; ENC_ALIAS("ISO8859-10", "ISO-8859-10") diff --git a/enc/iso_8859_11.c b/enc/iso_8859_11.c index a5522da2e3..85e8f2cdb4 100644 --- a/enc/iso_8859_11.c +++ b/enc/iso_8859_11.c @@ -93,9 +93,9 @@ OnigEncodingDefine(iso_8859_11, ISO_8859_11) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, + onigenc_single_byte_ascii_only_case_map, 0, ONIGENC_FLAG_NONE, - onigenc_single_byte_ascii_only_case_map, }; ENC_ALIAS("ISO8859-11", "ISO-8859-11") diff --git a/enc/iso_8859_13.c b/enc/iso_8859_13.c index 6e49e16dfb..fe1ddd7065 100644 --- a/enc/iso_8859_13.c +++ b/enc/iso_8859_13.c @@ -208,9 +208,9 @@ apply_all_case_fold(OnigCaseFoldType flag, static int get_case_fold_codes_by_str(OnigCaseFoldType flag, - const OnigUChar* p, const OnigUChar* end, - OnigCaseFoldCodeItem items[], - OnigEncoding enc ARG_UNUSED) + const OnigUChar* p, const OnigUChar* end, + OnigCaseFoldCodeItem items[], + OnigEncoding enc ARG_UNUSED) { return onigenc_get_case_fold_codes_by_str_with_map( numberof(CaseFoldMap), CaseFoldMap, 1, @@ -218,38 +218,39 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag, } static int -case_map (OnigCaseFoldType* flagP, const OnigUChar** pp, - const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, - const struct OnigEncodingTypeST* enc) +case_map(OnigCaseFoldType* flagP, const OnigUChar** pp, + const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, + const struct OnigEncodingTypeST* enc) { OnigCodePoint code; OnigUChar *to_start = to; OnigCaseFoldType flags = *flagP; - while (*pp=0xB1 && code<=0xBF){ + else if ((EncISO_8859_2_CtypeTable[code] & BIT_CTYPE_LOWER) + && (flags & ONIGENC_CASE_UPCASE)) { + if (code >= 0xB1 && code <= 0xBF) { flags |= ONIGENC_CASE_MODIFIED; code -= 0x10; } - else{ + else { flags |= ONIGENC_CASE_MODIFIED; code -= 0x20; } } *to++ = code; - if (flags&ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ - flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE); + if (flags & ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ + flags ^= (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_TITLECASE); } *flagP = flags; - return (int)(to-to_start); + return (int )(to - to_start); } OnigEncodingDefine(iso_8859_2, ISO_8859_2) = { @@ -284,8 +284,8 @@ OnigEncodingDefine(iso_8859_2, ISO_8859_2) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, + case_map, 0, ONIGENC_FLAG_NONE, - case_map, }; ENC_ALIAS("ISO8859-2", "ISO-8859-2") diff --git a/enc/iso_8859_3.c b/enc/iso_8859_3.c index 365d9a77de..2a343eac63 100644 --- a/enc/iso_8859_3.c +++ b/enc/iso_8859_3.c @@ -223,45 +223,46 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag, #define DOTLESS_i (0xB9) #define I_WITH_DOT_ABOVE (0xA9) static int -case_map (OnigCaseFoldType* flagP, const OnigUChar** pp, - const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, - const struct OnigEncodingTypeST* enc) +case_map(OnigCaseFoldType* flagP, const OnigUChar** pp, + const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, + const struct OnigEncodingTypeST* enc) { OnigCodePoint code; OnigUChar *to_start = to; OnigCaseFoldType flags = *flagP; - while (*pp=0xB0 && code<=0xBF ) { + else if (code >= 0xB0 && code <= 0xBF) { code -= 0x10; } else { @@ -269,11 +270,11 @@ case_map (OnigCaseFoldType* flagP, const OnigUChar** pp, } } *to++ = code; - if (flags&ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ - flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE); + if (flags & ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ + flags ^= (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_TITLECASE); } *flagP = flags; - return (int)(to-to_start); + return (int )(to - to_start); } OnigEncodingDefine(iso_8859_3, ISO_8859_3) = { @@ -293,8 +294,8 @@ OnigEncodingDefine(iso_8859_3, ISO_8859_3) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, + case_map, 0, ONIGENC_FLAG_NONE, - case_map, }; ENC_ALIAS("ISO8859-3", "ISO-8859-3") diff --git a/enc/iso_8859_4.c b/enc/iso_8859_4.c index 6d27300e22..e2134e8c0b 100644 --- a/enc/iso_8859_4.c +++ b/enc/iso_8859_4.c @@ -232,31 +232,32 @@ case_map(OnigCaseFoldType* flagP, const OnigUChar** pp, OnigUChar *to_start = to; OnigCaseFoldType flags = *flagP; - while (*pp=0xA0&&code<=0xBF) { - if (code==0xBF) + if (code >= 0xA0 && code <= 0xBF) { + if (code == 0xBF) code -= 0x02; else code -= 0x10; @@ -265,11 +266,11 @@ case_map(OnigCaseFoldType* flagP, const OnigUChar** pp, code -= 0x20; } *to++ = code; - if (flags&ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ - flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE); + if (flags & ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ + flags ^= (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_TITLECASE); } *flagP = flags; - return (int)(to-to_start); + return (int )(to - to_start); } OnigEncodingDefine(iso_8859_4, ISO_8859_4) = { @@ -289,8 +290,8 @@ OnigEncodingDefine(iso_8859_4, ISO_8859_4) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, + case_map, 0, ONIGENC_FLAG_NONE, - case_map, }; ENC_ALIAS("ISO8859-4", "ISO-8859-4") diff --git a/enc/iso_8859_5.c b/enc/iso_8859_5.c index 5d67639f5e..6fafc35823 100644 --- a/enc/iso_8859_5.c +++ b/enc/iso_8859_5.c @@ -210,35 +210,35 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag, } static int -case_map (OnigCaseFoldType* flagP, const OnigUChar** pp, - const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, - const struct OnigEncodingTypeST* enc) +case_map(OnigCaseFoldType* flagP, const OnigUChar** pp, + const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, + const struct OnigEncodingTypeST* enc) { OnigCodePoint code; OnigUChar *to_start = to; OnigCaseFoldType flags = *flagP; - while (*pp=0xDD && code<=0xDF) { - code-=0x25; + else if (code >= 0xDD && code <= 0xDF) { + code -= 0x25; } - else if (code==0xFC) { - code-=0x40; + else if (code == 0xFC) { + code -= 0x40; } - else if (code==0xFD || code==0xFE) { - code-=0x3F; + else if (code == 0xFD || code == 0xFE) { + code -= 0x3F; } else { - code-=0x20; + code -= 0x20; } } *to++ = code; - if (flags&ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ - flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE); + if (flags & ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ + flags ^= (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_TITLECASE); } *flagP = flags; - return (int)(to-to_start); + return (int )(to - to_start); } OnigEncodingDefine(iso_8859_7, ISO_8859_7) = { @@ -277,8 +277,8 @@ OnigEncodingDefine(iso_8859_7, ISO_8859_7) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, + case_map, 0, ONIGENC_FLAG_NONE, - case_map, }; ENC_ALIAS("ISO8859-7", "ISO-8859-7") diff --git a/enc/iso_8859_8.c b/enc/iso_8859_8.c index 4777762849..0a7a29e82e 100644 --- a/enc/iso_8859_8.c +++ b/enc/iso_8859_8.c @@ -93,9 +93,9 @@ OnigEncodingDefine(iso_8859_8, ISO_8859_8) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, + onigenc_single_byte_ascii_only_case_map, 0, ONIGENC_FLAG_NONE, - onigenc_single_byte_ascii_only_case_map, }; ENC_ALIAS("ISO8859-8", "ISO-8859-8") diff --git a/enc/iso_8859_9.c b/enc/iso_8859_9.c index 064a04d480..004eec310f 100644 --- a/enc/iso_8859_9.c +++ b/enc/iso_8859_9.c @@ -204,9 +204,9 @@ apply_all_case_fold(OnigCaseFoldType flag, static int get_case_fold_codes_by_str(OnigCaseFoldType flag, - const OnigUChar* p, const OnigUChar* end, - OnigCaseFoldCodeItem items[], - OnigEncoding enc ARG_UNUSED) + const OnigUChar* p, const OnigUChar* end, + OnigCaseFoldCodeItem items[], + OnigEncoding enc ARG_UNUSED) { return onigenc_get_case_fold_codes_by_str_with_map( numberof(CaseFoldMap), CaseFoldMap, 1, @@ -216,53 +216,54 @@ get_case_fold_codes_by_str(OnigCaseFoldType flag, #define DOTLESS_i (0xFD) #define I_WITH_DOT_ABOVE (0xDD) static int -case_map (OnigCaseFoldType* flagP, const OnigUChar** pp, - const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, - const struct OnigEncodingTypeST* enc) +case_map(OnigCaseFoldType* flagP, const OnigUChar** pp, + const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, + const struct OnigEncodingTypeST* enc) { OnigCodePoint code; OnigUChar *to_start = to; OnigCaseFoldType flags = *flagP; - while (*pp + * Copyright (c) 2002-2016 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -31,7 +31,10 @@ #include #include +#ifndef __USE_ISOC99 #define __USE_ISOC99 +#endif + #include #include "regenc.h" @@ -1108,11 +1111,13 @@ static int exec(FILE* fp, ENC_INFO* einfo) #define NCOL 8 int c, val, enc; + int r; enc = einfo->num; - fprintf(fp, "static const unsigned short Enc%s_CtypeTable[256] = {\n", - einfo->name); + r = fprintf(fp, "static const unsigned short Enc%s_CtypeTable[256] = {\n", + einfo->name); + if (r < 0) return -1; for (c = 0; c < 256; c++) { val = 0; @@ -1131,20 +1136,33 @@ static int exec(FILE* fp, ENC_INFO* einfo) if (IsWord (enc, c)) val |= BIT_CTYPE_WORD; if (IsAscii (enc, c)) val |= BIT_CTYPE_ASCII; - if (c % NCOL == 0) fputs(" ", fp); - fprintf(fp, "0x%04x", val); - if (c != 255) fputs(",", fp); + if (c % NCOL == 0) { + r = fputs(" ", fp); + if (r < 0) return -1; + } + r = fprintf(fp, "0x%04x", val); + if (r < 0) return -1; + + if (c != 255) { + r = fputs(",", fp); + if (r < 0) return -1; + } if (c != 0 && c % NCOL == (NCOL-1)) - fputs("\n", fp); + r = fputs("\n", fp); else - fputs(" ", fp); + r = fputs(" ", fp); + + if (r < 0) return -1; } - fprintf(fp, "};\n"); + r = fprintf(fp, "};\n"); + if (r < 0) return -1; + return 0; } extern int main(int argc ARG_UNUSED, char* argv[] ARG_UNUSED) { + int r; int i; FILE* fp = stdout; @@ -1155,7 +1173,11 @@ extern int main(int argc ARG_UNUSED, char* argv[] ARG_UNUSED) /* setlocale(LC_ALL, "fr_FR.iso88591"); */ for (i = 0; i < (int )(sizeof(Info)/sizeof(ENC_INFO)); i++) { - exec(fp, &Info[i]); + r = exec(fp, &Info[i]); + if (r < 0) { + fprintf(stderr, "FAIL exec(): %d\n", r); + return -1; + } } return 0; diff --git a/enc/shift_jis.c b/enc/shift_jis.c index c1552bfd13..eacca9a5db 100644 --- a/enc/shift_jis.c +++ b/enc/shift_jis.c @@ -28,7 +28,7 @@ * SUCH DAMAGE. */ -#include "regint.h" +#include "regenc.h" static const int EncLen_SJIS[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -563,9 +563,9 @@ OnigEncodingDefine(shift_jis, Shift_JIS) = { get_ctype_code_range, left_adjust_char_head, is_allowed_reverse_match, + onigenc_ascii_only_case_map, 0, ONIGENC_FLAG_NONE, - onigenc_ascii_only_case_map, }; /* * Name: Shift_JIS diff --git a/enc/unicode.c b/enc/unicode.c index 39fb24408f..72ff5a96e7 100644 --- a/enc/unicode.c +++ b/enc/unicode.c @@ -139,17 +139,17 @@ code3_equal(const OnigCodePoint *x, const OnigCodePoint *y) /* macros related to ONIGENC_CASE flags */ /* defined here because not used in other files */ -#define ONIGENC_CASE_SPECIALS (ONIGENC_CASE_TITLECASE|ONIGENC_CASE_IS_TITLECASE|ONIGENC_CASE_UP_SPECIAL|ONIGENC_CASE_DOWN_SPECIAL) +#define ONIGENC_CASE_SPECIALS (ONIGENC_CASE_TITLECASE | ONIGENC_CASE_IS_TITLECASE | ONIGENC_CASE_UP_SPECIAL | ONIGENC_CASE_DOWN_SPECIAL) /* macros for length in CaseMappingSpecials array in enc/unicode/casefold.h */ #define SpecialsLengthOffset 25 /* needs to be higher than the 22 bits used for Unicode codepoints */ -#define SpecialsLengthExtract(n) ((n)>>SpecialsLengthOffset) -#define SpecialsCodepointExtract(n) ((n)&((1<> SpecialsLengthOffset) +#define SpecialsCodepointExtract(n) ((n) & ((1 << SpecialsLengthOffset) - 1)) +#define SpecialsLengthEncode(n) ((n) << SpecialsLengthOffset) -#define OnigSpecialIndexMask (((1<>OnigSpecialIndexShift) +#define OnigSpecialIndexMask (((1 << OnigSpecialIndexWidth) - 1) << OnigSpecialIndexShift) +#define OnigSpecialIndexEncode(n) ((n) << OnigSpecialIndexShift) +#define OnigSpecialIndexDecode(n) (((n) & OnigSpecialIndexMask) >> OnigSpecialIndexShift) /* macros to shorten "enc/unicode/casefold.h", undefined immediately after including the file */ #define U ONIGENC_CASE_UPCASE @@ -660,128 +660,130 @@ onigenc_unicode_case_map(OnigCaseFoldType* flagP, OnigUChar* to, OnigUChar* to_end, const struct OnigEncodingTypeST* enc) { - OnigCodePoint code; - OnigUChar *to_start = to; - OnigCaseFoldType flags = *flagP; - int codepoint_length; - - to_end -= CASE_MAPPING_SLACK; - /* copy flags ONIGENC_CASE_UPCASE and ONIGENC_CASE_DOWNCASE over to - * ONIGENC_CASE_UP_SPECIAL and ONIGENC_CASE_DOWN_SPECIAL */ - flags |= (flags&(ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE))<='a' && code<='z') { - if (flags&ONIGENC_CASE_UPCASE) { - MODIFIED; - if (flags&ONIGENC_CASE_FOLD_TURKISH_AZERI && code=='i') - code = I_WITH_DOT_ABOVE; - else - code += 'A'-'a'; - } - } - else if (code>='A' && code<='Z') { - if (flags&(ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_FOLD)) { - MODIFIED; - if (flags&ONIGENC_CASE_FOLD_TURKISH_AZERI && code=='I') - code = DOTLESS_i; - else - code += 'a'-'A'; - } - } + OnigCodePoint code; + OnigUChar *to_start = to; + OnigCaseFoldType flags = *flagP; + int codepoint_length; + + to_end -= CASE_MAPPING_SLACK; + /* copy flags ONIGENC_CASE_UPCASE and ONIGENC_CASE_DOWNCASE over to + * ONIGENC_CASE_UP_SPECIAL and ONIGENC_CASE_DOWN_SPECIAL */ + flags |= (flags & (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE)) << ONIGENC_CASE_SPECIAL_OFFSET; + + while (*pp < end && to <= to_end) { + codepoint_length = ONIGENC_PRECISE_MBC_ENC_LEN(enc, *pp, end); + if (codepoint_length < 0) + return codepoint_length; /* encoding invalid */ + code = ONIGENC_MBC_TO_CODE(enc, *pp, end); + *pp += codepoint_length; + + if (code <= 'z') { /* ASCII comes first */ + if (code >= 'a' && code <= 'z') { + if (flags & ONIGENC_CASE_UPCASE) { + MODIFIED; + if (flags & ONIGENC_CASE_FOLD_TURKISH_AZERI && code == 'i') + code = I_WITH_DOT_ABOVE; + else + code += 'A' - 'a'; } - else if (!(flags&ONIGENC_CASE_ASCII_ONLY) && code>=0x00B5) { /* deal with non-ASCII; micron sign (U+00B5) is lowest affected */ - const CodePointList3 *folded; - - if (code==I_WITH_DOT_ABOVE) { - if (flags&(ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_FOLD)) { - MODIFIED; - code = 'i'; - if (!(flags&ONIGENC_CASE_FOLD_TURKISH_AZERI)) { /* make dot above explicit */ - to += ONIGENC_CODE_TO_MBC(enc, code, to); - code = DOT_ABOVE; - } - } - } - else if (code==DOTLESS_i) { /* handle this manually, because it isn't involved in folding */ - if (flags&ONIGENC_CASE_UPCASE) - MODIFIED, code = 'I'; + } + else if (code >= 'A' && code <= 'Z') { + if (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD)) { + MODIFIED; + if (flags & ONIGENC_CASE_FOLD_TURKISH_AZERI && code == 'I') + code = DOTLESS_i; + else + code += 'a' - 'A'; + } + } + } + else if (!(flags & ONIGENC_CASE_ASCII_ONLY) && code >= 0x00B5) { /* deal with non-ASCII; micron sign (U+00B5) is lowest affected */ + const CodePointList3 *folded; + + if (code == I_WITH_DOT_ABOVE) { + if (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD)) { + MODIFIED; + code = 'i'; + if (!(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI)) { /* make dot above explicit */ + to += ONIGENC_CODE_TO_MBC(enc, code, to); + code = DOT_ABOVE; + } + } + } + else if (code == DOTLESS_i) { /* handle this manually, because it isn't involved in folding */ + if (flags & ONIGENC_CASE_UPCASE) { + MODIFIED; + code = 'I'; + } + } + else if ((folded = onigenc_unicode_fold_lookup(code)) != 0) { /* data about character found in CaseFold_11_Table */ + if ((flags & ONIGENC_CASE_TITLECASE) /* Titlecase needed, */ + && (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE)) { /* but already Titlecase */ + /* already Titlecase, no changes needed */ + } + else if (flags & OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */ + const OnigCodePoint *next; + int count; + + MODIFIED; + if (flags & OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_SPECIALS) { /* special */ + const OnigCodePoint *SpecialsStart = CaseMappingSpecials + OnigSpecialIndexDecode(folded->n); + + if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_IS_TITLECASE) { /* swapCASE available */ + if ((flags & (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE)) + == (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE)) /* swapCASE needed */ + goto SpecialsCopy; + else /* swapCASE not needed */ + SpecialsStart += SpecialsLengthExtract(*SpecialsStart); } - else if ((folded = onigenc_unicode_fold_lookup(code)) != 0) { /* data about character found in CaseFold_11_Table */ - if ((flags&ONIGENC_CASE_TITLECASE) /* Titlecase needed, */ - && (OnigCaseFoldFlags(folded->n)&ONIGENC_CASE_IS_TITLECASE)) { /* but already Titlecase */ - /* already Titlecase, no changes needed */ - } - else if (flags&OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */ - const OnigCodePoint *next; - int count; - - MODIFIED; - if (flags&OnigCaseFoldFlags(folded->n)&ONIGENC_CASE_SPECIALS) { /* special */ - const OnigCodePoint *SpecialsStart = CaseMappingSpecials + OnigSpecialIndexDecode(folded->n); - - if (OnigCaseFoldFlags(folded->n)&ONIGENC_CASE_IS_TITLECASE) { /* swapCASE available */ - if ((flags&(ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) - == (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) /* swapCASE needed */ - goto SpecialsCopy; - else /* swapCASE not needed */ - SpecialsStart += SpecialsLengthExtract(*SpecialsStart); - } - if (OnigCaseFoldFlags(folded->n)&ONIGENC_CASE_TITLECASE) { /* Titlecase available */ - if (flags&ONIGENC_CASE_TITLECASE) /* Titlecase needed, but not yet Titlecase */ - goto SpecialsCopy; - else /* Titlecase not needed */ - SpecialsStart += SpecialsLengthExtract(*SpecialsStart); - } - if (OnigCaseFoldFlags(folded->n)&ONIGENC_CASE_DOWN_SPECIAL) { - if (!(flags&ONIGENC_CASE_DOWN_SPECIAL)) - SpecialsStart += SpecialsLengthExtract(*SpecialsStart); - } - /* here, we know we use ONIGENC_CASE_UP_SPECIAL, and the position is right */ - SpecialsCopy: - count = SpecialsLengthExtract(*SpecialsStart); - next = SpecialsStart; - code = SpecialsCodepointExtract(*next++); - } - else { /* no specials */ - count = OnigCodePointCount(folded->n); - next = folded->code; - code = *next++; - } - if (count==1) - ; - else if (count==2) { - to += ONIGENC_CODE_TO_MBC(enc, code, to); - code = *next; - } - else { /* count == 3 */ - to += ONIGENC_CODE_TO_MBC(enc, code, to); - to += ONIGENC_CODE_TO_MBC(enc, *next++, to); - code = *next; - } - } + if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_TITLECASE) { /* Titlecase available */ + if (flags & ONIGENC_CASE_TITLECASE) /* Titlecase needed, but not yet Titlecase */ + goto SpecialsCopy; + else /* Titlecase not needed */ + SpecialsStart += SpecialsLengthExtract(*SpecialsStart); } - else if ((folded = onigenc_unicode_unfold1_lookup(code)) != 0 /* data about character found in CaseUnfold_11_Table */ - && flags&OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */ - MODIFIED; - code = folded->code[(flags&OnigCaseFoldFlags(folded->n)&ONIGENC_CASE_TITLECASE) ? 1 : 0]; + if (OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_DOWN_SPECIAL) { + if (!(flags & ONIGENC_CASE_DOWN_SPECIAL)) + SpecialsStart += SpecialsLengthExtract(*SpecialsStart); } + /* here, we know we use ONIGENC_CASE_UP_SPECIAL, and the position is right */ +SpecialsCopy: + count = SpecialsLengthExtract(*SpecialsStart); + next = SpecialsStart; + code = SpecialsCodepointExtract(*next++); + } + else { /* no specials */ + count = OnigCodePointCount(folded->n); + next = folded->code; + code = *next++; + } + if (count == 1) + ; + else if (count == 2) { + to += ONIGENC_CODE_TO_MBC(enc, code, to); + code = *next; + } + else { /* count == 3 */ + to += ONIGENC_CODE_TO_MBC(enc, code, to); + to += ONIGENC_CODE_TO_MBC(enc, *next++, to); + code = *next; + } } - to += ONIGENC_CODE_TO_MBC(enc, code, to); - /* switch from titlecase to lowercase for capitalize */ - if (flags & ONIGENC_CASE_TITLECASE) - flags ^= (ONIGENC_CASE_UPCASE |ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE| - ONIGENC_CASE_UP_SPECIAL|ONIGENC_CASE_DOWN_SPECIAL); + } + else if ((folded = onigenc_unicode_unfold1_lookup(code)) != 0 /* data about character found in CaseUnfold_11_Table */ + && flags & OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */ + MODIFIED; + code = folded->code[(flags & OnigCaseFoldFlags(folded->n) & ONIGENC_CASE_TITLECASE) ? 1 : 0]; + } } - *flagP = flags; - return (int)(to-to_start); + to += ONIGENC_CODE_TO_MBC(enc, code, to); + /* switch from titlecase to lowercase for capitalize */ + if (flags & ONIGENC_CASE_TITLECASE) + flags ^= (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_TITLECASE | + ONIGENC_CASE_UP_SPECIAL | ONIGENC_CASE_DOWN_SPECIAL); + } + *flagP = flags; + return (int )(to - to_start); } #if 0 diff --git a/enc/us_ascii.c b/enc/us_ascii.c index cf835e6538..08f9072c43 100644 --- a/enc/us_ascii.c +++ b/enc/us_ascii.c @@ -1,7 +1,10 @@ #include "regenc.h" -#include "encindex.h" +#ifdef RUBY +# include "encindex.h" +#endif + #ifndef ENCINDEX_US_ASCII -#define ENCINDEX_US_ASCII 0 +# define ENCINDEX_US_ASCII 0 #endif static int @@ -29,9 +32,9 @@ OnigEncodingDefine(us_ascii, US_ASCII) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, + onigenc_single_byte_ascii_only_case_map, ENCINDEX_US_ASCII, ONIGENC_FLAG_NONE, - onigenc_single_byte_ascii_only_case_map, }; ENC_ALIAS("ASCII", "US-ASCII") ENC_ALIAS("ANSI_X3.4-1968", "US-ASCII") diff --git a/enc/utf_16be.c b/enc/utf_16be.c index e8b97983bf..f9dd7119d6 100644 --- a/enc/utf_16be.c +++ b/enc/utf_16be.c @@ -249,8 +249,8 @@ OnigEncodingDefine(utf_16be, UTF_16BE) = { onigenc_utf16_32_get_ctype_code_range, utf16be_left_adjust_char_head, onigenc_always_false_is_allowed_reverse_match, + onigenc_unicode_case_map, 0, ONIGENC_FLAG_UNICODE, - onigenc_unicode_case_map, }; ENC_ALIAS("UCS-2BE", "UTF-16BE") diff --git a/enc/utf_16le.c b/enc/utf_16le.c index 67ec2ad178..2c8438d0be 100644 --- a/enc/utf_16le.c +++ b/enc/utf_16le.c @@ -242,7 +242,7 @@ OnigEncodingDefine(utf_16le, UTF_16LE) = { onigenc_utf16_32_get_ctype_code_range, utf16le_left_adjust_char_head, onigenc_always_false_is_allowed_reverse_match, + onigenc_unicode_case_map, 0, ONIGENC_FLAG_UNICODE, - onigenc_unicode_case_map, }; diff --git a/enc/utf_32be.c b/enc/utf_32be.c index a57b854674..995c9d8ed5 100644 --- a/enc/utf_32be.c +++ b/enc/utf_32be.c @@ -187,9 +187,8 @@ OnigEncodingDefine(utf_32be, UTF_32BE) = { onigenc_utf16_32_get_ctype_code_range, utf32be_left_adjust_char_head, onigenc_always_false_is_allowed_reverse_match, + onigenc_unicode_case_map, 0, ONIGENC_FLAG_UNICODE, - onigenc_unicode_case_map, }; ENC_ALIAS("UCS-4BE", "UTF-32BE") - diff --git a/enc/utf_32le.c b/enc/utf_32le.c index c48089d6ed..e255f0e246 100644 --- a/enc/utf_32le.c +++ b/enc/utf_32le.c @@ -187,8 +187,8 @@ OnigEncodingDefine(utf_32le, UTF_32LE) = { onigenc_utf16_32_get_ctype_code_range, utf32le_left_adjust_char_head, onigenc_always_false_is_allowed_reverse_match, + onigenc_unicode_case_map, 0, ONIGENC_FLAG_UNICODE, - onigenc_unicode_case_map, }; ENC_ALIAS("UCS-4LE", "UTF-32LE") diff --git a/enc/utf_8.c b/enc/utf_8.c index 862b13fd9b..3dad2f729b 100644 --- a/enc/utf_8.c +++ b/enc/utf_8.c @@ -28,17 +28,20 @@ */ #include "regenc.h" -#include "encindex.h" +#ifdef RUBY +# include "encindex.h" +#endif + #ifndef ENCINDEX_UTF_8 -#define ENCINDEX_UTF_8 0 +# define ENCINDEX_UTF_8 0 #endif #define USE_INVALID_CODE_SCHEME #ifdef USE_INVALID_CODE_SCHEME /* virtual codepoint values for invalid encoding byte 0xfe and 0xff */ -#define INVALID_CODE_FE 0xfffffffe -#define INVALID_CODE_FF 0xffffffff +# define INVALID_CODE_FE 0xfffffffe +# define INVALID_CODE_FF 0xffffffff #endif #define VALID_CODE_LIMIT 0x0010ffff @@ -428,9 +431,9 @@ OnigEncodingDefine(utf_8, UTF_8) = { get_ctype_code_range, left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, + onigenc_unicode_case_map, ENCINDEX_UTF_8, ONIGENC_FLAG_UNICODE, - onigenc_unicode_case_map, }; ENC_ALIAS("CP65001", "UTF-8") @@ -444,4 +447,3 @@ ENC_ALIAS("CP65001", "UTF-8") ENC_REPLICATE("UTF8-MAC", "UTF-8") ENC_ALIAS("UTF-8-MAC", "UTF8-MAC") ENC_ALIAS("UTF-8-HFS", "UTF8-MAC") /* Emacs 23.2 */ - diff --git a/enc/windows_1250.c b/enc/windows_1250.c index 47317ddaf6..d2cf7b16bc 100644 --- a/enc/windows_1250.c +++ b/enc/windows_1250.c @@ -191,40 +191,41 @@ cp1250_get_case_fold_codes_by_str(OnigCaseFoldType flag, } static int -case_map (OnigCaseFoldType* flagP, const OnigUChar** pp, - const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, - const struct OnigEncodingTypeST* enc) +case_map(OnigCaseFoldType* flagP, const OnigUChar** pp, + const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, + const struct OnigEncodingTypeST* enc) { OnigCodePoint code; OnigUChar *to_start = to; OnigCaseFoldType flags = *flagP; - while (*pp= 0x8A && code <= 0xBF && code!=0xB9) code -= 0x10; @@ -232,11 +233,11 @@ case_map (OnigCaseFoldType* flagP, const OnigUChar** pp, code -= 0x20; } *to++ = code; - if (flags&ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ - flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE); + if (flags & ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ + flags ^= (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_TITLECASE); } *flagP = flags; - return (int)(to-to_start); + return (int )(to - to_start); } OnigEncodingDefine(windows_1250, Windows_1250) = { @@ -256,9 +257,9 @@ OnigEncodingDefine(windows_1250, Windows_1250) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, + case_map, 0, ONIGENC_FLAG_NONE, - case_map, }; /* * Name: windows-1250 diff --git a/enc/windows_1251.c b/enc/windows_1251.c index 0f9b7fa69a..fcd0f1015d 100644 --- a/enc/windows_1251.c +++ b/enc/windows_1251.c @@ -181,49 +181,50 @@ cp1251_get_case_fold_codes_by_str(OnigCaseFoldType flag, } static int -case_map (OnigCaseFoldType* flagP, const OnigUChar** pp, - const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, - const struct OnigEncodingTypeST* enc) +case_map(OnigCaseFoldType* flagP, const OnigUChar** pp, + const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, + const struct OnigEncodingTypeST* enc) { OnigCodePoint code; OnigUChar *to_start = to; OnigCaseFoldType flags = *flagP; - while (*pp=0xDD && code<=0xDF) + else if (code >= 0xDD && code <= 0xDF) code -= 0x25; - else if (code==0xFC) + else if (code == 0xFC) code = 0xBC; - else if (code==0xFD || code==0xFE) + else if (code == 0xFD || code == 0xFE) code -= 0x3F; else code -= 0x20; } *to++ = code; - if (flags&ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ - flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE); + if (flags & ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ + flags ^= (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_TITLECASE); } *flagP = flags; - return (int)(to-to_start); + return (int )(to - to_start); } OnigEncodingDefine(windows_1253, Windows_1253) = { @@ -289,8 +290,8 @@ OnigEncodingDefine(windows_1253, Windows_1253) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, + case_map, 0, ONIGENC_FLAG_NONE, - case_map, }; ENC_ALIAS("CP1253", "Windows-1253") diff --git a/enc/windows_1254.c b/enc/windows_1254.c index 2ccf966b8e..9ae66978a2 100644 --- a/enc/windows_1254.c +++ b/enc/windows_1254.c @@ -212,9 +212,9 @@ apply_all_case_fold(OnigCaseFoldType flag, static int get_case_fold_codes_by_str(OnigCaseFoldType flag, - const OnigUChar* p, const OnigUChar* end, - OnigCaseFoldCodeItem items[], - OnigEncoding enc ARG_UNUSED) + const OnigUChar* p, const OnigUChar* end, + OnigCaseFoldCodeItem items[], + OnigEncoding enc ARG_UNUSED) { return onigenc_get_case_fold_codes_by_str_with_map( numberof(CaseFoldMap), CaseFoldMap, 1, @@ -232,49 +232,50 @@ case_map(OnigCaseFoldType* flagP, const OnigUChar** pp, OnigUChar *to_start = to; OnigCaseFoldType flags = *flagP; - while (*pp=0xB0 && code<=0xBF ) + else if (code >= 0xB0 && code <= 0xBF) code -= 0x10; else code -= 0x20; } *to++ = code; - if (flags&ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ - flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE); + if (flags & ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ + flags ^= (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_TITLECASE); } *flagP = flags; - return (int)(to-to_start); + return (int )(to - to_start); } OnigEncodingDefine(windows_1257, Windows_1257) = { @@ -296,9 +297,8 @@ OnigEncodingDefine(windows_1257, Windows_1257) = { onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match, + case_map, 0, ONIGENC_FLAG_NONE, - case_map, }; - ENC_ALIAS("CP1257", "Windows-1257") diff --git a/enc/windows_31j.c b/enc/windows_31j.c index 71836c1f13..174f8983c4 100644 --- a/enc/windows_31j.c +++ b/enc/windows_31j.c @@ -33,7 +33,7 @@ OnigEncodingDefine(windows_31j, Windows_31J) = { mbc_enc_len, - "Windows-31J", /* name */ + "Windows-31J", /* name */ 2, /* max byte length */ 1, /* min byte length */ onigenc_is_mbc_newline_0x0a, @@ -48,9 +48,9 @@ OnigEncodingDefine(windows_31j, Windows_31J) = { get_ctype_code_range, left_adjust_char_head, is_allowed_reverse_match, + onigenc_ascii_only_case_map, 0, ONIGENC_FLAG_NONE, - onigenc_ascii_only_case_map, }; /* * Name: Windows-31J diff --git a/include/ruby/onigmo.h b/include/ruby/onigmo.h new file mode 100644 index 0000000000..228aa77ea5 --- /dev/null +++ b/include/ruby/onigmo.h @@ -0,0 +1,934 @@ +#ifndef ONIGMO_H +#define ONIGMO_H +/********************************************************************** + onigmo.h - Onigmo (Oniguruma-mod) (regular expression library) +**********************************************************************/ +/*- + * Copyright (c) 2002-2009 K.Kosako + * Copyright (c) 2011-2016 K.Takata + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifdef __cplusplus +extern "C" { +# if 0 +} /* satisfy cc-mode */ +# endif +#endif + +#define ONIGMO_VERSION_MAJOR 6 +#define ONIGMO_VERSION_MINOR 0 +#define ONIGMO_VERSION_TEENY 0 + +#ifndef ONIG_EXTERN +# ifdef RUBY_EXTERN +# define ONIG_EXTERN RUBY_EXTERN +# else +# if defined(_WIN32) && !defined(__GNUC__) +# if defined(EXPORT) || defined(RUBY_EXPORT) +# define ONIG_EXTERN extern __declspec(dllexport) +# else +# define ONIG_EXTERN extern __declspec(dllimport) +# endif +# endif +# endif +#endif + +#ifndef ONIG_EXTERN +# define ONIG_EXTERN extern +#endif + +#ifndef RUBY +# ifndef RUBY_SYMBOL_EXPORT_BEGIN +# define RUBY_SYMBOL_EXPORT_BEGIN +# define RUBY_SYMBOL_EXPORT_END +# endif +#endif + +RUBY_SYMBOL_EXPORT_BEGIN + +#include /* for size_t */ + +/* PART: character encoding */ + +#ifndef ONIG_ESCAPE_UCHAR_COLLISION +# define UChar OnigUChar +#endif + +typedef unsigned char OnigUChar; +typedef unsigned int OnigCodePoint; +typedef unsigned int OnigCtype; +typedef size_t OnigDistance; +typedef ptrdiff_t OnigPosition; + +#define ONIG_INFINITE_DISTANCE ~((OnigDistance )0) + +/* + * Onig casefold/case mapping flags and related definitions + * + * Subfields (starting with 0 at LSB): + * 0-2: Code point count in casefold.h + * 3-12: Index into SpecialCaseMapping array in casefold.h + * 13-22: Case folding/mapping flags + */ +typedef unsigned int OnigCaseFoldType; /* case fold flag */ + +ONIG_EXTERN OnigCaseFoldType OnigDefaultCaseFoldFlag; + +/* bits for actual code point count; 3 bits is more than enough, currently only 2 used */ +#define OnigCodePointMaskWidth 3 +#define OnigCodePointMask ((1< Unicode:0x1ffc */ + +/* code range */ +#define ONIGENC_CODE_RANGE_NUM(range) ((int )range[0]) +#define ONIGENC_CODE_RANGE_FROM(range,i) range[((i)*2) + 1] +#define ONIGENC_CODE_RANGE_TO(range,i) range[((i)*2) + 2] + +typedef struct { + int byte_len; /* argument(original) character(s) byte length */ + int code_len; /* number of code */ + OnigCodePoint code[ONIGENC_MAX_COMP_CASE_FOLD_CODE_LEN]; +} OnigCaseFoldCodeItem; + +typedef struct { + OnigCodePoint esc; + OnigCodePoint anychar; + OnigCodePoint anytime; + OnigCodePoint zero_or_one_time; + OnigCodePoint one_or_more_time; + OnigCodePoint anychar_anytime; +} OnigMetaCharTableType; + +typedef int (*OnigApplyAllCaseFoldFunc)(OnigCodePoint from, OnigCodePoint* to, int to_len, void* arg); + +typedef struct OnigEncodingTypeST { + int (*precise_mbc_enc_len)(const OnigUChar* p,const OnigUChar* e, const struct OnigEncodingTypeST* enc); + const char* name; + int max_enc_len; + int min_enc_len; + int (*is_mbc_newline)(const OnigUChar* p, const OnigUChar* end, const struct OnigEncodingTypeST* enc); + OnigCodePoint (*mbc_to_code)(const OnigUChar* p, const OnigUChar* end, const struct OnigEncodingTypeST* enc); + int (*code_to_mbclen)(OnigCodePoint code, const struct OnigEncodingTypeST* enc); + int (*code_to_mbc)(OnigCodePoint code, OnigUChar *buf, const struct OnigEncodingTypeST* enc); + int (*mbc_case_fold)(OnigCaseFoldType flag, const OnigUChar** pp, const OnigUChar* end, OnigUChar* to, const struct OnigEncodingTypeST* enc); + int (*apply_all_case_fold)(OnigCaseFoldType flag, OnigApplyAllCaseFoldFunc f, void* arg, const struct OnigEncodingTypeST* enc); + int (*get_case_fold_codes_by_str)(OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem acs[], const struct OnigEncodingTypeST* enc); + int (*property_name_to_ctype)(const struct OnigEncodingTypeST* enc, const OnigUChar* p, const OnigUChar* end); + int (*is_code_ctype)(OnigCodePoint code, OnigCtype ctype, const struct OnigEncodingTypeST* enc); + int (*get_ctype_code_range)(OnigCtype ctype, OnigCodePoint* sb_out, const OnigCodePoint* ranges[], const struct OnigEncodingTypeST* enc); + OnigUChar* (*left_adjust_char_head)(const OnigUChar* start, const OnigUChar* p, const OnigUChar* end, const struct OnigEncodingTypeST* enc); + int (*is_allowed_reverse_match)(const OnigUChar* p, const OnigUChar* end, const struct OnigEncodingTypeST* enc); + int (*case_map)(OnigCaseFoldType* flagP, const OnigUChar** pp, const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, const struct OnigEncodingTypeST* enc); + int ruby_encoding_index; + unsigned int flags; +} OnigEncodingType; + +typedef const OnigEncodingType* OnigEncoding; + +ONIG_EXTERN const OnigEncodingType OnigEncodingASCII; +#ifndef RUBY +ONIG_EXTERN const OnigEncodingType OnigEncodingISO_8859_1; +ONIG_EXTERN const OnigEncodingType OnigEncodingISO_8859_2; +ONIG_EXTERN const OnigEncodingType OnigEncodingISO_8859_3; +ONIG_EXTERN const OnigEncodingType OnigEncodingISO_8859_4; +ONIG_EXTERN const OnigEncodingType OnigEncodingISO_8859_5; +ONIG_EXTERN const OnigEncodingType OnigEncodingISO_8859_6; +ONIG_EXTERN const OnigEncodingType OnigEncodingISO_8859_7; +ONIG_EXTERN const OnigEncodingType OnigEncodingISO_8859_8; +ONIG_EXTERN const OnigEncodingType OnigEncodingISO_8859_9; +ONIG_EXTERN const OnigEncodingType OnigEncodingISO_8859_10; +ONIG_EXTERN const OnigEncodingType OnigEncodingISO_8859_11; +ONIG_EXTERN const OnigEncodingType OnigEncodingISO_8859_13; +ONIG_EXTERN const OnigEncodingType OnigEncodingISO_8859_14; +ONIG_EXTERN const OnigEncodingType OnigEncodingISO_8859_15; +ONIG_EXTERN const OnigEncodingType OnigEncodingISO_8859_16; +ONIG_EXTERN const OnigEncodingType OnigEncodingUTF_8; +ONIG_EXTERN const OnigEncodingType OnigEncodingUTF_16BE; +ONIG_EXTERN const OnigEncodingType OnigEncodingUTF_16LE; +ONIG_EXTERN const OnigEncodingType OnigEncodingUTF_32BE; +ONIG_EXTERN const OnigEncodingType OnigEncodingUTF_32LE; +ONIG_EXTERN const OnigEncodingType OnigEncodingEUC_JP; +ONIG_EXTERN const OnigEncodingType OnigEncodingEUC_TW; +ONIG_EXTERN const OnigEncodingType OnigEncodingEUC_KR; +ONIG_EXTERN const OnigEncodingType OnigEncodingEUC_CN; +ONIG_EXTERN const OnigEncodingType OnigEncodingShift_JIS; +ONIG_EXTERN const OnigEncodingType OnigEncodingWindows_31J; +/* ONIG_EXTERN const OnigEncodingType OnigEncodingKOI8; */ +ONIG_EXTERN const OnigEncodingType OnigEncodingKOI8_R; +ONIG_EXTERN const OnigEncodingType OnigEncodingKOI8_U; +ONIG_EXTERN const OnigEncodingType OnigEncodingWindows_1250; +ONIG_EXTERN const OnigEncodingType OnigEncodingWindows_1251; +ONIG_EXTERN const OnigEncodingType OnigEncodingWindows_1252; +ONIG_EXTERN const OnigEncodingType OnigEncodingWindows_1253; +ONIG_EXTERN const OnigEncodingType OnigEncodingWindows_1254; +ONIG_EXTERN const OnigEncodingType OnigEncodingWindows_1257; +ONIG_EXTERN const OnigEncodingType OnigEncodingBIG5; +ONIG_EXTERN const OnigEncodingType OnigEncodingGB18030; +#endif /* RUBY */ + +#define ONIG_ENCODING_ASCII (&OnigEncodingASCII) +#ifndef RUBY +# define ONIG_ENCODING_ISO_8859_1 (&OnigEncodingISO_8859_1) +# define ONIG_ENCODING_ISO_8859_2 (&OnigEncodingISO_8859_2) +# define ONIG_ENCODING_ISO_8859_3 (&OnigEncodingISO_8859_3) +# define ONIG_ENCODING_ISO_8859_4 (&OnigEncodingISO_8859_4) +# define ONIG_ENCODING_ISO_8859_5 (&OnigEncodingISO_8859_5) +# define ONIG_ENCODING_ISO_8859_6 (&OnigEncodingISO_8859_6) +# define ONIG_ENCODING_ISO_8859_7 (&OnigEncodingISO_8859_7) +# define ONIG_ENCODING_ISO_8859_8 (&OnigEncodingISO_8859_8) +# define ONIG_ENCODING_ISO_8859_9 (&OnigEncodingISO_8859_9) +# define ONIG_ENCODING_ISO_8859_10 (&OnigEncodingISO_8859_10) +# define ONIG_ENCODING_ISO_8859_11 (&OnigEncodingISO_8859_11) +# define ONIG_ENCODING_ISO_8859_13 (&OnigEncodingISO_8859_13) +# define ONIG_ENCODING_ISO_8859_14 (&OnigEncodingISO_8859_14) +# define ONIG_ENCODING_ISO_8859_15 (&OnigEncodingISO_8859_15) +# define ONIG_ENCODING_ISO_8859_16 (&OnigEncodingISO_8859_16) +# define ONIG_ENCODING_UTF_8 (&OnigEncodingUTF_8) +# define ONIG_ENCODING_UTF_16BE (&OnigEncodingUTF_16BE) +# define ONIG_ENCODING_UTF_16LE (&OnigEncodingUTF_16LE) +# define ONIG_ENCODING_UTF_32BE (&OnigEncodingUTF_32BE) +# define ONIG_ENCODING_UTF_32LE (&OnigEncodingUTF_32LE) +# define ONIG_ENCODING_EUC_JP (&OnigEncodingEUC_JP) +# define ONIG_ENCODING_EUC_TW (&OnigEncodingEUC_TW) +# define ONIG_ENCODING_EUC_KR (&OnigEncodingEUC_KR) +# define ONIG_ENCODING_EUC_CN (&OnigEncodingEUC_CN) +# define ONIG_ENCODING_SHIFT_JIS (&OnigEncodingShift_JIS) +# define ONIG_ENCODING_WINDOWS_31J (&OnigEncodingWindows_31J) +/* # define ONIG_ENCODING_KOI8 (&OnigEncodingKOI8) */ +# define ONIG_ENCODING_KOI8_R (&OnigEncodingKOI8_R) +# define ONIG_ENCODING_KOI8_U (&OnigEncodingKOI8_U) +# define ONIG_ENCODING_WINDOWS_1250 (&OnigEncodingWindows_1250) +# define ONIG_ENCODING_WINDOWS_1251 (&OnigEncodingWindows_1251) +# define ONIG_ENCODING_WINDOWS_1252 (&OnigEncodingWindows_1252) +# define ONIG_ENCODING_WINDOWS_1253 (&OnigEncodingWindows_1253) +# define ONIG_ENCODING_WINDOWS_1254 (&OnigEncodingWindows_1254) +# define ONIG_ENCODING_WINDOWS_1257 (&OnigEncodingWindows_1257) +# define ONIG_ENCODING_BIG5 (&OnigEncodingBIG5) +# define ONIG_ENCODING_GB18030 (&OnigEncodingGB18030) + +/* old names */ +# define ONIG_ENCODING_SJIS ONIG_ENCODING_SHIFT_JIS +# define ONIG_ENCODING_CP932 ONIG_ENCODING_WINDOWS_31J +# define ONIG_ENCODING_CP1250 ONIG_ENCODING_WINDOWS_1250 +# define ONIG_ENCODING_CP1251 ONIG_ENCODING_WINDOWS_1251 +# define ONIG_ENCODING_CP1252 ONIG_ENCODING_WINDOWS_1252 +# define ONIG_ENCODING_CP1253 ONIG_ENCODING_WINDOWS_1253 +# define ONIG_ENCODING_CP1254 ONIG_ENCODING_WINDOWS_1254 +# define ONIG_ENCODING_CP1257 ONIG_ENCODING_WINDOWS_1257 +# define ONIG_ENCODING_UTF8 ONIG_ENCODING_UTF_8 +# define ONIG_ENCODING_UTF16_BE ONIG_ENCODING_UTF_16BE +# define ONIG_ENCODING_UTF16_LE ONIG_ENCODING_UTF_16LE +# define ONIG_ENCODING_UTF32_BE ONIG_ENCODING_UTF_32BE +# define ONIG_ENCODING_UTF32_LE ONIG_ENCODING_UTF_32LE +#endif /* RUBY */ + +#define ONIG_ENCODING_UNDEF ((OnigEncoding )0) + +/* this declaration needs to be here because it is used in string.c in Ruby */ +ONIG_EXTERN +int onigenc_ascii_only_case_map(OnigCaseFoldType* flagP, const OnigUChar** pp, const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, const struct OnigEncodingTypeST* enc); + + +/* work size */ +#define ONIGENC_CODE_TO_MBC_MAXLEN 7 +#define ONIGENC_MBC_CASE_FOLD_MAXLEN 18 +/* 18: 6(max-byte) * 3(case-fold chars) */ + +/* character types */ +#define ONIGENC_CTYPE_NEWLINE 0 +#define ONIGENC_CTYPE_ALPHA 1 +#define ONIGENC_CTYPE_BLANK 2 +#define ONIGENC_CTYPE_CNTRL 3 +#define ONIGENC_CTYPE_DIGIT 4 +#define ONIGENC_CTYPE_GRAPH 5 +#define ONIGENC_CTYPE_LOWER 6 +#define ONIGENC_CTYPE_PRINT 7 +#define ONIGENC_CTYPE_PUNCT 8 +#define ONIGENC_CTYPE_SPACE 9 +#define ONIGENC_CTYPE_UPPER 10 +#define ONIGENC_CTYPE_XDIGIT 11 +#define ONIGENC_CTYPE_WORD 12 +#define ONIGENC_CTYPE_ALNUM 13 /* alpha || digit */ +#define ONIGENC_CTYPE_ASCII 14 +#define ONIGENC_MAX_STD_CTYPE ONIGENC_CTYPE_ASCII + +/* flags */ +#define ONIGENC_FLAG_NONE 0U +#define ONIGENC_FLAG_UNICODE 1U + +#define onig_enc_len(enc,p,e) ONIGENC_MBC_ENC_LEN(enc, p, e) + +#define ONIGENC_IS_UNDEF(enc) ((enc) == ONIG_ENCODING_UNDEF) +#define ONIGENC_IS_SINGLEBYTE(enc) (ONIGENC_MBC_MAXLEN(enc) == 1) +#define ONIGENC_IS_MBC_HEAD(enc,p,e) (ONIGENC_MBC_ENC_LEN(enc,p,e) != 1) +#define ONIGENC_IS_MBC_ASCII(p) (*(p) < 128) +#define ONIGENC_IS_CODE_ASCII(code) ((code) < 128) +#define ONIGENC_IS_MBC_WORD(enc,s,end) \ + ONIGENC_IS_CODE_WORD(enc,ONIGENC_MBC_TO_CODE(enc,s,end)) +#define ONIGENC_IS_MBC_ASCII_WORD(enc,s,end) \ + onigenc_ascii_is_code_ctype( \ + ONIGENC_MBC_TO_CODE(enc,s,end),ONIGENC_CTYPE_WORD,enc) +#define ONIGENC_IS_UNICODE(enc) ((enc)->flags & ONIGENC_FLAG_UNICODE) + + +#define ONIGENC_NAME(enc) ((enc)->name) + +#define ONIGENC_MBC_CASE_FOLD(enc,flag,pp,end,buf) \ + (enc)->mbc_case_fold(flag,(const OnigUChar** )pp,end,buf,enc) +#define ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc,s,end) \ + (enc)->is_allowed_reverse_match(s,end,enc) +#define ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc,start,s,end) \ + (enc)->left_adjust_char_head(start, s, end, enc) +#define ONIGENC_APPLY_ALL_CASE_FOLD(enc,case_fold_flag,f,arg) \ + (enc)->apply_all_case_fold(case_fold_flag,f,arg,enc) +#define ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc,case_fold_flag,p,end,acs) \ + (enc)->get_case_fold_codes_by_str(case_fold_flag,p,end,acs,enc) +#define ONIGENC_STEP_BACK(enc,start,s,end,n) \ + onigenc_step_back((enc),(start),(s),(end),(n)) + +#define ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(n) (n) +#define ONIGENC_MBCLEN_CHARFOUND_P(r) (0 < (r)) +#define ONIGENC_MBCLEN_CHARFOUND_LEN(r) (r) + +#define ONIGENC_CONSTRUCT_MBCLEN_INVALID() (-1) +#define ONIGENC_MBCLEN_INVALID_P(r) ((r) == -1) + +#define ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n) (-1-(n)) +#define ONIGENC_MBCLEN_NEEDMORE_P(r) ((r) < -1) +#define ONIGENC_MBCLEN_NEEDMORE_LEN(r) (-1-(r)) + +#define ONIGENC_PRECISE_MBC_ENC_LEN(enc,p,e) (enc)->precise_mbc_enc_len(p,e,enc) + +ONIG_EXTERN +int onigenc_mbclen_approximate(const OnigUChar* p,const OnigUChar* e, const struct OnigEncodingTypeST* enc); + +#define ONIGENC_MBC_ENC_LEN(enc,p,e) onigenc_mbclen_approximate(p,e,enc) +#define ONIGENC_MBC_MAXLEN(enc) ((enc)->max_enc_len) +#define ONIGENC_MBC_MAXLEN_DIST(enc) ONIGENC_MBC_MAXLEN(enc) +#define ONIGENC_MBC_MINLEN(enc) ((enc)->min_enc_len) +#define ONIGENC_IS_MBC_NEWLINE(enc,p,end) (enc)->is_mbc_newline((p),(end),enc) +#define ONIGENC_MBC_TO_CODE(enc,p,end) (enc)->mbc_to_code((p),(end),enc) +#define ONIGENC_CODE_TO_MBCLEN(enc,code) (enc)->code_to_mbclen(code,enc) +#define ONIGENC_CODE_TO_MBC(enc,code,buf) (enc)->code_to_mbc(code,buf,enc) +#define ONIGENC_PROPERTY_NAME_TO_CTYPE(enc,p,end) \ + (enc)->property_name_to_ctype(enc,p,end) + +#define ONIGENC_IS_CODE_CTYPE(enc,code,ctype) (enc)->is_code_ctype(code,ctype,enc) + +#define ONIGENC_IS_CODE_NEWLINE(enc,code) \ + ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_NEWLINE) +#define ONIGENC_IS_CODE_GRAPH(enc,code) \ + ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_GRAPH) +#define ONIGENC_IS_CODE_PRINT(enc,code) \ + ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_PRINT) +#define ONIGENC_IS_CODE_ALNUM(enc,code) \ + ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_ALNUM) +#define ONIGENC_IS_CODE_ALPHA(enc,code) \ + ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_ALPHA) +#define ONIGENC_IS_CODE_LOWER(enc,code) \ + ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_LOWER) +#define ONIGENC_IS_CODE_UPPER(enc,code) \ + ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_UPPER) +#define ONIGENC_IS_CODE_CNTRL(enc,code) \ + ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_CNTRL) +#define ONIGENC_IS_CODE_PUNCT(enc,code) \ + ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_PUNCT) +#define ONIGENC_IS_CODE_SPACE(enc,code) \ + ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_SPACE) +#define ONIGENC_IS_CODE_BLANK(enc,code) \ + ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_BLANK) +#define ONIGENC_IS_CODE_DIGIT(enc,code) \ + ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_DIGIT) +#define ONIGENC_IS_CODE_XDIGIT(enc,code) \ + ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_XDIGIT) +#define ONIGENC_IS_CODE_WORD(enc,code) \ + ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_WORD) + +#define ONIGENC_GET_CTYPE_CODE_RANGE(enc,ctype,sbout,ranges) \ + (enc)->get_ctype_code_range(ctype,sbout,ranges,enc) + +ONIG_EXTERN +OnigUChar* onigenc_step_back(OnigEncoding enc, const OnigUChar* start, const OnigUChar* s, const OnigUChar* end, int n); + + +/* encoding API */ +ONIG_EXTERN +int onigenc_init(void); +ONIG_EXTERN +int onigenc_set_default_encoding(OnigEncoding enc); +ONIG_EXTERN +OnigEncoding onigenc_get_default_encoding(void); +ONIG_EXTERN +OnigUChar* onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc, const OnigUChar* start, const OnigUChar* s, const OnigUChar* end, const OnigUChar** prev); +ONIG_EXTERN +OnigUChar* onigenc_get_prev_char_head(OnigEncoding enc, const OnigUChar* start, const OnigUChar* s, const OnigUChar* end); +ONIG_EXTERN +OnigUChar* onigenc_get_left_adjust_char_head(OnigEncoding enc, const OnigUChar* start, const OnigUChar* s, const OnigUChar* end); +ONIG_EXTERN +OnigUChar* onigenc_get_right_adjust_char_head(OnigEncoding enc, const OnigUChar* start, const OnigUChar* s, const OnigUChar* end); +ONIG_EXTERN +int onigenc_strlen(OnigEncoding enc, const OnigUChar* p, const OnigUChar* end); +ONIG_EXTERN +int onigenc_strlen_null(OnigEncoding enc, const OnigUChar* p); +ONIG_EXTERN +int onigenc_str_bytelen_null(OnigEncoding enc, const OnigUChar* p); + + + +/* PART: regular expression */ + +/* config parameters */ +#define ONIG_NREGION 10 +#define ONIG_MAX_CAPTURE_GROUP_NUM 32767 +#define ONIG_MAX_BACKREF_NUM 1000 +#define ONIG_MAX_REPEAT_NUM 100000 +#define ONIG_MAX_MULTI_BYTE_RANGES_NUM 10000 +/* constants */ +#define ONIG_MAX_ERROR_MESSAGE_LEN 90 + +typedef unsigned int OnigOptionType; + +#define ONIG_OPTION_DEFAULT ONIG_OPTION_NONE + +/* options */ +#define ONIG_OPTION_NONE 0U +#define ONIG_OPTION_IGNORECASE 1U +#define ONIG_OPTION_EXTEND (ONIG_OPTION_IGNORECASE << 1) +#define ONIG_OPTION_MULTILINE (ONIG_OPTION_EXTEND << 1) +#define ONIG_OPTION_DOTALL ONIG_OPTION_MULTILINE +#define ONIG_OPTION_SINGLELINE (ONIG_OPTION_MULTILINE << 1) +#define ONIG_OPTION_FIND_LONGEST (ONIG_OPTION_SINGLELINE << 1) +#define ONIG_OPTION_FIND_NOT_EMPTY (ONIG_OPTION_FIND_LONGEST << 1) +#define ONIG_OPTION_NEGATE_SINGLELINE (ONIG_OPTION_FIND_NOT_EMPTY << 1) +#define ONIG_OPTION_DONT_CAPTURE_GROUP (ONIG_OPTION_NEGATE_SINGLELINE << 1) +#define ONIG_OPTION_CAPTURE_GROUP (ONIG_OPTION_DONT_CAPTURE_GROUP << 1) +/* options (search time) */ +#define ONIG_OPTION_NOTBOL (ONIG_OPTION_CAPTURE_GROUP << 1) +#define ONIG_OPTION_NOTEOL (ONIG_OPTION_NOTBOL << 1) +#define ONIG_OPTION_NOTBOS (ONIG_OPTION_NOTEOL << 1) +#define ONIG_OPTION_NOTEOS (ONIG_OPTION_NOTBOS << 1) +/* options (ctype range) */ +#define ONIG_OPTION_ASCII_RANGE (ONIG_OPTION_NOTEOS << 1) +#define ONIG_OPTION_POSIX_BRACKET_ALL_RANGE (ONIG_OPTION_ASCII_RANGE << 1) +#define ONIG_OPTION_WORD_BOUND_ALL_RANGE (ONIG_OPTION_POSIX_BRACKET_ALL_RANGE << 1) +/* options (newline) */ +#define ONIG_OPTION_NEWLINE_CRLF (ONIG_OPTION_WORD_BOUND_ALL_RANGE << 1) +#define ONIG_OPTION_MAXBIT ONIG_OPTION_NEWLINE_CRLF /* limit */ + +#define ONIG_OPTION_ON(options,regopt) ((options) |= (regopt)) +#define ONIG_OPTION_OFF(options,regopt) ((options) &= ~(regopt)) +#define ONIG_IS_OPTION_ON(options,option) ((options) & (option)) + +/* syntax */ +typedef struct { + unsigned int op; + unsigned int op2; + unsigned int behavior; + OnigOptionType options; /* default option */ + OnigMetaCharTableType meta_char_table; +} OnigSyntaxType; + +ONIG_EXTERN const OnigSyntaxType OnigSyntaxASIS; +ONIG_EXTERN const OnigSyntaxType OnigSyntaxPosixBasic; +ONIG_EXTERN const OnigSyntaxType OnigSyntaxPosixExtended; +ONIG_EXTERN const OnigSyntaxType OnigSyntaxEmacs; +ONIG_EXTERN const OnigSyntaxType OnigSyntaxGrep; +ONIG_EXTERN const OnigSyntaxType OnigSyntaxGnuRegex; +ONIG_EXTERN const OnigSyntaxType OnigSyntaxJava; +ONIG_EXTERN const OnigSyntaxType OnigSyntaxPerl58; +ONIG_EXTERN const OnigSyntaxType OnigSyntaxPerl58_NG; +ONIG_EXTERN const OnigSyntaxType OnigSyntaxPerl; +ONIG_EXTERN const OnigSyntaxType OnigSyntaxRuby; +ONIG_EXTERN const OnigSyntaxType OnigSyntaxPython; + +/* predefined syntaxes (see regsyntax.c) */ +#define ONIG_SYNTAX_ASIS (&OnigSyntaxASIS) +#define ONIG_SYNTAX_POSIX_BASIC (&OnigSyntaxPosixBasic) +#define ONIG_SYNTAX_POSIX_EXTENDED (&OnigSyntaxPosixExtended) +#define ONIG_SYNTAX_EMACS (&OnigSyntaxEmacs) +#define ONIG_SYNTAX_GREP (&OnigSyntaxGrep) +#define ONIG_SYNTAX_GNU_REGEX (&OnigSyntaxGnuRegex) +#define ONIG_SYNTAX_JAVA (&OnigSyntaxJava) +#define ONIG_SYNTAX_PERL58 (&OnigSyntaxPerl58) +#define ONIG_SYNTAX_PERL58_NG (&OnigSyntaxPerl58_NG) +#define ONIG_SYNTAX_PERL (&OnigSyntaxPerl) +#define ONIG_SYNTAX_RUBY (&OnigSyntaxRuby) +#define ONIG_SYNTAX_PYTHON (&OnigSyntaxPython) + +/* default syntax */ +ONIG_EXTERN const OnigSyntaxType* OnigDefaultSyntax; +#define ONIG_SYNTAX_DEFAULT OnigDefaultSyntax + +/* syntax (operators) */ +#define ONIG_SYN_OP_VARIABLE_META_CHARACTERS (1U<<0) +#define ONIG_SYN_OP_DOT_ANYCHAR (1U<<1) /* . */ +#define ONIG_SYN_OP_ASTERISK_ZERO_INF (1U<<2) /* * */ +#define ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF (1U<<3) +#define ONIG_SYN_OP_PLUS_ONE_INF (1U<<4) /* + */ +#define ONIG_SYN_OP_ESC_PLUS_ONE_INF (1U<<5) +#define ONIG_SYN_OP_QMARK_ZERO_ONE (1U<<6) /* ? */ +#define ONIG_SYN_OP_ESC_QMARK_ZERO_ONE (1U<<7) +#define ONIG_SYN_OP_BRACE_INTERVAL (1U<<8) /* {lower,upper} */ +#define ONIG_SYN_OP_ESC_BRACE_INTERVAL (1U<<9) /* \{lower,upper\} */ +#define ONIG_SYN_OP_VBAR_ALT (1U<<10) /* | */ +#define ONIG_SYN_OP_ESC_VBAR_ALT (1U<<11) /* \| */ +#define ONIG_SYN_OP_LPAREN_SUBEXP (1U<<12) /* (...) */ +#define ONIG_SYN_OP_ESC_LPAREN_SUBEXP (1U<<13) /* \(...\) */ +#define ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR (1U<<14) /* \A, \Z, \z */ +#define ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR (1U<<15) /* \G */ +#define ONIG_SYN_OP_DECIMAL_BACKREF (1U<<16) /* \num */ +#define ONIG_SYN_OP_BRACKET_CC (1U<<17) /* [...] */ +#define ONIG_SYN_OP_ESC_W_WORD (1U<<18) /* \w, \W */ +#define ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END (1U<<19) /* \<. \> */ +#define ONIG_SYN_OP_ESC_B_WORD_BOUND (1U<<20) /* \b, \B */ +#define ONIG_SYN_OP_ESC_S_WHITE_SPACE (1U<<21) /* \s, \S */ +#define ONIG_SYN_OP_ESC_D_DIGIT (1U<<22) /* \d, \D */ +#define ONIG_SYN_OP_LINE_ANCHOR (1U<<23) /* ^, $ */ +#define ONIG_SYN_OP_POSIX_BRACKET (1U<<24) /* [:xxxx:] */ +#define ONIG_SYN_OP_QMARK_NON_GREEDY (1U<<25) /* ??,*?,+?,{n,m}? */ +#define ONIG_SYN_OP_ESC_CONTROL_CHARS (1U<<26) /* \n,\r,\t,\a ... */ +#define ONIG_SYN_OP_ESC_C_CONTROL (1U<<27) /* \cx */ +#define ONIG_SYN_OP_ESC_OCTAL3 (1U<<28) /* \OOO */ +#define ONIG_SYN_OP_ESC_X_HEX2 (1U<<29) /* \xHH */ +#define ONIG_SYN_OP_ESC_X_BRACE_HEX8 (1U<<30) /* \x{7HHHHHHH} */ +#define ONIG_SYN_OP_ESC_O_BRACE_OCTAL (1U<<31) /* \o{OOO} */ + +#define ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE (1U<<0) /* \Q...\E */ +#define ONIG_SYN_OP2_QMARK_GROUP_EFFECT (1U<<1) /* (?...) */ +#define ONIG_SYN_OP2_OPTION_PERL (1U<<2) /* (?imsxadlu), (?-imsx), (?^imsxalu) */ +#define ONIG_SYN_OP2_OPTION_RUBY (1U<<3) /* (?imxadu), (?-imx) */ +#define ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT (1U<<4) /* ?+,*+,++ */ +#define ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL (1U<<5) /* {n,m}+ */ +#define ONIG_SYN_OP2_CCLASS_SET_OP (1U<<6) /* [...&&..[..]..] */ +#define ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP (1U<<7) /* (?...) */ +#define ONIG_SYN_OP2_ESC_K_NAMED_BACKREF (1U<<8) /* \k */ +#define ONIG_SYN_OP2_ESC_G_SUBEXP_CALL (1U<<9) /* \g, \g */ +#define ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY (1U<<10) /* (?@..),(?@..) */ +#define ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL (1U<<11) /* \C-x */ +#define ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META (1U<<12) /* \M-x */ +#define ONIG_SYN_OP2_ESC_V_VTAB (1U<<13) /* \v as VTAB */ +#define ONIG_SYN_OP2_ESC_U_HEX4 (1U<<14) /* \uHHHH */ +#define ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR (1U<<15) /* \`, \' */ +#define ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY (1U<<16) /* \p{...}, \P{...} */ +#define ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT (1U<<17) /* \p{^..}, \P{^..} */ +/* #define ONIG_SYN_OP2_CHAR_PROPERTY_PREFIX_IS (1U<<18) */ +#define ONIG_SYN_OP2_ESC_H_XDIGIT (1U<<19) /* \h, \H */ +#define ONIG_SYN_OP2_INEFFECTIVE_ESCAPE (1U<<20) /* \ */ +#define ONIG_SYN_OP2_ESC_CAPITAL_R_LINEBREAK (1U<<21) /* \R as (?>\x0D\x0A|[\x0A-\x0D\x{85}\x{2028}\x{2029}]) */ +#define ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER (1U<<22) /* \X */ +#define ONIG_SYN_OP2_ESC_V_VERTICAL_WHITESPACE (1U<<23) /* \v, \V -- Perl */ /* NOTIMPL */ +#define ONIG_SYN_OP2_ESC_H_HORIZONTAL_WHITESPACE (1U<<24) /* \h, \H -- Perl */ /* NOTIMPL */ +#define ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP (1U<<25) /* \K */ +#define ONIG_SYN_OP2_ESC_G_BRACE_BACKREF (1U<<26) /* \g{name}, \g{n} */ +#define ONIG_SYN_OP2_QMARK_SUBEXP_CALL (1U<<27) /* (?&name), (?n), (?R), (?0) */ +#define ONIG_SYN_OP2_QMARK_VBAR_BRANCH_RESET (1U<<28) /* (?|...) */ /* NOTIMPL */ +#define ONIG_SYN_OP2_QMARK_LPAREN_CONDITION (1U<<29) /* (?(cond)yes...|no...) */ +#define ONIG_SYN_OP2_QMARK_CAPITAL_P_NAMED_GROUP (1U<<30) /* (?P...), (?P=name), (?P>name) -- Python/PCRE */ +#define ONIG_SYN_OP2_OPTION_JAVA (1U<<31) /* (?idmsux), (?-idmsux) */ /* NOTIMPL */ + +/* syntax (behavior) */ +#define ONIG_SYN_CONTEXT_INDEP_ANCHORS (1U<<31) /* not implemented */ +#define ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS (1U<<0) /* ?, *, +, {n,m} */ +#define ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS (1U<<1) /* error or ignore */ +#define ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP (1U<<2) /* ...)... */ +#define ONIG_SYN_ALLOW_INVALID_INTERVAL (1U<<3) /* {??? */ +#define ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV (1U<<4) /* {,n} => {0,n} */ +#define ONIG_SYN_STRICT_CHECK_BACKREF (1U<<5) /* /(\1)/,/\1()/ ..*/ +#define ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND (1U<<6) /* (?<=a|bc) */ +#define ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP (1U<<7) /* see doc/RE */ +#define ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME (1U<<8) /* (?)(?) */ +#define ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY (1U<<9) /* a{n}?=(?:a{n})? */ +#define ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME_CALL (1U<<10) /* (?)(?)(?&x) */ +#define ONIG_SYN_USE_LEFT_MOST_NAMED_GROUP (1U<<11) /* (?)(?)\k */ + +/* syntax (behavior) in char class [...] */ +#define ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC (1U<<20) /* [^...] */ +#define ONIG_SYN_BACKSLASH_ESCAPE_IN_CC (1U<<21) /* [..\w..] etc.. */ +#define ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC (1U<<22) +#define ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC (1U<<23) /* [0-9-a]=[0-9\-a] */ +/* syntax (behavior) warning */ +#define ONIG_SYN_WARN_CC_OP_NOT_ESCAPED (1U<<24) /* [,-,] */ +#define ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT (1U<<25) /* (?:a*)+ */ +#define ONIG_SYN_WARN_CC_DUP (1U<<26) /* [aa] */ + +/* meta character specifiers (onig_set_meta_char()) */ +#define ONIG_META_CHAR_ESCAPE 0 +#define ONIG_META_CHAR_ANYCHAR 1 +#define ONIG_META_CHAR_ANYTIME 2 +#define ONIG_META_CHAR_ZERO_OR_ONE_TIME 3 +#define ONIG_META_CHAR_ONE_OR_MORE_TIME 4 +#define ONIG_META_CHAR_ANYCHAR_ANYTIME 5 + +#define ONIG_INEFFECTIVE_META_CHAR 0 + +/* error codes */ +#define ONIG_IS_PATTERN_ERROR(ecode) ((ecode) <= -100 && (ecode) > -1000) +/* normal return */ +#define ONIG_NORMAL 0 +#define ONIG_MISMATCH -1 +#define ONIG_NO_SUPPORT_CONFIG -2 + +/* internal error */ +#define ONIGERR_MEMORY -5 +#define ONIGERR_TYPE_BUG -6 +#define ONIGERR_PARSER_BUG -11 +#define ONIGERR_STACK_BUG -12 +#define ONIGERR_UNDEFINED_BYTECODE -13 +#define ONIGERR_UNEXPECTED_BYTECODE -14 +#define ONIGERR_MATCH_STACK_LIMIT_OVER -15 +#define ONIGERR_PARSE_DEPTH_LIMIT_OVER -16 +#define ONIGERR_DEFAULT_ENCODING_IS_NOT_SET -21 +#define ONIGERR_SPECIFIED_ENCODING_CANT_CONVERT_TO_WIDE_CHAR -22 +/* general error */ +#define ONIGERR_INVALID_ARGUMENT -30 +/* syntax error */ +#define ONIGERR_END_PATTERN_AT_LEFT_BRACE -100 +#define ONIGERR_END_PATTERN_AT_LEFT_BRACKET -101 +#define ONIGERR_EMPTY_CHAR_CLASS -102 +#define ONIGERR_PREMATURE_END_OF_CHAR_CLASS -103 +#define ONIGERR_END_PATTERN_AT_ESCAPE -104 +#define ONIGERR_END_PATTERN_AT_META -105 +#define ONIGERR_END_PATTERN_AT_CONTROL -106 +#define ONIGERR_META_CODE_SYNTAX -108 +#define ONIGERR_CONTROL_CODE_SYNTAX -109 +#define ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE -110 +#define ONIGERR_CHAR_CLASS_VALUE_AT_START_OF_RANGE -111 +#define ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS -112 +#define ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED -113 +#define ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID -114 +#define ONIGERR_NESTED_REPEAT_OPERATOR -115 +#define ONIGERR_UNMATCHED_CLOSE_PARENTHESIS -116 +#define ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS -117 +#define ONIGERR_END_PATTERN_IN_GROUP -118 +#define ONIGERR_UNDEFINED_GROUP_OPTION -119 +#define ONIGERR_INVALID_POSIX_BRACKET_TYPE -121 +#define ONIGERR_INVALID_LOOK_BEHIND_PATTERN -122 +#define ONIGERR_INVALID_REPEAT_RANGE_PATTERN -123 +#define ONIGERR_INVALID_CONDITION_PATTERN -124 +/* values error (syntax error) */ +#define ONIGERR_TOO_BIG_NUMBER -200 +#define ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE -201 +#define ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE -202 +#define ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS -203 +#define ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE -204 +#define ONIGERR_TOO_MANY_MULTI_BYTE_RANGES -205 +#define ONIGERR_TOO_SHORT_MULTI_BYTE_STRING -206 +#define ONIGERR_TOO_BIG_BACKREF_NUMBER -207 +#define ONIGERR_INVALID_BACKREF -208 +#define ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED -209 +#define ONIGERR_TOO_MANY_CAPTURE_GROUPS -210 +#define ONIGERR_TOO_SHORT_DIGITS -211 +#define ONIGERR_TOO_LONG_WIDE_CHAR_VALUE -212 +#define ONIGERR_EMPTY_GROUP_NAME -214 +#define ONIGERR_INVALID_GROUP_NAME -215 +#define ONIGERR_INVALID_CHAR_IN_GROUP_NAME -216 +#define ONIGERR_UNDEFINED_NAME_REFERENCE -217 +#define ONIGERR_UNDEFINED_GROUP_REFERENCE -218 +#define ONIGERR_MULTIPLEX_DEFINED_NAME -219 +#define ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL -220 +#define ONIGERR_NEVER_ENDING_RECURSION -221 +#define ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY -222 +#define ONIGERR_INVALID_CHAR_PROPERTY_NAME -223 +#define ONIGERR_INVALID_CODE_POINT_VALUE -400 +#define ONIGERR_INVALID_WIDE_CHAR_VALUE -400 +#define ONIGERR_TOO_BIG_WIDE_CHAR_VALUE -401 +#define ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION -402 +#define ONIGERR_INVALID_COMBINATION_OF_OPTIONS -403 + +/* errors related to thread */ +/* #define ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT -1001 */ + + +/* must be smaller than BIT_STATUS_BITS_NUM (unsigned int * 8) */ +#define ONIG_MAX_CAPTURE_HISTORY_GROUP 31 +#define ONIG_IS_CAPTURE_HISTORY_GROUP(r, i) \ + ((i) <= ONIG_MAX_CAPTURE_HISTORY_GROUP && (r)->list && (r)->list[i]) + +typedef struct OnigCaptureTreeNodeStruct { + int group; /* group number */ + OnigPosition beg; + OnigPosition end; + int allocated; + int num_childs; + struct OnigCaptureTreeNodeStruct** childs; +} OnigCaptureTreeNode; + +/* match result region type */ +struct re_registers { + int allocated; + int num_regs; + OnigPosition* beg; + OnigPosition* end; + /* extended */ + OnigCaptureTreeNode* history_root; /* capture history tree root */ +}; + +/* capture tree traverse */ +#define ONIG_TRAVERSE_CALLBACK_AT_FIRST 1 +#define ONIG_TRAVERSE_CALLBACK_AT_LAST 2 +#define ONIG_TRAVERSE_CALLBACK_AT_BOTH \ + ( ONIG_TRAVERSE_CALLBACK_AT_FIRST | ONIG_TRAVERSE_CALLBACK_AT_LAST ) + + +#define ONIG_REGION_NOTPOS -1 + +typedef struct re_registers OnigRegion; + +typedef struct { + OnigEncoding enc; + OnigUChar* par; + OnigUChar* par_end; +} OnigErrorInfo; + +typedef struct { + int lower; + int upper; +} OnigRepeatRange; + +typedef void (*OnigWarnFunc)(const char* s); +extern void onig_null_warn(const char* s); +#define ONIG_NULL_WARN onig_null_warn + +#define ONIG_CHAR_TABLE_SIZE 256 + +typedef struct re_pattern_buffer { + /* common members of BBuf(bytes-buffer) */ + unsigned char* p; /* compiled pattern */ + unsigned int used; /* used space for p */ + unsigned int alloc; /* allocated space for p */ + + int num_mem; /* used memory(...) num counted from 1 */ + int num_repeat; /* OP_REPEAT/OP_REPEAT_NG id-counter */ + int num_null_check; /* OP_NULL_CHECK_START/END id counter */ + int num_comb_exp_check; /* combination explosion check */ + int num_call; /* number of subexp call */ + unsigned int capture_history; /* (?@...) flag (1-31) */ + unsigned int bt_mem_start; /* need backtrack flag */ + unsigned int bt_mem_end; /* need backtrack flag */ + int stack_pop_level; + int repeat_range_alloc; + + OnigOptionType options; + + OnigRepeatRange* repeat_range; + + OnigEncoding enc; + const OnigSyntaxType* syntax; + void* name_table; + OnigCaseFoldType case_fold_flag; + + /* optimization info (string search, char-map and anchors) */ + int optimize; /* optimize flag */ + int threshold_len; /* search str-length for apply optimize */ + int anchor; /* BEGIN_BUF, BEGIN_POS, (SEMI_)END_BUF */ + OnigDistance anchor_dmin; /* (SEMI_)END_BUF anchor distance */ + OnigDistance anchor_dmax; /* (SEMI_)END_BUF anchor distance */ + int sub_anchor; /* start-anchor for exact or map */ + unsigned char *exact; + unsigned char *exact_end; + unsigned char map[ONIG_CHAR_TABLE_SIZE]; /* used as BM skip or char-map */ + int *int_map; /* BM skip for exact_len > 255 */ + int *int_map_backward; /* BM skip for backward search */ + OnigDistance dmin; /* min-distance of exact or map */ + OnigDistance dmax; /* max-distance of exact or map */ + + /* regex_t link chain */ + struct re_pattern_buffer* chain; /* escape compile-conflict */ +} OnigRegexType; + +typedef OnigRegexType* OnigRegex; + +#ifndef ONIG_ESCAPE_REGEX_T_COLLISION +typedef OnigRegexType regex_t; +#endif + + +typedef struct { + int num_of_elements; + OnigEncoding pattern_enc; + OnigEncoding target_enc; + const OnigSyntaxType* syntax; + OnigOptionType option; + OnigCaseFoldType case_fold_flag; +} OnigCompileInfo; + +/* Oniguruma Native API */ +ONIG_EXTERN +int onig_initialize(OnigEncoding encodings[], int n); +ONIG_EXTERN +int onig_init(void); +ONIG_EXTERN +int onig_error_code_to_str(OnigUChar* s, OnigPosition err_code, ...); +ONIG_EXTERN +void onig_set_warn_func(OnigWarnFunc f); +ONIG_EXTERN +void onig_set_verb_warn_func(OnigWarnFunc f); +ONIG_EXTERN +int onig_new(OnigRegex*, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigOptionType option, OnigEncoding enc, const OnigSyntaxType* syntax, OnigErrorInfo* einfo); +ONIG_EXTERN +int onig_reg_init(OnigRegex reg, OnigOptionType option, OnigCaseFoldType case_fold_flag, OnigEncoding enc, const OnigSyntaxType* syntax); +ONIG_EXTERN +int onig_new_without_alloc(OnigRegex, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo); +ONIG_EXTERN +int onig_new_deluxe(OnigRegex* reg, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo); +ONIG_EXTERN +void onig_free(OnigRegex); +ONIG_EXTERN +void onig_free_body(OnigRegex); +ONIG_EXTERN +OnigPosition onig_scan(OnigRegex reg, const OnigUChar* str, const OnigUChar* end, OnigRegion* region, OnigOptionType option, int (*scan_callback)(OnigPosition, OnigPosition, OnigRegion*, void*), void* callback_arg); +ONIG_EXTERN +OnigPosition onig_search(OnigRegex, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start, const OnigUChar* range, OnigRegion* region, OnigOptionType option); +ONIG_EXTERN +OnigPosition onig_search_gpos(OnigRegex, const OnigUChar* str, const OnigUChar* end, const OnigUChar* global_pos, const OnigUChar* start, const OnigUChar* range, OnigRegion* region, OnigOptionType option); +ONIG_EXTERN +OnigPosition onig_match(OnigRegex, const OnigUChar* str, const OnigUChar* end, const OnigUChar* at, OnigRegion* region, OnigOptionType option); +ONIG_EXTERN +OnigRegion* onig_region_new(void); +ONIG_EXTERN +void onig_region_init(OnigRegion* region); +ONIG_EXTERN +void onig_region_free(OnigRegion* region, int free_self); +ONIG_EXTERN +void onig_region_copy(OnigRegion* to, const OnigRegion* from); +ONIG_EXTERN +void onig_region_clear(OnigRegion* region); +ONIG_EXTERN +int onig_region_resize(OnigRegion* region, int n); +ONIG_EXTERN +int onig_region_set(OnigRegion* region, int at, int beg, int end); +ONIG_EXTERN +int onig_name_to_group_numbers(OnigRegex reg, const OnigUChar* name, const OnigUChar* name_end, int** nums); +ONIG_EXTERN +int onig_name_to_backref_number(OnigRegex reg, const OnigUChar* name, const OnigUChar* name_end, const OnigRegion *region); +ONIG_EXTERN +int onig_foreach_name(OnigRegex reg, int (*func)(const OnigUChar*, const OnigUChar*,int,int*,OnigRegex,void*), void* arg); +ONIG_EXTERN +int onig_number_of_names(const OnigRegexType *reg); +ONIG_EXTERN +int onig_number_of_captures(const OnigRegexType *reg); +ONIG_EXTERN +int onig_number_of_capture_histories(const OnigRegexType *reg); +ONIG_EXTERN +OnigCaptureTreeNode* onig_get_capture_tree(OnigRegion* region); +ONIG_EXTERN +int onig_capture_tree_traverse(OnigRegion* region, int at, int(*callback_func)(int,OnigPosition,OnigPosition,int,int,void*), void* arg); +ONIG_EXTERN +int onig_noname_group_capture_is_active(const OnigRegexType *reg); +ONIG_EXTERN +OnigEncoding onig_get_encoding(const OnigRegexType *reg); +ONIG_EXTERN +OnigOptionType onig_get_options(const OnigRegexType *reg); +ONIG_EXTERN +OnigCaseFoldType onig_get_case_fold_flag(const OnigRegexType *reg); +ONIG_EXTERN +const OnigSyntaxType* onig_get_syntax(const OnigRegexType *reg); +ONIG_EXTERN +int onig_set_default_syntax(const OnigSyntaxType* syntax); +ONIG_EXTERN +void onig_copy_syntax(OnigSyntaxType* to, const OnigSyntaxType* from); +ONIG_EXTERN +unsigned int onig_get_syntax_op(const OnigSyntaxType* syntax); +ONIG_EXTERN +unsigned int onig_get_syntax_op2(const OnigSyntaxType* syntax); +ONIG_EXTERN +unsigned int onig_get_syntax_behavior(const OnigSyntaxType* syntax); +ONIG_EXTERN +OnigOptionType onig_get_syntax_options(const OnigSyntaxType* syntax); +ONIG_EXTERN +void onig_set_syntax_op(OnigSyntaxType* syntax, unsigned int op); +ONIG_EXTERN +void onig_set_syntax_op2(OnigSyntaxType* syntax, unsigned int op2); +ONIG_EXTERN +void onig_set_syntax_behavior(OnigSyntaxType* syntax, unsigned int behavior); +ONIG_EXTERN +void onig_set_syntax_options(OnigSyntaxType* syntax, OnigOptionType options); +ONIG_EXTERN +int onig_set_meta_char(OnigSyntaxType* syntax, unsigned int what, OnigCodePoint code); +ONIG_EXTERN +void onig_copy_encoding(OnigEncodingType *to, OnigEncoding from); +ONIG_EXTERN +OnigCaseFoldType onig_get_default_case_fold_flag(void); +ONIG_EXTERN +int onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag); +ONIG_EXTERN +unsigned int onig_get_match_stack_limit_size(void); +ONIG_EXTERN +int onig_set_match_stack_limit_size(unsigned int size); +ONIG_EXTERN +unsigned int onig_get_parse_depth_limit(void); +ONIG_EXTERN +int onig_set_parse_depth_limit(unsigned int depth); +ONIG_EXTERN +int onig_end(void); +ONIG_EXTERN +const char* onig_version(void); +ONIG_EXTERN +const char* onig_copyright(void); + +RUBY_SYMBOL_EXPORT_END + +#ifdef __cplusplus +# if 0 +{ /* satisfy cc-mode */ +# endif +} +#endif + +#endif /* ONIGMO_H */ diff --git a/include/ruby/oniguruma.h b/include/ruby/oniguruma.h index 1d8a0198d8..dc83754aca 100644 --- a/include/ruby/oniguruma.h +++ b/include/ruby/oniguruma.h @@ -1,880 +1,8 @@ #ifndef ONIGURUMA_H #define ONIGURUMA_H -/********************************************************************** - oniguruma.h - Onigmo (Oniguruma-mod) (regular expression library) -**********************************************************************/ -/*- - * Copyright (c) 2002-2009 K.Kosako - * Copyright (c) 2011-2014 K.Takata - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#ifdef __cplusplus -extern "C" { -#if 0 -} /* satisfy cc-mode */ -#endif -#endif - +#include "onigmo.h" #define ONIGURUMA -#define ONIGURUMA_VERSION_MAJOR 5 -#define ONIGURUMA_VERSION_MINOR 15 -#define ONIGURUMA_VERSION_TEENY 0 - -#ifdef __cplusplus -# ifndef HAVE_PROTOTYPES -# define HAVE_PROTOTYPES 1 -# endif -# ifndef HAVE_STDARG_PROTOTYPES -# define HAVE_STDARG_PROTOTYPES 1 -# endif -#endif - -/* escape Mac OS X/Xcode 2.4/gcc 4.0.1 problem */ -#if defined(__APPLE__) && defined(__GNUC__) && __GNUC__ >= 4 -# ifndef HAVE_STDARG_PROTOTYPES -# define HAVE_STDARG_PROTOTYPES 1 -# endif -#endif - -#ifdef HAVE_STDARG_H -# ifndef HAVE_STDARG_PROTOTYPES -# define HAVE_STDARG_PROTOTYPES 1 -# endif -#endif - -#ifndef P_ -#if defined(__STDC__) || defined(_WIN32) -# define P_(args) args -#else -# define P_(args) () -#endif -#endif - -#ifndef PV_ -#ifdef HAVE_STDARG_PROTOTYPES -# define PV_(args) args -#else -# define PV_(args) () -#endif -#endif - -#ifndef ONIG_EXTERN -#ifdef RUBY_EXTERN -#define ONIG_EXTERN RUBY_EXTERN -#else -#if defined(_WIN32) && !defined(__GNUC__) -#if defined(EXPORT) || defined(RUBY_EXPORT) -#define ONIG_EXTERN extern __declspec(dllexport) -#else -#define ONIG_EXTERN extern __declspec(dllimport) -#endif -#endif -#endif -#endif - -#ifndef ONIG_EXTERN -#define ONIG_EXTERN extern -#endif - -RUBY_SYMBOL_EXPORT_BEGIN - -#include /* for size_t */ - -/* PART: character encoding */ - -#ifndef ONIG_ESCAPE_UCHAR_COLLISION -#define UChar OnigUChar -#endif - -typedef unsigned char OnigUChar; -typedef unsigned int OnigCodePoint; -typedef unsigned int OnigCtype; -typedef size_t OnigDistance; -typedef ptrdiff_t OnigPosition; - -#define ONIG_INFINITE_DISTANCE ~((OnigDistance )0) - -/* - * Onig casefold/case mapping flags and related definitions - * - * Subfields (starting with 0 at LSB): - * 0-2: Code point count in casefold.h - * 3-12: Index into SpecialCaseMapping array in casefold.h - * 13-22: Case folding/mapping flags - */ -typedef unsigned int OnigCaseFoldType; /* case fold flag */ - -ONIG_EXTERN OnigCaseFoldType OnigDefaultCaseFoldFlag; - -/* bits for actual code point count; 3 bits is more than enough, currently only 2 used */ -#define OnigCodePointMaskWidth 3 -#define OnigCodePointMask ((1< Unicode:0x1ffc */ - -/* code range */ -#define ONIGENC_CODE_RANGE_NUM(range) ((int )range[0]) -#define ONIGENC_CODE_RANGE_FROM(range,i) range[((i)*2) + 1] -#define ONIGENC_CODE_RANGE_TO(range,i) range[((i)*2) + 2] - -typedef struct { - int byte_len; /* argument(original) character(s) byte length */ - int code_len; /* number of code */ - OnigCodePoint code[ONIGENC_MAX_COMP_CASE_FOLD_CODE_LEN]; -} OnigCaseFoldCodeItem; - -typedef struct { - OnigCodePoint esc; - OnigCodePoint anychar; - OnigCodePoint anytime; - OnigCodePoint zero_or_one_time; - OnigCodePoint one_or_more_time; - OnigCodePoint anychar_anytime; -} OnigMetaCharTableType; - -typedef int (*OnigApplyAllCaseFoldFunc)(OnigCodePoint from, OnigCodePoint* to, int to_len, void* arg); - -typedef struct OnigEncodingTypeST { - int (*precise_mbc_enc_len)(const OnigUChar* p,const OnigUChar* e, const struct OnigEncodingTypeST* enc); - const char* name; - int max_enc_len; - int min_enc_len; - int (*is_mbc_newline)(const OnigUChar* p, const OnigUChar* end, const struct OnigEncodingTypeST* enc); - OnigCodePoint (*mbc_to_code)(const OnigUChar* p, const OnigUChar* end, const struct OnigEncodingTypeST* enc); - int (*code_to_mbclen)(OnigCodePoint code, const struct OnigEncodingTypeST* enc); - int (*code_to_mbc)(OnigCodePoint code, OnigUChar *buf, const struct OnigEncodingTypeST* enc); - int (*mbc_case_fold)(OnigCaseFoldType flag, const OnigUChar** pp, const OnigUChar* end, OnigUChar* to, const struct OnigEncodingTypeST* enc); - int (*apply_all_case_fold)(OnigCaseFoldType flag, OnigApplyAllCaseFoldFunc f, void* arg, const struct OnigEncodingTypeST* enc); - int (*get_case_fold_codes_by_str)(OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem acs[], const struct OnigEncodingTypeST* enc); - int (*property_name_to_ctype)(const struct OnigEncodingTypeST* enc, const OnigUChar* p, const OnigUChar* end); - int (*is_code_ctype)(OnigCodePoint code, OnigCtype ctype, const struct OnigEncodingTypeST* enc); - int (*get_ctype_code_range)(OnigCtype ctype, OnigCodePoint* sb_out, const OnigCodePoint* ranges[], const struct OnigEncodingTypeST* enc); - OnigUChar* (*left_adjust_char_head)(const OnigUChar* start, const OnigUChar* p, const OnigUChar* end, const struct OnigEncodingTypeST* enc); - int (*is_allowed_reverse_match)(const OnigUChar* p, const OnigUChar* end, const struct OnigEncodingTypeST* enc); - int ruby_encoding_index; - unsigned int flags; - int (*case_map)(OnigCaseFoldType* flagP, const OnigUChar** pp, const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, const struct OnigEncodingTypeST* enc); -} OnigEncodingType; - -typedef const OnigEncodingType* OnigEncoding; - -ONIG_EXTERN const OnigEncodingType OnigEncodingASCII; - -#define ONIG_ENCODING_ASCII (&OnigEncodingASCII) - -#define ONIG_ENCODING_UNDEF ((OnigEncoding )0) - -/* this declaration needs to be here because it is used in string.c */ -ONIG_EXTERN int onigenc_ascii_only_case_map P_((OnigCaseFoldType* flagP, - const OnigUChar** pp, const OnigUChar* end, - OnigUChar* to, OnigUChar* to_end, - const struct OnigEncodingTypeST* enc)); - - -/* work size */ -#define ONIGENC_CODE_TO_MBC_MAXLEN 7 -#define ONIGENC_MBC_CASE_FOLD_MAXLEN 18 -/* 18: 6(max-byte) * 3(case-fold chars) */ - -/* character types */ -#define ONIGENC_CTYPE_NEWLINE 0 -#define ONIGENC_CTYPE_ALPHA 1 -#define ONIGENC_CTYPE_BLANK 2 -#define ONIGENC_CTYPE_CNTRL 3 -#define ONIGENC_CTYPE_DIGIT 4 -#define ONIGENC_CTYPE_GRAPH 5 -#define ONIGENC_CTYPE_LOWER 6 -#define ONIGENC_CTYPE_PRINT 7 -#define ONIGENC_CTYPE_PUNCT 8 -#define ONIGENC_CTYPE_SPACE 9 -#define ONIGENC_CTYPE_UPPER 10 -#define ONIGENC_CTYPE_XDIGIT 11 -#define ONIGENC_CTYPE_WORD 12 -#define ONIGENC_CTYPE_ALNUM 13 /* alpha || digit */ -#define ONIGENC_CTYPE_ASCII 14 -#define ONIGENC_MAX_STD_CTYPE ONIGENC_CTYPE_ASCII - -/* flags */ -#define ONIGENC_FLAG_NONE 0U -#define ONIGENC_FLAG_UNICODE 1U - -#define onig_enc_len(enc,p,e) ONIGENC_MBC_ENC_LEN(enc, p, e) - -#define ONIGENC_IS_UNDEF(enc) ((enc) == ONIG_ENCODING_UNDEF) -#define ONIGENC_IS_SINGLEBYTE(enc) (ONIGENC_MBC_MAXLEN(enc) == 1) -#define ONIGENC_IS_MBC_HEAD(enc,p,e) (ONIGENC_MBC_ENC_LEN(enc,p,e) != 1) -#define ONIGENC_IS_MBC_ASCII(p) (*(p) < 128) -#define ONIGENC_IS_CODE_ASCII(code) ((code) < 128) -#define ONIGENC_IS_MBC_WORD(enc,s,end) \ - ONIGENC_IS_CODE_WORD(enc,ONIGENC_MBC_TO_CODE(enc,s,end)) -#define ONIGENC_IS_MBC_ASCII_WORD(enc,s,end) \ - onigenc_ascii_is_code_ctype( \ - ONIGENC_MBC_TO_CODE(enc,s,end),ONIGENC_CTYPE_WORD,enc) -#define ONIGENC_IS_UNICODE(enc) ((enc)->flags & ONIGENC_FLAG_UNICODE) - - -#define ONIGENC_NAME(enc) ((enc)->name) - -#define ONIGENC_MBC_CASE_FOLD(enc,flag,pp,end,buf) \ - (enc)->mbc_case_fold(flag,(const OnigUChar** )pp,end,buf,enc) -#define ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc,s,end) \ - (enc)->is_allowed_reverse_match(s,end,enc) -#define ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc,start,s,end) \ - (enc)->left_adjust_char_head(start, s, end, enc) -#define ONIGENC_APPLY_ALL_CASE_FOLD(enc,case_fold_flag,f,arg) \ - (enc)->apply_all_case_fold(case_fold_flag,f,arg,enc) -#define ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc,case_fold_flag,p,end,acs) \ - (enc)->get_case_fold_codes_by_str(case_fold_flag,p,end,acs,enc) -#define ONIGENC_STEP_BACK(enc,start,s,end,n) \ - onigenc_step_back((enc),(start),(s),(end),(n)) - -#define ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(n) (n) -#define ONIGENC_MBCLEN_CHARFOUND_P(r) (0 < (r)) -#define ONIGENC_MBCLEN_CHARFOUND_LEN(r) (r) - -#define ONIGENC_CONSTRUCT_MBCLEN_INVALID() (-1) -#define ONIGENC_MBCLEN_INVALID_P(r) ((r) == -1) - -#define ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n) (-1-(n)) -#define ONIGENC_MBCLEN_NEEDMORE_P(r) ((r) < -1) -#define ONIGENC_MBCLEN_NEEDMORE_LEN(r) (-1-(r)) - -#define ONIGENC_PRECISE_MBC_ENC_LEN(enc,p,e) (enc)->precise_mbc_enc_len(p,e,enc) - -ONIG_EXTERN -int onigenc_mbclen_approximate P_((const OnigUChar* p,const OnigUChar* e, const struct OnigEncodingTypeST* enc)); - -#define ONIGENC_MBC_ENC_LEN(enc,p,e) onigenc_mbclen_approximate(p,e,enc) -#define ONIGENC_MBC_MAXLEN(enc) ((enc)->max_enc_len) -#define ONIGENC_MBC_MAXLEN_DIST(enc) ONIGENC_MBC_MAXLEN(enc) -#define ONIGENC_MBC_MINLEN(enc) ((enc)->min_enc_len) -#define ONIGENC_IS_MBC_NEWLINE(enc,p,end) (enc)->is_mbc_newline((p),(end),enc) -#define ONIGENC_MBC_TO_CODE(enc,p,end) (enc)->mbc_to_code((p),(end),enc) -#define ONIGENC_CODE_TO_MBCLEN(enc,code) (enc)->code_to_mbclen(code,enc) -#define ONIGENC_CODE_TO_MBC(enc,code,buf) (enc)->code_to_mbc(code,buf,enc) -#define ONIGENC_PROPERTY_NAME_TO_CTYPE(enc,p,end) \ - (enc)->property_name_to_ctype(enc,p,end) - -#define ONIGENC_IS_CODE_CTYPE(enc,code,ctype) (enc)->is_code_ctype(code,ctype,enc) - -#define ONIGENC_IS_CODE_NEWLINE(enc,code) \ - ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_NEWLINE) -#define ONIGENC_IS_CODE_GRAPH(enc,code) \ - ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_GRAPH) -#define ONIGENC_IS_CODE_PRINT(enc,code) \ - ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_PRINT) -#define ONIGENC_IS_CODE_ALNUM(enc,code) \ - ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_ALNUM) -#define ONIGENC_IS_CODE_ALPHA(enc,code) \ - ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_ALPHA) -#define ONIGENC_IS_CODE_LOWER(enc,code) \ - ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_LOWER) -#define ONIGENC_IS_CODE_UPPER(enc,code) \ - ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_UPPER) -#define ONIGENC_IS_CODE_CNTRL(enc,code) \ - ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_CNTRL) -#define ONIGENC_IS_CODE_PUNCT(enc,code) \ - ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_PUNCT) -#define ONIGENC_IS_CODE_SPACE(enc,code) \ - ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_SPACE) -#define ONIGENC_IS_CODE_BLANK(enc,code) \ - ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_BLANK) -#define ONIGENC_IS_CODE_DIGIT(enc,code) \ - ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_DIGIT) -#define ONIGENC_IS_CODE_XDIGIT(enc,code) \ - ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_XDIGIT) -#define ONIGENC_IS_CODE_WORD(enc,code) \ - ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_WORD) - -#define ONIGENC_GET_CTYPE_CODE_RANGE(enc,ctype,sbout,ranges) \ - (enc)->get_ctype_code_range(ctype,sbout,ranges,enc) - -ONIG_EXTERN -OnigUChar* onigenc_step_back P_((OnigEncoding enc, const OnigUChar* start, const OnigUChar* s, const OnigUChar* end, int n)); - - -/* encoding API */ -ONIG_EXTERN -int onigenc_init P_((void)); -ONIG_EXTERN -int onigenc_set_default_encoding P_((OnigEncoding enc)); -PUREFUNC(ONIG_EXTERN OnigEncoding onigenc_get_default_encoding P_((void))); -PUREFUNC(ONIG_EXTERN void onigenc_set_default_caseconv_table P_((const OnigUChar* table))); -ONIG_EXTERN -OnigUChar* onigenc_get_right_adjust_char_head_with_prev P_((OnigEncoding enc, const OnigUChar* start, const OnigUChar* s, const OnigUChar* end, const OnigUChar** prev)); -ONIG_EXTERN -OnigUChar* onigenc_get_prev_char_head P_((OnigEncoding enc, const OnigUChar* start, const OnigUChar* s, const OnigUChar* end)); -ONIG_EXTERN -OnigUChar* onigenc_get_left_adjust_char_head P_((OnigEncoding enc, const OnigUChar* start, const OnigUChar* s, const OnigUChar* end)); -ONIG_EXTERN -OnigUChar* onigenc_get_right_adjust_char_head P_((OnigEncoding enc, const OnigUChar* start, const OnigUChar* s, const OnigUChar* end)); -ONIG_EXTERN -int onigenc_strlen P_((OnigEncoding enc, const OnigUChar* p, const OnigUChar* end)); -ONIG_EXTERN -int onigenc_strlen_null P_((OnigEncoding enc, const OnigUChar* p)); -ONIG_EXTERN -int onigenc_str_bytelen_null P_((OnigEncoding enc, const OnigUChar* p)); - - - -/* PART: regular expression */ - -/* config parameters */ -#define ONIG_NREGION 10 -#define ONIG_MAX_BACKREF_NUM 1000 -#define ONIG_MAX_CAPTURE_GROUP_NUM 32767 -#define ONIG_MAX_REPEAT_NUM 100000 -#define ONIG_MAX_MULTI_BYTE_RANGES_NUM 10000 -/* constants */ -#define ONIG_MAX_ERROR_MESSAGE_LEN 90 - -typedef unsigned int OnigOptionType; - -#define ONIG_OPTION_DEFAULT ONIG_OPTION_NONE - -/* options */ -#define ONIG_OPTION_NONE 0U -#define ONIG_OPTION_IGNORECASE 1U -#define ONIG_OPTION_EXTEND (ONIG_OPTION_IGNORECASE << 1) -#define ONIG_OPTION_MULTILINE (ONIG_OPTION_EXTEND << 1) -#define ONIG_OPTION_DOTALL ONIG_OPTION_MULTILINE -#define ONIG_OPTION_SINGLELINE (ONIG_OPTION_MULTILINE << 1) -#define ONIG_OPTION_FIND_LONGEST (ONIG_OPTION_SINGLELINE << 1) -#define ONIG_OPTION_FIND_NOT_EMPTY (ONIG_OPTION_FIND_LONGEST << 1) -#define ONIG_OPTION_NEGATE_SINGLELINE (ONIG_OPTION_FIND_NOT_EMPTY << 1) -#define ONIG_OPTION_DONT_CAPTURE_GROUP (ONIG_OPTION_NEGATE_SINGLELINE << 1) -#define ONIG_OPTION_CAPTURE_GROUP (ONIG_OPTION_DONT_CAPTURE_GROUP << 1) -/* options (search time) */ -#define ONIG_OPTION_NOTBOL (ONIG_OPTION_CAPTURE_GROUP << 1) -#define ONIG_OPTION_NOTEOL (ONIG_OPTION_NOTBOL << 1) -#define ONIG_OPTION_POSIX_REGION (ONIG_OPTION_NOTEOL << 1) -/* options (ctype range) */ -#define ONIG_OPTION_ASCII_RANGE (ONIG_OPTION_POSIX_REGION << 1) -#define ONIG_OPTION_POSIX_BRACKET_ALL_RANGE (ONIG_OPTION_ASCII_RANGE << 1) -#define ONIG_OPTION_WORD_BOUND_ALL_RANGE (ONIG_OPTION_POSIX_BRACKET_ALL_RANGE << 1) -/* options (newline) */ -#define ONIG_OPTION_NEWLINE_CRLF (ONIG_OPTION_WORD_BOUND_ALL_RANGE << 1) -#define ONIG_OPTION_NOTBOS (ONIG_OPTION_NEWLINE_CRLF << 1) -#define ONIG_OPTION_NOTEOS (ONIG_OPTION_NOTBOS << 1) -#define ONIG_OPTION_MAXBIT ONIG_OPTION_NOTEOS /* limit */ - -#define ONIG_OPTION_ON(options,regopt) ((options) |= (regopt)) -#define ONIG_OPTION_OFF(options,regopt) ((options) &= ~(regopt)) -#define ONIG_IS_OPTION_ON(options,option) ((options) & (option)) - -/* syntax */ -typedef struct { - unsigned int op; - unsigned int op2; - unsigned int behavior; - OnigOptionType options; /* default option */ - OnigMetaCharTableType meta_char_table; -} OnigSyntaxType; - -ONIG_EXTERN const OnigSyntaxType OnigSyntaxASIS; -ONIG_EXTERN const OnigSyntaxType OnigSyntaxPosixBasic; -ONIG_EXTERN const OnigSyntaxType OnigSyntaxPosixExtended; -ONIG_EXTERN const OnigSyntaxType OnigSyntaxEmacs; -ONIG_EXTERN const OnigSyntaxType OnigSyntaxGrep; -ONIG_EXTERN const OnigSyntaxType OnigSyntaxGnuRegex; -ONIG_EXTERN const OnigSyntaxType OnigSyntaxJava; -ONIG_EXTERN const OnigSyntaxType OnigSyntaxPerl58; -ONIG_EXTERN const OnigSyntaxType OnigSyntaxPerl58_NG; -ONIG_EXTERN const OnigSyntaxType OnigSyntaxPerl; -ONIG_EXTERN const OnigSyntaxType OnigSyntaxRuby; -ONIG_EXTERN const OnigSyntaxType OnigSyntaxPython; - -/* predefined syntaxes (see regsyntax.c) */ -#define ONIG_SYNTAX_ASIS (&OnigSyntaxASIS) -#define ONIG_SYNTAX_POSIX_BASIC (&OnigSyntaxPosixBasic) -#define ONIG_SYNTAX_POSIX_EXTENDED (&OnigSyntaxPosixExtended) -#define ONIG_SYNTAX_EMACS (&OnigSyntaxEmacs) -#define ONIG_SYNTAX_GREP (&OnigSyntaxGrep) -#define ONIG_SYNTAX_GNU_REGEX (&OnigSyntaxGnuRegex) -#define ONIG_SYNTAX_JAVA (&OnigSyntaxJava) -#define ONIG_SYNTAX_PERL58 (&OnigSyntaxPerl58) -#define ONIG_SYNTAX_PERL58_NG (&OnigSyntaxPerl58_NG) -#define ONIG_SYNTAX_PERL (&OnigSyntaxPerl) -#define ONIG_SYNTAX_RUBY (&OnigSyntaxRuby) -#define ONIG_SYNTAX_PYTHON (&OnigSyntaxPython) - -/* default syntax */ -ONIG_EXTERN const OnigSyntaxType* OnigDefaultSyntax; -#define ONIG_SYNTAX_DEFAULT OnigDefaultSyntax - -/* syntax (operators) */ -#define ONIG_SYN_OP_VARIABLE_META_CHARACTERS (1U<<0) -#define ONIG_SYN_OP_DOT_ANYCHAR (1U<<1) /* . */ -#define ONIG_SYN_OP_ASTERISK_ZERO_INF (1U<<2) /* * */ -#define ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF (1U<<3) -#define ONIG_SYN_OP_PLUS_ONE_INF (1U<<4) /* + */ -#define ONIG_SYN_OP_ESC_PLUS_ONE_INF (1U<<5) -#define ONIG_SYN_OP_QMARK_ZERO_ONE (1U<<6) /* ? */ -#define ONIG_SYN_OP_ESC_QMARK_ZERO_ONE (1U<<7) -#define ONIG_SYN_OP_BRACE_INTERVAL (1U<<8) /* {lower,upper} */ -#define ONIG_SYN_OP_ESC_BRACE_INTERVAL (1U<<9) /* \{lower,upper\} */ -#define ONIG_SYN_OP_VBAR_ALT (1U<<10) /* | */ -#define ONIG_SYN_OP_ESC_VBAR_ALT (1U<<11) /* \| */ -#define ONIG_SYN_OP_LPAREN_SUBEXP (1U<<12) /* (...) */ -#define ONIG_SYN_OP_ESC_LPAREN_SUBEXP (1U<<13) /* \(...\) */ -#define ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR (1U<<14) /* \A, \Z, \z */ -#define ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR (1U<<15) /* \G */ -#define ONIG_SYN_OP_DECIMAL_BACKREF (1U<<16) /* \num */ -#define ONIG_SYN_OP_BRACKET_CC (1U<<17) /* [...] */ -#define ONIG_SYN_OP_ESC_W_WORD (1U<<18) /* \w, \W */ -#define ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END (1U<<19) /* \<. \> */ -#define ONIG_SYN_OP_ESC_B_WORD_BOUND (1U<<20) /* \b, \B */ -#define ONIG_SYN_OP_ESC_S_WHITE_SPACE (1U<<21) /* \s, \S */ -#define ONIG_SYN_OP_ESC_D_DIGIT (1U<<22) /* \d, \D */ -#define ONIG_SYN_OP_LINE_ANCHOR (1U<<23) /* ^, $ */ -#define ONIG_SYN_OP_POSIX_BRACKET (1U<<24) /* [:xxxx:] */ -#define ONIG_SYN_OP_QMARK_NON_GREEDY (1U<<25) /* ??,*?,+?,{n,m}? */ -#define ONIG_SYN_OP_ESC_CONTROL_CHARS (1U<<26) /* \n,\r,\t,\a ... */ -#define ONIG_SYN_OP_ESC_C_CONTROL (1U<<27) /* \cx */ -#define ONIG_SYN_OP_ESC_OCTAL3 (1U<<28) /* \OOO */ -#define ONIG_SYN_OP_ESC_X_HEX2 (1U<<29) /* \xHH */ -#define ONIG_SYN_OP_ESC_X_BRACE_HEX8 (1U<<30) /* \x{7HHHHHHH} */ -#define ONIG_SYN_OP_ESC_O_BRACE_OCTAL (1U<<31) /* \o{OOO} */ /* NOTIMPL */ - -#define ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE (1U<<0) /* \Q...\E */ -#define ONIG_SYN_OP2_QMARK_GROUP_EFFECT (1U<<1) /* (?...) */ -#define ONIG_SYN_OP2_OPTION_PERL (1U<<2) /* (?imsxadlu), (?-imsx), (?^imsxalu) */ -#define ONIG_SYN_OP2_OPTION_RUBY (1U<<3) /* (?imxadu), (?-imx) */ -#define ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT (1U<<4) /* ?+,*+,++ */ -#define ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL (1U<<5) /* {n,m}+ */ -#define ONIG_SYN_OP2_CCLASS_SET_OP (1U<<6) /* [...&&..[..]..] */ -#define ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP (1U<<7) /* (?...) */ -#define ONIG_SYN_OP2_ESC_K_NAMED_BACKREF (1U<<8) /* \k */ -#define ONIG_SYN_OP2_ESC_G_SUBEXP_CALL (1U<<9) /* \g, \g */ -#define ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY (1U<<10) /* (?@..),(?@..) */ -#define ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL (1U<<11) /* \C-x */ -#define ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META (1U<<12) /* \M-x */ -#define ONIG_SYN_OP2_ESC_V_VTAB (1U<<13) /* \v as VTAB */ -#define ONIG_SYN_OP2_ESC_U_HEX4 (1U<<14) /* \uHHHH */ -#define ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR (1U<<15) /* \`, \' */ -#define ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY (1U<<16) /* \p{...}, \P{...} */ -#define ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT (1U<<17) /* \p{^..}, \P{^..} */ -/* #define ONIG_SYN_OP2_CHAR_PROPERTY_PREFIX_IS (1U<<18) */ -#define ONIG_SYN_OP2_ESC_H_XDIGIT (1U<<19) /* \h, \H */ -#define ONIG_SYN_OP2_INEFFECTIVE_ESCAPE (1U<<20) /* \ */ -#define ONIG_SYN_OP2_ESC_CAPITAL_R_LINEBREAK (1U<<21) /* \R as (?>\x0D\x0A|[\x0A-\x0D\x{85}\x{2028}\x{2029}]) */ -#define ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER (1U<<22) /* \X as (?>\P{M}\p{M}*) */ -#define ONIG_SYN_OP2_ESC_V_VERTICAL_WHITESPACE (1U<<23) /* \v, \V -- Perl */ /* NOTIMPL */ -#define ONIG_SYN_OP2_ESC_H_HORIZONTAL_WHITESPACE (1U<<24) /* \h, \H -- Perl */ /* NOTIMPL */ -#define ONIG_SYN_OP2_ESC_CAPITAL_K_KEEP (1U<<25) /* \K */ -#define ONIG_SYN_OP2_ESC_G_BRACE_BACKREF (1U<<26) /* \g{name}, \g{n} */ -#define ONIG_SYN_OP2_QMARK_SUBEXP_CALL (1U<<27) /* (?&name), (?n), (?R), (?0) */ -#define ONIG_SYN_OP2_QMARK_VBAR_BRANCH_RESET (1U<<28) /* (?|...) */ /* NOTIMPL */ -#define ONIG_SYN_OP2_QMARK_LPAREN_CONDITION (1U<<29) /* (?(cond)yes...|no...) */ -#define ONIG_SYN_OP2_QMARK_CAPITAL_P_NAMED_GROUP (1U<<30) /* (?P...), (?P=name), (?P>name) -- Python/PCRE */ -#define ONIG_SYN_OP2_OPTION_JAVA (1U<<31) /* (?idmsux), (?-idmsux) */ /* NOTIMPL */ - -/* syntax (behavior) */ -#define ONIG_SYN_CONTEXT_INDEP_ANCHORS (1U<<31) /* not implemented */ -#define ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS (1U<<0) /* ?, *, +, {n,m} */ -#define ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS (1U<<1) /* error or ignore */ -#define ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP (1U<<2) /* ...)... */ -#define ONIG_SYN_ALLOW_INVALID_INTERVAL (1U<<3) /* {??? */ -#define ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV (1U<<4) /* {,n} => {0,n} */ -#define ONIG_SYN_STRICT_CHECK_BACKREF (1U<<5) /* /(\1)/,/\1()/ ..*/ -#define ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND (1U<<6) /* (?<=a|bc) */ -#define ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP (1U<<7) /* see doc/RE */ -#define ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME (1U<<8) /* (?)(?) */ -#define ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY (1U<<9) /* a{n}?=(?:a{n})? */ -#define ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME_CALL (1U<<10) /* (?)(?)(?&x) */ - -/* syntax (behavior) in char class [...] */ -#define ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC (1U<<20) /* [^...] */ -#define ONIG_SYN_BACKSLASH_ESCAPE_IN_CC (1U<<21) /* [..\w..] etc.. */ -#define ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC (1U<<22) -#define ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC (1U<<23) /* [0-9-a]=[0-9\-a] */ -/* syntax (behavior) warning */ -#define ONIG_SYN_WARN_CC_OP_NOT_ESCAPED (1U<<24) /* [,-,] */ -#define ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT (1U<<25) /* (?:a*)+ */ -#define ONIG_SYN_WARN_CC_DUP (1U<<26) /* [aa] */ - -/* meta character specifiers (onig_set_meta_char()) */ -#define ONIG_META_CHAR_ESCAPE 0 -#define ONIG_META_CHAR_ANYCHAR 1 -#define ONIG_META_CHAR_ANYTIME 2 -#define ONIG_META_CHAR_ZERO_OR_ONE_TIME 3 -#define ONIG_META_CHAR_ONE_OR_MORE_TIME 4 -#define ONIG_META_CHAR_ANYCHAR_ANYTIME 5 - -#define ONIG_INEFFECTIVE_META_CHAR 0 - -/* error codes */ -#define ONIG_IS_PATTERN_ERROR(ecode) ((ecode) <= -100 && (ecode) > -1000) -/* normal return */ -#define ONIG_NORMAL 0 -#define ONIG_MISMATCH -1 -#define ONIG_NO_SUPPORT_CONFIG -2 - -/* internal error */ -#define ONIGERR_MEMORY -5 -#define ONIGERR_TYPE_BUG -6 -#define ONIGERR_PARSER_BUG -11 -#define ONIGERR_STACK_BUG -12 -#define ONIGERR_UNDEFINED_BYTECODE -13 -#define ONIGERR_UNEXPECTED_BYTECODE -14 -#define ONIGERR_MATCH_STACK_LIMIT_OVER -15 -#define ONIGERR_DEFAULT_ENCODING_IS_NOT_SET -21 -#define ONIGERR_SPECIFIED_ENCODING_CANT_CONVERT_TO_WIDE_CHAR -22 -/* general error */ -#define ONIGERR_INVALID_ARGUMENT -30 -/* syntax error */ -#define ONIGERR_END_PATTERN_AT_LEFT_BRACE -100 -#define ONIGERR_END_PATTERN_AT_LEFT_BRACKET -101 -#define ONIGERR_EMPTY_CHAR_CLASS -102 -#define ONIGERR_PREMATURE_END_OF_CHAR_CLASS -103 -#define ONIGERR_END_PATTERN_AT_ESCAPE -104 -#define ONIGERR_END_PATTERN_AT_META -105 -#define ONIGERR_END_PATTERN_AT_CONTROL -106 -#define ONIGERR_META_CODE_SYNTAX -108 -#define ONIGERR_CONTROL_CODE_SYNTAX -109 -#define ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE -110 -#define ONIGERR_CHAR_CLASS_VALUE_AT_START_OF_RANGE -111 -#define ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS -112 -#define ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED -113 -#define ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID -114 -#define ONIGERR_NESTED_REPEAT_OPERATOR -115 -#define ONIGERR_UNMATCHED_CLOSE_PARENTHESIS -116 -#define ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS -117 -#define ONIGERR_END_PATTERN_IN_GROUP -118 -#define ONIGERR_UNDEFINED_GROUP_OPTION -119 -#define ONIGERR_INVALID_POSIX_BRACKET_TYPE -121 -#define ONIGERR_INVALID_LOOK_BEHIND_PATTERN -122 -#define ONIGERR_INVALID_REPEAT_RANGE_PATTERN -123 -#define ONIGERR_INVALID_CONDITION_PATTERN -124 -/* values error (syntax error) */ -#define ONIGERR_TOO_BIG_NUMBER -200 -#define ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE -201 -#define ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE -202 -#define ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS -203 -#define ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE -204 -#define ONIGERR_TOO_MANY_MULTI_BYTE_RANGES -205 -#define ONIGERR_TOO_SHORT_MULTI_BYTE_STRING -206 -#define ONIGERR_TOO_BIG_BACKREF_NUMBER -207 -#define ONIGERR_INVALID_BACKREF -208 -#define ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED -209 -#define ONIGERR_TOO_SHORT_DIGITS -210 -#define ONIGERR_TOO_LONG_WIDE_CHAR_VALUE -212 -#define ONIGERR_EMPTY_GROUP_NAME -214 -#define ONIGERR_INVALID_GROUP_NAME -215 -#define ONIGERR_INVALID_CHAR_IN_GROUP_NAME -216 -#define ONIGERR_UNDEFINED_NAME_REFERENCE -217 -#define ONIGERR_UNDEFINED_GROUP_REFERENCE -218 -#define ONIGERR_MULTIPLEX_DEFINED_NAME -219 -#define ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL -220 -#define ONIGERR_NEVER_ENDING_RECURSION -221 -#define ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY -222 -#define ONIGERR_INVALID_CHAR_PROPERTY_NAME -223 -#define ONIGERR_TOO_MANY_CAPTURE_GROUPS -224 -#define ONIGERR_INVALID_CODE_POINT_VALUE -400 -#define ONIGERR_INVALID_WIDE_CHAR_VALUE -400 -#define ONIGERR_TOO_BIG_WIDE_CHAR_VALUE -401 -#define ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION -402 -#define ONIGERR_INVALID_COMBINATION_OF_OPTIONS -403 - -/* errors related to thread */ -#define ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT -1001 - - -/* must be smaller than BIT_STATUS_BITS_NUM (unsigned int * 8) */ -#define ONIG_MAX_CAPTURE_HISTORY_GROUP 31 -#define ONIG_IS_CAPTURE_HISTORY_GROUP(r, i) \ - ((i) <= ONIG_MAX_CAPTURE_HISTORY_GROUP && (r)->list && (r)->list[i]) - -typedef struct OnigCaptureTreeNodeStruct { - int group; /* group number */ - OnigPosition beg; - OnigPosition end; - int allocated; - int num_childs; - struct OnigCaptureTreeNodeStruct** childs; -} OnigCaptureTreeNode; - -/* match result region type */ -struct re_registers { - int allocated; - int num_regs; - OnigPosition* beg; - OnigPosition* end; - /* extended */ - OnigCaptureTreeNode* history_root; /* capture history tree root */ -}; - -/* capture tree traverse */ -#define ONIG_TRAVERSE_CALLBACK_AT_FIRST 1 -#define ONIG_TRAVERSE_CALLBACK_AT_LAST 2 -#define ONIG_TRAVERSE_CALLBACK_AT_BOTH \ - ( ONIG_TRAVERSE_CALLBACK_AT_FIRST | ONIG_TRAVERSE_CALLBACK_AT_LAST ) - - -#define ONIG_REGION_NOTPOS -1 - -typedef struct re_registers OnigRegion; - -typedef struct { - OnigEncoding enc; - OnigUChar* par; - OnigUChar* par_end; -} OnigErrorInfo; - -typedef struct { - int lower; - int upper; -} OnigRepeatRange; - -typedef void (*OnigWarnFunc) P_((const char* s)); -extern void onig_null_warn P_((const char* s)); -#define ONIG_NULL_WARN onig_null_warn - -#define ONIG_CHAR_TABLE_SIZE 256 - -/* regex_t state */ -#define ONIG_STATE_NORMAL 0 -#define ONIG_STATE_SEARCHING 1 -#define ONIG_STATE_COMPILING -1 -#define ONIG_STATE_MODIFY -2 - -#define ONIG_STATE(reg) \ - ((reg)->state > 0 ? ONIG_STATE_SEARCHING : (reg)->state) - -typedef struct re_pattern_buffer { - /* common members of BBuf(bytes-buffer) */ - unsigned char* p; /* compiled pattern */ - unsigned int used; /* used space for p */ - unsigned int alloc; /* allocated space for p */ - - int state; /* normal, searching, compiling */ - int num_mem; /* used memory(...) num counted from 1 */ - int num_repeat; /* OP_REPEAT/OP_REPEAT_NG id-counter */ - int num_null_check; /* OP_NULL_CHECK_START/END id counter */ - int num_comb_exp_check; /* combination explosion check */ - int num_call; /* number of subexp call */ - unsigned int capture_history; /* (?@...) flag (1-31) */ - unsigned int bt_mem_start; /* need backtrack flag */ - unsigned int bt_mem_end; /* need backtrack flag */ - int stack_pop_level; - int repeat_range_alloc; - - OnigOptionType options; - - OnigRepeatRange* repeat_range; - - OnigEncoding enc; - const OnigSyntaxType* syntax; - void* name_table; - OnigCaseFoldType case_fold_flag; - - /* optimization info (string search, char-map and anchors) */ - int optimize; /* optimize flag */ - int threshold_len; /* search str-length for apply optimize */ - int anchor; /* BEGIN_BUF, BEGIN_POS, (SEMI_)END_BUF */ - OnigDistance anchor_dmin; /* (SEMI_)END_BUF anchor distance */ - OnigDistance anchor_dmax; /* (SEMI_)END_BUF anchor distance */ - int sub_anchor; /* start-anchor for exact or map */ - unsigned char *exact; - unsigned char *exact_end; - unsigned char map[ONIG_CHAR_TABLE_SIZE]; /* used as BM skip or char-map */ - int *int_map; /* BM skip for exact_len > 255 */ - int *int_map_backward; /* BM skip for backward search */ - OnigDistance dmin; /* min-distance of exact or map */ - OnigDistance dmax; /* max-distance of exact or map */ - - /* regex_t link chain */ - struct re_pattern_buffer* chain; /* escape compile-conflict */ -} OnigRegexType; - -typedef OnigRegexType* OnigRegex; - -#ifndef ONIG_ESCAPE_REGEX_T_COLLISION - typedef OnigRegexType regex_t; -#endif - - -typedef struct { - int num_of_elements; - OnigEncoding pattern_enc; - OnigEncoding target_enc; - const OnigSyntaxType* syntax; - OnigOptionType option; - OnigCaseFoldType case_fold_flag; -} OnigCompileInfo; - -/* Oniguruma Native API */ -ONIG_EXTERN -int onig_init P_((void)); -ONIG_EXTERN -int onig_error_code_to_str PV_((OnigUChar* s, OnigPosition err_code, ...)); -ONIG_EXTERN -void onig_set_warn_func P_((OnigWarnFunc f)); -ONIG_EXTERN -void onig_set_verb_warn_func P_((OnigWarnFunc f)); -ONIG_EXTERN -int onig_new P_((OnigRegex*, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigOptionType option, OnigEncoding enc, const OnigSyntaxType* syntax, OnigErrorInfo* einfo)); -ONIG_EXTERN -int onig_reg_init P_((OnigRegex reg, OnigOptionType option, OnigCaseFoldType case_fold_flag, OnigEncoding enc, const OnigSyntaxType* syntax)); -ONIG_EXTERN -int onig_new_without_alloc P_((OnigRegex, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo)); -ONIG_EXTERN -int onig_new_deluxe P_((OnigRegex* reg, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo)); -ONIG_EXTERN -void onig_free P_((OnigRegex)); -ONIG_EXTERN -void onig_free_body P_((OnigRegex)); -ONIG_EXTERN -int onig_recompile P_((OnigRegex, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo)); -ONIG_EXTERN -int onig_recompile_deluxe P_((OnigRegex reg, const OnigUChar* pattern, const OnigUChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo)); -ONIG_EXTERN -OnigPosition onig_search P_((OnigRegex, const OnigUChar* str, const OnigUChar* end, const OnigUChar* start, const OnigUChar* range, OnigRegion* region, OnigOptionType option)); -ONIG_EXTERN -OnigPosition onig_search_gpos P_((OnigRegex, const OnigUChar* str, const OnigUChar* end, const OnigUChar* global_pos, const OnigUChar* start, const OnigUChar* range, OnigRegion* region, OnigOptionType option)); -ONIG_EXTERN -OnigPosition onig_match P_((OnigRegex, const OnigUChar* str, const OnigUChar* end, const OnigUChar* at, OnigRegion* region, OnigOptionType option)); -ONIG_EXTERN -OnigRegion* onig_region_new P_((void)); -ONIG_EXTERN -void onig_region_init P_((OnigRegion* region)); -ONIG_EXTERN -void onig_region_free P_((OnigRegion* region, int free_self)); -ONIG_EXTERN -void onig_region_copy P_((OnigRegion* to, OnigRegion* from)); -ONIG_EXTERN -void onig_region_clear P_((OnigRegion* region)); -ONIG_EXTERN -int onig_region_resize P_((OnigRegion* region, int n)); -ONIG_EXTERN -int onig_region_set P_((OnigRegion* region, int at, int beg, int end)); -ONIG_EXTERN -int onig_name_to_group_numbers P_((OnigRegex reg, const OnigUChar* name, const OnigUChar* name_end, int** nums)); -ONIG_EXTERN -int onig_name_to_backref_number P_((OnigRegex reg, const OnigUChar* name, const OnigUChar* name_end, OnigRegion *region)); -ONIG_EXTERN -int onig_foreach_name P_((OnigRegex reg, int (*func)(const OnigUChar*, const OnigUChar*,int,int*,OnigRegex,void*), void* arg)); -ONIG_EXTERN -int onig_number_of_names P_((OnigRegex reg)); -ONIG_EXTERN -int onig_number_of_captures P_((OnigRegex reg)); -ONIG_EXTERN -int onig_number_of_capture_histories P_((OnigRegex reg)); -ONIG_EXTERN -OnigCaptureTreeNode* onig_get_capture_tree P_((OnigRegion* region)); -ONIG_EXTERN -int onig_capture_tree_traverse P_((OnigRegion* region, int at, int(*callback_func)(int,OnigPosition,OnigPosition,int,int,void*), void* arg)); -ONIG_EXTERN -int onig_noname_group_capture_is_active P_((OnigRegex reg)); -ONIG_EXTERN -OnigEncoding onig_get_encoding P_((OnigRegex reg)); -ONIG_EXTERN -OnigOptionType onig_get_options P_((OnigRegex reg)); -ONIG_EXTERN -OnigCaseFoldType onig_get_case_fold_flag P_((OnigRegex reg)); -ONIG_EXTERN -const OnigSyntaxType* onig_get_syntax P_((OnigRegex reg)); -ONIG_EXTERN -int onig_set_default_syntax P_((const OnigSyntaxType* syntax)); -ONIG_EXTERN -void onig_copy_syntax P_((OnigSyntaxType* to, const OnigSyntaxType* from)); -ONIG_EXTERN -unsigned int onig_get_syntax_op P_((OnigSyntaxType* syntax)); -ONIG_EXTERN -unsigned int onig_get_syntax_op2 P_((OnigSyntaxType* syntax)); -ONIG_EXTERN -unsigned int onig_get_syntax_behavior P_((OnigSyntaxType* syntax)); -ONIG_EXTERN -OnigOptionType onig_get_syntax_options P_((OnigSyntaxType* syntax)); -ONIG_EXTERN -void onig_set_syntax_op P_((OnigSyntaxType* syntax, unsigned int op)); -ONIG_EXTERN -void onig_set_syntax_op2 P_((OnigSyntaxType* syntax, unsigned int op2)); -ONIG_EXTERN -void onig_set_syntax_behavior P_((OnigSyntaxType* syntax, unsigned int behavior)); -ONIG_EXTERN -void onig_set_syntax_options P_((OnigSyntaxType* syntax, OnigOptionType options)); -ONIG_EXTERN -int onig_set_meta_char P_((OnigSyntaxType* syntax, unsigned int what, OnigCodePoint code)); -ONIG_EXTERN -void onig_copy_encoding P_((OnigEncodingType *to, OnigEncoding from)); -ONIG_EXTERN -OnigCaseFoldType onig_get_default_case_fold_flag P_((void)); -ONIG_EXTERN -int onig_set_default_case_fold_flag P_((OnigCaseFoldType case_fold_flag)); -ONIG_EXTERN -unsigned int onig_get_match_stack_limit_size P_((void)); -ONIG_EXTERN -int onig_set_match_stack_limit_size P_((unsigned int size)); -ONIG_EXTERN -int onig_end P_((void)); -ONIG_EXTERN -const char* onig_version P_((void)); -ONIG_EXTERN -const char* onig_copyright P_((void)); - -RUBY_SYMBOL_EXPORT_END - -#ifdef __cplusplus -#if 0 -{ /* satisfy cc-mode */ -#endif -} -#endif - +#define ONIGURUMA_VERSION_MAJOR ONIGMO_VERSION_MAJOR +#define ONIGURUMA_VERSION_MINOR ONIGMO_VERSION_MINOR +#define ONIGURUMA_VERSION_TEENY ONIGMO_VERSION_TEENY #endif /* ONIGURUMA_H */ diff --git a/re.c b/re.c index 069a9bc15d..2d786f5cef 100644 --- a/re.c +++ b/re.c @@ -847,7 +847,7 @@ onig_new_with_source(regex_t** reg, const UChar* pattern, const UChar* pattern_e r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax); if (r) goto err; - r = onig_compile(*reg, pattern, pattern_end, einfo, sourcefile, sourceline); + r = onig_compile_ruby(*reg, pattern, pattern_end, einfo, sourcefile, sourceline); if (r) { err: onig_free(*reg); @@ -3908,7 +3908,6 @@ Init_Regexp(void) { rb_eRegexpError = rb_define_class("RegexpError", rb_eStandardError); - onigenc_set_default_caseconv_table((UChar*)casetable); onigenc_set_default_encoding(ONIG_ENCODING_ASCII); onig_set_warn_func(re_warn); onig_set_verb_warn_func(re_warn); diff --git a/regcomp.c b/regcomp.c index 222d1d6c9e..49103afea1 100644 --- a/regcomp.c +++ b/regcomp.c @@ -3,7 +3,7 @@ **********************************************************************/ /*- * Copyright (c) 2002-2013 K.Kosako - * Copyright (c) 2011-2014 K.Takata + * Copyright (c) 2011-2016 K.Takata * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -30,15 +30,6 @@ #include "regparse.h" -#if defined(USE_MULTI_THREAD_SYSTEM) \ - && defined(USE_DEFAULT_MULTI_THREAD_SYSTEM) -#ifdef _WIN32 -CRITICAL_SECTION gOnigMutex; -#else -pthread_mutex_t gOnigMutex; -#endif -#endif - OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN; extern OnigCaseFoldType @@ -263,6 +254,7 @@ add_mem_num(regex_t* reg, int num) return 0; } +#if 0 static int add_pointer(regex_t* reg, void* addr) { @@ -271,6 +263,7 @@ add_pointer(regex_t* reg, void* addr) BBUF_ADD(reg, &ptr, SIZE_POINTER); return 0; } +#endif static int add_option(regex_t* reg, OnigOptionType option) @@ -591,11 +584,6 @@ compile_length_cclass_node(CClassNode* cc, regex_t* reg) { int len; - if (IS_NCCLASS_SHARE(cc)) { - len = SIZE_OPCODE + SIZE_POINTER; - return len; - } - if (IS_NULL(cc->mbuf)) { len = SIZE_OPCODE + SIZE_BITSET; } @@ -621,12 +609,6 @@ compile_cclass_node(CClassNode* cc, regex_t* reg) { int r; - if (IS_NCCLASS_SHARE(cc)) { - add_opcode(reg, OP_CCLASS_NODE); - r = add_pointer(reg, cc); - return r; - } - if (IS_NULL(cc->mbuf)) { if (IS_NCCLASS_NOT(cc)) add_opcode(reg, OP_CCLASS_NOT); @@ -638,17 +620,17 @@ compile_cclass_node(CClassNode* cc, regex_t* reg) else { if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) { if (IS_NCCLASS_NOT(cc)) - add_opcode(reg, OP_CCLASS_MB_NOT); + add_opcode(reg, OP_CCLASS_MB_NOT); else - add_opcode(reg, OP_CCLASS_MB); + add_opcode(reg, OP_CCLASS_MB); r = add_multi_byte_cclass(cc->mbuf, reg); } else { if (IS_NCCLASS_NOT(cc)) - add_opcode(reg, OP_CCLASS_MIX_NOT); + add_opcode(reg, OP_CCLASS_MIX_NOT); else - add_opcode(reg, OP_CCLASS_MIX); + add_opcode(reg, OP_CCLASS_MIX); r = add_bitset(reg, cc->bs); if (r) return r; @@ -760,9 +742,9 @@ compile_length_quantifier_node(QtfrNode* qn, regex_t* reg) if (NTYPE(qn->target) == NT_CANY) { if (qn->greedy && infinite) { if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON) - return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower + cklen; + return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower + cklen; else - return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower + cklen; + return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower + cklen; } } @@ -989,9 +971,9 @@ compile_length_quantifier_node(QtfrNode* qn, regex_t* reg) if (NTYPE(qn->target) == NT_CANY) { if (qn->greedy && infinite) { if (IS_NOT_NULL(qn->next_head_exact)) - return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower; + return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower; else - return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower; + return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower; } } @@ -1010,9 +992,12 @@ compile_length_quantifier_node(QtfrNode* qn, regex_t* reg) } if (qn->greedy) { +#ifdef USE_OP_PUSH_OR_JUMP_EXACT if (IS_NOT_NULL(qn->head_exact)) len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP; - else if (IS_NOT_NULL(qn->next_head_exact)) + else +#endif + if (IS_NOT_NULL(qn->next_head_exact)) len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP; else len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP; @@ -1078,9 +1063,12 @@ compile_quantifier_node(QtfrNode* qn, regex_t* reg) (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) { if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) { if (qn->greedy) { +#ifdef USE_OP_PUSH_OR_JUMP_EXACT if (IS_NOT_NULL(qn->head_exact)) r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_OR_JUMP_EXACT1); - else if (IS_NOT_NULL(qn->next_head_exact)) + else +#endif + if (IS_NOT_NULL(qn->next_head_exact)) r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_IF_PEEK_NEXT); else r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH); @@ -1096,6 +1084,7 @@ compile_quantifier_node(QtfrNode* qn, regex_t* reg) } if (qn->greedy) { +#ifdef USE_OP_PUSH_OR_JUMP_EXACT if (IS_NOT_NULL(qn->head_exact)) { r = add_opcode_rel_addr(reg, OP_PUSH_OR_JUMP_EXACT1, mod_tlen + SIZE_OP_JUMP); @@ -1106,7 +1095,9 @@ compile_quantifier_node(QtfrNode* qn, regex_t* reg) r = add_opcode_rel_addr(reg, OP_JUMP, -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1)); } - else if (IS_NOT_NULL(qn->next_head_exact)) { + else +#endif + if (IS_NOT_NULL(qn->next_head_exact)) { r = add_opcode_rel_addr(reg, OP_PUSH_IF_PEEK_NEXT, mod_tlen + SIZE_OP_JUMP); if (r) return r; @@ -1243,6 +1234,11 @@ compile_length_enclose_node(EncloseNode* node, regex_t* reg) len += (IS_ENCLOSE_RECURSION(node) ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END); } + else if (IS_ENCLOSE_RECURSION(node)) { + len = SIZE_OP_MEMORY_START_PUSH; + len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum) + ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_REC); + } else #endif { @@ -1354,6 +1350,14 @@ compile_enclose_node(EncloseNode* node, regex_t* reg) if (r) return r; r = add_opcode(reg, OP_RETURN); } + else if (IS_ENCLOSE_RECURSION(node)) { + if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)) + r = add_opcode(reg, OP_MEMORY_END_PUSH_REC); + else + r = add_opcode(reg, OP_MEMORY_END_REC); + if (r) return r; + r = add_mem_num(reg, node->regnum); + } else #endif { @@ -1589,10 +1593,10 @@ compile_length_tree(Node* node, regex_t* reg) int n = 0; len = 0; do { - r = compile_length_tree(NCAR(node), reg); - if (r < 0) return r; - len += r; - n++; + r = compile_length_tree(NCAR(node), reg); + if (r < 0) return r; + len += r; + n++; } while (IS_NOT_NULL(node = NCDR(node))); r = len; r += (SIZE_OP_PUSH + SIZE_OP_JUMP) * (n - 1); @@ -1621,7 +1625,7 @@ compile_length_tree(Node* node, regex_t* reg) #ifdef USE_BACKREF_WITH_LEVEL if (IS_BACKREF_NEST_LEVEL(br)) { - r = SIZE_OPCODE + SIZE_OPTION + SIZE_LENGTH + + r = SIZE_OPCODE + SIZE_OPTION + SIZE_LENGTH + SIZE_LENGTH + (SIZE_MEMNUM * br->back_num); } else @@ -1785,12 +1789,12 @@ compile_tree(Node* node, regex_t* reg) int i; int* p; - if (IS_IGNORECASE(reg->options)) { - r = add_opcode(reg, OP_BACKREF_MULTI_IC); - } - else { - r = add_opcode(reg, OP_BACKREF_MULTI); - } + if (IS_IGNORECASE(reg->options)) { + r = add_opcode(reg, OP_BACKREF_MULTI_IC); + } + else { + r = add_opcode(reg, OP_BACKREF_MULTI); + } if (r) return r; #ifdef USE_BACKREF_WITH_LEVEL @@ -1884,17 +1888,8 @@ noname_disable_map(Node** plink, GroupNumRemap* map, int* counter) break; case NT_ANCHOR: - { - AnchorNode* an = NANCHOR(node); - switch (an->type) { - case ANCHOR_PREC_READ: - case ANCHOR_PREC_READ_NOT: - case ANCHOR_LOOK_BEHIND: - case ANCHOR_LOOK_BEHIND_NOT: - r = noname_disable_map(&(an->target), map, counter); - break; - } - } + if (NANCHOR(node)->target) + r = noname_disable_map(&(NANCHOR(node)->target), map, counter); break; default: @@ -1951,7 +1946,7 @@ renumber_by_map(Node* node, GroupNumRemap* map) { EncloseNode* en = NENCLOSE(node); if (en->type == ENCLOSE_CONDITION) - en->regnum = map[en->regnum].new_val; + en->regnum = map[en->regnum].new_val; r = renumber_by_map(en->target, map); } break; @@ -1961,17 +1956,8 @@ renumber_by_map(Node* node, GroupNumRemap* map) break; case NT_ANCHOR: - { - AnchorNode* an = NANCHOR(node); - switch (an->type) { - case ANCHOR_PREC_READ: - case ANCHOR_PREC_READ_NOT: - case ANCHOR_LOOK_BEHIND: - case ANCHOR_LOOK_BEHIND_NOT: - r = renumber_by_map(an->target, map); - break; - } - } + if (NANCHOR(node)->target) + r = renumber_by_map(NANCHOR(node)->target, map); break; default: @@ -2005,6 +1991,11 @@ numbered_ref_check(Node* node) return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED; break; + case NT_ANCHOR: + if (NANCHOR(node)->target) + r = numbered_ref_check(NANCHOR(node)->target); + break; + default: break; } @@ -2091,7 +2082,7 @@ quantifiers_memory_node_info(Node* node) } break; -#ifdef USE_SUBEXP_CALL +# ifdef USE_SUBEXP_CALL case NT_CALL: if (IS_CALL_RECURSION(NCALL(node))) { return NQ_TARGET_IS_EMPTY_REC; /* tiny version */ @@ -2099,7 +2090,7 @@ quantifiers_memory_node_info(Node* node) else r = quantifiers_memory_node_info(NCALL(node)->target); break; -#endif +# endif case NT_QTFR: { @@ -2238,18 +2229,23 @@ get_min_match_length(Node* node, OnigDistance *min, ScanEnv* env) EncloseNode* en = NENCLOSE(node); switch (en->type) { case ENCLOSE_MEMORY: -#ifdef USE_SUBEXP_CALL - if (IS_ENCLOSE_MIN_FIXED(en)) - *min = en->min_len; - else { - r = get_min_match_length(en->target, min, env); - if (r == 0) { - en->min_len = *min; - SET_ENCLOSE_STATUS(node, NST_MIN_FIXED); + if (IS_ENCLOSE_MIN_FIXED(en)) + *min = en->min_len; + else { + if (IS_ENCLOSE_MARK1(NENCLOSE(node))) + *min = 0; // recursive + else { + SET_ENCLOSE_STATUS(node, NST_MARK1); + r = get_min_match_length(en->target, min, env); + CLEAR_ENCLOSE_STATUS(node, NST_MARK1); + if (r == 0) { + en->min_len = *min; + SET_ENCLOSE_STATUS(node, NST_MIN_FIXED); + } } - } - break; -#endif + } + break; + case ENCLOSE_OPTION: case ENCLOSE_STOP_BACKTRACK: case ENCLOSE_CONDITION: @@ -2356,18 +2352,23 @@ get_max_match_length(Node* node, OnigDistance *max, ScanEnv* env) EncloseNode* en = NENCLOSE(node); switch (en->type) { case ENCLOSE_MEMORY: -#ifdef USE_SUBEXP_CALL if (IS_ENCLOSE_MAX_FIXED(en)) *max = en->max_len; else { - r = get_max_match_length(en->target, max, env); - if (r == 0) { - en->max_len = *max; - SET_ENCLOSE_STATUS(node, NST_MAX_FIXED); + if (IS_ENCLOSE_MARK1(NENCLOSE(node))) + *max = ONIG_INFINITE_DISTANCE; + else { + SET_ENCLOSE_STATUS(node, NST_MARK1); + r = get_max_match_length(en->target, max, env); + CLEAR_ENCLOSE_STATUS(node, NST_MARK1); + if (r == 0) { + en->max_len = *max; + SET_ENCLOSE_STATUS(node, NST_MAX_FIXED); + } } } break; -#endif + case ENCLOSE_OPTION: case ENCLOSE_STOP_BACKTRACK: case ENCLOSE_CONDITION: @@ -2622,10 +2623,10 @@ is_not_included(Node* x, Node* y, regex_t* reg) for (i = 0; i < SINGLE_BYTE_SIZE; i++) { v = BITSET_AT(xc->bs, i); if ((v != 0 && !IS_NCCLASS_NOT(xc)) || - (v == 0 && IS_NCCLASS_NOT(xc))) { + (v == 0 && IS_NCCLASS_NOT(xc))) { v = BITSET_AT(yc->bs, i); if ((v != 0 && !IS_NCCLASS_NOT(yc)) || - (v == 0 && IS_NCCLASS_NOT(yc))) + (v == 0 && IS_NCCLASS_NOT(yc))) return 0; } } @@ -2675,24 +2676,24 @@ is_not_included(Node* x, Node* y, regex_t* reg) break; case NT_CCLASS: - { - CClassNode* cc = NCCLASS(y); + { + CClassNode* cc = NCCLASS(y); - code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s, - xs->s + ONIGENC_MBC_MAXLEN(reg->enc)); - return (onig_is_code_in_cc(reg->enc, code, cc) != 0 ? 0 : 1); - } - break; + code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s, + xs->s + ONIGENC_MBC_MAXLEN(reg->enc)); + return (onig_is_code_in_cc(reg->enc, code, cc) != 0 ? 0 : 1); + } + break; case NT_STR: - { - UChar *q; - StrNode* ys = NSTR(y); - len = NSTRING_LEN(x); - if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y); - if (NSTRING_IS_AMBIG(x) || NSTRING_IS_AMBIG(y)) { - /* tiny version */ - return 0; + { + UChar *q; + StrNode* ys = NSTR(y); + len = NSTRING_LEN(x); + if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y); + if (NSTRING_IS_AMBIG(x) || NSTRING_IS_AMBIG(y)) { + /* tiny version */ + return 0; } else { for (i = 0, p = ys->s, q = xs->s; (OnigDistance )i < len; i++, p++, q++) { @@ -2703,7 +2704,7 @@ is_not_included(Node* x, Node* y, regex_t* reg) break; default: - break; + break; } } break; @@ -2760,9 +2761,11 @@ get_head_value_node(Node* node, int exact, regex_t* reg) { QtfrNode* qn = NQTFR(node); if (qn->lower > 0) { +#ifdef USE_OP_PUSH_OR_JUMP_EXACT if (IS_NOT_NULL(qn->head_exact)) n = qn->head_exact; else +#endif n = get_head_value_node(qn->target, exact, reg); } } @@ -2854,8 +2857,8 @@ check_type_tree(Node* node, int type_mask, int enclose_mask, int anchor_mask) #ifdef USE_SUBEXP_CALL -#define RECURSION_EXIST 1 -#define RECURSION_INFINITE 2 +# define RECURSION_EXIST 1 +# define RECURSION_INFINITE 2 static int subexp_inf_recursive_check(Node* node, ScanEnv* env, int head) @@ -3055,7 +3058,7 @@ subexp_recursive_check(Node* node) static int subexp_recursive_check_trav(Node* node, ScanEnv* env) { -#define FOUND_CALLED_NODE 1 +# define FOUND_CALLED_NODE 1 int type; int r = 0; @@ -3156,22 +3159,22 @@ setup_subexp_call(Node* node, ScanEnv* env) if (cn->group_num != 0) { int gnum = cn->group_num; -#ifdef USE_NAMED_GROUP +# ifdef USE_NAMED_GROUP if (env->num_named > 0 && IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) { return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED; } -#endif +# endif if (gnum > env->num_mem) { onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_GROUP_REFERENCE, cn->name, cn->name_end); return ONIGERR_UNDEFINED_GROUP_REFERENCE; } -#ifdef USE_NAMED_GROUP +# ifdef USE_NAMED_GROUP set_call_attr: -#endif +# endif cn->target = nodes[cn->group_num]; if (IS_NULL(cn->target)) { onig_scan_env_set_error_string(env, @@ -3182,12 +3185,12 @@ setup_subexp_call(Node* node, ScanEnv* env) BIT_STATUS_ON_AT(env->bt_mem_start, cn->group_num); cn->unset_addr_list = env->unset_addr_list; } -#ifdef USE_NAMED_GROUP -#ifdef USE_PERL_SUBEXP_CALL +# ifdef USE_NAMED_GROUP +# ifdef USE_PERL_SUBEXP_CALL else if (cn->name == cn->name_end) { goto set_call_attr; } -#endif +# endif else { int *refs; @@ -3209,7 +3212,7 @@ setup_subexp_call(Node* node, ScanEnv* env) goto set_call_attr; } } -#endif +# endif } break; @@ -3398,13 +3401,9 @@ update_string_node_case_fold(regex_t* reg, Node *node) } r = onig_node_str_set(node, sbuf, sp); - if (r != 0) { - xfree(sbuf); - return r; - } xfree(sbuf); - return 0; + return r; } static int @@ -3512,29 +3511,29 @@ expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[], UChar *q = p + items[i].byte_len; if (q < end) { - r = expand_case_fold_make_rem_string(&rem, q, end, reg); - if (r != 0) { - onig_node_free(an); - goto mem_err2; - } + r = expand_case_fold_make_rem_string(&rem, q, end, reg); + if (r != 0) { + onig_node_free(an); + goto mem_err2; + } - xnode = onig_node_list_add(NULL_NODE, snode); - if (IS_NULL(xnode)) { - onig_node_free(an); - onig_node_free(rem); - goto mem_err2; - } - if (IS_NULL(onig_node_list_add(xnode, rem))) { - onig_node_free(an); - onig_node_free(xnode); - onig_node_free(rem); - goto mem_err; - } + xnode = onig_node_list_add(NULL_NODE, snode); + if (IS_NULL(xnode)) { + onig_node_free(an); + onig_node_free(rem); + goto mem_err2; + } + if (IS_NULL(onig_node_list_add(xnode, rem))) { + onig_node_free(an); + onig_node_free(xnode); + onig_node_free(rem); + goto mem_err; + } - NCAR(an) = xnode; + NCAR(an) = xnode; } else { - NCAR(an) = snode; + NCAR(an) = snode; } NCDR(var_anode) = an; @@ -3711,12 +3710,12 @@ expand_case_fold_string(Node* node, regex_t* reg) #ifdef USE_COMBINATION_EXPLOSION_CHECK -#define CEC_THRES_NUM_BIG_REPEAT 512 -#define CEC_INFINITE_NUM 0x7fffffff +# define CEC_THRES_NUM_BIG_REPEAT 512 +# define CEC_INFINITE_NUM 0x7fffffff -#define CEC_IN_INFINITE_REPEAT (1<<0) -#define CEC_IN_FINITE_REPEAT (1<<1) -#define CEC_CONT_BIG_REPEAT (1<<2) +# define CEC_IN_INFINITE_REPEAT (1<<0) +# define CEC_IN_FINITE_REPEAT (1<<1) +# define CEC_CONT_BIG_REPEAT (1<<2) static int setup_comb_exp_check(Node* node, int state, ScanEnv* env) @@ -3832,14 +3831,14 @@ setup_comb_exp_check(Node* node, int state, ScanEnv* env) } break; -#ifdef USE_SUBEXP_CALL +# ifdef USE_SUBEXP_CALL case NT_CALL: if (IS_CALL_RECURSION(NCALL(node))) env->has_recursion = 1; else r = setup_comb_exp_check(NCALL(node)->target, state, env); break; -#endif +# endif default: break; @@ -3854,6 +3853,8 @@ setup_comb_exp_check(Node* node, int state, ScanEnv* env) #define IN_REPEAT (1<<2) #define IN_VAR_REPEAT (1<<3) #define IN_ROOT (1<<4) +#define IN_CALL (1<<5) +#define IN_RECCALL (1<<6) /* setup_tree does the following work. 1. check empty loop. (set qn->target_empty_info) @@ -3943,7 +3944,7 @@ restart: Node* target = qn->target; if ((state & IN_REPEAT) != 0) { - qn->state |= NST_IN_REPEAT; + qn->state |= NST_IN_REPEAT; } if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) { @@ -4058,12 +4059,18 @@ restart: break; case ENCLOSE_MEMORY: - if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT)) != 0) { + if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT | IN_CALL)) != 0) { BIT_STATUS_ON_AT(env->bt_mem_start, en->regnum); /* SET_ENCLOSE_STATUS(node, NST_MEM_IN_ALT_NOT); */ } - r = setup_tree(en->target, reg, state, env); - break; + if (IS_ENCLOSE_CALLED(en)) + state |= IN_CALL; + if (IS_ENCLOSE_RECURSION(en)) + state |= IN_RECCALL; + else if ((state & IN_RECCALL) != 0) + SET_CALL_RECURSION(node); + r = setup_tree(en->target, reg, state, env); + break; case ENCLOSE_STOP_BACKTRACK: { @@ -4090,6 +4097,8 @@ restart: return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED; } #endif + if (NENCLOSE(node)->regnum > env->num_mem) + return ONIGERR_INVALID_BACKREF; r = setup_tree(NENCLOSE(node)->target, reg, state, env); break; } @@ -4133,10 +4142,10 @@ restart: ALLOWED_ENCLOSE_IN_LB, ALLOWED_ANCHOR_IN_LB); if (r < 0) return r; if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; - r = setup_look_behind(node, reg, env); - if (r != 0) return r; if (NTYPE(node) != NT_ANCHOR) goto restart; r = setup_tree(an->target, reg, state, env); + if (r != 0) return r; + r = setup_look_behind(node, reg, env); } break; @@ -4146,10 +4155,10 @@ restart: ALLOWED_ENCLOSE_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT); if (r < 0) return r; if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; - r = setup_look_behind(node, reg, env); - if (r != 0) return r; if (NTYPE(node) != NT_ANCHOR) goto restart; r = setup_tree(an->target, reg, (state | IN_NOT), env); + if (r != 0) return r; + r = setup_look_behind(node, reg, env); } break; } @@ -4203,6 +4212,10 @@ set_bm_skip(UChar* s, UChar* end, regex_t* reg, } } else { +# if OPT_EXACT_MAXLEN < ONIG_CHAR_TABLE_SIZE + /* This should not happen. */ + return ONIGERR_TYPE_BUG; +# else if (IS_NULL(*int_skip)) { *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE); if (IS_NULL(*int_skip)) return ONIGERR_MEMORY; @@ -4231,6 +4244,7 @@ set_bm_skip(UChar* s, UChar* end, regex_t* reg, } } } +# endif } return 0; } @@ -4276,6 +4290,10 @@ set_bm_skip(UChar* s, UChar* end, regex_t* reg, } } else { +# if OPT_EXACT_MAXLEN < ONIG_CHAR_TABLE_SIZE + /* This should not happen. */ + return ONIGERR_TYPE_BUG; +# else if (IS_NULL(*int_skip)) { *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE); if (IS_NULL(*int_skip)) return ONIGERR_MEMORY; @@ -4304,13 +4322,12 @@ set_bm_skip(UChar* s, UChar* end, regex_t* reg, } } } +# endif } return 0; } #endif /* USE_SUNDAY_QUICK_SEARCH */ -#define OPT_EXACT_MAXLEN 24 - typedef struct { OnigDistance min; /* min byte length */ OnigDistance max; /* max byte length */ @@ -4980,14 +4997,14 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) if (slen > 0) { add_char_opt_map_info(&opt->map, *(sn->s), env->enc); } - set_mml(&opt->len, slen, slen); + set_mml(&opt->len, slen, slen); } else { - OnigDistance max; + OnigDistance max; if (NSTRING_IS_DONT_GET_OPT_INFO(node)) { - int n = onigenc_strlen(env->enc, sn->s, sn->end); - max = ONIGENC_MBC_MAXLEN_DIST(env->enc) * n; + int n = onigenc_strlen(env->enc, sn->s, sn->end); + max = ONIGENC_MBC_MAXLEN_DIST(env->enc) * n; } else { concat_opt_exact_info_str(&opt->exb, sn->s, sn->end, @@ -5003,7 +5020,7 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) max = slen; } - set_mml(&opt->len, slen, max); + set_mml(&opt->len, slen, max); } if ((OnigDistance )opt->exb.len == slen) @@ -5019,18 +5036,18 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) /* no need to check ignore case. (set in setup_tree()) */ if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) { - OnigDistance min = ONIGENC_MBC_MINLEN(env->enc); + OnigDistance min = ONIGENC_MBC_MINLEN(env->enc); OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc); set_mml(&opt->len, min, max); } else { - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - z = BITSET_AT(cc->bs, i); - if ((z && !IS_NCCLASS_NOT(cc)) || (!z && IS_NCCLASS_NOT(cc))) { - add_char_opt_map_info(&opt->map, (UChar )i, env->enc); - } - } + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + z = BITSET_AT(cc->bs, i); + if ((z && !IS_NCCLASS_NOT(cc)) || (!z && IS_NCCLASS_NOT(cc))) { + add_char_opt_map_info(&opt->map, (UChar )i, env->enc); + } + } set_mml(&opt->len, 1, 1); } } @@ -5044,7 +5061,7 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) max = ONIGENC_MBC_MAXLEN_DIST(env->enc); if (max == 1) { - min = 1; + min = 1; maxcode = NCTYPE(node)->ascii_range ? 0x80 : SINGLE_BYTE_SIZE; switch (NCTYPE(node)->ctype) { @@ -5067,7 +5084,7 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) } } else { - min = ONIGENC_MBC_MINLEN(env->enc); + min = ONIGENC_MBC_MINLEN(env->enc); } set_mml(&opt->len, min, max); } @@ -5186,7 +5203,7 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) if (nopt.exb.len > 0) { if (nopt.exb.reach_end) { for (i = 2; i <= qn->lower && - ! is_full_opt_exact_info(&opt->exb); i++) { + ! is_full_opt_exact_info(&opt->exb); i++) { concat_opt_exact_info(&opt->exb, &nopt.exb, env->enc); } if (i < qn->lower) { @@ -5308,11 +5325,14 @@ set_optimize_exact_info(regex_t* reg, OptExactInfo* e) else { if (e->len >= 3 || (e->len >= 2 && allow_reverse)) { r = set_bm_skip(reg->exact, reg->exact_end, reg, - reg->map, &(reg->int_map), 0); - if (r) return r; - - reg->optimize = (allow_reverse != 0 + reg->map, &(reg->int_map), 0); + if (r == 0) { + reg->optimize = (allow_reverse != 0 ? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV); + } + else { + reg->optimize = ONIG_OPTIMIZE_EXACT; + } } else { reg->optimize = ONIG_OPTIMIZE_EXACT; @@ -5378,6 +5398,9 @@ set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) ANCHOR_BEGIN_POSITION | ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_ML | ANCHOR_LOOK_BEHIND); + if ((opt.anc.left_anchor & (ANCHOR_LOOK_BEHIND | ANCHOR_PREC_READ_NOT)) != 0) + reg->anchor &= ~ANCHOR_ANYCHAR_STAR_ML; + reg->anchor |= opt.anc.right_anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF | ANCHOR_PREC_READ_NOT); @@ -5570,14 +5593,14 @@ print_optimize_info(FILE* f, regex_t* reg) fputc('[', f); for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) { if (reg->map[i] != 0) { - if (c > 0) fputs(", ", f); - c++; - if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 && - ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i)) - fputc(i, f); - else - fprintf(f, "%d", i); - } + if (c > 0) fputs(", ", f); + c++; + if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 && + ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i)) + fputc(i, f); + else + fprintf(f, "%d", i); + } } fprintf(f, "]\n"); } @@ -5612,6 +5635,7 @@ onig_free(regex_t* reg) } } +#ifdef RUBY size_t onig_memsize(const regex_t *reg) { @@ -5635,65 +5659,47 @@ onig_region_memsize(const OnigRegion *regs) size += regs->allocated * (sizeof(*regs->beg) + sizeof(*regs->end)); return size; } +#endif #define REGEX_TRANSFER(to,from) do {\ - (to)->state = ONIG_STATE_MODIFY;\ onig_free_body(to);\ xmemcpy(to, from, sizeof(regex_t));\ xfree(from);\ } while (0) +#if 0 extern void onig_transfer(regex_t* to, regex_t* from) { - THREAD_ATOMIC_START; REGEX_TRANSFER(to, from); - THREAD_ATOMIC_END; -} - -#define REGEX_CHAIN_HEAD(reg) do {\ - while (IS_NOT_NULL((reg)->chain)) {\ - (reg) = (reg)->chain;\ - }\ -} while (0) - -extern void -onig_chain_link_add(regex_t* to, regex_t* add) -{ - THREAD_ATOMIC_START; - REGEX_CHAIN_HEAD(to); - to->chain = add; - THREAD_ATOMIC_END; -} - -extern void -onig_chain_reduce(regex_t* reg) -{ - regex_t *head, *prev; - - prev = reg; - head = prev->chain; - if (IS_NOT_NULL(head)) { - reg->state = ONIG_STATE_MODIFY; - while (IS_NOT_NULL(head->chain)) { - prev = head; - head = head->chain; - } - prev->chain = (regex_t* )NULL; - REGEX_TRANSFER(reg, head); - } } +#endif #ifdef ONIG_DEBUG_COMPILE -static void print_compiled_byte_code_list P_((FILE* f, regex_t* reg)); +static void print_compiled_byte_code_list(FILE* f, regex_t* reg); #endif #ifdef ONIG_DEBUG_PARSE_TREE -static void print_tree P_((FILE* f, Node* node)); +static void print_tree(FILE* f, Node* node); #endif +#ifdef RUBY extern int onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, + OnigErrorInfo* einfo) +{ + return onig_compile_ruby(reg, pattern, pattern_end, einfo, NULL, 0); +} +#endif + +#ifdef RUBY +extern int +onig_compile_ruby(regex_t* reg, const UChar* pattern, const UChar* pattern_end, OnigErrorInfo* einfo, const char *sourcefile, int sourceline) +#else +extern int +onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, + OnigErrorInfo* einfo) +#endif { #define COMPILE_INIT_SIZE 20 @@ -5707,9 +5713,10 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL; +#ifdef RUBY scan_env.sourcefile = sourcefile; scan_env.sourceline = sourceline; - reg->state = ONIG_STATE_COMPILING; +#endif #ifdef ONIG_DEBUG print_enc_string(stderr, reg->enc, pattern, pattern_end); @@ -5794,17 +5801,17 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, #ifdef USE_COMBINATION_EXPLOSION_CHECK if (scan_env.backrefed_mem == 0 -#ifdef USE_SUBEXP_CALL +# ifdef USE_SUBEXP_CALL || scan_env.num_call == 0 -#endif +# endif ) { setup_comb_exp_check(root, 0, &scan_env); -#ifdef USE_SUBEXP_CALL +# ifdef USE_SUBEXP_CALL if (scan_env.has_recursion != 0) { scan_env.num_comb_exp_check = 0; } else -#endif +# endif if (scan_env.comb_exp_max_regnum > 0) { int i; for (i = 1; i <= scan_env.comb_exp_max_regnum; i++) { @@ -5858,14 +5865,13 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, onig_node_free(root); #ifdef ONIG_DEBUG_COMPILE -#ifdef USE_NAMED_GROUP +# ifdef USE_NAMED_GROUP onig_print_names(stderr, reg); -#endif +# endif print_compiled_byte_code_list(stderr, reg); #endif end: - reg->state = ONIG_STATE_NORMAL; return r; err_unset: @@ -5889,27 +5895,6 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, return r; } -#ifdef USE_RECOMPILE_API -extern int -onig_recompile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, - OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, - OnigErrorInfo* einfo) -{ - int r; - regex_t *new_reg; - - r = onig_new(&new_reg, pattern, pattern_end, option, enc, syntax, einfo); - if (r) return r; - if (ONIG_STATE(reg) == ONIG_STATE_NORMAL) { - onig_transfer(reg, new_reg); - } - else { - onig_chain_link_add(reg, new_reg); - } - return 0; -} -#endif - static int onig_inited = 0; extern int @@ -5931,8 +5916,6 @@ onig_reg_init(regex_t* reg, OnigOptionType option, return ONIGERR_INVALID_COMBINATION_OF_OPTIONS; } - (reg)->state = ONIG_STATE_MODIFY; - if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) { option |= syntax->options; option &= ~ONIG_OPTION_SINGLELINE; @@ -5968,7 +5951,7 @@ onig_new_without_alloc(regex_t* reg, const UChar* pattern, r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax); if (r) return r; - r = onig_compile(reg, pattern, pattern_end, einfo, NULL, 0); + r = onig_compile(reg, pattern, pattern_end, einfo); return r; } @@ -5985,7 +5968,7 @@ onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end, r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax); if (r) goto err; - r = onig_compile(*reg, pattern, pattern_end, einfo, NULL, 0); + r = onig_compile(*reg, pattern, pattern_end, einfo); if (r) { err: onig_free(*reg); @@ -5994,6 +5977,11 @@ onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end, return r; } +extern int +onig_initialize(OnigEncoding encodings[] ARG_UNUSED, int n ARG_UNUSED) +{ + return onig_init(); +} extern int onig_init(void) @@ -6001,11 +5989,12 @@ onig_init(void) if (onig_inited != 0) return 0; - THREAD_SYSTEM_INIT; - THREAD_ATOMIC_START; - onig_inited = 1; +#if defined(ONIG_DEBUG_MEMLEAK) && defined(_MSC_VER) + _CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF); +#endif + onigenc_init(); /* onigenc_set_default_caseconv_table((UChar* )0); */ @@ -6013,7 +6002,6 @@ onig_init(void) onig_statistics_init(); #endif - THREAD_ATOMIC_END; return 0; } @@ -6052,26 +6040,18 @@ exec_end_call_list(void) extern int onig_end(void) { - THREAD_ATOMIC_START; - exec_end_call_list(); #ifdef ONIG_DEBUG_STATISTICS onig_print_statistics(stderr); #endif -#ifdef USE_SHARED_CCLASS_TABLE - onig_free_shared_cclass_table(); -#endif - -#ifdef USE_PARSE_TREE_NODE_RECYCLE - onig_free_node_list(); +#if defined(ONIG_DEBUG_MEMLEAK) && defined(_MSC_VER) + _CrtDumpMemoryLeaks(); #endif onig_inited = 0; - THREAD_ATOMIC_END; - THREAD_SYSTEM_END; return 0; } @@ -6137,14 +6117,14 @@ onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc) #ifdef ONIG_DEBUG /* arguments type */ -#define ARG_SPECIAL -1 -#define ARG_NON 0 -#define ARG_RELADDR 1 -#define ARG_ABSADDR 2 -#define ARG_LENGTH 3 -#define ARG_MEMNUM 4 -#define ARG_OPTION 5 -#define ARG_STATE_CHECK 6 +# define ARG_SPECIAL -1 +# define ARG_NON 0 +# define ARG_RELADDR 1 +# define ARG_ABSADDR 2 +# define ARG_LENGTH 3 +# define ARG_MEMNUM 4 +# define ARG_OPTION 5 +# define ARG_STATE_CHECK 6 OnigOpInfoType OnigOpInfo[] = { { OP_FINISH, "finish", ARG_NON }, @@ -6169,7 +6149,6 @@ OnigOpInfoType OnigOpInfo[] = { { OP_CCLASS_NOT, "cclass-not", ARG_SPECIAL }, { OP_CCLASS_MB_NOT, "cclass-mb-not", ARG_SPECIAL }, { OP_CCLASS_MIX_NOT, "cclass-mix-not", ARG_SPECIAL }, - { OP_CCLASS_NODE, "cclass-node", ARG_SPECIAL }, { OP_ANYCHAR, "anychar", ARG_NON }, { OP_ANYCHAR_ML, "anychar-ml", ARG_NON }, { OP_ANYCHAR_STAR, "anychar*", ARG_NON }, @@ -6272,14 +6251,14 @@ op2arg_type(int opcode) return ARG_SPECIAL; } -#ifdef ONIG_DEBUG_PARSE_TREE +# ifdef ONIG_DEBUG_PARSE_TREE static void Indent(FILE* f, int indent) { int i; for (i = 0; i < indent; i++) putc(' ', f); } -#endif /* ONIG_DEBUG_PARSE_TREE */ +# endif /* ONIG_DEBUG_PARSE_TREE */ static void p_string(FILE* f, ptrdiff_t len, UChar* s) @@ -6318,7 +6297,7 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar* bpend, UChar** nextp, break; case ARG_RELADDR: GET_RELADDR_INC(addr, bp); - fprintf(f, ":(+%d)", addr); + fprintf(f, ":(%s%d)", (addr >= 0) ? "+" : "", addr); break; case ARG_ABSADDR: GET_ABSADDR_INC(addr, bp); @@ -6423,9 +6402,9 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar* bpend, UChar** nextp, case OP_CCLASS_MB_NOT: GET_LENGTH_INC(len, bp); q = bp; -#ifndef PLATFORM_UNALIGNED_WORD_ACCESS +# ifndef PLATFORM_UNALIGNED_WORD_ACCESS ALIGNMENT_RIGHT(q); -#endif +# endif GET_CODE_POINT(code, q); bp += len; fprintf(f, ":%d:%d", (int )code, len); @@ -6437,24 +6416,14 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar* bpend, UChar** nextp, bp += SIZE_BITSET; GET_LENGTH_INC(len, bp); q = bp; -#ifndef PLATFORM_UNALIGNED_WORD_ACCESS +# ifndef PLATFORM_UNALIGNED_WORD_ACCESS ALIGNMENT_RIGHT(q); -#endif +# endif GET_CODE_POINT(code, q); bp += len; fprintf(f, ":%d:%d:%d", n, (int )code, len); break; - case OP_CCLASS_NODE: - { - CClassNode *cc; - - GET_POINTER_INC(cc, bp); - n = bitset_on_num(cc->bs); - fprintf(f, ":%"PRIuPTR":%d", (uintptr_t )cc, n); - } - break; - case OP_BACKREFN_IC: mem = *((MemNumType* )bp); bp += SIZE_MEMNUM; @@ -6507,7 +6476,7 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar* bpend, UChar** nextp, case OP_PUSH_IF_PEEK_NEXT: addr = *((RelAddrType* )bp); bp += SIZE_RELADDR; - fprintf(f, ":(%d)", addr); + fprintf(f, ":(%s%d)", (addr >= 0) ? "+" : "", addr); p_string(f, 1, bp); bp += 1; break; @@ -6520,7 +6489,7 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar* bpend, UChar** nextp, case OP_PUSH_LOOK_BEHIND_NOT: GET_RELADDR_INC(addr, bp); GET_LENGTH_INC(len, bp); - fprintf(f, ":%d:(%d)", len, addr); + fprintf(f, ":%d:(%s%d)", len, (addr >= 0) ? "+" : "", addr); break; case OP_STATE_CHECK_PUSH: @@ -6529,13 +6498,13 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar* bpend, UChar** nextp, bp += SIZE_STATE_CHECK_NUM; addr = *((RelAddrType* )bp); bp += SIZE_RELADDR; - fprintf(f, ":%d:(%d)", scn, addr); + fprintf(f, ":%d:(%s%d)", scn, (addr >= 0) ? "+" : "", addr); break; case OP_CONDITION: GET_MEMNUM_INC(mem, bp); GET_RELADDR_INC(addr, bp); - fprintf(f, ":%d:(%d)", mem, addr); + fprintf(f, ":%d:(%s%d)", mem, (addr >= 0) ? "+" : "", addr); break; default: @@ -6547,7 +6516,7 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar* bpend, UChar** nextp, if (nextp) *nextp = bp; } -#ifdef ONIG_DEBUG_COMPILE +# ifdef ONIG_DEBUG_COMPILE static void print_compiled_byte_code_list(FILE* f, regex_t* reg) { @@ -6569,9 +6538,9 @@ print_compiled_byte_code_list(FILE* f, regex_t* reg) fprintf(f, "\n"); } -#endif /* ONIG_DEBUG_COMPILE */ +# endif /* ONIG_DEBUG_COMPILE */ -#ifdef ONIG_DEBUG_PARSE_TREE +# ifdef ONIG_DEBUG_PARSE_TREE void print_indent_tree(FILE* f, Node* node, int indent) { @@ -6621,8 +6590,8 @@ print_indent_tree(FILE* f, Node* node, int indent) if (IS_NCCLASS_NOT(NCCLASS(node))) fputs("not ", f); if (NCCLASS(node)->mbuf) { BBuf* bbuf = NCCLASS(node)->mbuf; - OnigCodePoint* data = (OnigCodePoint*)bbuf->p; - OnigCodePoint* end = (OnigCodePoint*)(bbuf->p + bbuf->used); + OnigCodePoint* data = (OnigCodePoint* )bbuf->p; + OnigCodePoint* end = (OnigCodePoint* )(bbuf->p + bbuf->used); fprintf(f, "%d", *data++); for (; data < end; data+=2) { fprintf(f, ","); @@ -6664,10 +6633,10 @@ print_indent_tree(FILE* f, Node* node, int indent) case ANCHOR_WORD_BOUND: fputs("word bound", f); break; case ANCHOR_NOT_WORD_BOUND: fputs("not word bound", f); break; -#ifdef USE_WORD_BEGIN_END +# ifdef USE_WORD_BEGIN_END case ANCHOR_WORD_BEGIN: fputs("word begin", f); break; case ANCHOR_WORD_END: fputs("word end", f); break; -#endif +# endif case ANCHOR_PREC_READ: fputs("prec read", f); container_p = TRUE; break; case ANCHOR_PREC_READ_NOT: fputs("prec read not", f); container_p = TRUE; break; case ANCHOR_LOOK_BEHIND: fputs("look_behind", f); container_p = TRUE; break; @@ -6693,7 +6662,7 @@ print_indent_tree(FILE* f, Node* node, int indent) } break; -#ifdef USE_SUBEXP_CALL +# ifdef USE_SUBEXP_CALL case NT_CALL: { CallNode* cn = NCALL(node); @@ -6701,7 +6670,7 @@ print_indent_tree(FILE* f, Node* node, int indent) p_string(f, cn->name_end - cn->name, cn->name); } break; -#endif +# endif case NT_QTFR: fprintf(f, "{%d,%d}%s\n", (intptr_t )node, @@ -6752,5 +6721,5 @@ print_tree(FILE* f, Node* node) { print_indent_tree(f, node, 0); } -#endif /* ONIG_DEBUG_PARSE_TREE */ +# endif /* ONIG_DEBUG_PARSE_TREE */ #endif /* ONIG_DEBUG */ diff --git a/regenc.c b/regenc.c index 5cacbdfaa4..ca09a7fcb3 100644 --- a/regenc.c +++ b/regenc.c @@ -3,7 +3,7 @@ **********************************************************************/ /*- * Copyright (c) 2002-2007 K.Kosako - * Copyright (c) 2011 K.Takata + * Copyright (c) 2011-2016 K.Takata * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -364,12 +364,14 @@ const UChar OnigEncISO_8859_1_ToUpperCaseTable[256] = { }; #endif +#if 0 extern void onigenc_set_default_caseconv_table(const UChar* table ARG_UNUSED) { /* nothing */ /* obsoleted. */ } +#endif extern UChar* onigenc_get_left_adjust_char_head(OnigEncoding enc, const UChar* start, const UChar* s, const UChar* end) @@ -631,8 +633,10 @@ onigenc_single_byte_code_to_mbclen(OnigCodePoint code ARG_UNUSED, OnigEncoding e extern int onigenc_single_byte_code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc ARG_UNUSED) { +#ifdef RUBY if (code > 0xff) rb_raise(rb_eRangeError, "%u out of char range", code); +#endif *buf = (UChar )(code & 0xff); return 1; } @@ -892,6 +896,7 @@ onigenc_with_ascii_strnicmp(OnigEncoding enc, const UChar* p, const UChar* end, return 0; } +#if 0 /* Property management */ static int resize_property_list(int new_size, const OnigCodePoint*** plist, int* psize) @@ -944,68 +949,64 @@ onigenc_property_list_add_property(UChar* name, const OnigCodePoint* prop, (hash_data_type )(*pnum + ONIGENC_MAX_STD_CTYPE)); return 0; } +#endif extern int -onigenc_property_list_init(int (*f)(void)) -{ - int r; - - THREAD_ATOMIC_START; - - r = f(); - - THREAD_ATOMIC_END; - return r; -} - -extern int -onigenc_ascii_only_case_map (OnigCaseFoldType* flagP, const OnigUChar** pp, const OnigUChar* end, - OnigUChar* to, OnigUChar* to_end, const struct OnigEncodingTypeST* enc) +onigenc_ascii_only_case_map(OnigCaseFoldType* flagP, const OnigUChar** pp, const OnigUChar* end, + OnigUChar* to, OnigUChar* to_end, const struct OnigEncodingTypeST* enc) { OnigCodePoint code; OnigUChar *to_start = to; OnigCaseFoldType flags = *flagP; int codepoint_length; - while (*pp='a' && code<='z' && (flags&ONIGENC_CASE_UPCASE)) - flags |= ONIGENC_CASE_MODIFIED, code += 'A'-'a'; - else if (code>='A' && code<='Z' && (flags&(ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_FOLD))) - flags |= ONIGENC_CASE_MODIFIED, code += 'a'-'A'; + if (code >= 'a' && code <= 'z' && (flags & ONIGENC_CASE_UPCASE)) { + flags |= ONIGENC_CASE_MODIFIED; + code += 'A' - 'a'; + } else if (code >= 'A' && code <= 'Z' && + (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD))) { + flags |= ONIGENC_CASE_MODIFIED; + code += 'a' - 'A'; + } to += ONIGENC_CODE_TO_MBC(enc, code, to); if (flags & ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ - flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE); + flags ^= (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_TITLECASE); } *flagP = flags; - return (int)(to-to_start); + return (int )(to - to_start); } extern int -onigenc_single_byte_ascii_only_case_map (OnigCaseFoldType* flagP, const OnigUChar** pp, - const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, - const struct OnigEncodingTypeST* enc) +onigenc_single_byte_ascii_only_case_map(OnigCaseFoldType* flagP, const OnigUChar** pp, + const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, + const struct OnigEncodingTypeST* enc) { OnigCodePoint code; OnigUChar *to_start = to; OnigCaseFoldType flags = *flagP; - while (*pp='a' && code<='z' && (flags&ONIGENC_CASE_UPCASE)) - flags |= ONIGENC_CASE_MODIFIED, code += 'A'-'a'; - else if (code>='A' && code<='Z' && (flags&(ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_FOLD))) - flags |= ONIGENC_CASE_MODIFIED, code += 'a'-'A'; + if (code >= 'a' && code <= 'z' && (flags & ONIGENC_CASE_UPCASE)) { + flags |= ONIGENC_CASE_MODIFIED; + code += 'A' - 'a'; + } else if (code >= 'A' && code <= 'Z' && + (flags & (ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_FOLD))) { + flags |= ONIGENC_CASE_MODIFIED; + code += 'a' - 'A'; + } *to++ = code; if (flags & ONIGENC_CASE_TITLECASE) /* switch from titlecase to lowercase for capitalize */ - flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE|ONIGENC_CASE_TITLECASE); + flags ^= (ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE | ONIGENC_CASE_TITLECASE); } *flagP = flags; - return (int)(to-to_start); + return (int )(to - to_start); } diff --git a/regenc.h b/regenc.h index 2c4c9343c5..10ca18c2a4 100644 --- a/regenc.h +++ b/regenc.h @@ -1,11 +1,11 @@ -#ifndef ONIGURUMA_REGENC_H -#define ONIGURUMA_REGENC_H +#ifndef ONIGMO_REGENC_H +#define ONIGMO_REGENC_H /********************************************************************** regenc.h - Onigmo (Oniguruma-mod) (regular expression library) **********************************************************************/ /*- * Copyright (c) 2002-2008 K.Kosako - * Copyright (c) 2011 K.Takata + * Copyright (c) 2011-2016 K.Takata * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -30,18 +30,32 @@ * SUCH DAMAGE. */ -#ifndef REGINT_H -#ifndef RUBY_EXTERN -#include "ruby/config.h" -#include "ruby/defines.h" -#endif +#if !defined(RUBY) && (defined(RUBY_EXPORT) || defined(ONIG_ENC_REGISTER)) +# define RUBY #endif +#ifdef RUBY +# ifndef ONIGMO_REGINT_H +# ifndef RUBY_EXTERN +# include "ruby/config.h" +# include "ruby/defines.h" +# endif +# endif +#else /* RUBY */ +# ifndef PACKAGE +/* PACKAGE is defined in config.h */ +# include "config.h" +# endif +#endif /* RUBY */ #ifdef ONIG_ESCAPE_UCHAR_COLLISION -#undef ONIG_ESCAPE_UCHAR_COLLISION +# undef ONIG_ESCAPE_UCHAR_COLLISION #endif -#include "ruby/oniguruma.h" +#ifdef RUBY +# include "ruby/onigmo.h" +#else +# include "onigmo.h" +#endif RUBY_SYMBOL_EXPORT_BEGIN @@ -52,23 +66,23 @@ typedef struct { #ifndef NULL -#define NULL ((void* )0) +# define NULL ((void* )0) #endif #ifndef TRUE -#define TRUE 1 +# define TRUE 1 #endif #ifndef FALSE -#define FALSE 0 +# define FALSE 0 #endif #ifndef ARG_UNUSED -#if defined(__GNUC__) +# if defined(__GNUC__) # define ARG_UNUSED __attribute__ ((unused)) -#else +# else # define ARG_UNUSED -#endif +# endif #endif #define ONIG_IS_NULL(p) (((void*)(p)) == (void*)0) @@ -111,7 +125,7 @@ typedef struct { {(short int )(sizeof(name) - 1), (name), (ctype)} #ifndef numberof -#define numberof(array) (int )(sizeof(array) / sizeof((array)[0])) +# define numberof(array) (int )(sizeof(array) / sizeof((array)[0])) #endif @@ -125,48 +139,48 @@ typedef struct { #define ONIG_ENCODING_INIT_DEFAULT ONIG_ENCODING_ASCII /* for encoding system implementation (internal) */ -ONIG_EXTERN int onigenc_ascii_apply_all_case_fold P_((OnigCaseFoldType flag, OnigApplyAllCaseFoldFunc f, void* arg, OnigEncoding enc)); -ONIG_EXTERN int onigenc_ascii_get_case_fold_codes_by_str P_((OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[], OnigEncoding enc)); -ONIG_EXTERN int onigenc_apply_all_case_fold_with_map P_((int map_size, const OnigPairCaseFoldCodes map[], int ess_tsett_flag, OnigCaseFoldType flag, OnigApplyAllCaseFoldFunc f, void* arg)); -ONIG_EXTERN int onigenc_get_case_fold_codes_by_str_with_map P_((int map_size, const OnigPairCaseFoldCodes map[], int ess_tsett_flag, OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])); -CONSTFUNC(ONIG_EXTERN int onigenc_not_support_get_ctype_code_range P_((OnigCtype ctype, OnigCodePoint* sb_out, const OnigCodePoint* ranges[], OnigEncoding enc))); -PUREFUNC(ONIG_EXTERN int onigenc_is_mbc_newline_0x0a P_((const UChar* p, const UChar* end, OnigEncoding enc))); -ONIG_EXTERN int onigenc_single_byte_ascii_only_case_map P_((OnigCaseFoldType* flagP, const OnigUChar** pp, const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, const struct OnigEncodingTypeST* enc)); +ONIG_EXTERN int onigenc_ascii_apply_all_case_fold(OnigCaseFoldType flag, OnigApplyAllCaseFoldFunc f, void* arg, OnigEncoding enc); +ONIG_EXTERN int onigenc_ascii_get_case_fold_codes_by_str(OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[], OnigEncoding enc); +ONIG_EXTERN int onigenc_apply_all_case_fold_with_map(int map_size, const OnigPairCaseFoldCodes map[], int ess_tsett_flag, OnigCaseFoldType flag, OnigApplyAllCaseFoldFunc f, void* arg); +ONIG_EXTERN int onigenc_get_case_fold_codes_by_str_with_map(int map_size, const OnigPairCaseFoldCodes map[], int ess_tsett_flag, OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[]); +ONIG_EXTERN int onigenc_not_support_get_ctype_code_range(OnigCtype ctype, OnigCodePoint* sb_out, const OnigCodePoint* ranges[], OnigEncoding enc); +ONIG_EXTERN int onigenc_is_mbc_newline_0x0a(const UChar* p, const UChar* end, OnigEncoding enc); +ONIG_EXTERN int onigenc_single_byte_ascii_only_case_map(OnigCaseFoldType* flagP, const OnigUChar** pp, const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, const struct OnigEncodingTypeST* enc); /* methods for single byte encoding */ -ONIG_EXTERN int onigenc_ascii_mbc_case_fold P_((OnigCaseFoldType flag, const UChar** p, const UChar* end, UChar* lower, OnigEncoding enc)); -CONSTFUNC(ONIG_EXTERN int onigenc_single_byte_mbc_enc_len P_((const UChar* p, const UChar* e, OnigEncoding enc))); -PUREFUNC(ONIG_EXTERN OnigCodePoint onigenc_single_byte_mbc_to_code P_((const UChar* p, const UChar* end, OnigEncoding enc))); -CONSTFUNC(ONIG_EXTERN int onigenc_single_byte_code_to_mbclen P_((OnigCodePoint code, OnigEncoding enc))); -ONIG_EXTERN int onigenc_single_byte_code_to_mbc P_((OnigCodePoint code, UChar *buf, OnigEncoding enc)); -CONSTFUNC(ONIG_EXTERN UChar* onigenc_single_byte_left_adjust_char_head P_((const UChar* start, const UChar* s, const OnigUChar* end, OnigEncoding enc))); -CONSTFUNC(ONIG_EXTERN int onigenc_always_true_is_allowed_reverse_match P_((const UChar* s, const UChar* end, OnigEncoding enc))); -CONSTFUNC(ONIG_EXTERN int onigenc_always_false_is_allowed_reverse_match P_((const UChar* s, const UChar* end, OnigEncoding enc))); -CONSTFUNC(ONIG_EXTERN int onigenc_ascii_is_code_ctype P_((OnigCodePoint code, unsigned int ctype, OnigEncoding enc))); +ONIG_EXTERN int onigenc_ascii_mbc_case_fold(OnigCaseFoldType flag, const UChar** p, const UChar* end, UChar* lower, OnigEncoding enc); +ONIG_EXTERN int onigenc_single_byte_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc); +ONIG_EXTERN OnigCodePoint onigenc_single_byte_mbc_to_code(const UChar* p, const UChar* end, OnigEncoding enc); +ONIG_EXTERN int onigenc_single_byte_code_to_mbclen(OnigCodePoint code, OnigEncoding enc); +ONIG_EXTERN int onigenc_single_byte_code_to_mbc(OnigCodePoint code, UChar *buf, OnigEncoding enc); +ONIG_EXTERN UChar* onigenc_single_byte_left_adjust_char_head(const UChar* start, const UChar* s, const OnigUChar* end, OnigEncoding enc); +ONIG_EXTERN int onigenc_always_true_is_allowed_reverse_match(const UChar* s, const UChar* end, OnigEncoding enc); +ONIG_EXTERN int onigenc_always_false_is_allowed_reverse_match(const UChar* s, const UChar* end, OnigEncoding enc); +ONIG_EXTERN int onigenc_ascii_is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc); /* methods for multi byte encoding */ -ONIG_EXTERN OnigCodePoint onigenc_mbn_mbc_to_code P_((OnigEncoding enc, const UChar* p, const UChar* end)); -ONIG_EXTERN int onigenc_mbn_mbc_case_fold P_((OnigEncoding enc, OnigCaseFoldType flag, const UChar** p, const UChar* end, UChar* lower)); -CONSTFUNC(ONIG_EXTERN int onigenc_mb2_code_to_mbclen P_((OnigCodePoint code, OnigEncoding enc))); -ONIG_EXTERN int onigenc_mb2_code_to_mbc P_((OnigEncoding enc, OnigCodePoint code, UChar *buf)); -ONIG_EXTERN int onigenc_minimum_property_name_to_ctype P_((OnigEncoding enc, const UChar* p, const UChar* end)); -ONIG_EXTERN int onigenc_unicode_property_name_to_ctype P_((OnigEncoding enc, const UChar* p, const UChar* end)); -ONIG_EXTERN int onigenc_mb2_is_code_ctype P_((OnigEncoding enc, OnigCodePoint code, unsigned int ctype)); -CONSTFUNC(ONIG_EXTERN int onigenc_mb4_code_to_mbclen P_((OnigCodePoint code, OnigEncoding enc))); -ONIG_EXTERN int onigenc_mb4_code_to_mbc P_((OnigEncoding enc, OnigCodePoint code, UChar *buf)); -ONIG_EXTERN int onigenc_mb4_is_code_ctype P_((OnigEncoding enc, OnigCodePoint code, unsigned int ctype)); +ONIG_EXTERN OnigCodePoint onigenc_mbn_mbc_to_code(OnigEncoding enc, const UChar* p, const UChar* end); +ONIG_EXTERN int onigenc_mbn_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag, const UChar** p, const UChar* end, UChar* lower); +ONIG_EXTERN int onigenc_mb2_code_to_mbclen(OnigCodePoint code, OnigEncoding enc); +ONIG_EXTERN int onigenc_mb2_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf); +ONIG_EXTERN int onigenc_minimum_property_name_to_ctype(OnigEncoding enc, const UChar* p, const UChar* end); +ONIG_EXTERN int onigenc_unicode_property_name_to_ctype(OnigEncoding enc, const UChar* p, const UChar* end); +ONIG_EXTERN int onigenc_mb2_is_code_ctype(OnigEncoding enc, OnigCodePoint code, unsigned int ctype); +ONIG_EXTERN int onigenc_mb4_code_to_mbclen(OnigCodePoint code, OnigEncoding enc); +ONIG_EXTERN int onigenc_mb4_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf); +ONIG_EXTERN int onigenc_mb4_is_code_ctype(OnigEncoding enc, OnigCodePoint code, unsigned int ctype); -ONIG_EXTERN int onigenc_unicode_case_map P_((OnigCaseFoldType* flagP, const OnigUChar** pp, const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, const struct OnigEncodingTypeST* enc)); +ONIG_EXTERN int onigenc_unicode_case_map(OnigCaseFoldType* flagP, const OnigUChar** pp, const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, const struct OnigEncodingTypeST* enc); /* in enc/unicode.c */ -ONIG_EXTERN int onigenc_unicode_is_code_ctype P_((OnigCodePoint code, unsigned int ctype, OnigEncoding enc)); -ONIG_EXTERN int onigenc_utf16_32_get_ctype_code_range P_((OnigCtype ctype, OnigCodePoint *sb_out, const OnigCodePoint* ranges[], OnigEncoding enc)); -ONIG_EXTERN int onigenc_unicode_ctype_code_range P_((int ctype, const OnigCodePoint* ranges[])); -ONIG_EXTERN int onigenc_unicode_get_case_fold_codes_by_str P_((OnigEncoding enc, OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])); -ONIG_EXTERN int onigenc_unicode_mbc_case_fold P_((OnigEncoding enc, OnigCaseFoldType flag, const UChar** pp, const UChar* end, UChar* fold)); -ONIG_EXTERN int onigenc_unicode_apply_all_case_fold P_((OnigCaseFoldType flag, OnigApplyAllCaseFoldFunc f, void* arg, OnigEncoding enc)); +ONIG_EXTERN int onigenc_unicode_is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc); +ONIG_EXTERN int onigenc_utf16_32_get_ctype_code_range(OnigCtype ctype, OnigCodePoint *sb_out, const OnigCodePoint* ranges[], OnigEncoding enc); +ONIG_EXTERN int onigenc_unicode_ctype_code_range(int ctype, const OnigCodePoint* ranges[]); +ONIG_EXTERN int onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, OnigCaseFoldType flag, const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[]); +ONIG_EXTERN int onigenc_unicode_mbc_case_fold(OnigEncoding enc, OnigCaseFoldType flag, const UChar** pp, const UChar* end, UChar* fold); +ONIG_EXTERN int onigenc_unicode_apply_all_case_fold(OnigCaseFoldType flag, OnigApplyAllCaseFoldFunc f, void* arg, OnigEncoding enc); #define UTF16_IS_SURROGATE_FIRST(c) (((c) & 0xfc) == 0xd8) @@ -182,14 +196,14 @@ ONIG_EXTERN const UChar OnigEncISO_8859_1_ToLowerCaseTable[]; ONIG_EXTERN const UChar OnigEncISO_8859_1_ToUpperCaseTable[]; ONIG_EXTERN int -onigenc_with_ascii_strncmp P_((OnigEncoding enc, const UChar* p, const UChar* end, const UChar* sascii /* ascii */, int n)); +onigenc_with_ascii_strncmp(OnigEncoding enc, const UChar* p, const UChar* end, const UChar* sascii /* ascii */, int n); ONIG_EXTERN int -onigenc_with_ascii_strnicmp P_((OnigEncoding enc, const UChar* p, const UChar* end, const UChar* sascii /* ascii */, int n)); +onigenc_with_ascii_strnicmp(OnigEncoding enc, const UChar* p, const UChar* end, const UChar* sascii /* ascii */, int n); ONIG_EXTERN UChar* -onigenc_step P_((OnigEncoding enc, const UChar* p, const UChar* end, int n)); +onigenc_step(OnigEncoding enc, const UChar* p, const UChar* end, int n); /* defined in regexec.c, but used in enc/xxx.c */ -extern int onig_is_in_code_range P_((const UChar* p, OnigCodePoint code)); +extern int onig_is_in_code_range(const UChar* p, OnigCodePoint code); ONIG_EXTERN OnigEncoding OnigEncDefaultCharEncoding; ONIG_EXTERN const UChar OnigEncAsciiToLowerCaseTable[]; @@ -212,9 +226,9 @@ ONIG_EXTERN const unsigned short OnigEncAsciiCtypeTable[]; #ifdef ONIG_ENC_REGISTER extern int ONIG_ENC_REGISTER(const char *, OnigEncoding); -#define OnigEncodingName(n) encoding_##n -#define OnigEncodingDeclare(n) static const OnigEncodingType OnigEncodingName(n) -#define OnigEncodingDefine(f,n) \ +# define OnigEncodingName(n) encoding_##n +# define OnigEncodingDeclare(n) static const OnigEncodingType OnigEncodingName(n) +# define OnigEncodingDefine(f,n) \ OnigEncodingDeclare(n); \ void Init_##f(void) { \ ONIG_ENC_REGISTER(OnigEncodingName(n).name, \ @@ -222,9 +236,9 @@ extern int ONIG_ENC_REGISTER(const char *, OnigEncoding); } \ OnigEncodingDeclare(n) #else -#define OnigEncodingName(n) OnigEncoding##n -#define OnigEncodingDeclare(n) const OnigEncodingType OnigEncodingName(n) -#define OnigEncodingDefine(f,n) OnigEncodingDeclare(n) +# define OnigEncodingName(n) OnigEncoding##n +# define OnigEncodingDeclare(n) const OnigEncodingType OnigEncodingName(n) +# define OnigEncodingDefine(f,n) OnigEncodingDeclare(n) #endif /* macros for define replica encoding and encoding alias */ @@ -234,4 +248,4 @@ extern int ONIG_ENC_REGISTER(const char *, OnigEncoding); RUBY_SYMBOL_EXPORT_END -#endif /* ONIGURUMA_REGENC_H */ +#endif /* ONIGMO_REGENC_H */ diff --git a/regerror.c b/regerror.c index 9ec3f65f4c..dbe3ee4094 100644 --- a/regerror.c +++ b/regerror.c @@ -3,7 +3,7 @@ **********************************************************************/ /*- * Copyright (c) 2002-2007 K.Kosako - * Copyright (c) 2011-2014 K.Takata + * Copyright (c) 2011-2016 K.Takata * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -31,13 +31,7 @@ #include "regint.h" #include /* for vsnprintf() */ -#ifdef HAVE_STDARG_PROTOTYPES #include -#define va_init_list(a,b) va_start(a,b) -#else -#include -#define va_init_list(a,b) va_start(a) -#endif extern UChar* onig_error_code_to_format(OnigPosition code) @@ -65,6 +59,8 @@ onig_error_code_to_format(OnigPosition code) p = "unexpected bytecode (bug)"; break; case ONIGERR_MATCH_STACK_LIMIT_OVER: p = "match-stack limit over"; break; + case ONIGERR_PARSE_DEPTH_LIMIT_OVER: + p = "parse depth limit over"; break; case ONIGERR_DEFAULT_ENCODING_IS_NOT_SET: p = "default multibyte-encoding is not set"; break; case ONIGERR_SPECIFIED_ENCODING_CANT_CONVERT_TO_WIDE_CHAR: @@ -179,8 +175,6 @@ onig_error_code_to_format(OnigPosition code) p = "not supported encoding combination"; break; case ONIGERR_INVALID_COMBINATION_OF_OPTIONS: p = "invalid combination of options"; break; - case ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT: - p = "over thread pass limit count"; break; default: p = "undefined error code"; break; @@ -191,12 +185,12 @@ onig_error_code_to_format(OnigPosition code) static void sprint_byte(char* s, unsigned int v) { - sprintf(s, "%02x", (v & 0377)); + xsnprintf(s, 3, "%02x", (v & 0377)); } static void sprint_byte_with_x(char* s, unsigned int v) { - sprintf(s, "\\x%02x", (v & 0377)); + xsnprintf(s, 5, "\\x%02x", (v & 0377)); } static int to_ascii(OnigEncoding enc, UChar *s, UChar *end, @@ -252,14 +246,7 @@ static int to_ascii(OnigEncoding enc, UChar *s, UChar *end, #define MAX_ERROR_PAR_LEN 30 extern int -#ifdef HAVE_STDARG_PROTOTYPES onig_error_code_to_str(UChar* s, OnigPosition code, ...) -#else -onig_error_code_to_str(s, code, va_alist) - UChar* s; - OnigPosition code; - va_dcl -#endif { UChar *p, *q; OnigErrorInfo* einfo; @@ -268,7 +255,7 @@ onig_error_code_to_str(s, code, va_alist) UChar parbuf[MAX_ERROR_PAR_LEN]; va_list vargs; - va_init_list(vargs, code); + va_start(vargs, code); switch (code) { case ONIGERR_UNDEFINED_NAME_REFERENCE: @@ -337,26 +324,17 @@ onig_vsnprintf_with_pattern(UChar buf[], int bufsize, OnigEncoding enc, need = (pat_end - pat) * 4 + 4; if (n + need < (size_t )bufsize) { - strcat((char* )buf, ": /"); + xstrcat((char* )buf, ": /", bufsize); s = buf + onigenc_str_bytelen_null(ONIG_ENCODING_ASCII, buf); p = pat; while (p < pat_end) { - if (*p == '\\') { - *s++ = *p++; - len = enclen(enc, p, pat_end); - while (len-- > 0) *s++ = *p++; - } - else if (*p == '/') { - *s++ = (unsigned char )'\\'; - *s++ = *p++; - } - else if (ONIGENC_IS_MBC_HEAD(enc, p, pat_end)) { + if (ONIGENC_IS_MBC_HEAD(enc, p, pat_end)) { len = enclen(enc, p, pat_end); if (ONIGENC_MBC_MINLEN(enc) == 1) { while (len-- > 0) *s++ = *p++; } - else { /* for UTF16 */ + else { /* for UTF16/32 */ int blen; while (len-- > 0) { @@ -367,6 +345,15 @@ onig_vsnprintf_with_pattern(UChar buf[], int bufsize, OnigEncoding enc, } } } + else if (*p == '\\') { + *s++ = *p++; + len = enclen(enc, p, pat_end); + while (len-- > 0) *s++ = *p++; + } + else if (*p == '/') { + *s++ = (unsigned char )'\\'; + *s++ = *p++; + } else if (!ONIGENC_IS_CODE_PRINT(enc, *p) && !ONIGENC_IS_CODE_SPACE(enc, *p)) { sprint_byte_with_x((char* )bs, (unsigned int )(*p++)); @@ -384,25 +371,15 @@ onig_vsnprintf_with_pattern(UChar buf[], int bufsize, OnigEncoding enc, } } +#if 0 /* unused */ void -#ifdef HAVE_STDARG_PROTOTYPES onig_snprintf_with_pattern(UChar buf[], int bufsize, OnigEncoding enc, UChar* pat, UChar* pat_end, const UChar *fmt, ...) -#else -onig_snprintf_with_pattern(buf, bufsize, enc, pat, pat_end, fmt, va_alist) - UChar buf[]; - int bufsize; - OnigEncoding enc; - UChar* pat; - UChar* pat_end; - const UChar *fmt; - va_dcl -#endif { va_list args; - va_init_list(args, fmt); + va_start(args, fmt); onig_vsnprintf_with_pattern(buf, bufsize, enc, pat, pat_end, fmt, args); va_end(args); } - +#endif diff --git a/regexec.c b/regexec.c index f8813875dc..b27884b32c 100644 --- a/regexec.c +++ b/regexec.c @@ -3,7 +3,7 @@ **********************************************************************/ /*- * Copyright (c) 2002-2008 K.Kosako - * Copyright (c) 2011-2014 K.Takata + * Copyright (c) 2011-2016 K.Takata * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -30,33 +30,39 @@ #include "regint.h" -/* #define USE_MATCH_RANGE_MUST_BE_INSIDE_OF_SPECIFIED_RANGE */ +#ifdef RUBY +# undef USE_MATCH_RANGE_MUST_BE_INSIDE_OF_SPECIFIED_RANGE +#else +# define USE_MATCH_RANGE_MUST_BE_INSIDE_OF_SPECIFIED_RANGE +#endif -#ifndef USE_DIRECT_THREADED_VM +#ifndef USE_TOKEN_THREADED_VM # ifdef __GNUC__ -# define USE_DIRECT_THREADED_VM 1 +# define USE_TOKEN_THREADED_VM 1 # else -# define USE_DIRECT_THREADED_VM 0 +# define USE_TOKEN_THREADED_VM 0 # endif #endif -#define ENC_DUMMY_FLAG (1<<24) +#ifdef RUBY +# define ENC_DUMMY_FLAG (1<<24) static inline int rb_enc_asciicompat(OnigEncoding enc) { - return ONIGENC_MBC_MINLEN(enc)==1 && !((enc)->ruby_encoding_index & ENC_DUMMY_FLAG); + return ONIGENC_MBC_MINLEN(enc)==1 && !((enc)->ruby_encoding_index & ENC_DUMMY_FLAG); } -#undef ONIGENC_IS_MBC_ASCII_WORD -#define ONIGENC_IS_MBC_ASCII_WORD(enc,s,end) \ +# undef ONIGENC_IS_MBC_ASCII_WORD +# define ONIGENC_IS_MBC_ASCII_WORD(enc,s,end) \ (rb_enc_asciicompat(enc) ? (ISALNUM(*s) || *s=='_') : \ onigenc_ascii_is_code_ctype( \ ONIGENC_MBC_TO_CODE(enc,s,end),ONIGENC_CTYPE_WORD,enc)) +#endif /* RUBY */ #ifdef USE_CRNL_AS_LINE_TERMINATOR -#define ONIGENC_IS_MBC_CRNL(enc,p,end) \ +# define ONIGENC_IS_MBC_CRNL(enc,p,end) \ (ONIGENC_MBC_TO_CODE(enc,p,end) == 13 && \ ONIGENC_MBC_TO_CODE(enc,(p+enclen(enc,p,end)),end) == 10) -#define ONIGENC_IS_MBC_NEWLINE_EX(enc,p,start,end,option,check_prev) \ +# define ONIGENC_IS_MBC_NEWLINE_EX(enc,p,start,end,option,check_prev) \ is_mbc_newline_ex((enc),(p),(start),(end),(option),(check_prev)) static int is_mbc_newline_ex(OnigEncoding enc, const UChar *p, const UChar *start, @@ -90,7 +96,7 @@ is_mbc_newline_ex(OnigEncoding enc, const UChar *p, const UChar *start, } } #else /* USE_CRNL_AS_LINE_TERMINATOR */ -#define ONIGENC_IS_MBC_NEWLINE_EX(enc,p,start,end,option,check_prev) \ +# define ONIGENC_IS_MBC_NEWLINE_EX(enc,p,start,end,option,check_prev) \ ONIGENC_IS_MBC_NEWLINE((enc), (p), (end)) #endif /* USE_CRNL_AS_LINE_TERMINATOR */ @@ -105,7 +111,7 @@ history_tree_clear(OnigCaptureTreeNode* node) if (IS_NOT_NULL(node)) { for (i = 0; i < node->num_childs; i++) { if (IS_NOT_NULL(node->childs[i])) { - history_tree_free(node->childs[i]); + history_tree_free(node->childs[i]); } } for (i = 0; i < node->allocated; i++) { @@ -156,7 +162,7 @@ history_node_new(void) static int history_tree_add_child(OnigCaptureTreeNode* parent, OnigCaptureTreeNode* child) { -#define HISTORY_TREE_INIT_ALLOC_SIZE 8 +# define HISTORY_TREE_INIT_ALLOC_SIZE 8 if (parent->num_childs >= parent->allocated) { int n, i; @@ -164,15 +170,15 @@ history_tree_add_child(OnigCaptureTreeNode* parent, OnigCaptureTreeNode* child) if (IS_NULL(parent->childs)) { n = HISTORY_TREE_INIT_ALLOC_SIZE; parent->childs = - (OnigCaptureTreeNode** )xmalloc(sizeof(OnigCaptureTreeNode*) * n); + (OnigCaptureTreeNode** )xmalloc(sizeof(OnigCaptureTreeNode*) * n); CHECK_NULL_RETURN_MEMERR(parent->childs); } else { OnigCaptureTreeNode** tmp; n = parent->allocated * 2; tmp = - (OnigCaptureTreeNode** )xrealloc(parent->childs, - sizeof(OnigCaptureTreeNode*) * n); + (OnigCaptureTreeNode** )xrealloc(parent->childs, + sizeof(OnigCaptureTreeNode*) * n); if (tmp == 0) { history_tree_clear(parent); return ONIGERR_MEMORY; @@ -348,7 +354,7 @@ onig_region_free(OnigRegion* r, int free_self) } extern void -onig_region_copy(OnigRegion* to, OnigRegion* from) +onig_region_copy(OnigRegion* to, const OnigRegion* from) { #define RREGC_SIZE (sizeof(int) * from->num_regs) int i, r; @@ -404,7 +410,7 @@ onig_region_copy(OnigRegion* to, OnigRegion* from) #define STK_MASK_MEM_END_OR_MARK 0x8000 /* MEM_END or MEM_END_MARK */ #ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE -#define MATCH_ARG_INIT(msa, arg_option, arg_region, arg_start, arg_gpos) do {\ +# define MATCH_ARG_INIT(msa, arg_option, arg_region, arg_start, arg_gpos) do {\ (msa).stack_p = (void* )0;\ (msa).options = (arg_option);\ (msa).region = (arg_region);\ @@ -413,7 +419,7 @@ onig_region_copy(OnigRegion* to, OnigRegion* from) (msa).best_len = ONIG_MISMATCH;\ } while(0) #else -#define MATCH_ARG_INIT(msa, arg_option, arg_region, arg_start, arg_gpos) do {\ +# define MATCH_ARG_INIT(msa, arg_option, arg_region, arg_start, arg_gpos) do {\ (msa).stack_p = (void* )0;\ (msa).options = (arg_option);\ (msa).region = (arg_region);\ @@ -424,9 +430,9 @@ onig_region_copy(OnigRegion* to, OnigRegion* from) #ifdef USE_COMBINATION_EXPLOSION_CHECK -#define STATE_CHECK_BUFF_MALLOC_THRESHOLD_SIZE 16 +# define STATE_CHECK_BUFF_MALLOC_THRESHOLD_SIZE 16 -#define STATE_CHECK_BUFF_INIT(msa, str_len, offset, state_num) do { \ +# define STATE_CHECK_BUFF_INIT(msa, str_len, offset, state_num) do { \ if ((state_num) > 0 && str_len >= STATE_CHECK_STRING_THRESHOLD_LEN) {\ unsigned int size = (unsigned int )(((str_len) + 1) * (state_num) + 7) >> 3;\ offset = ((offset) * (state_num)) >> 3;\ @@ -452,14 +458,14 @@ onig_region_copy(OnigRegion* to, OnigRegion* from) }\ } while(0) -#define MATCH_ARG_FREE(msa) do {\ +# define MATCH_ARG_FREE(msa) do {\ if ((msa).stack_p) xfree((msa).stack_p);\ if ((msa).state_check_buff_size >= STATE_CHECK_BUFF_MALLOC_THRESHOLD_SIZE) { \ if ((msa).state_check_buff) xfree((msa).state_check_buff);\ }\ } while(0) #else /* USE_COMBINATION_EXPLOSION_CHECK */ -#define MATCH_ARG_FREE(msa) if ((msa).stack_p) xfree((msa).stack_p) +# define MATCH_ARG_FREE(msa) if ((msa).stack_p) xfree((msa).stack_p) #endif /* USE_COMBINATION_EXPLOSION_CHECK */ @@ -548,9 +554,9 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end, n *= 2; if (limit_size != 0 && n > limit_size) { if ((unsigned int )(stk_end - stk_base) == limit_size) - return ONIGERR_MATCH_STACK_LIMIT_OVER; + return ONIGERR_MATCH_STACK_LIMIT_OVER; else - n = limit_size; + n = limit_size; } x = (OnigStackType* )xrealloc(stk_base, sizeof(OnigStackType) * n); if (IS_NULL(x)) { @@ -587,9 +593,9 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end, #define IS_TO_VOID_TARGET(stk) (((stk)->type & STK_MASK_TO_VOID_TARGET) != 0) #ifdef USE_COMBINATION_EXPLOSION_CHECK -#define STATE_CHECK_POS(s,snum) \ +# define STATE_CHECK_POS(s,snum) \ (((s) - str) * num_comb_exp_check + ((snum) - 1)) -#define STATE_CHECK_VAL(v,snum) do {\ +# define STATE_CHECK_VAL(v,snum) do {\ if (state_check_buff != NULL) {\ int x = STATE_CHECK_POS(s,snum);\ (v) = state_check_buff[x/8] & (1<<(x%8));\ @@ -598,13 +604,13 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end, } while(0) -#define ELSE_IF_STATE_CHECK_MARK(stk) \ +# define ELSE_IF_STATE_CHECK_MARK(stk) \ else if ((stk)->type == STK_STATE_CHECK_MARK) { \ int x = STATE_CHECK_POS(stk->u.state.pstr, stk->u.state.state_check);\ state_check_buff[x/8] |= (1<<(x%8)); \ } -#define STACK_PUSH(stack_type,pat,s,sprev,keep) do {\ +# define STACK_PUSH(stack_type,pat,s,sprev,keep) do {\ STACK_ENSURE(1);\ stk->type = (stack_type);\ stk->u.state.pcode = (pat);\ @@ -615,14 +621,14 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end, STACK_INC;\ } while(0) -#define STACK_PUSH_ENSURED(stack_type,pat) do {\ +# define STACK_PUSH_ENSURED(stack_type,pat) do {\ stk->type = (stack_type);\ stk->u.state.pcode = (pat);\ stk->u.state.state_check = 0;\ STACK_INC;\ } while(0) -#define STACK_PUSH_ALT_WITH_STATE_CHECK(pat,s,sprev,snum,keep) do {\ +# define STACK_PUSH_ALT_WITH_STATE_CHECK(pat,s,sprev,snum,keep) do {\ STACK_ENSURE(1);\ stk->type = STK_ALT;\ stk->u.state.pcode = (pat);\ @@ -633,7 +639,7 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end, STACK_INC;\ } while(0) -#define STACK_PUSH_STATE_CHECK(s,snum) do {\ +# define STACK_PUSH_STATE_CHECK(s,snum) do {\ if (state_check_buff != NULL) {\ STACK_ENSURE(1);\ stk->type = STK_STATE_CHECK_MARK;\ @@ -645,9 +651,9 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end, #else /* USE_COMBINATION_EXPLOSION_CHECK */ -#define ELSE_IF_STATE_CHECK_MARK(stk) +# define ELSE_IF_STATE_CHECK_MARK(stk) -#define STACK_PUSH(stack_type,pat,s,sprev,keep) do {\ +# define STACK_PUSH(stack_type,pat,s,sprev,keep) do {\ STACK_ENSURE(1);\ stk->type = (stack_type);\ stk->u.state.pcode = (pat);\ @@ -657,7 +663,7 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end, STACK_INC;\ } while(0) -#define STACK_PUSH_ENSURED(stack_type,pat) do {\ +# define STACK_PUSH_ENSURED(stack_type,pat) do {\ stk->type = (stack_type);\ stk->u.state.pcode = (pat);\ STACK_INC;\ @@ -781,13 +787,13 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end, #ifdef ONIG_DEBUG -#define STACK_BASE_CHECK(p, at) \ +# define STACK_BASE_CHECK(p, at) \ if ((p) < stk_base) {\ fprintf(stderr, "at %s\n", at);\ goto stack_error;\ } #else -#define STACK_BASE_CHECK(p, at) +# define STACK_BASE_CHECK(p, at) #endif #define STACK_POP_ONE do {\ @@ -1124,16 +1130,16 @@ static int string_cmp_ic(OnigEncoding enc, int case_fold_flag, #define IS_EMPTY_STR (str == end) -#define ON_STR_BEGIN(s) ((s) == str) -#define ON_STR_END(s) ((s) == end) +#define ON_STR_BEGIN(s) ((s) == str) +#define ON_STR_END(s) ((s) == end) #ifdef USE_MATCH_RANGE_MUST_BE_INSIDE_OF_SPECIFIED_RANGE -#define DATA_ENSURE_CHECK1 (s < right_range) -#define DATA_ENSURE_CHECK(n) (s + (n) <= right_range) -#define DATA_ENSURE(n) if (s + (n) > right_range) goto fail +# define DATA_ENSURE_CHECK1 (s < right_range) +# define DATA_ENSURE_CHECK(n) (s + (n) <= right_range) +# define DATA_ENSURE(n) if (s + (n) > right_range) goto fail #else -#define DATA_ENSURE_CHECK1 (s < end) -#define DATA_ENSURE_CHECK(n) (s + (n) <= end) -#define DATA_ENSURE(n) if (s + (n) > end) goto fail +# define DATA_ENSURE_CHECK1 (s < end) +# define DATA_ENSURE_CHECK(n) (s + (n) <= end) +# define DATA_ENSURE(n) if (s + (n) > end) goto fail #endif /* USE_MATCH_RANGE_MUST_BE_INSIDE_OF_SPECIFIED_RANGE */ @@ -1150,29 +1156,29 @@ make_capture_history_tree(OnigCaptureTreeNode* node, OnigStackType** kp, if (k->type == STK_MEM_START) { n = k->u.mem.num; if (n <= ONIG_MAX_CAPTURE_HISTORY_GROUP && - BIT_STATUS_AT(reg->capture_history, n) != 0) { - child = history_node_new(); - CHECK_NULL_RETURN_MEMERR(child); - child->group = n; - child->beg = k->u.mem.pstr - str; - r = history_tree_add_child(node, child); - if (r != 0) { - history_tree_free(child); - return r; - } - *kp = (k + 1); - r = make_capture_history_tree(child, kp, stk_top, str, reg); - if (r != 0) return r; - - k = *kp; - child->end = k->u.mem.pstr - str; + BIT_STATUS_AT(reg->capture_history, n) != 0) { + child = history_node_new(); + CHECK_NULL_RETURN_MEMERR(child); + child->group = n; + child->beg = k->u.mem.pstr - str; + r = history_tree_add_child(node, child); + if (r != 0) { + history_tree_free(child); + return r; + } + *kp = (k + 1); + r = make_capture_history_tree(child, kp, stk_top, str, reg); + if (r != 0) return r; + + k = *kp; + child->end = k->u.mem.pstr - str; } } else if (k->type == STK_MEM_END) { if (k->u.mem.num == node->group) { - node->end = k->u.mem.pstr - str; - *kp = k; - return 0; + node->end = k->u.mem.pstr - str; + *kp = k; + return 0; } } k++; @@ -1195,10 +1201,10 @@ static int mem_is_in_memp(int mem, int num, UChar* memp) return 0; } -static int backref_match_at_nested_level(regex_t* reg - , OnigStackType* top, OnigStackType* stk_base - , int ignore_case, int case_fold_flag - , int nest, int mem_num, UChar* memp, UChar** s, const UChar* send) +static int backref_match_at_nested_level(regex_t* reg, + OnigStackType* top, OnigStackType* stk_base, + int ignore_case, int case_fold_flag, + int nest, int mem_num, UChar* memp, UChar** s, const UChar* send) { UChar *ss, *p, *pstart, *pend = NULL_UCHARP; int level; @@ -1255,27 +1261,37 @@ static int backref_match_at_nested_level(regex_t* reg #ifdef ONIG_DEBUG_STATISTICS -#define USE_TIMEOFDAY - -#ifdef USE_TIMEOFDAY -#ifdef HAVE_SYS_TIME_H -#include -#endif -#ifdef HAVE_UNISTD_H -#include -#endif +# ifdef _WIN32 +# include +static LARGE_INTEGER ts, te, freq; +# define GETTIME(t) QueryPerformanceCounter(&(t)) +# define TIMEDIFF(te,ts) (unsigned long )(((te).QuadPart - (ts).QuadPart) \ + * 1000000 / freq.QuadPart) +# else /* _WIN32 */ + +# define USE_TIMEOFDAY + +# ifdef USE_TIMEOFDAY +# ifdef HAVE_SYS_TIME_H +# include +# endif +# ifdef HAVE_UNISTD_H +# include +# endif static struct timeval ts, te; -#define GETTIME(t) gettimeofday(&(t), (struct timezone* )0) -#define TIMEDIFF(te,ts) (((te).tv_usec - (ts).tv_usec) + \ - (((te).tv_sec - (ts).tv_sec)*1000000)) -#else /* USE_TIMEOFDAY */ -#ifdef HAVE_SYS_TIMES_H -#include -#endif +# define GETTIME(t) gettimeofday(&(t), (struct timezone* )0) +# define TIMEDIFF(te,ts) (((te).tv_usec - (ts).tv_usec) + \ + (((te).tv_sec - (ts).tv_sec)*1000000)) +# else /* USE_TIMEOFDAY */ +# ifdef HAVE_SYS_TIMES_H +# include +# endif static struct tms ts, te; -#define GETTIME(t) times(&(t)) -#define TIMEDIFF(te,ts) ((te).tms_utime - (ts).tms_utime) -#endif /* USE_TIMEOFDAY */ +# define GETTIME(t) times(&(t)) +# define TIMEDIFF(te,ts) ((te).tms_utime - (ts).tms_utime) +# endif /* USE_TIMEOFDAY */ + +# endif /* _WIN32 */ static int OpCounter[256]; static int OpPrevCounter[256]; @@ -1284,14 +1300,14 @@ static int OpCurr = OP_FINISH; static int OpPrevTarget = OP_FAIL; static int MaxStackDepth = 0; -#define MOP_IN(opcode) do {\ +# define MOP_IN(opcode) do {\ if (opcode == OpPrevTarget) OpPrevCounter[OpCurr]++;\ OpCurr = opcode;\ OpCounter[opcode]++;\ GETTIME(ts);\ } while(0) -#define MOP_OUT do {\ +# define MOP_OUT do {\ GETTIME(te);\ OpTime[OpCurr] += TIMEDIFF(te, ts);\ } while(0) @@ -1304,6 +1320,9 @@ onig_statistics_init(void) OpCounter[i] = OpPrevCounter[i] = 0; OpTime[i] = 0; } MaxStackDepth = 0; +# ifdef _WIN32 + QueryPerformanceFrequency(&freq); +# endif } extern void @@ -1318,28 +1337,45 @@ onig_print_statistics(FILE* f) fprintf(f, "\nmax stack depth: %d\n", MaxStackDepth); } -#define STACK_INC do {\ +# define STACK_INC do {\ stk++;\ if (stk - stk_base > MaxStackDepth) \ MaxStackDepth = stk - stk_base;\ } while(0) #else /* ONIG_DEBUG_STATISTICS */ -#define STACK_INC stk++ +# define STACK_INC stk++ -#define MOP_IN(opcode) -#define MOP_OUT +# define MOP_IN(opcode) +# define MOP_OUT #endif /* ONIG_DEBUG_STATISTICS */ - -/* matching region of POSIX API */ -typedef int regoff_t; - -typedef struct { - regoff_t rm_so; - regoff_t rm_eo; -} posix_regmatch_t; +#ifdef ONIG_DEBUG_MATCH +static char * +stack_type_str(int stack_type) +{ + switch (stack_type) { + case STK_ALT: return "Alt "; + case STK_LOOK_BEHIND_NOT: return "LBNot "; + case STK_POS_NOT: return "PosNot"; + case STK_MEM_START: return "MemS "; + case STK_MEM_END: return "MemE "; + case STK_REPEAT_INC: return "RepInc"; + case STK_STATE_CHECK_MARK: return "StChMk"; + case STK_NULL_CHECK_START: return "NulChS"; + case STK_NULL_CHECK_END: return "NulChE"; + case STK_MEM_END_MARK: return "MemEMk"; + case STK_POS: return "Pos "; + case STK_STOP_BT: return "StopBt"; + case STK_REPEAT: return "Rep "; + case STK_CALL_FRAME: return "Call "; + case STK_RETURN: return "Ret "; + case STK_VOID: return "Void "; + default: return " "; + } +} +#endif /* match data(str - end) from position (sstart). */ /* if sstart == str then set sprev to NULL. */ @@ -1376,13 +1412,14 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, int num_comb_exp_check = reg->num_comb_exp_check; #endif -#if USE_DIRECT_THREADED_VM -#define VM_LOOP JUMP; -#define VM_LOOP_END -#define CASE(x) L_##x: sbegin = s; OPCODE_EXEC_HOOK; -#define DEFAULT L_DEFAULT: -#define NEXT sprev = sbegin; JUMP -#define JUMP goto *oplabels[*p++] +#if USE_TOKEN_THREADED_VM +# define OP_OFFSET 1 +# define VM_LOOP JUMP; +# define VM_LOOP_END +# define CASE(x) L_##x: sbegin = s; OPCODE_EXEC_HOOK; +# define DEFAULT L_DEFAULT: +# define NEXT sprev = sbegin; JUMP +# define JUMP goto *oplabels[*p++] static const void *oplabels[] = { &&L_OP_FINISH, /* matching process terminator (no more alternative) */ @@ -1410,7 +1447,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, &&L_OP_CCLASS_NOT, &&L_OP_CCLASS_MB_NOT, &&L_OP_CCLASS_MIX_NOT, - &&L_OP_CCLASS_NODE, /* pointer to CClassNode node */ &&L_OP_ANYCHAR, /* "." */ &&L_OP_ANYCHAR_ML, /* "." multi-line */ @@ -1423,24 +1459,24 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, &&L_OP_NOT_WORD, &&L_OP_WORD_BOUND, &&L_OP_NOT_WORD_BOUND, -#ifdef USE_WORD_BEGIN_END +# ifdef USE_WORD_BEGIN_END &&L_OP_WORD_BEGIN, &&L_OP_WORD_END, -#else +# else &&L_DEFAULT, &&L_DEFAULT, -#endif +# endif &&L_OP_ASCII_WORD, &&L_OP_NOT_ASCII_WORD, &&L_OP_ASCII_WORD_BOUND, &&L_OP_NOT_ASCII_WORD_BOUND, -#ifdef USE_WORD_BEGIN_END +# ifdef USE_WORD_BEGIN_END &&L_OP_ASCII_WORD_BEGIN, &&L_OP_ASCII_WORD_END, -#else +# else &&L_DEFAULT, &&L_DEFAULT, -#endif +# endif &&L_OP_BEGIN_BUF, &&L_OP_END_BUF, @@ -1456,25 +1492,25 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, &&L_OP_BACKREFN_IC, &&L_OP_BACKREF_MULTI, &&L_OP_BACKREF_MULTI_IC, -#ifdef USE_BACKREF_WITH_LEVEL +# ifdef USE_BACKREF_WITH_LEVEL &&L_OP_BACKREF_WITH_LEVEL, /* \k, \k */ -#else +# else &&L_DEFAULT, -#endif +# endif &&L_OP_MEMORY_START, &&L_OP_MEMORY_START_PUSH, /* push back-tracker to stack */ &&L_OP_MEMORY_END_PUSH, /* push back-tracker to stack */ -#ifdef USE_SUBEXP_CALL +# ifdef USE_SUBEXP_CALL &&L_OP_MEMORY_END_PUSH_REC, /* push back-tracker to stack */ -#else +# else &&L_DEFAULT, -#endif +# endif &&L_OP_MEMORY_END, -#ifdef USE_SUBEXP_CALL +# ifdef USE_SUBEXP_CALL &&L_OP_MEMORY_END_REC, /* push marker to stack */ -#else +# else &&L_DEFAULT, -#endif +# endif &&L_OP_KEEP, @@ -1482,7 +1518,11 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, &&L_OP_JUMP, &&L_OP_PUSH, &&L_OP_POP, +# ifdef USE_OP_PUSH_OR_JUMP_EXACT &&L_OP_PUSH_OR_JUMP_EXACT1, /* if match exact then push, else jump. */ +# else + &&L_DEFAULT, +# endif &&L_OP_PUSH_IF_PEEK_NEXT, /* if match exact then push, else none. */ &&L_OP_REPEAT, /* {n,m} */ &&L_OP_REPEAT_NG, /* {n,m}? (non greedy) */ @@ -1492,16 +1532,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, &&L_OP_REPEAT_INC_NG_SG, /* search and get in stack (non greedy) */ &&L_OP_NULL_CHECK_START, /* null loop checker start */ &&L_OP_NULL_CHECK_END, /* null loop checker end */ -#ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT +# ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT &&L_OP_NULL_CHECK_END_MEMST, /* null loop checker end (with capture status) */ -#else +# else &&L_DEFAULT, -#endif -#ifdef USE_SUBEXP_CALL +# endif +# ifdef USE_SUBEXP_CALL &&L_OP_NULL_CHECK_END_MEMST_PUSH, /* with capture status and push check-end */ -#else +# else &&L_DEFAULT, -#endif +# endif &&L_OP_PUSH_POS, /* (?=...) start */ &&L_OP_POP_POS, /* (?=...) end */ @@ -1513,69 +1553,66 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, &&L_OP_PUSH_LOOK_BEHIND_NOT, /* (? */ &&L_OP_RETURN, -#else +# else &&L_DEFAULT, &&L_DEFAULT, -#endif +# endif &&L_OP_CONDITION, -#ifdef USE_COMBINATION_EXPLOSION_CHECK +# ifdef USE_COMBINATION_EXPLOSION_CHECK &&L_OP_STATE_CHECK_PUSH, /* combination explosion check and push */ &&L_OP_STATE_CHECK_PUSH_OR_JUMP, /* check ok -> push, else jump */ &&L_OP_STATE_CHECK, /* check only */ -#else +# else &&L_DEFAULT, &&L_DEFAULT, &&L_DEFAULT, -#endif -#ifdef USE_COMBINATION_EXPLOSION_CHECK +# endif +# ifdef USE_COMBINATION_EXPLOSION_CHECK &&L_OP_STATE_CHECK_ANYCHAR_STAR, &&L_OP_STATE_CHECK_ANYCHAR_ML_STAR, -#else +# else &&L_DEFAULT, &&L_DEFAULT, -#endif +# endif /* no need: IS_DYNAMIC_OPTION() == 0 */ -#if 0 /* no need: IS_DYNAMIC_OPTION() == 0 */ +# if 0 /* no need: IS_DYNAMIC_OPTION() == 0 */ &&L_OP_SET_OPTION_PUSH, /* set option and push recover option */ &&L_OP_SET_OPTION /* set option */ -#else +# else &&L_DEFAULT, &&L_DEFAULT -#endif +# endif }; -#else +#else /* USE_TOKEN_THREADED_VM */ -#define VM_LOOP \ +# define OP_OFFSET 0 +# define VM_LOOP \ while (1) { \ OPCODE_EXEC_HOOK; \ sbegin = s; \ switch (*p++) { -#define VM_LOOP_END } sprev = sbegin; } -#define CASE(x) case x: -#define DEFAULT default: -#define NEXT break -#define JUMP continue; break -#endif +# define VM_LOOP_END } sprev = sbegin; } +# define CASE(x) case x: +# define DEFAULT default: +# define NEXT break +# define JUMP continue; break +#endif /* USE_TOKEN_THREADED_VM */ #ifdef USE_SUBEXP_CALL - /* Stack #0 is used to store the pattern itself and used for (?R), \g<0>, etc. */ - n = reg->num_repeat + (reg->num_mem + 1) * 2; - - STACK_INIT(alloca_base, xmalloc_base, n, INIT_MATCH_STACK_SIZE); - pop_level = reg->stack_pop_level; - num_mem = reg->num_mem; - repeat_stk = (OnigStackIndex* )alloca_base; +/* Stack #0 is used to store the pattern itself and used for (?R), \g<0>, + etc. Additional space is required. */ +# define ADD_NUMMEM 1 +#else +/* Stack #0 not is used. */ +# define ADD_NUMMEM 0 +#endif - mem_start_stk = (OnigStackIndex* )(repeat_stk + reg->num_repeat); - mem_end_stk = mem_start_stk + (num_mem + 1); -#else /* USE_SUBEXP_CALL */ - /* Stack #0 not is used. */ - n = reg->num_repeat + reg->num_mem * 2; + n = reg->num_repeat + (reg->num_mem + ADD_NUMMEM) * 2; STACK_INIT(alloca_base, xmalloc_base, n, INIT_MATCH_STACK_SIZE); pop_level = reg->stack_pop_level; @@ -1583,25 +1620,27 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, repeat_stk = (OnigStackIndex* )alloca_base; mem_start_stk = (OnigStackIndex* )(repeat_stk + reg->num_repeat); - mem_end_stk = mem_start_stk + num_mem; + mem_end_stk = mem_start_stk + (num_mem + ADD_NUMMEM); + { + OnigStackIndex *pp = mem_start_stk; + for (; pp < repeat_stk + n; pp += 2) { + pp[0] = INVALID_STACK_INDEX; + pp[1] = INVALID_STACK_INDEX; + } + } +#ifndef USE_SUBEXP_CALL mem_start_stk--; /* for index start from 1, mem_start_stk[1]..mem_start_stk[num_mem] */ mem_end_stk--; /* for index start from 1, mem_end_stk[1]..mem_end_stk[num_mem] */ -#endif /* USE_SUBEXP_CALL */ - { - OnigStackIndex *pp = mem_start_stk; - for (; pp < (repeat_stk + n); pp+=2) { - pp[0] = INVALID_STACK_INDEX; - pp[1] = INVALID_STACK_INDEX; - } - } +#endif #ifdef ONIG_DEBUG_MATCH fprintf(stderr, "match_at: str: %"PRIdPTR" (%p), end: %"PRIdPTR" (%p), start: %"PRIdPTR" (%p), sprev: %"PRIdPTR" (%p)\n", (intptr_t )str, str, (intptr_t )end, end, (intptr_t )sstart, sstart, (intptr_t )sprev, sprev); fprintf(stderr, "size: %d, start offset: %d\n", (int )(end - str), (int )(sstart - str)); + fprintf(stderr, "\n ofs> str stk:type addr:opcode\n"); #endif STACK_PUSH_ENSURED(STK_ALT, (UChar* )FinishCode); /* bottom stack */ @@ -1611,31 +1650,34 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, #ifdef ONIG_DEBUG_MATCH -#define OPCODE_EXEC_HOOK \ +# define OPCODE_EXEC_HOOK \ if (s) { \ UChar *op, *q, *bp, buf[50]; \ int len; \ - op = p - 1; \ + op = p - OP_OFFSET; \ fprintf(stderr, "%4"PRIdPTR"> \"", (*op == OP_FINISH) ? (ptrdiff_t )-1 : s - str); \ bp = buf; \ q = s; \ if (*op != OP_FINISH) { /* s may not be a valid pointer if OP_FINISH. */ \ for (i = 0; i < 7 && q < end; i++) { \ - len = enclen(encode, q, end); \ + len = enclen(encode, q, end); \ while (len-- > 0) *bp++ = *q++; \ } \ + if (q < end) { xmemcpy(bp, "...", 3); bp += 3; } \ } \ - if (q < end) { xmemcpy(bp, "...\"", 4); bp += 4; } \ - else { xmemcpy(bp, "\"", 1); bp += 1; } \ + xmemcpy(bp, "\"", 1); bp += 1; \ *bp = 0; \ fputs((char* )buf, stderr); \ for (i = 0; i < 20 - (bp - buf); i++) fputc(' ', stderr); \ - fprintf(stderr, "%4"PRIdPTR":", (op == FinishCode) ? (ptrdiff_t )-1 : op - reg->p); \ + fprintf(stderr, "%4"PRIdPTR":%s %4"PRIdPTR":", \ + stk - stk_base - 1, \ + (stk > stk_base) ? stack_type_str(stk[-1].type) : " ", \ + (op == FinishCode) ? (ptrdiff_t )-1 : op - reg->p); \ onig_print_compiled_byte_code(stderr, op, reg->p+reg->used, NULL, encode); \ fprintf(stderr, "\n"); \ } #else -#define OPCODE_EXEC_HOOK ((void) 0) +# define OPCODE_EXEC_HOOK ((void) 0) #endif @@ -1652,83 +1694,56 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, } else goto end_best_len; - } + } #endif best_len = n; region = msa->region; if (region) { -#ifdef USE_POSIX_API_REGION_OPTION - if (IS_POSIX_REGION(msa->options)) { - posix_regmatch_t* rmt = (posix_regmatch_t* )region; - - rmt[0].rm_so = (regoff_t )(((pkeep > s) ? s : pkeep) - str); - rmt[0].rm_eo = (regoff_t )(s - str); - for (i = 1; i <= num_mem; i++) { - if (mem_end_stk[i] != INVALID_STACK_INDEX) { - if (BIT_STATUS_AT(reg->bt_mem_start, i)) - rmt[i].rm_so = (regoff_t )(STACK_AT(mem_start_stk[i])->u.mem.pstr - str); - else - rmt[i].rm_so = (regoff_t )((UChar* )((void* )(mem_start_stk[i])) - str); - - rmt[i].rm_eo = (regoff_t )((BIT_STATUS_AT(reg->bt_mem_end, i) + region->beg[0] = ((pkeep > s) ? s : pkeep) - str; + region->end[0] = s - str; + for (i = 1; i <= num_mem; i++) { + if (mem_end_stk[i] != INVALID_STACK_INDEX) { + if (BIT_STATUS_AT(reg->bt_mem_start, i)) + region->beg[i] = STACK_AT(mem_start_stk[i])->u.mem.pstr - str; + else + region->beg[i] = (UChar* )((void* )mem_start_stk[i]) - str; + + region->end[i] = (BIT_STATUS_AT(reg->bt_mem_end, i) ? STACK_AT(mem_end_stk[i])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[i])) - str); - } - else { - rmt[i].rm_so = rmt[i].rm_eo = ONIG_REGION_NOTPOS; - } + : (UChar* )((void* )mem_end_stk[i])) - str; } - } - else { -#endif /* USE_POSIX_API_REGION_OPTION */ - region->beg[0] = ((pkeep > s) ? s : pkeep) - str; - region->end[0] = s - str; - for (i = 1; i <= num_mem; i++) { - if (mem_end_stk[i] != INVALID_STACK_INDEX) { - if (BIT_STATUS_AT(reg->bt_mem_start, i)) - region->beg[i] = STACK_AT(mem_start_stk[i])->u.mem.pstr - str; - else - region->beg[i] = (UChar* )((void* )mem_start_stk[i]) - str; - - region->end[i] = (BIT_STATUS_AT(reg->bt_mem_end, i) - ? STACK_AT(mem_end_stk[i])->u.mem.pstr - : (UChar* )((void* )mem_end_stk[i])) - str; - } - else { - region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS; - } + else { + region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS; } + } #ifdef USE_CAPTURE_HISTORY - if (reg->capture_history != 0) { - int r; - OnigCaptureTreeNode* node; - - if (IS_NULL(region->history_root)) { - region->history_root = node = history_node_new(); - CHECK_NULL_RETURN_MEMERR(node); - } - else { - node = region->history_root; - history_tree_clear(node); - } - - node->group = 0; - node->beg = ((pkeep > s) ? s : pkeep) - str; - node->end = s - str; - - stkp = stk_base; - r = make_capture_history_tree(region->history_root, &stkp, - stk, (UChar* )str, reg); - if (r < 0) { - best_len = r; /* error code */ - goto finish; - } + if (reg->capture_history != 0) { + int r; + OnigCaptureTreeNode* node; + + if (IS_NULL(region->history_root)) { + region->history_root = node = history_node_new(); + CHECK_NULL_RETURN_MEMERR(node); } + else { + node = region->history_root; + history_tree_clear(node); + } + + node->group = 0; + node->beg = ((pkeep > s) ? s : pkeep) - str; + node->end = s - str; + + stkp = stk_base; + r = make_capture_history_tree(region->history_root, &stkp, + stk, (UChar* )str, reg); + if (r < 0) { + best_len = r; /* error code */ + goto finish; + } + } #endif /* USE_CAPTURE_HISTORY */ -#ifdef USE_POSIX_API_REGION_OPTION - } /* else IS_POSIX_REGION() */ -#endif } /* if (region) */ } /* n > best_len */ @@ -1777,8 +1792,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, q = lowbuf; while (len-- > 0) { if (*p != *q) { - goto fail; - } + goto fail; + } p++; q++; } } @@ -2036,7 +2051,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, int mb_len = enclen(encode, s, end); if (! DATA_ENSURE_CHECK(mb_len)) { - DATA_ENSURE(1); + DATA_ENSURE(1); s = (UChar* )end; p += tlen; goto cc_mb_not_success; @@ -2078,25 +2093,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, MOP_OUT; NEXT; - CASE(OP_CCLASS_NODE) MOP_IN(OP_CCLASS_NODE); - { - OnigCodePoint code; - void *node; - int mb_len; - UChar *ss; - - DATA_ENSURE(1); - GET_POINTER_INC(node, p); - mb_len = enclen(encode, s, end); - ss = s; - s += mb_len; - DATA_ENSURE(0); - code = ONIGENC_MBC_TO_CODE(encode, ss, s); - if (onig_is_code_in_cc_len(mb_len, code, node) == 0) goto fail; - } - MOP_OUT; - NEXT; - CASE(OP_ANYCHAR) MOP_IN(OP_ANYCHAR); DATA_ENSURE(1); n = enclen(encode, s, end); @@ -2118,10 +2114,10 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, while (DATA_ENSURE_CHECK1) { STACK_PUSH_ALT(p, s, sprev, pkeep); n = enclen(encode, s, end); - DATA_ENSURE(n); - if (ONIGENC_IS_MBC_NEWLINE_EX(encode, s, str, end, option, 0)) goto fail; - sprev = s; - s += n; + DATA_ENSURE(n); + if (ONIGENC_IS_MBC_NEWLINE_EX(encode, s, str, end, option, 0)) goto fail; + sprev = s; + s += n; } MOP_OUT; NEXT; @@ -2149,10 +2145,10 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, STACK_PUSH_ALT(p + 1, s, sprev, pkeep); } n = enclen(encode, s, end); - DATA_ENSURE(n); - if (ONIGENC_IS_MBC_NEWLINE_EX(encode, s, str, end, option, 0)) goto fail; - sprev = s; - s += n; + DATA_ENSURE(n); + if (ONIGENC_IS_MBC_NEWLINE_EX(encode, s, str, end, option, 0)) goto fail; + sprev = s; + s += n; } p++; MOP_OUT; @@ -2187,10 +2183,10 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, STACK_PUSH_ALT_WITH_STATE_CHECK(p, s, sprev, mem, pkeep); n = enclen(encode, s, end); - DATA_ENSURE(n); - if (ONIGENC_IS_MBC_NEWLINE_EX(encode, s, str, end, option, 0)) goto fail; - sprev = s; - s += n; + DATA_ENSURE(n); + if (ONIGENC_IS_MBC_NEWLINE_EX(encode, s, str, end, option, 0)) goto fail; + sprev = s; + s += n; } MOP_OUT; NEXT; @@ -2681,8 +2677,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, GET_LENGTH_INC(tlen, p); sprev = s; - if (backref_match_at_nested_level(reg, stk, stk_base, ic - , case_fold_flag, (int )level, (int )tlen, p, &s, end)) { + if (backref_match_at_nested_level(reg, stk, stk_base, ic, + case_fold_flag, (int )level, (int )tlen, p, &s, end)) { while (sprev + (len = enclen(encode, sprev, end)) < s) sprev += len; @@ -2758,10 +2754,10 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, GET_MEMNUM_INC(mem, p); /* mem: null check id */ STACK_NULL_CHECK_MEMST(isnull, mem, s, reg); if (isnull) { -#ifdef ONIG_DEBUG_MATCH +# ifdef ONIG_DEBUG_MATCH fprintf(stderr, "NULL_CHECK_END_MEMST: skip id:%d, s:%"PRIdPTR" (%p)\n", (int )mem, (intptr_t )s, s); -#endif +# endif if (isnull == -1) goto fail; goto null_check_found; } @@ -2777,16 +2773,16 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, int isnull; GET_MEMNUM_INC(mem, p); /* mem: null check id */ -#ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT +# ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT STACK_NULL_CHECK_MEMST_REC(isnull, mem, s, reg); -#else +# else STACK_NULL_CHECK_REC(isnull, mem, s); -#endif +# endif if (isnull) { -#ifdef ONIG_DEBUG_MATCH +# ifdef ONIG_DEBUG_MATCH fprintf(stderr, "NULL_CHECK_END_MEMST_PUSH: skip id:%d, s:%"PRIdPTR" (%p)\n", (int )mem, (intptr_t )s, s); -#endif +# endif if (isnull == -1) goto fail; goto null_check_found; } @@ -2850,6 +2846,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, MOP_OUT; JUMP; +#ifdef USE_OP_PUSH_OR_JUMP_EXACT CASE(OP_PUSH_OR_JUMP_EXACT1) MOP_IN(OP_PUSH_OR_JUMP_EXACT1); GET_RELADDR_INC(addr, p); if (*p == *s && DATA_ENSURE_CHECK1) { @@ -2861,6 +2858,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, p += (addr + 1); MOP_OUT; JUMP; +#endif CASE(OP_PUSH_IF_PEEK_NEXT) MOP_IN(OP_PUSH_IF_PEEK_NEXT); GET_RELADDR_INC(addr, p); @@ -2915,14 +2913,14 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, repeat_inc: stkp->u.repeat.count++; if (stkp->u.repeat.count >= reg->repeat_range[mem].upper) { - /* end of repeat. Nothing to do. */ + /* end of repeat. Nothing to do. */ } else if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { - STACK_PUSH_ALT(p, s, sprev, pkeep); - p = STACK_AT(si)->u.repeat.pcode; /* Don't use stkp after PUSH. */ + STACK_PUSH_ALT(p, s, sprev, pkeep); + p = STACK_AT(si)->u.repeat.pcode; /* Don't use stkp after PUSH. */ } else { - p = stkp->u.repeat.pcode; + p = stkp->u.repeat.pcode; } STACK_PUSH_REPEAT_INC(si); MOP_OUT; @@ -2944,19 +2942,19 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, repeat_inc_ng: stkp->u.repeat.count++; if (stkp->u.repeat.count < reg->repeat_range[mem].upper) { - if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { - UChar* pcode = stkp->u.repeat.pcode; - - STACK_PUSH_REPEAT_INC(si); - STACK_PUSH_ALT(pcode, s, sprev, pkeep); - } - else { - p = stkp->u.repeat.pcode; - STACK_PUSH_REPEAT_INC(si); - } + if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { + UChar* pcode = stkp->u.repeat.pcode; + + STACK_PUSH_REPEAT_INC(si); + STACK_PUSH_ALT(pcode, s, sprev, pkeep); + } + else { + p = stkp->u.repeat.pcode; + STACK_PUSH_REPEAT_INC(si); + } } else if (stkp->u.repeat.count == reg->repeat_range[mem].upper) { - STACK_PUSH_REPEAT_INC(si); + STACK_PUSH_REPEAT_INC(si); } MOP_OUT; CHECK_INTERRUPT_IN_MATCH_AT; @@ -3067,9 +3065,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, CASE(OP_FAIL) if (0) { - /* fall */ + /* fall */ fail: - MOP_OUT; + MOP_OUT; } MOP_IN(OP_FAIL); STACK_POP; @@ -3080,8 +3078,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, #ifdef USE_COMBINATION_EXPLOSION_CHECK if (stk->u.state.state_check != 0) { - stk->type = STK_STATE_CHECK_MARK; - stk++; + stk->type = STK_STATE_CHECK_MARK; + stk++; } #endif @@ -3158,7 +3156,7 @@ slow_search(OnigEncoding enc, UChar* target, UChar* target_end, static int str_lower_case_match(OnigEncoding enc, int case_fold_flag, - const UChar* t, const UChar* tend, + const UChar* t, const UChar* tend, const UChar* p, const UChar* end) { int lowlen; @@ -3250,7 +3248,7 @@ slow_search_backward_ic(OnigEncoding enc, int case_fold_flag, while (s >= text) { if (str_lower_case_match(enc, case_fold_flag, - target, target_end, s, text_end)) + target, target_end, s, text_end)) return s; s = (UChar* )onigenc_get_prev_char_head(enc, adjust_text, s, text_end); @@ -3270,10 +3268,10 @@ bm_search_notrev(regex_t* reg, const UChar* target, const UChar* target_end, const UChar *tail; ptrdiff_t skip, tlen1; -#ifdef ONIG_DEBUG_SEARCH +# ifdef ONIG_DEBUG_SEARCH fprintf(stderr, "bm_search_notrev: text: %"PRIuPTR" (%p), text_end: %"PRIuPTR" (%p), text_range: %"PRIuPTR" (%p)\n", text, text, text_end, text_end, text_range, text_range); -#endif +# endif tail = target_end - 1; tlen1 = tail - target; @@ -3294,11 +3292,12 @@ bm_search_notrev(regex_t* reg, const UChar* target, const UChar* target_end, skip = reg->map[*se]; t = s; do { - s += enclen(reg->enc, s, end); + s += enclen(reg->enc, s, end); } while ((s - t) < skip && s < end); } } else { +# if OPT_EXACT_MAXLEN >= ONIG_CHAR_TABLE_SIZE while (s < end) { p = se = s + tlen1; t = tail; @@ -3309,9 +3308,10 @@ bm_search_notrev(regex_t* reg, const UChar* target, const UChar* target_end, skip = reg->int_map[*se]; t = s; do { - s += enclen(reg->enc, s, end); + s += enclen(reg->enc, s, end); } while ((s - t) < skip && s < end); } +# endif } return (UChar* )NULL; @@ -3325,10 +3325,10 @@ bm_search(regex_t* reg, const UChar* target, const UChar* target_end, const UChar *s, *t, *p, *end; const UChar *tail; -#ifdef ONIG_DEBUG_SEARCH +# ifdef ONIG_DEBUG_SEARCH fprintf(stderr, "bm_search: text: %"PRIuPTR", text_end: %"PRIuPTR", text_range: %"PRIuPTR"\n", text, text_end, text_range); -#endif +# endif end = text_range + (target_end - target) - 1; if (end > text_end) @@ -3340,10 +3340,10 @@ bm_search(regex_t* reg, const UChar* target, const UChar* target_end, while (s < end) { p = s; t = tail; -#ifdef ONIG_DEBUG_SEARCH +# ifdef ONIG_DEBUG_SEARCH fprintf(stderr, "bm_search_loop: pos: %"PRIdPTR" %s\n", (intptr_t )(s - text), s); -#endif +# endif while (*p == *t) { if (t == target) return (UChar* )p; p--; t--; @@ -3352,6 +3352,7 @@ bm_search(regex_t* reg, const UChar* target, const UChar* target_end, } } else { /* see int_map[] */ +# if OPT_EXACT_MAXLEN >= ONIG_CHAR_TABLE_SIZE while (s < end) { p = s; t = tail; @@ -3361,6 +3362,7 @@ bm_search(regex_t* reg, const UChar* target, const UChar* target_end, } s += reg->int_map[*s]; } +# endif } return (UChar* )NULL; } @@ -3377,10 +3379,10 @@ bm_search_notrev_ic(regex_t* reg, const UChar* target, const UChar* target_end, OnigEncoding enc = reg->enc; int case_fold_flag = reg->case_fold_flag; -#ifdef ONIG_DEBUG_SEARCH +# ifdef ONIG_DEBUG_SEARCH fprintf(stderr, "bm_search_notrev_ic: text: %d (%p), text_end: %d (%p), text_range: %d (%p)\n", (int )text, text, (int )text_end, text_end, (int )text_range, text_range); -#endif +# endif tail = target_end - 1; tlen1 = tail - target; @@ -3399,11 +3401,12 @@ bm_search_notrev_ic(regex_t* reg, const UChar* target, const UChar* target_end, skip = reg->map[*se]; t = s; do { - s += enclen(reg->enc, s, end); + s += enclen(reg->enc, s, end); } while ((s - t) < skip && s < end); } } else { +# if OPT_EXACT_MAXLEN >= ONIG_CHAR_TABLE_SIZE while (s < end) { se = s + tlen1; if (str_lower_case_match(enc, case_fold_flag, target, target_end, @@ -3412,9 +3415,10 @@ bm_search_notrev_ic(regex_t* reg, const UChar* target, const UChar* target_end, skip = reg->int_map[*se]; t = s; do { - s += enclen(reg->enc, s, end); + s += enclen(reg->enc, s, end); } while ((s - t) < skip && s < end); } +# endif } return (UChar* )NULL; @@ -3430,10 +3434,10 @@ bm_search_ic(regex_t* reg, const UChar* target, const UChar* target_end, OnigEncoding enc = reg->enc; int case_fold_flag = reg->case_fold_flag; -#ifdef ONIG_DEBUG_SEARCH +# ifdef ONIG_DEBUG_SEARCH fprintf(stderr, "bm_search_ic: text: %d (%p), text_end: %d (%p), text_range: %d (%p)\n", (int )text, text, (int )text_end, text_end, (int )text_range, text_range); -#endif +# endif end = text_range + (target_end - target) - 1; if (end > text_end) @@ -3451,6 +3455,7 @@ bm_search_ic(regex_t* reg, const UChar* target, const UChar* target_end, } } else { /* see int_map[] */ +# if OPT_EXACT_MAXLEN >= ONIG_CHAR_TABLE_SIZE while (s < end) { p = s - (target_end - target) + 1; if (str_lower_case_match(enc, case_fold_flag, target, target_end, @@ -3458,6 +3463,7 @@ bm_search_ic(regex_t* reg, const UChar* target, const UChar* target_end, return (UChar* )p; s += reg->int_map[*s]; } +# endif } return (UChar* )NULL; } @@ -3475,10 +3481,10 @@ bm_search_notrev(regex_t* reg, const UChar* target, const UChar* target_end, ptrdiff_t skip, tlen1; OnigEncoding enc = reg->enc; -#ifdef ONIG_DEBUG_SEARCH +# ifdef ONIG_DEBUG_SEARCH fprintf(stderr, "bm_search_notrev: text: %"PRIdPTR" (%p), text_end: %"PRIdPTR" (%p), text_range: %"PRIdPTR" (%p)\n", (intptr_t )text, text, (intptr_t )text_end, text_end, (intptr_t )text_range, text_range); -#endif +# endif tail = target_end - 1; tlen1 = tail - target; @@ -3500,11 +3506,12 @@ bm_search_notrev(regex_t* reg, const UChar* target, const UChar* target_end, skip = reg->map[se[1]]; t = s; do { - s += enclen(enc, s, end); + s += enclen(enc, s, end); } while ((s - t) < skip && s < end); } } else { +# if OPT_EXACT_MAXLEN >= ONIG_CHAR_TABLE_SIZE while (s < end) { p = se = s + tlen1; t = tail; @@ -3516,9 +3523,10 @@ bm_search_notrev(regex_t* reg, const UChar* target, const UChar* target_end, skip = reg->int_map[se[1]]; t = s; do { - s += enclen(enc, s, end); + s += enclen(enc, s, end); } while ((s - t) < skip && s < end); } +# endif } return (UChar* )NULL; @@ -3533,6 +3541,11 @@ bm_search(regex_t* reg, const UChar* target, const UChar* target_end, const UChar *tail; ptrdiff_t tlen1; +# ifdef ONIG_DEBUG_SEARCH + fprintf(stderr, "bm_search: text: %"PRIuPTR", text_end: %"PRIuPTR", text_range: %"PRIuPTR"\n", + text, text_end, text_range); +# endif + tail = target_end - 1; tlen1 = tail - target; end = text_range + tlen1; @@ -3553,6 +3566,7 @@ bm_search(regex_t* reg, const UChar* target, const UChar* target_end, } } else { /* see int_map[] */ +# if OPT_EXACT_MAXLEN >= ONIG_CHAR_TABLE_SIZE while (s < end) { p = s; t = tail; @@ -3563,6 +3577,7 @@ bm_search(regex_t* reg, const UChar* target, const UChar* target_end, if (s + 1 >= end) break; s += reg->int_map[s[1]]; } +# endif } return (UChar* )NULL; } @@ -3579,10 +3594,10 @@ bm_search_notrev_ic(regex_t* reg, const UChar* target, const UChar* target_end, OnigEncoding enc = reg->enc; int case_fold_flag = reg->case_fold_flag; -#ifdef ONIG_DEBUG_SEARCH +# ifdef ONIG_DEBUG_SEARCH fprintf(stderr, "bm_search_notrev_ic: text: %"PRIdPTR" (%p), text_end: %"PRIdPTR" (%p), text_range: %"PRIdPTR" (%p)\n", (intptr_t )text, text, (intptr_t )text_end, text_end, (intptr_t )text_range, text_range); -#endif +# endif tail = target_end - 1; tlen1 = tail - target; @@ -3602,11 +3617,12 @@ bm_search_notrev_ic(regex_t* reg, const UChar* target, const UChar* target_end, skip = reg->map[se[1]]; t = s; do { - s += enclen(enc, s, end); + s += enclen(enc, s, end); } while ((s - t) < skip && s < end); } } else { +# if OPT_EXACT_MAXLEN >= ONIG_CHAR_TABLE_SIZE while (s < end) { se = s + tlen1; if (str_lower_case_match(enc, case_fold_flag, target, target_end, @@ -3616,9 +3632,10 @@ bm_search_notrev_ic(regex_t* reg, const UChar* target, const UChar* target_end, skip = reg->int_map[se[1]]; t = s; do { - s += enclen(enc, s, end); + s += enclen(enc, s, end); } while ((s - t) < skip && s < end); } +# endif } return (UChar* )NULL; @@ -3635,10 +3652,10 @@ bm_search_ic(regex_t* reg, const UChar* target, const UChar* target_end, OnigEncoding enc = reg->enc; int case_fold_flag = reg->case_fold_flag; -#ifdef ONIG_DEBUG_SEARCH +# ifdef ONIG_DEBUG_SEARCH fprintf(stderr, "bm_search_ic: text: %"PRIdPTR" (%p), text_end: %"PRIdPTR" (%p), text_range: %"PRIdPTR" (%p)\n", (intptr_t )text, text, (intptr_t )text_end, text_end, (intptr_t )text_range, text_range); -#endif +# endif tail = target_end - 1; tlen1 = tail - target; @@ -3658,6 +3675,7 @@ bm_search_ic(regex_t* reg, const UChar* target, const UChar* target_end, } } else { /* see int_map[] */ +# if OPT_EXACT_MAXLEN >= ONIG_CHAR_TABLE_SIZE while (s < end) { p = s - tlen1; if (str_lower_case_match(enc, case_fold_flag, target, target_end, @@ -3666,11 +3684,13 @@ bm_search_ic(regex_t* reg, const UChar* target, const UChar* target_end, if (s + 1 >= end) break; s += reg->int_map[s[1]]; } +# endif } return (UChar* )NULL; } #endif /* USE_SUNDAY_QUICK_SEARCH */ +#ifdef USE_INT_MAP_BACKWARD static int set_bm_backward_skip(UChar* s, UChar* end, OnigEncoding enc ARG_UNUSED, int** skip) @@ -3720,6 +3740,7 @@ bm_search_backward(regex_t* reg, const UChar* target, const UChar* target_end, return (UChar* )NULL; } +#endif static UChar* map_search(OnigEncoding enc, UChar map[], @@ -3758,31 +3779,6 @@ onig_match(regex_t* reg, const UChar* str, const UChar* end, const UChar* at, On UChar *prev; OnigMatchArg msa; -#if defined(USE_RECOMPILE_API) && defined(USE_MULTI_THREAD_SYSTEM) - start: - THREAD_ATOMIC_START; - if (ONIG_STATE(reg) >= ONIG_STATE_NORMAL) { - ONIG_STATE_INC(reg); - if (IS_NOT_NULL(reg->chain) && ONIG_STATE(reg) == ONIG_STATE_NORMAL) { - onig_chain_reduce(reg); - ONIG_STATE_INC(reg); - } - } - else { - int n; - - THREAD_ATOMIC_END; - n = 0; - while (ONIG_STATE(reg) < ONIG_STATE_NORMAL) { - if (++n > THREAD_PASS_LIMIT_COUNT) - return ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT; - THREAD_PASS; - } - goto start; - } - THREAD_ATOMIC_END; -#endif /* USE_RECOMPILE_API && USE_MULTI_THREAD_SYSTEM */ - MATCH_ARG_INIT(msa, option, region, at, at); #ifdef USE_COMBINATION_EXPLOSION_CHECK { @@ -3791,11 +3787,7 @@ onig_match(regex_t* reg, const UChar* str, const UChar* end, const UChar* at, On } #endif - if (region -#ifdef USE_POSIX_API_REGION_OPTION - && !IS_POSIX_REGION(option) -#endif - ) { + if (region) { r = onig_region_resize_clear(region, reg->num_mem + 1); } else @@ -3811,7 +3803,6 @@ onig_match(regex_t* reg, const UChar* str, const UChar* end, const UChar* at, On } MATCH_ARG_FREE(msa); - ONIG_STATE_DEC_THREAD(reg); return r; } @@ -3844,7 +3835,7 @@ forward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, break; case ONIG_OPTIMIZE_EXACT_IC: p = slow_search_ic(reg->enc, reg->case_fold_flag, - reg->exact, reg->exact_end, p, end, range); + reg->exact, reg->exact_end, p, end, range); break; case ONIG_OPTIMIZE_EXACT_BM: @@ -3952,7 +3943,6 @@ backward_search_range(regex_t* reg, const UChar* str, const UChar* end, UChar* s, const UChar* range, UChar* adjrange, UChar** low, UChar** high) { - int r; UChar *p; range += reg->dmin; @@ -3970,13 +3960,15 @@ backward_search_range(regex_t* reg, const UChar* str, const UChar* end, case ONIG_OPTIMIZE_EXACT_BM_IC: case ONIG_OPTIMIZE_EXACT_BM_NOT_REV_IC: p = slow_search_backward_ic(reg->enc, reg->case_fold_flag, - reg->exact, reg->exact_end, - range, adjrange, end, p); + reg->exact, reg->exact_end, + range, adjrange, end, p); break; case ONIG_OPTIMIZE_EXACT_BM: case ONIG_OPTIMIZE_EXACT_BM_NOT_REV: +#ifdef USE_INT_MAP_BACKWARD if (IS_NULL(reg->int_map_backward)) { + int r; if (s - range < BM_BACKWARD_SEARCH_LENGTH_THRESHOLD) goto exact_method; @@ -3986,6 +3978,9 @@ backward_search_range(regex_t* reg, const UChar* str, const UChar* end, } p = bm_search_backward(reg, reg->exact, reg->exact_end, range, adjrange, end, p); +#else + goto exact_method; +#endif break; case ONIG_OPTIMIZE_MAP: @@ -4070,42 +4065,13 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, const UChar *orig_range = range; #endif -#if defined(USE_RECOMPILE_API) && defined(USE_MULTI_THREAD_SYSTEM) - start: - THREAD_ATOMIC_START; - if (ONIG_STATE(reg) >= ONIG_STATE_NORMAL) { - ONIG_STATE_INC(reg); - if (IS_NOT_NULL(reg->chain) && ONIG_STATE(reg) == ONIG_STATE_NORMAL) { - onig_chain_reduce(reg); - ONIG_STATE_INC(reg); - } - } - else { - int n; - - THREAD_ATOMIC_END; - n = 0; - while (ONIG_STATE(reg) < ONIG_STATE_NORMAL) { - if (++n > THREAD_PASS_LIMIT_COUNT) - return ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT; - THREAD_PASS; - } - goto start; - } - THREAD_ATOMIC_END; -#endif /* USE_RECOMPILE_API && USE_MULTI_THREAD_SYSTEM */ - #ifdef ONIG_DEBUG_SEARCH fprintf(stderr, "onig_search (entry point): str: %"PRIuPTR" (%p), end: %"PRIuPTR", start: %"PRIuPTR", range: %"PRIuPTR"\n", (intptr_t )str, str, end - str, start - str, range - str); #endif - if (region -#ifdef USE_POSIX_API_REGION_OPTION - && !IS_POSIX_REGION(option) -#endif - ) { + if (region) { r = onig_region_resize_clear(region, reg->num_mem + 1); if (r) goto finish_no_msa; } @@ -4114,8 +4080,8 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, #ifdef USE_MATCH_RANGE_MUST_BE_INSIDE_OF_SPECIFIED_RANGE -#ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE -#define MATCH_AND_RETURN_CHECK(upper_range) \ +# ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE +# define MATCH_AND_RETURN_CHECK(upper_range) \ r = match_at(reg, str, end, (upper_range), s, prev, &msa); \ if (r != ONIG_MISMATCH) {\ if (r >= 0) {\ @@ -4125,8 +4091,8 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, }\ else goto finish; /* error */ \ } -#else -#define MATCH_AND_RETURN_CHECK(upper_range) \ +# else +# define MATCH_AND_RETURN_CHECK(upper_range) \ r = match_at(reg, str, end, (upper_range), s, prev, &msa); \ if (r != ONIG_MISMATCH) {\ if (r >= 0) {\ @@ -4134,10 +4100,10 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, }\ else goto finish; /* error */ \ } -#endif /* USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE */ +# endif /* USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE */ #else -#ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE -#define MATCH_AND_RETURN_CHECK(none) \ +# ifdef USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE +# define MATCH_AND_RETURN_CHECK(none) \ r = match_at(reg, str, end, s, prev, &msa);\ if (r != ONIG_MISMATCH) {\ if (r >= 0) {\ @@ -4147,8 +4113,8 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, }\ else goto finish; /* error */ \ } -#else -#define MATCH_AND_RETURN_CHECK(none) \ +# else +# define MATCH_AND_RETURN_CHECK(none) \ r = match_at(reg, str, end, s, prev, &msa);\ if (r != ONIG_MISMATCH) {\ if (r >= 0) {\ @@ -4156,7 +4122,7 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, }\ else goto finish; /* error */ \ } -#endif /* USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE */ +# endif /* USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE */ #endif /* USE_MATCH_RANGE_MUST_BE_INSIDE_OF_SPECIFIED_RANGE */ @@ -4168,7 +4134,15 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, /* search start-position only */ begin_position: if (range > start) - range = start + 1; + { + if (global_pos > start) + { + if (global_pos < range) + range = global_pos + 1; + } + else + range = start + 1; + } else range = start; } @@ -4244,9 +4218,7 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, } } else if ((reg->anchor & ANCHOR_ANYCHAR_STAR_ML)) { - if (! (reg->anchor & ANCHOR_LOOK_BEHIND)) { - goto begin_position; - } + goto begin_position; } } else if (str == end) { /* empty string */ @@ -4306,7 +4278,7 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, } if ((end - start) < reg->threshold_len) - goto mismatch; + goto mismatch; if (reg->dmax != ONIG_INFINITE_DISTANCE) { do { @@ -4328,24 +4300,24 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, if (! forward_search_range(reg, str, end, s, sch_range, &low, &high, (UChar** )NULL)) goto mismatch; - if ((reg->anchor & ANCHOR_ANYCHAR_STAR) != 0) { - do { - if ((reg->anchor & ANCHOR_BEGIN_POSITION) == 0) - msa.gpos = s; /* move \G position */ - MATCH_AND_RETURN_CHECK(orig_range); - prev = s; - s += enclen(reg->enc, s, end); - - if ((reg->anchor & (ANCHOR_LOOK_BEHIND | ANCHOR_PREC_READ_NOT)) == 0) { - while (!ONIGENC_IS_MBC_NEWLINE_EX(reg->enc, prev, str, end, reg->options, 0) - && s < range) { - prev = s; - s += enclen(reg->enc, s, end); - } - } - } while (s < range); - goto mismatch; - } + if ((reg->anchor & ANCHOR_ANYCHAR_STAR) != 0) { + do { + if ((reg->anchor & ANCHOR_BEGIN_POSITION) == 0) + msa.gpos = s; /* move \G position */ + MATCH_AND_RETURN_CHECK(orig_range); + prev = s; + s += enclen(reg->enc, s, end); + + if ((reg->anchor & (ANCHOR_LOOK_BEHIND | ANCHOR_PREC_READ_NOT)) == 0) { + while (!ONIGENC_IS_MBC_NEWLINE_EX(reg->enc, prev, str, end, reg->options, 0) + && s < range) { + prev = s; + s += enclen(reg->enc, s, end); + } + } + } while (s < range); + goto mismatch; + } } } @@ -4428,15 +4400,10 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, finish: MATCH_ARG_FREE(msa); - ONIG_STATE_DEC_THREAD(reg); /* If result is mismatch and no FIND_NOT_EMPTY option, then the region is not set in match_at(). */ - if (IS_FIND_NOT_EMPTY(reg->options) && region -#ifdef USE_POSIX_API_REGION_OPTION - && !IS_POSIX_REGION(option) -#endif - ) { + if (IS_FIND_NOT_EMPTY(reg->options) && region) { onig_region_clear(region); } @@ -4449,7 +4416,6 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, mismatch_no_msa: r = ONIG_MISMATCH; finish_no_msa: - ONIG_STATE_DEC_THREAD(reg); #ifdef ONIG_DEBUG if (r != ONIG_MISMATCH) fprintf(stderr, "onig_search: error %"PRIdPTRDIFF"\n", r); @@ -4457,43 +4423,82 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end, return r; match: - ONIG_STATE_DEC_THREAD(reg); MATCH_ARG_FREE(msa); return s - str; } +extern OnigPosition +onig_scan(regex_t* reg, const UChar* str, const UChar* end, + OnigRegion* region, OnigOptionType option, + int (*scan_callback)(OnigPosition, OnigPosition, OnigRegion*, void*), + void* callback_arg) +{ + OnigPosition r; + OnigPosition n; + int rs; + const UChar* start; + + n = 0; + start = str; + while (1) { + r = onig_search(reg, str, end, start, end, region, option); + if (r >= 0) { + rs = scan_callback(n, r, region, callback_arg); + n++; + if (rs != 0) + return rs; + + if (region->end[0] == start - str) + start++; + else + start = str + region->end[0]; + + if (start > end) + break; + } + else if (r == ONIG_MISMATCH) { + break; + } + else { /* error */ + return r; + } + } + + return n; +} + extern OnigEncoding -onig_get_encoding(regex_t* reg) +onig_get_encoding(const regex_t* reg) { return reg->enc; } extern OnigOptionType -onig_get_options(regex_t* reg) +onig_get_options(const regex_t* reg) { return reg->options; } extern OnigCaseFoldType -onig_get_case_fold_flag(regex_t* reg) +onig_get_case_fold_flag(const regex_t* reg) { return reg->case_fold_flag; } extern const OnigSyntaxType* -onig_get_syntax(regex_t* reg) +onig_get_syntax(const regex_t* reg) { return reg->syntax; } extern int -onig_number_of_captures(regex_t* reg) +onig_number_of_captures(const regex_t* reg) { return reg->num_mem; } extern int -onig_number_of_capture_histories(regex_t* reg) +onig_number_of_capture_histories(const regex_t* reg) { #ifdef USE_CAPTURE_HISTORY int i, n; diff --git a/regint.h b/regint.h index 80d3523126..344ece4ef1 100644 --- a/regint.h +++ b/regint.h @@ -1,11 +1,11 @@ -#ifndef ONIGURUMA_REGINT_H -#define ONIGURUMA_REGINT_H +#ifndef ONIGMO_REGINT_H +#define ONIGMO_REGINT_H /********************************************************************** regint.h - Onigmo (Oniguruma-mod) (regular expression library) **********************************************************************/ /*- * Copyright (c) 2002-2013 K.Kosako - * Copyright (c) 2011-2014 K.Takata + * Copyright (c) 2011-2016 K.Takata * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -35,6 +35,7 @@ /* #define ONIG_DEBUG_COMPILE */ /* #define ONIG_DEBUG_SEARCH */ /* #define ONIG_DEBUG_MATCH */ +/* #define ONIG_DEBUG_MEMLEAK */ /* #define ONIG_DONT_OPTIMIZE */ /* for byte-code statistical data. */ @@ -42,25 +43,25 @@ #if defined(ONIG_DEBUG_PARSE_TREE) || defined(ONIG_DEBUG_MATCH) || \ defined(ONIG_DEBUG_SEARCH) || defined(ONIG_DEBUG_COMPILE) || \ - defined(ONIG_DEBUG_STATISTICS) -#ifndef ONIG_DEBUG -#define ONIG_DEBUG -#endif + defined(ONIG_DEBUG_STATISTICS) || defined(ONIG_DEBUG_MEMLEAK) +# ifndef ONIG_DEBUG +# define ONIG_DEBUG +# endif #endif #ifndef UNALIGNED_WORD_ACCESS -#if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ - defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || \ - defined(__powerpc64__) || \ - defined(__mc68020__) -#define UNALIGNED_WORD_ACCESS 1 -#else -#define UNALIGNED_WORD_ACCESS 0 -#endif +# if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ + defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || \ + defined(__powerpc64__) || \ + defined(__mc68020__) +# define UNALIGNED_WORD_ACCESS 1 +# else +# define UNALIGNED_WORD_ACCESS 0 +# endif #endif #if UNALIGNED_WORD_ACCESS -#define PLATFORM_UNALIGNED_WORD_ACCESS +# define PLATFORM_UNALIGNED_WORD_ACCESS #endif /* config */ @@ -73,213 +74,163 @@ #define USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT /* /(?:()|())*\2/ */ #define USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE /* /\n$/ =~ "\n" */ #define USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR -/* #define USE_RECOMPILE_API */ /* !!! moved to regenc.h. */ /* #define USE_CRNL_AS_LINE_TERMINATOR */ #define USE_NO_INVALID_QUANTIFIER /* internal config */ -#define USE_PARSE_TREE_NODE_RECYCLE -#define USE_OP_PUSH_OR_JUMP_EXACT +/* #define USE_OP_PUSH_OR_JUMP_EXACT */ #define USE_QTFR_PEEK_NEXT #define USE_ST_LIBRARY -#define USE_SHARED_CCLASS_TABLE #define USE_SUNDAY_QUICK_SEARCH #define INIT_MATCH_STACK_SIZE 160 #define DEFAULT_MATCH_STACK_LIMIT_SIZE 0 /* unlimited */ +#define DEFAULT_PARSE_DEPTH_LIMIT 4096 + +#define OPT_EXACT_MAXLEN 24 /* check config */ #if defined(USE_PERL_SUBEXP_CALL) || defined(USE_CAPITAL_P_NAMED_GROUP) -#if !defined(USE_NAMED_GROUP) || !defined(USE_SUBEXP_CALL) -#error USE_NAMED_GROUP and USE_SUBEXP_CALL must be defined. -#endif +# if !defined(USE_NAMED_GROUP) || !defined(USE_SUBEXP_CALL) +# error USE_NAMED_GROUP and USE_SUBEXP_CALL must be defined. +# endif #endif #if defined(__GNUC__) -# define ARG_UNUSED __attribute__ ((unused)) +# define ARG_UNUSED __attribute__ ((unused)) #else -# define ARG_UNUSED +# define ARG_UNUSED #endif -#ifndef RUBY_DEFINES_H -#include "ruby/ruby.h" -#undef xmalloc -#undef xrealloc -#undef xcalloc -#undef xfree +#if !defined(RUBY) && defined(RUBY_EXPORT) +# define RUBY #endif +#ifdef RUBY +# ifndef RUBY_DEFINES_H +# include "ruby/ruby.h" +# undef xmalloc +# undef xrealloc +# undef xcalloc +# undef xfree +# endif +#else /* RUBY */ +# include "config.h" +# if SIZEOF_LONG_LONG > 0 +# define LONG_LONG long long +# endif +#endif /* RUBY */ + +#include /* */ /* escape other system UChar definition */ #ifdef ONIG_ESCAPE_UCHAR_COLLISION -#undef ONIG_ESCAPE_UCHAR_COLLISION +# undef ONIG_ESCAPE_UCHAR_COLLISION #endif #define USE_WORD_BEGIN_END /* "\<": word-begin, "\>": word-end */ -#undef USE_CAPTURE_HISTORY +#ifdef RUBY +# undef USE_CAPTURE_HISTORY +#else +# define USE_CAPTURE_HISTORY +#endif #define USE_VARIABLE_META_CHARS -#define USE_POSIX_API_REGION_OPTION /* needed for POSIX API support */ #define USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE /* #define USE_COMBINATION_EXPLOSION_CHECK */ /* (X*)* */ -/* multithread config */ -/* #define USE_MULTI_THREAD_SYSTEM */ -/* #define USE_DEFAULT_MULTI_THREAD_SYSTEM */ - -#if defined(USE_MULTI_THREAD_SYSTEM) \ - && defined(USE_DEFAULT_MULTI_THREAD_SYSTEM) - -#ifdef _WIN32 -#define WIN32_LEAN_AND_MEAN -#include -extern CRITICAL_SECTION gOnigMutex; -#define THREAD_SYSTEM_INIT InitializeCriticalSection(&gOnigMutex) -#define THREAD_SYSTEM_END DeleteCriticalSection(&gOnigMutex) -#define THREAD_ATOMIC_START EnterCriticalSection(&gOnigMutex) -#define THREAD_ATOMIC_END LeaveCriticalSection(&gOnigMutex) -#define THREAD_PASS Sleep(0) -#else /* _WIN32 */ -#include -#include -extern pthread_mutex_t gOnigMutex; -#define THREAD_SYSTEM_INIT pthread_mutex_init(&gOnigMutex, NULL) -#define THREAD_SYSTEM_END pthread_mutex_destroy(&gOnigMutex) -#define THREAD_ATOMIC_START pthread_mutex_lock(&gOnigMutex) -#define THREAD_ATOMIC_END pthread_mutex_unlock(&gOnigMutex) -#define THREAD_PASS sched_yield() -#endif /* _WIN32 */ - -#else /* USE_DEFAULT_MULTI_THREAD_SYSTEM */ - -#ifndef THREAD_SYSTEM_INIT -#define THREAD_SYSTEM_INIT /* depend on thread system */ -#endif -#ifndef THREAD_SYSTEM_END -#define THREAD_SYSTEM_END /* depend on thread system */ -#endif -#ifndef THREAD_ATOMIC_START -#define THREAD_ATOMIC_START /* depend on thread system */ -#endif -#ifndef THREAD_ATOMIC_END -#define THREAD_ATOMIC_END /* depend on thread system */ -#endif -#ifndef THREAD_PASS -#define THREAD_PASS /* depend on thread system */ -#endif - -#endif /* USE_DEFAULT_MULTI_THREAD_SYSTEM */ #ifndef xmalloc -#define xmalloc malloc -#define xrealloc realloc -#define xcalloc calloc -#define xfree free +# define xmalloc malloc +# define xrealloc realloc +# define xcalloc calloc +# define xfree free #endif #ifdef RUBY -#define CHECK_INTERRUPT_IN_MATCH_AT rb_thread_check_ints() -#define onig_st_init_table st_init_table -#define onig_st_init_table_with_size st_init_table_with_size -#define onig_st_init_numtable st_init_numtable -#define onig_st_init_numtable_with_size st_init_numtable_with_size -#define onig_st_init_strtable st_init_strtable -#define onig_st_init_strtable_with_size st_init_strtable_with_size -#define onig_st_delete st_delete -#define onig_st_delete_safe st_delete_safe -#define onig_st_insert st_insert -#define onig_st_lookup st_lookup -#define onig_st_foreach st_foreach -#define onig_st_add_direct st_add_direct -#define onig_st_free_table st_free_table -#define onig_st_cleanup_safe st_cleanup_safe -#define onig_st_copy st_copy -#define onig_st_nothing_key_clone st_nothing_key_clone -#define onig_st_nothing_key_free st_nothing_key_free -#define onig_st_is_member st_is_member - -#define USE_UPPER_CASE_TABLE -#else - -#define CHECK_INTERRUPT_IN_MATCH_AT - -#define st_init_table onig_st_init_table -#define st_init_table_with_size onig_st_init_table_with_size -#define st_init_numtable onig_st_init_numtable -#define st_init_numtable_with_size onig_st_init_numtable_with_size -#define st_init_strtable onig_st_init_strtable -#define st_init_strtable_with_size onig_st_init_strtable_with_size -#define st_delete onig_st_delete -#define st_delete_safe onig_st_delete_safe -#define st_insert onig_st_insert -#define st_lookup onig_st_lookup -#define st_foreach onig_st_foreach -#define st_add_direct onig_st_add_direct -#define st_free_table onig_st_free_table -#define st_cleanup_safe onig_st_cleanup_safe -#define st_copy onig_st_copy -#define st_nothing_key_clone onig_st_nothing_key_clone -#define st_nothing_key_free onig_st_nothing_key_free +# define CHECK_INTERRUPT_IN_MATCH_AT rb_thread_check_ints() +# define onig_st_init_table st_init_table +# define onig_st_init_table_with_size st_init_table_with_size +# define onig_st_init_numtable st_init_numtable +# define onig_st_init_numtable_with_size st_init_numtable_with_size +# define onig_st_init_strtable st_init_strtable +# define onig_st_init_strtable_with_size st_init_strtable_with_size +# define onig_st_delete st_delete +# define onig_st_delete_safe st_delete_safe +# define onig_st_insert st_insert +# define onig_st_lookup st_lookup +# define onig_st_foreach st_foreach +# define onig_st_add_direct st_add_direct +# define onig_st_free_table st_free_table +# define onig_st_cleanup_safe st_cleanup_safe +# define onig_st_copy st_copy +# define onig_st_nothing_key_clone st_nothing_key_clone +# define onig_st_nothing_key_free st_nothing_key_free +# define onig_st_is_member st_is_member + +# define USE_UPPER_CASE_TABLE +#else /* RUBY */ + +# define CHECK_INTERRUPT_IN_MATCH_AT + +# define st_init_table onig_st_init_table +# define st_init_table_with_size onig_st_init_table_with_size +# define st_init_numtable onig_st_init_numtable +# define st_init_numtable_with_size onig_st_init_numtable_with_size +# define st_init_strtable onig_st_init_strtable +# define st_init_strtable_with_size onig_st_init_strtable_with_size +# define st_delete onig_st_delete +# define st_delete_safe onig_st_delete_safe +# define st_insert onig_st_insert +# define st_lookup onig_st_lookup +# define st_foreach onig_st_foreach +# define st_add_direct onig_st_add_direct +# define st_free_table onig_st_free_table +# define st_cleanup_safe onig_st_cleanup_safe +# define st_copy onig_st_copy +# define st_nothing_key_clone onig_st_nothing_key_clone +# define st_nothing_key_free onig_st_nothing_key_free /* */ -#define onig_st_is_member st_is_member +# define onig_st_is_member st_is_member -#endif +#endif /* RUBY */ #define STATE_CHECK_STRING_THRESHOLD_LEN 7 #define STATE_CHECK_BUFF_MAX_SIZE 0x4000 -#define THREAD_PASS_LIMIT_COUNT 8 #define xmemset memset #define xmemcpy memcpy #define xmemmove memmove #if defined(_WIN32) && !defined(__GNUC__) -#define xalloca _alloca -#define xvsnprintf _vsnprintf +# define xalloca _alloca +# define xvsnprintf(buf,size,fmt,args) _vsnprintf_s(buf,size,_TRUNCATE,fmt,args) +# define xsnprintf sprintf_s +# define xstrcat(dest,src,size) strcat_s(dest,size,src) #else -#define xalloca alloca -#define xvsnprintf vsnprintf +# define xalloca alloca +# define xvsnprintf vsnprintf +# define xsnprintf snprintf +# define xstrcat(dest,src,size) strcat(dest,src) #endif +#if defined(ONIG_DEBUG_MEMLEAK) && defined(_MSC_VER) +# define _CRTDBG_MAP_ALLOC +# include +# include +#endif -#if defined(USE_RECOMPILE_API) && defined(USE_MULTI_THREAD_SYSTEM) -#define ONIG_STATE_INC(reg) (reg)->state++ -#define ONIG_STATE_DEC(reg) (reg)->state-- - -#define ONIG_STATE_INC_THREAD(reg) do {\ - THREAD_ATOMIC_START;\ - (reg)->state++;\ - THREAD_ATOMIC_END;\ -} while(0) -#define ONIG_STATE_DEC_THREAD(reg) do {\ - THREAD_ATOMIC_START;\ - (reg)->state--;\ - THREAD_ATOMIC_END;\ -} while(0) -#else -#define ONIG_STATE_INC(reg) /* Nothing */ -#define ONIG_STATE_DEC(reg) /* Nothing */ -#define ONIG_STATE_INC_THREAD(reg) /* Nothing */ -#define ONIG_STATE_DEC_THREAD(reg) /* Nothing */ -#endif /* USE_RECOMPILE_API && USE_MULTI_THREAD_SYSTEM */ - -#ifdef HAVE_STDLIB_H #include -#endif #if defined(HAVE_ALLOCA_H) && (defined(_AIX) || !defined(__GNUC__)) -#include +# include #endif -#ifdef HAVE_STRING_H -# include -#else -# include -#endif +#include #include #ifdef HAVE_SYS_TYPES_H -#include +# include #endif #ifdef HAVE_STDINT_H @@ -290,12 +241,10 @@ extern pthread_mutex_t gOnigMutex; # include #endif -#ifdef STDC_HEADERS -# include -#endif +#include #ifdef _WIN32 -#include /* for alloca() */ +# include /* for alloca() */ #endif #ifdef ONIG_DEBUG @@ -303,28 +252,32 @@ extern pthread_mutex_t gOnigMutex; #endif #ifdef _WIN32 -#if defined(_MSC_VER) && (_MSC_VER < 1300) -#ifndef _INTPTR_T_DEFINED -#define _INTPTR_T_DEFINED +# if defined(_MSC_VER) && (_MSC_VER < 1300) +# ifndef _INTPTR_T_DEFINED +# define _INTPTR_T_DEFINED typedef int intptr_t; -#endif -#ifndef _UINTPTR_T_DEFINED -#define _UINTPTR_T_DEFINED +# endif +# ifndef _UINTPTR_T_DEFINED +# define _UINTPTR_T_DEFINED typedef unsigned int uintptr_t; -#endif -#endif +# endif +# endif #endif /* _WIN32 */ #ifndef PRIdPTR -#ifdef _WIN64 -#define PRIdPTR "I64d" -#define PRIuPTR "I64u" -#define PRIxPTR "I64x" -#else -#define PRIdPTR "ld" -#define PRIuPTR "lu" -#define PRIxPTR "lx" +# ifdef _WIN64 +# define PRIdPTR "I64d" +# define PRIuPTR "I64u" +# define PRIxPTR "I64x" +# else +# define PRIdPTR "ld" +# define PRIuPTR "lu" +# define PRIxPTR "lx" +# endif #endif + +#ifndef PRIdPTRDIFF +# define PRIdPTRDIFF PRIdPTR #endif #include "regenc.h" @@ -332,10 +285,10 @@ typedef unsigned int uintptr_t; RUBY_SYMBOL_EXPORT_BEGIN #ifdef MIN -#undef MIN +# undef MIN #endif #ifdef MAX -#undef MAX +# undef MAX #endif #define MIN(a,b) (((a)>(b))?(b):(a)) #define MAX(a,b) (((a)<(b))?(b):(a)) @@ -350,28 +303,28 @@ RUBY_SYMBOL_EXPORT_BEGIN #ifdef PLATFORM_UNALIGNED_WORD_ACCESS -#define PLATFORM_GET_INC(val,p,type) do{\ +# define PLATFORM_GET_INC(val,p,type) do{\ val = *(type* )p;\ (p) += sizeof(type);\ } while(0) #else -#define PLATFORM_GET_INC(val,p,type) do{\ +# define PLATFORM_GET_INC(val,p,type) do{\ xmemcpy(&val, (p), sizeof(type));\ (p) += sizeof(type);\ } while(0) /* sizeof(OnigCodePoint) */ -#define WORD_ALIGNMENT_SIZE SIZEOF_LONG +# define WORD_ALIGNMENT_SIZE SIZEOF_LONG -#define GET_ALIGNMENT_PAD_SIZE(addr,pad_size) do {\ +# define GET_ALIGNMENT_PAD_SIZE(addr,pad_size) do {\ (pad_size) = WORD_ALIGNMENT_SIZE \ - ((uintptr_t )(addr) % WORD_ALIGNMENT_SIZE);\ if ((pad_size) == WORD_ALIGNMENT_SIZE) (pad_size) = 0;\ } while (0) -#define ALIGNMENT_RIGHT(addr) do {\ +# define ALIGNMENT_RIGHT(addr) do {\ (addr) += (WORD_ALIGNMENT_SIZE - 1);\ (addr) -= ((uintptr_t )(addr) % WORD_ALIGNMENT_SIZE);\ } while (0) @@ -435,7 +388,6 @@ typedef unsigned int BitStatusType; #define IS_NOTEOL(option) ((option) & ONIG_OPTION_NOTEOL) #define IS_NOTBOS(option) ((option) & ONIG_OPTION_NOTBOS) #define IS_NOTEOS(option) ((option) & ONIG_OPTION_NOTEOS) -#define IS_POSIX_REGION(option) ((option) & ONIG_OPTION_POSIX_REGION) #define IS_ASCII_RANGE(option) ((option) & ONIG_OPTION_ASCII_RANGE) #define IS_POSIX_BRACKET_ALL_RANGE(option) ((option) & ONIG_OPTION_POSIX_BRACKET_ALL_RANGE) #define IS_WORD_BOUND_ALL_RANGE(option) ((option) & ONIG_OPTION_WORD_BOUND_ALL_RANGE) @@ -618,7 +570,6 @@ enum OpCode { OP_CCLASS_NOT, OP_CCLASS_MB_NOT, OP_CCLASS_MIX_NOT, - OP_CCLASS_NODE, /* pointer to CClassNode node */ OP_ANYCHAR, /* "." */ OP_ANYCHAR_ML, /* "." multi-line */ @@ -781,10 +732,10 @@ typedef void* PointerType; #define SIZE_OP_CONDITION (SIZE_OPCODE + SIZE_MEMNUM + SIZE_RELADDR) #ifdef USE_COMBINATION_EXPLOSION_CHECK -#define SIZE_OP_STATE_CHECK (SIZE_OPCODE + SIZE_STATE_CHECK_NUM) -#define SIZE_OP_STATE_CHECK_PUSH (SIZE_OPCODE + SIZE_STATE_CHECK_NUM + SIZE_RELADDR) -#define SIZE_OP_STATE_CHECK_PUSH_OR_JUMP (SIZE_OPCODE + SIZE_STATE_CHECK_NUM + SIZE_RELADDR) -#define SIZE_OP_STATE_CHECK_ANYCHAR_STAR (SIZE_OPCODE + SIZE_STATE_CHECK_NUM) +# define SIZE_OP_STATE_CHECK (SIZE_OPCODE + SIZE_STATE_CHECK_NUM) +# define SIZE_OP_STATE_CHECK_PUSH (SIZE_OPCODE + SIZE_STATE_CHECK_NUM + SIZE_RELADDR) +# define SIZE_OP_STATE_CHECK_PUSH_OR_JUMP (SIZE_OPCODE + SIZE_STATE_CHECK_NUM + SIZE_RELADDR) +# define SIZE_OP_STATE_CHECK_ANYCHAR_STAR (SIZE_OPCODE + SIZE_STATE_CHECK_NUM) #endif #define MC_ESC(syn) (syn)->meta_char_table.esc @@ -832,13 +783,10 @@ typedef void* PointerType; /* cclass node */ #define FLAG_NCCLASS_NOT (1<<0) -#define FLAG_NCCLASS_SHARE (1<<1) #define NCCLASS_SET_NOT(nd) NCCLASS_FLAG_SET(nd, FLAG_NCCLASS_NOT) -#define NCCLASS_SET_SHARE(nd) NCCLASS_FLAG_SET(nd, FLAG_NCCLASS_SHARE) #define NCCLASS_CLEAR_NOT(nd) NCCLASS_FLAG_CLEAR(nd, FLAG_NCCLASS_NOT) #define IS_NCCLASS_NOT(nd) IS_NCCLASS_FLAG_ON(nd, FLAG_NCCLASS_NOT) -#define IS_NCCLASS_SHARE(nd) IS_NCCLASS_FLAG_ON(nd, FLAG_NCCLASS_SHARE) typedef struct { int type; @@ -936,60 +884,44 @@ typedef struct { extern OnigOpInfoType OnigOpInfo[]; -extern void onig_print_compiled_byte_code P_((FILE* f, UChar* bp, UChar* bpend, UChar** nextp, OnigEncoding enc)); +extern void onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar* bpend, UChar** nextp, OnigEncoding enc); -#ifdef ONIG_DEBUG_STATISTICS -extern void onig_statistics_init P_((void)); -extern void onig_print_statistics P_((FILE* f)); -#endif +# ifdef ONIG_DEBUG_STATISTICS +extern void onig_statistics_init(void); +extern void onig_print_statistics(FILE* f); +# endif #endif -extern UChar* onig_error_code_to_format P_((OnigPosition code)); -extern void onig_snprintf_with_pattern PV_((UChar buf[], int bufsize, OnigEncoding enc, UChar* pat, UChar* pat_end, const UChar *fmt, ...)); -extern int onig_bbuf_init P_((BBuf* buf, OnigDistance size)); -extern int onig_compile P_((regex_t* reg, const UChar* pattern, const UChar* pattern_end, OnigErrorInfo* einfo, const char *sourcefile, int sourceline)); -extern void onig_chain_reduce P_((regex_t* reg)); -extern void onig_chain_link_add P_((regex_t* to, regex_t* add)); -extern void onig_transfer P_((regex_t* to, regex_t* from)); -extern int onig_is_code_in_cc P_((OnigEncoding enc, OnigCodePoint code, CClassNode* cc)); -extern int onig_is_code_in_cc_len P_((int enclen, OnigCodePoint code, CClassNode* cc)); +extern UChar* onig_error_code_to_format(OnigPosition code); +extern void onig_vsnprintf_with_pattern(UChar buf[], int bufsize, OnigEncoding enc, UChar* pat, UChar* pat_end, const UChar *fmt, va_list args); +extern void onig_snprintf_with_pattern(UChar buf[], int bufsize, OnigEncoding enc, UChar* pat, UChar* pat_end, const UChar *fmt, ...); +extern int onig_bbuf_init(BBuf* buf, OnigDistance size); +extern int onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, OnigErrorInfo* einfo); +#ifdef RUBY +extern int onig_compile_ruby(regex_t* reg, const UChar* pattern, const UChar* pattern_end, OnigErrorInfo* einfo, const char *sourcefile, int sourceline); +#endif +extern void onig_transfer(regex_t* to, regex_t* from); +extern int onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc); +extern int onig_is_code_in_cc_len(int enclen, OnigCodePoint code, CClassNode* cc); /* strend hash */ typedef void hash_table_type; #ifdef RUBY -#include "ruby/st.h" -typedef st_data_t hash_data_type; +# include "ruby/st.h" #else -#include "st.h" -typedef uintptr_t hash_data_type; +# include "st.h" #endif +typedef st_data_t hash_data_type; -extern hash_table_type* onig_st_init_strend_table_with_size P_((st_index_t size)); -extern int onig_st_lookup_strend P_((hash_table_type* table, const UChar* str_key, const UChar* end_key, hash_data_type *value)); -extern int onig_st_insert_strend P_((hash_table_type* table, const UChar* str_key, const UChar* end_key, hash_data_type value)); - -/* encoding property management */ -#define PROPERTY_LIST_ADD_PROP(Name, CR) \ - r = onigenc_property_list_add_property((UChar* )Name, CR,\ - &PropertyNameTable, &PropertyList, &PropertyListNum,\ - &PropertyListSize);\ - if (r != 0) goto end - -#define PROPERTY_LIST_INIT_CHECK \ - if (PropertyInited == 0) {\ - int r = onigenc_property_list_init(init_property_list);\ - if (r != 0) return r;\ - } - -extern int onigenc_property_list_add_property P_((UChar* name, const OnigCodePoint* prop, hash_table_type **table, const OnigCodePoint*** plist, int *pnum, int *psize)); - -typedef int (*ONIGENC_INIT_PROPERTY_LIST_FUNC_TYPE)(void); - -extern int onigenc_property_list_init P_((ONIGENC_INIT_PROPERTY_LIST_FUNC_TYPE)); +extern hash_table_type* onig_st_init_strend_table_with_size(st_index_t size); +extern int onig_st_lookup_strend(hash_table_type* table, const UChar* str_key, const UChar* end_key, hash_data_type *value); +extern int onig_st_insert_strend(hash_table_type* table, const UChar* str_key, const UChar* end_key, hash_data_type value); -extern size_t onig_memsize P_((const regex_t *reg)); -extern size_t onig_region_memsize P_((const struct re_registers *regs)); +#ifdef RUBY +extern size_t onig_memsize(const regex_t *reg); +extern size_t onig_region_memsize(const struct re_registers *regs); +#endif RUBY_SYMBOL_EXPORT_END -#endif /* ONIGURUMA_REGINT_H */ +#endif /* ONIGMO_REGINT_H */ diff --git a/regparse.c b/regparse.c index fba0a34c42..204aa46ce9 100644 --- a/regparse.c +++ b/regparse.c @@ -3,7 +3,7 @@ **********************************************************************/ /*- * Copyright (c) 2002-2008 K.Kosako - * Copyright (c) 2011-2014 K.Takata + * Copyright (c) 2011-2016 K.Takata * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -29,6 +29,7 @@ */ #include "regparse.h" +#include #define WARN_BUFSIZE 256 @@ -51,6 +52,9 @@ const OnigSyntaxType OnigSyntaxRuby = { ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL | ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB | ONIG_SYN_OP2_ESC_H_XDIGIT | +#ifndef RUBY + ONIG_SYN_OP2_ESC_U_HEX4 | +#endif ONIG_SYN_OP2_ESC_CAPITAL_X_EXTENDED_GRAPHEME_CLUSTER | ONIG_SYN_OP2_QMARK_LPAREN_CONDITION | ONIG_SYN_OP2_ESC_CAPITAL_R_LINEBREAK | @@ -105,6 +109,26 @@ extern void onig_set_verb_warn_func(OnigWarnFunc f) static void CC_DUP_WARN(ScanEnv *env); + +static unsigned int ParseDepthLimit = DEFAULT_PARSE_DEPTH_LIMIT; + +extern unsigned int +onig_get_parse_depth_limit(void) +{ + return ParseDepthLimit; +} + +extern int +onig_set_parse_depth_limit(unsigned int depth) +{ + if (depth == 0) + ParseDepthLimit = DEFAULT_PARSE_DEPTH_LIMIT; + else + ParseDepthLimit = depth; + return 0; +} + + static void bbuf_free(BBuf* bbuf) { @@ -216,6 +240,7 @@ bitset_copy(BitSetRef dest, BitSetRef bs) for (i = 0; i < BITSET_SIZE; i++) { dest[i] = bs[i]; } } +#if defined(USE_NAMED_GROUP) && !defined(USE_ST_LIBRARY) extern int onig_strncmp(const UChar* s1, const UChar* s2, int n) { @@ -227,6 +252,7 @@ onig_strncmp(const UChar* s1, const UChar* s2, int n) } return 0; } +#endif extern void onig_strcpy(UChar* dest, const UChar* src, const UChar* end) @@ -265,9 +291,9 @@ strdup_with_null(OnigEncoding enc, UChar* s, UChar* end) #ifdef __GNUC__ /* get rid of Wunused-but-set-variable and Wuninitialized */ -#define PFETCH_READY UChar* pfetch_prev = NULL; (void)pfetch_prev +# define PFETCH_READY UChar* pfetch_prev = NULL; (void)pfetch_prev #else -#define PFETCH_READY UChar* pfetch_prev +# define PFETCH_READY UChar* pfetch_prev #endif #define PEND (p < end ? 0 : 1) #define PUNFETCH p = pfetch_prev @@ -325,7 +351,11 @@ strcat_capa_from_static(UChar* dest, UChar* dest_end, #ifdef USE_ST_LIBRARY -#include "ruby/st.h" +# ifdef RUBY +# include "ruby/st.h" +# else +# include "st.h" +# endif typedef struct { const UChar* s; @@ -417,7 +447,7 @@ onig_st_insert_strend(hash_table_type* table, const UChar* str_key, #ifdef USE_NAMED_GROUP -#define INIT_NAME_BACKREFS_ALLOC_NUM 8 +# define INIT_NAME_BACKREFS_ALLOC_NUM 8 typedef struct { UChar* name; @@ -428,12 +458,12 @@ typedef struct { int* back_refs; } NameEntry; -#ifdef USE_ST_LIBRARY +# ifdef USE_ST_LIBRARY typedef st_table NameTable; typedef st_data_t HashDataType; /* 1.6 st.h doesn't define st_data_t type */ -#ifdef ONIG_DEBUG +# ifdef ONIG_DEBUG static int i_print_name_entry(UChar* key, NameEntry* e, void* arg) { @@ -467,7 +497,7 @@ onig_print_names(FILE* fp, regex_t* reg) } return 0; } -#endif /* ONIG_DEBUG */ +# endif /* ONIG_DEBUG */ static int i_free_name_entry(UChar* key, NameEntry* e, void* arg ARG_UNUSED) @@ -530,8 +560,8 @@ static int i_names(UChar* key ARG_UNUSED, NameEntry* e, INamesArg* arg) { int r = (*(arg->func))(e->name, - e->name + e->name_len, - e->back_num, + e->name + e->name_len, + e->back_num, (e->back_num > 1 ? e->back_refs : &(e->back_ref1)), arg->reg, arg->arg); if (r != 0) { @@ -589,7 +619,7 @@ onig_renumber_name_table(regex_t* reg, GroupNumRemap* map) extern int -onig_number_of_names(regex_t* reg) +onig_number_of_names(const regex_t* reg) { NameTable* t = (NameTable* )reg->name_table; @@ -599,9 +629,9 @@ onig_number_of_names(regex_t* reg) return 0; } -#else /* USE_ST_LIBRARY */ +# else /* USE_ST_LIBRARY */ -#define INIT_NAMES_ALLOC_NUM 8 +# define INIT_NAMES_ALLOC_NUM 8 typedef struct { NameEntry* e; @@ -609,7 +639,7 @@ typedef struct { int alloc; } NameTable; -#ifdef ONIG_DEBUG +# ifdef ONIG_DEBUG extern int onig_print_names(FILE* fp, regex_t* reg) { @@ -640,7 +670,7 @@ onig_print_names(FILE* fp, regex_t* reg) } return 0; } -#endif +# endif static int names_clear(regex_t* reg) @@ -725,7 +755,7 @@ onig_foreach_name(regex_t* reg, } extern int -onig_number_of_names(regex_t* reg) +onig_number_of_names(const regex_t* reg) { NameTable* t = (NameTable* )reg->name_table; @@ -735,7 +765,7 @@ onig_number_of_names(regex_t* reg) return 0; } -#endif /* else USE_ST_LIBRARY */ +# endif /* else USE_ST_LIBRARY */ static int name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env) @@ -749,7 +779,7 @@ name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env) e = name_find(reg, name, name_end); if (IS_NULL(e)) { -#ifdef USE_ST_LIBRARY +# ifdef USE_ST_LIBRARY if (IS_NULL(t)) { t = onig_st_init_strend_table_with_size(5); reg->name_table = (void* )t; @@ -770,7 +800,7 @@ name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env) e->back_alloc = 0; e->back_refs = (int* )NULL; -#else +# else if (IS_NULL(t)) { alloc = INIT_NAMES_ALLOC_NUM; @@ -813,7 +843,7 @@ name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env) e->name = strdup_with_null(reg->enc, name, name_end); if (IS_NULL(e->name)) return ONIGERR_MEMORY; e->name_len = name_end - name; -#endif +# endif } if (e->back_num >= 1 && @@ -876,7 +906,7 @@ onig_name_to_group_numbers(regex_t* reg, const UChar* name, extern int onig_name_to_backref_number(regex_t* reg, const UChar* name, - const UChar* name_end, OnigRegion *region) + const UChar* name_end, const OnigRegion *region) { int i, n, *nums; @@ -909,7 +939,7 @@ onig_name_to_group_numbers(regex_t* reg, const UChar* name, extern int onig_name_to_backref_number(regex_t* reg, const UChar* name, - const UChar* name_end, OnigRegion* region) + const UChar* name_end, const OnigRegion* region) { return ONIG_NO_SUPPORT_CONFIG; } @@ -922,14 +952,14 @@ onig_foreach_name(regex_t* reg, } extern int -onig_number_of_names(regex_t* reg) +onig_number_of_names(const regex_t* reg) { return 0; } #endif /* else USE_NAMED_GROUP */ extern int -onig_noname_group_capture_is_active(regex_t* reg) +onig_noname_group_capture_is_active(const regex_t* reg) { if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_DONT_CAPTURE_GROUP)) return 0; @@ -976,6 +1006,7 @@ scan_env_clear(ScanEnv* env) env->curr_max_regnum = 0; env->has_recursion = 0; #endif + env->parse_depth = 0; env->warnings_flag = 0; } @@ -1025,14 +1056,6 @@ scan_env_set_mem_node(ScanEnv* env, int num, Node* node) } -#ifdef USE_PARSE_TREE_NODE_RECYCLE -typedef struct _FreeNode { - struct _FreeNode* next; -} FreeNode; - -static FreeNode* FreeNodeList = (FreeNode* )NULL; -#endif - extern void onig_node_free(Node* node) { @@ -1053,18 +1076,7 @@ onig_node_free(Node* node) { Node* next_node = NCDR(node); -#ifdef USE_PARSE_TREE_NODE_RECYCLE - { - FreeNode* n = (FreeNode* )node; - - THREAD_ATOMIC_START; - n->next = FreeNodeList; - FreeNodeList = n; - THREAD_ATOMIC_END; - } -#else xfree(node); -#endif node = next_node; goto start; } @@ -1074,9 +1086,8 @@ onig_node_free(Node* node) { CClassNode* cc = NCCLASS(node); - if (IS_NCCLASS_SHARE(cc)) return ; if (cc->mbuf) - bbuf_free(cc->mbuf); + bbuf_free(cc->mbuf); } break; @@ -1101,77 +1112,18 @@ onig_node_free(Node* node) break; } -#ifdef USE_PARSE_TREE_NODE_RECYCLE - { - FreeNode* n = (FreeNode* )node; - - THREAD_ATOMIC_START; - n->next = FreeNodeList; - FreeNodeList = n; - THREAD_ATOMIC_END; - } -#else xfree(node); -#endif } -#ifdef USE_PARSE_TREE_NODE_RECYCLE -extern int -onig_free_node_list(void) -{ - FreeNode* n; - - /* THREAD_ATOMIC_START; */ - while (IS_NOT_NULL(FreeNodeList)) { - n = FreeNodeList; - FreeNodeList = FreeNodeList->next; - xfree(n); - } - /* THREAD_ATOMIC_END; */ - return 0; -} -#endif - static Node* node_new(void) { Node* node; -#ifdef USE_PARSE_TREE_NODE_RECYCLE - THREAD_ATOMIC_START; - if (IS_NOT_NULL(FreeNodeList)) { - node = (Node* )FreeNodeList; - FreeNodeList = FreeNodeList->next; - THREAD_ATOMIC_END; - return node; - } - THREAD_ATOMIC_END; -#endif - - node = (Node* )xmalloc(sizeof(Node)); - /* xmemset(node, 0, sizeof(Node)); */ - return node; -} - -#if defined(USE_MULTI_THREAD_SYSTEM) && \ - defined(USE_SHARED_CCLASS_TABLE) && \ - defined(USE_PARSE_TREE_NODE_RECYCLE) -static Node* -node_new_locked(void) -{ - Node* node; - - if (IS_NOT_NULL(FreeNodeList)) { - node = (Node* )FreeNodeList; - FreeNodeList = FreeNodeList->next; - return node; - } - node = (Node* )xmalloc(sizeof(Node)); /* xmemset(node, 0, sizeof(Node)); */ return node; } -#endif static void initialize_cclass(CClassNode* cc) @@ -1193,75 +1145,6 @@ node_new_cclass(void) return node; } -#if defined(USE_MULTI_THREAD_SYSTEM) && \ - defined(USE_SHARED_CCLASS_TABLE) && \ - defined(USE_PARSE_TREE_NODE_RECYCLE) -static Node* -node_new_cclass_locked(void) -{ - Node* node = node_new_locked(); - CHECK_NULL_RETURN(node); - - SET_NTYPE(node, NT_CCLASS); - initialize_cclass(NCCLASS(node)); - return node; -} -#else -#define node_new_cclass_locked() node_new_cclass() -#endif - -#ifdef USE_SHARED_CCLASS_TABLE -static Node* -node_new_cclass_by_codepoint_range(int not, OnigCodePoint sb_out, - const OnigCodePoint ranges[]) -{ - int n, i; - CClassNode* cc; - OnigCodePoint j; - - Node* node = node_new_cclass_locked(); - CHECK_NULL_RETURN(node); - - cc = NCCLASS(node); - if (not != 0) NCCLASS_SET_NOT(cc); - - BITSET_CLEAR(cc->bs); - if (sb_out > 0 && IS_NOT_NULL(ranges)) { - n = ONIGENC_CODE_RANGE_NUM(ranges); - for (i = 0; i < n; i++) { - for (j = ONIGENC_CODE_RANGE_FROM(ranges, i); - j <= (OnigCodePoint )ONIGENC_CODE_RANGE_TO(ranges, i); j++) { - if (j >= sb_out) goto sb_end; - - BITSET_SET_BIT(cc->bs, j); - } - } - } - - sb_end: - if (IS_NULL(ranges)) { - is_null: - cc->mbuf = NULL; - } - else { - BBuf* bbuf; - - n = ONIGENC_CODE_RANGE_NUM(ranges); - if (n == 0) goto is_null; - - bbuf = (BBuf* )xmalloc(sizeof(BBuf)); - CHECK_NULL_RETURN(bbuf); - bbuf->alloc = n + 1; - bbuf->used = n + 1; - bbuf->p = (UChar* )((void* )ranges); - - cc->mbuf = bbuf; - } - - return node; -} -#endif /* USE_SHARED_CCLASS_TABLE */ - static Node* node_new_ctype(int type, int not, int ascii_range) { @@ -1548,6 +1431,7 @@ node_str_cat_codepoint(Node* node, OnigEncoding enc, OnigCodePoint c) return onig_node_str_cat(node, buf, buf + num); } +#if 0 extern void onig_node_conv_to_str_node(Node* node, int flag) { @@ -1557,6 +1441,7 @@ onig_node_conv_to_str_node(Node* node, int flag) NSTR(node)->s = NSTR(node)->buf; NSTR(node)->end = NSTR(node)->buf; } +#endif extern void onig_node_str_clear(Node* node) @@ -1715,6 +1600,7 @@ scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int minlen, } else { PUNFETCH; + maxlen++; break; } } @@ -1886,7 +1772,7 @@ add_code_range0(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to, static int add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to) { - return add_code_range0(pbuf, env, from, to, 1); + return add_code_range0(pbuf, env, from, to, 1); } static int @@ -1990,7 +1876,7 @@ or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1, static int and_code_range1(BBuf** pbuf, ScanEnv* env, OnigCodePoint from1, OnigCodePoint to1, - OnigCodePoint* data, int n) + OnigCodePoint* data, int n) { int i, r; OnigCodePoint from2, to2; @@ -2204,8 +2090,8 @@ or_cclass(CClassNode* dest, CClassNode* cc, ScanEnv* env) static void UNKNOWN_ESC_WARN(ScanEnv *env, int c); -static int -conv_backslash_value(int c, ScanEnv* env) +static OnigCodePoint +conv_backslash_value(OnigCodePoint c, ScanEnv* env) { if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) { switch (c) { @@ -2231,7 +2117,7 @@ conv_backslash_value(int c, ScanEnv* env) } #ifdef USE_NO_INVALID_QUANTIFIER -#define is_invalid_quantifier_target(node) 0 +# define is_invalid_quantifier_target(node) 0 #else static int is_invalid_quantifier_target(Node* node) @@ -2303,6 +2189,7 @@ enum ReduceType { }; static enum ReduceType const ReduceTypeTable[6][6] = { +/* '?', '*', '+', '??', '*?', '+?' p / c */ {RQ_DEL, RQ_A, RQ_A, RQ_QQ, RQ_AQ, RQ_ASIS}, /* '?' */ {RQ_DEL, RQ_DEL, RQ_DEL, RQ_P_QQ, RQ_P_QQ, RQ_DEL}, /* '*' */ {RQ_A, RQ_A, RQ_DEL, RQ_ASIS, RQ_P_QQ, RQ_DEL}, /* '+' */ @@ -2505,6 +2392,7 @@ fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env) PFETCH(c); if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) { if (c != MC_ESC(env->syntax)) goto invalid; + if (PEND) goto invalid; PFETCH(c); } if (c != '}') goto invalid; @@ -2528,7 +2416,7 @@ fetch_range_quantifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env) /* \M-, \C-, \c, or \... */ static int -fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) +fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env, OnigCodePoint* val) { int v; OnigCodePoint c; @@ -2547,9 +2435,8 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) if (PEND) return ONIGERR_END_PATTERN_AT_META; PFETCH_S(c); if (c == MC_ESC(env->syntax)) { - v = fetch_escaped_value(&p, end, env); - if (v < 0) return v; - c = (OnigCodePoint )v; + v = fetch_escaped_value(&p, end, env, &c); + if (v < 0) return v; } c = ((c & 0xff) | 0x80); } @@ -2573,15 +2460,14 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL; PFETCH_S(c); if (c == '?') { - c = 0177; + c = 0177; } else { - if (c == MC_ESC(env->syntax)) { - v = fetch_escaped_value(&p, end, env); - if (v < 0) return v; - c = (OnigCodePoint )v; - } - c &= 0x9f; + if (c == MC_ESC(env->syntax)) { + v = fetch_escaped_value(&p, end, env, &c); + if (v < 0) return v; + } + c &= 0x9f; } break; } @@ -2596,7 +2482,8 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) } *src = p; - return c; + *val = c; + return 0; } static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env); @@ -2617,8 +2504,13 @@ get_name_end_code_point(OnigCodePoint start) } #ifdef USE_NAMED_GROUP -#define ONIGENC_IS_CODE_NAME(enc, c) TRUE -#ifdef USE_BACKREF_WITH_LEVEL +# ifdef RUBY +# define ONIGENC_IS_CODE_NAME(enc, c) TRUE +# else +# define ONIGENC_IS_CODE_NAME(enc, c) ONIGENC_IS_CODE_WORD(enc, c) +# endif + +# ifdef USE_BACKREF_WITH_LEVEL /* \k, \k \k, \k @@ -2678,11 +2570,11 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, if (is_num != 0) { if (ONIGENC_IS_CODE_DIGIT(enc, c)) { - is_num = 1; + is_num = 1; } else { - r = ONIGERR_INVALID_GROUP_NAME; - is_num = 0; + r = ONIGERR_INVALID_GROUP_NAME; + is_num = 0; } } else if (!ONIGENC_IS_CODE_NAME(enc, c)) { @@ -2695,6 +2587,10 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, int level; int flag = (c == '-' ? -1 : 1); + if (PEND) { + r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; + goto end; + } PFETCH(c); if (! ONIGENC_IS_CODE_DIGIT(enc, c)) goto err; PUNFETCH; @@ -2703,9 +2599,11 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, *rlevel = (level * flag); exist_level = 1; - PFETCH(c); - if (c == end_code) - goto end; + if (!PEND) { + PFETCH(c); + if (c == end_code) + goto end; + } } err: @@ -2732,7 +2630,7 @@ fetch_name_with_level(OnigCodePoint start_code, UChar** src, UChar* end, return r; } } -#endif /* USE_BACKREF_WITH_LEVEL */ +# endif /* USE_BACKREF_WITH_LEVEL */ /* ref: 0 -> define name (don't allow number name) @@ -2769,17 +2667,17 @@ fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, if (ONIGENC_IS_CODE_DIGIT(enc, c)) { if (ref == 1) - is_num = 1; + is_num = 1; else { - r = ONIGERR_INVALID_GROUP_NAME; - is_num = 0; + r = ONIGERR_INVALID_GROUP_NAME; + is_num = 0; } } else if (c == '-') { if (ref == 1) { - is_num = 2; - sign = -1; - pnum_head = p; + is_num = 2; + sign = -1; + pnum_head = p; } else { r = ONIGERR_INVALID_GROUP_NAME; @@ -2796,30 +2694,30 @@ fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, name_end = p; PFETCH_S(c); if (c == end_code || c == ')') { - if (is_num == 2) { - r = ONIGERR_INVALID_GROUP_NAME; - goto teardown; - } - break; + if (is_num == 2) { + r = ONIGERR_INVALID_GROUP_NAME; + goto teardown; + } + break; } if (is_num != 0) { - if (ONIGENC_IS_CODE_DIGIT(enc, c)) { - is_num = 1; - } - else { - if (!ONIGENC_IS_CODE_WORD(enc, c)) - r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; - else - r = ONIGERR_INVALID_GROUP_NAME; - goto teardown; - } + if (ONIGENC_IS_CODE_DIGIT(enc, c)) { + is_num = 1; + } + else { + if (!ONIGENC_IS_CODE_WORD(enc, c)) + r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; + else + r = ONIGERR_INVALID_GROUP_NAME; + goto teardown; + } } else { - if (!ONIGENC_IS_CODE_NAME(enc, c)) { - r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; - goto teardown; - } + if (!ONIGENC_IS_CODE_NAME(enc, c)) { + r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; + goto teardown; + } } } @@ -2833,8 +2731,8 @@ fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, *rback_num = onig_scan_unsigned_number(&pnum_head, name_end, enc); if (*rback_num < 0) return ONIGERR_TOO_BIG_NUMBER; else if (*rback_num == 0) { - r = ONIGERR_INVALID_GROUP_NAME; - goto err; + r = ONIGERR_INVALID_GROUP_NAME; + goto err; } *rback_num *= sign; @@ -2845,12 +2743,12 @@ fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, return 0; } else { - teardown: +teardown: while (!PEND) { name_end = p; PFETCH_S(c); if (c == end_code || c == ')') - break; + break; } if (PEND) name_end = end; @@ -2939,8 +2837,6 @@ fetch_name(OnigCodePoint start_code, UChar** src, UChar* end, } #endif /* USE_NAMED_GROUP */ -void onig_vsnprintf_with_pattern(UChar buf[], int bufsize, OnigEncoding enc, - UChar* pat, UChar* pat_end, const UChar *fmt, va_list args); static void onig_syntax_warn(ScanEnv *env, const char *fmt, ...) @@ -2952,10 +2848,14 @@ onig_syntax_warn(ScanEnv *env, const char *fmt, ...) env->pattern, env->pattern_end, (const UChar *)fmt, args); va_end(args); +#ifdef RUBY if (env->sourcefile == NULL) rb_warn("%s", (char *)buf); else rb_compile_warn(env->sourcefile, env->sourceline, "%s", (char *)buf); +#else + (*onig_warn)((char* )buf); +#endif } static void @@ -2979,6 +2879,10 @@ CLOSE_BRACKET_WITHOUT_ESC_WARN(ScanEnv* env, UChar* c) } } +#ifndef RTEST +# define RTEST(v) 1 +#endif + static void CC_DUP_WARN(ScanEnv *env) { @@ -3148,6 +3052,8 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case 'p': case 'P': + if (PEND) break; + c2 = PPEEK; if (c2 == '{' && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) { @@ -3155,7 +3061,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->type = TK_CHAR_PROPERTY; tok->u.prop.not = (c == 'P' ? 1 : 0); - if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { + if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { PFETCH(c2); if (c2 == '^') { tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0); @@ -3178,10 +3084,10 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc); if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; if (!PEND) { - c2 = PPEEK; - if (ONIGENC_IS_CODE_XDIGIT(enc, c2)) - return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; - } + c2 = PPEEK; + if (ONIGENC_IS_CODE_XDIGIT(enc, c2)) + return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; + } if (p > prev + enclen(enc, prev, end) && !PEND && (PPEEK_IS('}'))) { PINC; @@ -3223,6 +3129,33 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) } break; + case 'o': + if (PEND) break; + + prev = p; + if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) { + PINC; + num = scan_unsigned_octal_number(&p, end, 11, enc); + if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; + if (!PEND) { + c2 = PPEEK; + if (ONIGENC_IS_CODE_DIGIT(enc, c2) && c2 < '8') + return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; + } + + if (p > prev + enclen(enc, prev, end) && !PEND && (PPEEK_IS('}'))) { + PINC; + tok->type = TK_CODE_POINT; + tok->base = 8; + tok->u.code = (OnigCodePoint )num; + } + else { + /* can't read nothing or invalid format */ + p = prev; + } + } + break; + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { @@ -3241,10 +3174,10 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) default: PUNFETCH; - num = fetch_escaped_value(&p, end, env); + num = fetch_escaped_value(&p, end, env, &c2); if (num < 0) return num; - if (tok->u.c != num) { - tok->u.code = (OnigCodePoint )num; + if ((OnigCodePoint)tok->u.c != c2) { + tok->u.code = (OnigCodePoint )c2; tok->type = TK_CODE_POINT; } break; @@ -3302,15 +3235,15 @@ fetch_named_backref_token(OnigCodePoint c, OnigToken* tok, UChar** src, prev = p; -#ifdef USE_BACKREF_WITH_LEVEL +# ifdef USE_BACKREF_WITH_LEVEL name_end = NULL_UCHARP; /* no need. escape gcc warning. */ r = fetch_name_with_level(c, &p, end, &name_end, env, &back_num, &tok->u.backref.level); if (r == 1) tok->u.backref.exist_level = 1; else tok->u.backref.exist_level = 0; -#else +# else r = fetch_name(&p, end, &name_end, env, &back_num, 1); -#endif +# endif if (r < 0) return r; if (back_num != 0) { @@ -3348,7 +3281,7 @@ fetch_named_backref_token(OnigCodePoint c, OnigToken* tok, UChar** src, tok->type = TK_BACKREF; tok->u.backref.by_name = 1; - if (num == 1) { + if (num == 1 || IS_SYNTAX_BV(syn, ONIG_SYN_USE_LEFT_MOST_NAMED_GROUP)) { tok->u.backref.num = 1; tok->u.backref.ref1 = backs[0]; } @@ -3601,9 +3534,9 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) num = scan_unsigned_hexadecimal_number(&p, end, 0, 8, enc); if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; if (!PEND) { - if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK)) - return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; - } + if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK)) + return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; + } if ((p > prev + enclen(enc, prev, end)) && !PEND && PPEEK_IS('}')) { PINC; @@ -3644,13 +3577,39 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) } break; + case 'o': + if (PEND) break; + + prev = p; + if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_O_BRACE_OCTAL)) { + PINC; + num = scan_unsigned_octal_number(&p, end, 11, enc); + if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; + if (!PEND) { + OnigCodePoint c = PPEEK; + if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') + return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; + } + + if ((p > prev + enclen(enc, prev, end)) && !PEND && PPEEK_IS('}')) { + PINC; + tok->type = TK_CODE_POINT; + tok->u.code = (OnigCodePoint )num; + } + else { + /* can't read nothing or invalid format */ + p = prev; + } + } + break; + case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': PUNFETCH; prev = p; num = onig_scan_unsigned_number(&p, end, enc); if (num < 0 || num > ONIG_MAX_BACKREF_NUM) { - goto skip_backref; + goto skip_backref; } if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) && @@ -3698,7 +3657,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) #ifdef USE_NAMED_GROUP case 'k': - if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) { + if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) { PFETCH(c); if (c == '<' || c == '\'') { r = fetch_named_backref_token(c, tok, &p, end, env); @@ -3714,8 +3673,8 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) #if defined(USE_SUBEXP_CALL) || defined(USE_NAMED_GROUP) case 'g': -#ifdef USE_NAMED_GROUP - if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_BRACE_BACKREF)) { +# ifdef USE_NAMED_GROUP + if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_BRACE_BACKREF)) { PFETCH(c); if (c == '{') { r = fetch_named_backref_token(c, tok, &p, end, env); @@ -3724,9 +3683,9 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) else PUNFETCH; } -#endif -#ifdef USE_SUBEXP_CALL - if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) { +# endif +# ifdef USE_SUBEXP_CALL + if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) { PFETCH(c); if (c == '<' || c == '\'') { int gnum = -1, rel = 0; @@ -3763,7 +3722,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) PUNFETCH; } } -#endif +# endif break; #endif @@ -3781,7 +3740,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->type = TK_CHAR_PROPERTY; tok->u.prop.not = (c == 'P' ? 1 : 0); - if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { + if (!PEND && IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) { PFETCH(c); if (c == '^') { tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0); @@ -3814,16 +3773,20 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) break; default: - PUNFETCH; - num = fetch_escaped_value(&p, end, env); - if (num < 0) return num; - /* set_raw: */ - if (tok->u.c != num) { - tok->type = TK_CODE_POINT; - tok->u.code = (OnigCodePoint )num; - } - else { /* string */ - p = tok->backp + enclen(enc, tok->backp, end); + { + OnigCodePoint c2; + + PUNFETCH; + num = fetch_escaped_value(&p, end, env, &c2); + if (num < 0) return num; + /* set_raw: */ + if ((OnigCodePoint)tok->u.c != c2) { + tok->type = TK_CODE_POINT; + tok->u.code = (OnigCodePoint )c2; + } + else { /* string */ + p = tok->backp + enclen(enc, tok->backp, end); + } } break; } @@ -3913,22 +3876,22 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) case '(': if (PPEEK_IS('?') && - IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { - PINC; - if (PPEEK_IS('#')) { - PFETCH(c); - while (1) { - if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; - PFETCH(c); - if (c == MC_ESC(syn)) { - if (!PEND) PFETCH(c); - } - else { - if (c == ')') break; - } - } - goto start; - } + IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { + PINC; + if (PPEEK_IS('#')) { + PFETCH(c); + while (1) { + if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; + PFETCH(c); + if (c == MC_ESC(syn)) { + if (!PEND) PFETCH(c); + } + else { + if (c == ')') break; + } + } + goto start; + } #ifdef USE_PERL_SUBEXP_CALL /* (?&name), (?n), (?R), (?0), (?+n), (?-n) */ c = PPEEK; @@ -3999,6 +3962,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) PFETCH_READY; PINC; /* skip 'P' */ + if (PEND) return ONIGERR_UNDEFINED_GROUP_OPTION; PFETCH(c); if (c == '=') { /* (?P=name): backref */ r = fetch_named_backref_token((OnigCodePoint )'(', tok, &p, end, env); @@ -4017,10 +3981,9 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) tok->u.call.rel = 0; break; } - PUNFETCH; } #endif /* USE_CAPITAL_P_NAMED_GROUP */ - PUNFETCH; + PUNFETCH; } if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break; @@ -4098,8 +4061,8 @@ add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not, if (not == 0) { for (i = 0; i < n; i++) { - for (j = ONIGENC_CODE_RANGE_FROM(mbr, i); - j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) { + for (j = ONIGENC_CODE_RANGE_FROM(mbr, i); + j <= ONIGENC_CODE_RANGE_TO(mbr, i); j++) { if (j >= sb_out) { if (j > ONIGENC_CODE_RANGE_FROM(mbr, i)) { r = add_code_range_to_buf(&(cc->mbuf), env, j, @@ -4110,7 +4073,7 @@ add_ctype_to_cc_by_range(CClassNode* cc, int ctype ARG_UNUSED, int not, goto sb_end; } - BITSET_SET_BIT_CHKDUP(cc->bs, j); + BITSET_SET_BIT_CHKDUP(cc->bs, j); } } @@ -4183,12 +4146,15 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, int ascii_range, ScanEnv* en CClassNode ccascii; initialize_cclass(&ccascii); if (ONIGENC_MBC_MINLEN(env->enc) > 1) { - add_code_range(&(ccascii.mbuf), env, 0x00, 0x7F); + r = add_code_range(&(ccascii.mbuf), env, 0x00, 0x7F); } else { bitset_set_range(env, ccascii.bs, 0x00, 0x7F); + r = 0; + } + if (r == 0) { + r = and_cclass(&ccwork, &ccascii, env); } - r = and_cclass(&ccwork, &ccascii, env); if (IS_NOT_NULL(ccascii.mbuf)) bbuf_free(ccascii.mbuf); } if (r == 0) { @@ -4244,7 +4210,7 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, int ascii_range, ScanEnv* en BITSET_SET_BIT_CHKDUP(cc->bs, c); } if (ascii_range) - ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); + ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); } else { for (c = 0; c < maxcode; c++) { @@ -4252,7 +4218,7 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, int ascii_range, ScanEnv* en BITSET_SET_BIT_CHKDUP(cc->bs, c); } if (! ascii_range) - ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); + ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); } break; @@ -4262,16 +4228,16 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, int ascii_range, ScanEnv* en if (ONIGENC_IS_CODE_WORD(enc, c)) BITSET_SET_BIT_CHKDUP(cc->bs, c); } if (! ascii_range) - ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); + ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); } else { for (c = 0; c < SINGLE_BYTE_SIZE; c++) { - if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* check invalid code point */ + if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) /* check invalid code point */ && (! ONIGENC_IS_CODE_WORD(enc, c) || c >= maxcode)) BITSET_SET_BIT_CHKDUP(cc->bs, c); } if (ascii_range) - ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); + ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); } break; @@ -4330,7 +4296,7 @@ parse_posix_bracket(CClassNode* cc, CClassNode* asc_cc, if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) { p = (UChar* )onigenc_step(enc, p, end, pb->len); if (onigenc_with_ascii_strncmp(enc, p, end, (UChar* )":]", 2) != 0) - return ONIGERR_INVALID_POSIX_BRACKET_TYPE; + return ONIGERR_INVALID_POSIX_BRACKET_TYPE; r = add_ctype_to_cc(cc, pb->ctype, not, ascii_range, env); if (r != 0) return r; @@ -4361,7 +4327,7 @@ parse_posix_bracket(CClassNode* cc, CClassNode* asc_cc, if (! PEND) { PFETCH_S(c); if (c == ']') - return ONIGERR_INVALID_POSIX_BRACKET_TYPE; + return ONIGERR_INVALID_POSIX_BRACKET_TYPE; } } @@ -4441,7 +4407,7 @@ enum CCVALTYPE { static int next_state_class(CClassNode* cc, CClassNode* asc_cc, - OnigCodePoint* vs, enum CCVALTYPE* type, + OnigCodePoint* vs, enum CCVALTYPE* type, enum CCSTATE* state, ScanEnv* env) { int r; @@ -4499,8 +4465,8 @@ next_state_val(CClassNode* cc, CClassNode* asc_cc, case CCS_RANGE: if (intype == *type) { if (intype == CCV_SB) { - if (*vs > 0xff || v > 0xff) - return ONIGERR_INVALID_CODE_POINT_VALUE; + if (*vs > 0xff || v > 0xff) + return ONIGERR_INVALID_CODE_POINT_VALUE; if (*vs > v) { if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) @@ -4604,6 +4570,9 @@ parse_char_class(Node** np, Node** asc_np, OnigToken* tok, UChar** src, UChar* e enum CCVALTYPE val_type, in_type; int val_israw, in_israw; + env->parse_depth++; + if (env->parse_depth > ParseDepthLimit) + return ONIGERR_PARSE_DEPTH_LIMIT_OVER; prev_cc = asc_prev_cc = (CClassNode* )NULL; *np = *asc_np = NULL_NODE; r = fetch_token_in_cc(tok, src, end, env); @@ -4687,7 +4656,7 @@ parse_char_class(Node** np, Node** asc_np, OnigToken* tok, UChar** src, UChar* e goto err; } - len = enclen(env->enc, buf, buf+i); + len = enclen(env->enc, buf, buf + i); if (i < len) { r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; goto err; @@ -4695,7 +4664,8 @@ parse_char_class(Node** np, Node** asc_np, OnigToken* tok, UChar** src, UChar* e else if (i > len) { /* fetch back */ p = psave; for (i = 1; i < len; i++) { - r = fetch_token_in_cc(tok, &p, end, env); + (void)fetch_token_in_cc(tok, &p, end, env); + /* no need to check the retun value (already checked above) */ } fetched = 0; } @@ -4948,16 +4918,17 @@ parse_char_class(Node** np, Node** asc_np, OnigToken* tok, UChar** src, UChar* e #define NEWLINE_CODE 0x0a if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) { - if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1) - BITSET_SET_BIT_CHKDUP(cc->bs, NEWLINE_CODE); - else { - r = add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE); - if (r < 0) goto err; - } + if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1) + BITSET_SET_BIT_CHKDUP(cc->bs, NEWLINE_CODE); + else { + r = add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE); + if (r < 0) goto err; + } } } } *src = p; + env->parse_depth--; return 0; err: @@ -5028,18 +4999,20 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, return ONIGERR_UNDEFINED_GROUP_OPTION; break; -#ifdef USE_CAPITAL_P_NAMED_GROUP +# ifdef USE_CAPITAL_P_NAMED_GROUP case 'P': /* (?P...) */ - if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAMED_GROUP)) { + if (!PEND && + IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_CAPITAL_P_NAMED_GROUP)) { PFETCH(c); if (c == '<') goto named_group1; } return ONIGERR_UNDEFINED_GROUP_OPTION; break; -#endif +# endif #endif case '<': /* look behind (?<=...), (?syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) { #ifdef USE_NAMED_GROUP - if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { + if (!PEND && + IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { PFETCH(c); if (c == '<' || c == '\'') { list_capture = 1; @@ -5115,7 +5089,8 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, break; case '(': /* conditional expression: (?(cond)yes), (?(cond)yes|no) */ - if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LPAREN_CONDITION)) { + if (!PEND && + IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LPAREN_CONDITION)) { UChar *name = NULL; UChar *name_end; PFETCH(c); @@ -5133,36 +5108,29 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, #endif if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) { if (num > env->num_mem || - IS_NULL(SCANENV_MEM_NODES(env)[num])) + IS_NULL(SCANENV_MEM_NODES(env)[num])) return ONIGERR_INVALID_BACKREF; } } #ifdef USE_NAMED_GROUP else if (c == '<' || c == '\'') { /* (), ('name') */ - int nums; - int *backs; - name = p; - r = fetch_name((OnigCodePoint )c, &p, end, &name_end, env, &num, 0); + r = fetch_named_backref_token(c, tok, &p, end, env); if (r < 0) return r; - PFETCH(c); - if (c != ')') return ONIGERR_UNDEFINED_GROUP_OPTION; + if (!PPEEK_IS(')')) return ONIGERR_UNDEFINED_GROUP_OPTION; + PINC; - nums = onig_name_to_group_numbers(env->reg, name, name_end, &backs); - if (nums <= 0) { - onig_scan_env_set_error_string(env, - ONIGERR_UNDEFINED_NAME_REFERENCE, name, name_end); - return ONIGERR_UNDEFINED_NAME_REFERENCE; + if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_USE_LEFT_MOST_NAMED_GROUP)) { + num = tok->u.backref.ref1; } - if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_STRICT_CHECK_BACKREF)) { - int i; - for (i = 0; i < nums; i++) { - if (backs[i] > env->num_mem || - IS_NULL(SCANENV_MEM_NODES(env)[backs[i]])) - return ONIGERR_INVALID_BACKREF; - } + else { + /* FIXME: + * Use left most named group for now. This is the same as Perl. + * However this should use the same strategy as normal back- + * references on Ruby syntax; search right to left. */ + int len = tok->u.backref.num; + num = len > 1 ? tok->u.backref.refs[0] : tok->u.backref.ref1; } - num = backs[0]; /* XXX: use left most named group as Perl */ } #endif else @@ -5187,7 +5155,7 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, #endif case '^': /* loads default options */ - if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) { + if (!PEND && IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) { /* d-imsx */ ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1); ONOFF(option, ONIG_OPTION_IGNORECASE, 1); @@ -5197,7 +5165,7 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, PFETCH(c); } #if 0 - else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) { + else if (!PEND && IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) { /* d-imx */ ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0); ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 0); @@ -5255,8 +5223,8 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, case 'a': /* limits \d, \s, \w and POSIX brackets to ASCII range */ if ((IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) || - IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) && - (neg == 0)) { + IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) && + (neg == 0)) { ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0); ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 1); ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 1); @@ -5267,8 +5235,8 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, case 'u': if ((IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) || - IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) && - (neg == 0)) { + IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) && + (neg == 0)) { ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1); ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 1); ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 1); @@ -5279,11 +5247,11 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, case 'd': if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL) && - (neg == 0)) { + (neg == 0)) { ONOFF(option, ONIG_OPTION_ASCII_RANGE, 1); } else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY) && - (neg == 0)) { + (neg == 0)) { ONOFF(option, ONIG_OPTION_ASCII_RANGE, 0); ONOFF(option, ONIG_OPTION_POSIX_BRACKET_ALL_RANGE, 0); ONOFF(option, ONIG_OPTION_WORD_BOUND_ALL_RANGE, 0); @@ -5313,9 +5281,12 @@ parse_enclose(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, else if (c == ':') { OnigOptionType prev = env->option; - env->option = option; + env->option = option; r = fetch_token(tok, &p, end, env); - if (r < 0) return r; + if (r < 0) { + env->option = prev; + return r; + } r = parse_subexp(&target, tok, term, &p, end, env); env->option = prev; if (r < 0) return r; @@ -5430,29 +5401,29 @@ set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env) int targetq_num = popular_quantifier_num(qnt); #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR - if (!IS_QUANTIFIER_BY_NUMBER(qn) && !IS_QUANTIFIER_BY_NUMBER(qnt) && + if (nestq_num >= 0 && targetq_num >= 0 && IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) { - switch (ReduceTypeTable[targetq_num][nestq_num]) { - case RQ_ASIS: - break; - - case RQ_DEL: - if (onig_warn != onig_null_warn) { - onig_syntax_warn(env, "regular expression has redundant nested repeat operator '%s'", - PopularQStr[targetq_num]); - } - goto warn_exit; - break; - - default: - if (onig_warn != onig_null_warn) { - onig_syntax_warn(env, "nested repeat operator '%s' and '%s' was replaced with '%s' in regular expression", - PopularQStr[targetq_num], PopularQStr[nestq_num], - ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]); - } - goto warn_exit; - break; - } + switch (ReduceTypeTable[targetq_num][nestq_num]) { + case RQ_ASIS: + break; + + case RQ_DEL: + if (onig_warn != onig_null_warn) { + onig_syntax_warn(env, "regular expression has redundant nested repeat operator '%s'", + PopularQStr[targetq_num]); + } + goto warn_exit; + break; + + default: + if (onig_warn != onig_null_warn) { + onig_syntax_warn(env, "nested repeat operator '%s' and '%s' was replaced with '%s' in regular expression", + PopularQStr[targetq_num], PopularQStr[nestq_num], + ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]); + } + goto warn_exit; + break; + } } warn_exit: @@ -5482,85 +5453,6 @@ set_quantifier(Node* qnode, Node* target, int group, ScanEnv* env) } -#ifdef USE_SHARED_CCLASS_TABLE - -#define THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS 8 - -/* for ctype node hash table */ - -typedef struct { - OnigEncoding enc; - int not; - int type; -} type_cclass_key; - -static int type_cclass_cmp(type_cclass_key* x, type_cclass_key* y) -{ - if (x->type != y->type) return 1; - if (x->enc != y->enc) return 1; - if (x->not != y->not) return 1; - return 0; -} - -static st_index_t type_cclass_hash(type_cclass_key* key) -{ - int i, val; - UChar *p; - - val = 0; - - p = (UChar* )&(key->enc); - for (i = 0; i < (int )sizeof(key->enc); i++) { - val = val * 997 + (int )*p++; - } - - p = (UChar* )(&key->type); - for (i = 0; i < (int )sizeof(key->type); i++) { - val = val * 997 + (int )*p++; - } - - val += key->not; - return val + (val >> 5); -} - -static const struct st_hash_type type_type_cclass_hash = { - type_cclass_cmp, - type_cclass_hash, -}; - -static st_table* OnigTypeCClassTable; - - -static int -i_free_shared_class(type_cclass_key* key, Node* node, void* arg ARG_UNUSED) -{ - if (IS_NOT_NULL(node)) { - CClassNode* cc = NCCLASS(node); - if (IS_NOT_NULL(cc->mbuf)) xfree(cc->mbuf); - xfree(node); - } - - if (IS_NOT_NULL(key)) xfree(key); - return ST_DELETE; -} - -extern int -onig_free_shared_cclass_table(void) -{ - /* THREAD_ATOMIC_START; */ - if (IS_NOT_NULL(OnigTypeCClassTable)) { - onig_st_foreach(OnigTypeCClassTable, i_free_shared_class, 0); - onig_st_free_table(OnigTypeCClassTable); - OnigTypeCClassTable = NULL; - } - /* THREAD_ATOMIC_END; */ - - return 0; -} - -#endif /* USE_SHARED_CCLASS_TABLE */ - - #ifndef CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS static int clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc) @@ -5603,7 +5495,7 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], CClassNode* cc; CClassNode* asc_cc; BitSetRef bs; - int add_flag; + int add_flag, r; iarg = (IApplyCaseFoldArg* )arg; env = iarg->env; @@ -5630,7 +5522,8 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], (is_in == 0 && IS_NCCLASS_NOT(cc))) { if (add_flag) { if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) { - add_code_range0(&(cc->mbuf), env, *to, *to, 0); + r = add_code_range0(&(cc->mbuf), env, *to, *to, 0); + if (r < 0) return r; } else { BITSET_SET_BIT(bs, *to); @@ -5642,7 +5535,8 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[], if (add_flag) { if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) { if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc); - add_code_range0(&(cc->mbuf), env, *to, *to, 0); + r = add_code_range0(&(cc->mbuf), env, *to, *to, 0); + if (r < 0) return r; } else { if (IS_NCCLASS_NOT(cc)) { @@ -5732,7 +5626,7 @@ node_linebreak(Node** np, ScanEnv* env) Node* target1 = NULL; Node* target2 = NULL; CClassNode* cc; - int num1, num2; + int num1, num2, r; UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN * 2]; /* \x0D\x0A */ @@ -5748,7 +5642,8 @@ node_linebreak(Node** np, ScanEnv* env) if (IS_NULL(right)) goto err; cc = NCCLASS(right); if (ONIGENC_MBC_MINLEN(env->enc) > 1) { - add_code_range(&(cc->mbuf), env, 0x0A, 0x0D); + r = add_code_range(&(cc->mbuf), env, 0x0A, 0x0D); + if (r != 0) goto err; } else { bitset_set_range(env, cc->bs, 0x0A, 0x0D); @@ -5757,8 +5652,10 @@ node_linebreak(Node** np, ScanEnv* env) /* TODO: move this block to enc/unicode.c */ if (ONIGENC_IS_UNICODE(env->enc)) { /* UTF-8, UTF-16BE/LE, UTF-32BE/LE */ - add_code_range(&(cc->mbuf), env, 0x85, 0x85); - add_code_range(&(cc->mbuf), env, 0x2028, 0x2029); + r = add_code_range(&(cc->mbuf), env, 0x85, 0x85); + if (r != 0) goto err; + r = add_code_range(&(cc->mbuf), env, 0x2028, 0x2029); + if (r != 0) goto err; } /* ...|... */ @@ -5787,7 +5684,7 @@ node_linebreak(Node** np, ScanEnv* env) static int propname2ctype(ScanEnv* env, const char* propname) { - UChar* name = (UChar*)propname; + UChar* name = (UChar* )propname; int ctype = env->enc->property_name_to_ctype(ONIG_ENCODING_ASCII, name, name + strlen(propname)); return ctype; @@ -5796,21 +5693,23 @@ propname2ctype(ScanEnv* env, const char* propname) static int node_extended_grapheme_cluster(Node** np, ScanEnv* env) { + Node* tmp = NULL; Node* np1 = NULL; Node* list = NULL; Node* list2 = NULL; Node* alt = NULL; Node* alt2 = NULL; + BBuf *pbuf1 = NULL; int r = 0; + int num1; + UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN * 2]; + OnigOptionType option; #ifdef USE_UNICODE_PROPERTIES if (ONIGENC_IS_UNICODE(env->enc)) { /* UTF-8, UTF-16BE/LE, UTF-32BE/LE */ - Node* tmp = NULL; - int num1, num2; - UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN * 2]; CClassNode* cc; - OnigOptionType option; + OnigCodePoint sb_out = (ONIGENC_MBC_MINLEN(env->enc) > 1) ? 0x00 : 0x80; int extend = propname2ctype(env, "Grapheme_Cluster_Break=Extend"); /* Prepend* @@ -5845,8 +5744,26 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env) cc = NCCLASS(np1); r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=Control"), 1, 0, env); if (r != 0) goto err; - BITSET_CLEAR_BIT(cc->bs, 0x0a); - BITSET_CLEAR_BIT(cc->bs, 0x0d); + if (ONIGENC_MBC_MINLEN(env->enc) > 1) { + BBuf *pbuf2 = NULL; + r = add_code_range(&pbuf1, env, 0x0a, 0x0a); + if (r != 0) goto err; + r = add_code_range(&pbuf1, env, 0x0d, 0x0d); + if (r != 0) goto err; + r = and_code_range_buf(cc->mbuf, 0, pbuf1, 1, &pbuf2, env); + if (r != 0) { + bbuf_free(pbuf2); + goto err; + } + bbuf_free(pbuf1); + pbuf1 = NULL; + bbuf_free(cc->mbuf); + cc->mbuf = pbuf2; + } + else { + BITSET_CLEAR_BIT(cc->bs, 0x0a); + BITSET_CLEAR_BIT(cc->bs, 0x0d); + } tmp = onig_node_new_alt(np1, NULL_NODE); if (IS_NULL(tmp)) goto err; @@ -6134,32 +6051,26 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env) np1 = node_new_cclass(); if (IS_NULL(np1)) goto err; cc = NCCLASS(np1); - r = add_code_range(&(cc->mbuf), env, 0x1F308, 0x1F308); - if (r != 0) goto err; - r = add_code_range(&(cc->mbuf), env, 0x1F33E, 0x1F33E); - if (r != 0) goto err; - r = add_code_range(&(cc->mbuf), env, 0x1F373, 0x1F373); - if (r != 0) goto err; - r = add_code_range(&(cc->mbuf), env, 0x1F393, 0x1F393); - if (r != 0) goto err; - r = add_code_range(&(cc->mbuf), env, 0x1F3A4, 0x1F3A4); - if (r != 0) goto err; - r = add_code_range(&(cc->mbuf), env, 0x1F3A8, 0x1F3A8); - if (r != 0) goto err; - r = add_code_range(&(cc->mbuf), env, 0x1F3EB, 0x1F3EB); - if (r != 0) goto err; - r = add_code_range(&(cc->mbuf), env, 0x1F3ED, 0x1F3ED); - if (r != 0) goto err; - r = add_code_range(&(cc->mbuf), env, 0x1F4BB, 0x1F4BC); - if (r != 0) goto err; - r = add_code_range(&(cc->mbuf), env, 0x1F527, 0x1F527); - if (r != 0) goto err; - r = add_code_range(&(cc->mbuf), env, 0x1F52C, 0x1F52C); - if (r != 0) goto err; - r = add_code_range(&(cc->mbuf), env, 0x1F680, 0x1F680); - if (r != 0) goto err; - r = add_code_range(&(cc->mbuf), env, 0x1F692, 0x1F692); - if (r != 0) goto err; + { + static const OnigCodePoint ranges[] = { + 13, + 0x1F308, 0x1F308, + 0x1F33E, 0x1F33E, + 0x1F373, 0x1F373, + 0x1F393, 0x1F393, + 0x1F3A4, 0x1F3A4, + 0x1F3A8, 0x1F3A8, + 0x1F3EB, 0x1F3EB, + 0x1F3ED, 0x1F3ED, + 0x1F4BB, 0x1F4BC, + 0x1F527, 0x1F527, + 0x1F52C, 0x1F52C, + 0x1F680, 0x1F680, + 0x1F692, 0x1F692, + }; + r = add_ctype_to_cc_by_range(cc, -1, 0, env, sb_out, ranges); + if (r != 0) goto err; + } r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=Glue_After_Zwj"), 0, 0, env); if (r != 0) goto err; @@ -6176,11 +6087,10 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env) /* Emoji variation sequence * http://unicode.org/Public/emoji/4.0/emoji-zwj-sequences.txt */ - np1 = node_new_cclass(); + r = ONIGENC_CODE_TO_MBC(env->enc, 0xfe0f, buf); + if (r < 0) goto err; + np1 = node_new_str_raw(buf, buf + r); if (IS_NULL(np1)) goto err; - cc = NCCLASS(np1); - r = add_code_range(&(cc->mbuf), env, 0xfe0f, 0xfe0f); - if (r != 0) goto err; tmp = node_new_quantifier(0, 1, 0); if (IS_NULL(tmp)) goto err; @@ -6195,14 +6105,17 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env) np1 = node_new_cclass(); if (IS_NULL(np1)) goto err; cc = NCCLASS(np1); - r = add_code_range(&(cc->mbuf), env, 0x2640, 0x2640); - if (r != 0) goto err; - r = add_code_range(&(cc->mbuf), env, 0x2642, 0x2642); - if (r != 0) goto err; - r = add_code_range(&(cc->mbuf), env, 0x2695, 0x2696); - if (r != 0) goto err; - r = add_code_range(&(cc->mbuf), env, 0x2708, 0x2708); - if (r != 0) goto err; + { + static const OnigCodePoint ranges[] = { + 4, + 0x2640, 0x2640, + 0x2642, 0x2642, + 0x2695, 0x2696, + 0x2708, 0x2708, + }; + r = add_ctype_to_cc_by_range(cc, -1, 0, env, sb_out, ranges); + if (r != 0) goto err; + } tmp = node_new_list(np1, list2); if (IS_NULL(tmp)) goto err; @@ -6220,11 +6133,10 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env) alt2 = NULL; /* ZWJ */ - np1 = node_new_cclass(); + r = ONIGENC_CODE_TO_MBC(env->enc, 0x200D, buf); + if (r < 0) goto err; + np1 = node_new_str_raw(buf, buf + r); if (IS_NULL(np1)) goto err; - cc = NCCLASS(np1); - r = add_code_range(&(cc->mbuf), env, 0x200D, 0x200D); - if (r != 0) goto err; tmp = node_new_list(np1, list2); if (IS_NULL(tmp)) goto err; @@ -6280,21 +6192,21 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env) np1 = node_new_cclass(); if (IS_NULL(np1)) goto err; cc = NCCLASS(np1); - r = add_code_range(&(cc->mbuf), env, 0x1F3C2, 0x1F3C2); - if (r != 0) goto err; - r = add_code_range(&(cc->mbuf), env, 0x1F3C7, 0x1F3C7); - if (r != 0) goto err; - r = add_code_range(&(cc->mbuf), env, 0x1F3CC, 0x1F3CC); - if (r != 0) goto err; - r = add_code_range(&(cc->mbuf), env, 0x1F3F3, 0x1F3F3); - if (r != 0) goto err; - r = add_code_range(&(cc->mbuf), env, 0x1F441, 0x1F441); - if (r != 0) goto err; - r = add_code_range(&(cc->mbuf), env, 0x1F46F, 0x1F46F); - if (r != 0) goto err; - r = add_code_range(&(cc->mbuf), env, 0x1F574, 0x1F574); - if (r != 0) goto err; - r = add_code_range(&(cc->mbuf), env, 0x1F6CC, 0x1F6CC); + { + static const OnigCodePoint ranges[] = { + 8, + 0x1F3C2, 0x1F3C2, + 0x1F3C7, 0x1F3C7, + 0x1F3CC, 0x1F3CC, + 0x1F3F3, 0x1F3F3, + 0x1F441, 0x1F441, + 0x1F46F, 0x1F46F, + 0x1F574, 0x1F574, + 0x1F6CC, 0x1F6CC, + }; + r = add_ctype_to_cc_by_range(cc, -1, 0, env, sb_out, ranges); + if (r != 0) goto err; + } r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=E_Base"), 0, 0, env); if (r != 0) goto err; r = add_ctype_to_cc(cc, propname2ctype(env, "Grapheme_Cluster_Break=E_Base_GAZ"), 0, 0, env); @@ -6344,11 +6256,10 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env) list2 = tmp; np1 = NULL; - np1 = node_new_cclass(); + r = ONIGENC_CODE_TO_MBC(env->enc, 0x200D, buf); + if (r < 0) goto err; + np1 = node_new_str_raw(buf, buf + r); if (IS_NULL(np1)) goto err; - cc = NCCLASS(np1); - r = add_code_range(&(cc->mbuf), env, 0x200D, 0x200D); - if (r != 0) goto err; tmp = node_new_list(np1, list2); if (IS_NULL(tmp)) goto err; @@ -6421,11 +6332,10 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env) np1 = NULL; /* Prepend+ */ - np1 = node_new_cclass(); + r = ONIGENC_CODE_TO_MBC(env->enc, 0x200D, buf); + if (r < 0) goto err; + np1 = node_new_str_raw(buf, buf + r); if (IS_NULL(np1)) goto err; - cc = NCCLASS(np1); - r = add_code_range(&(cc->mbuf), env, 0x200D, 0x200D); - if (r != 0) goto err; tmp = node_new_quantifier(0, 1, 0); if (IS_NULL(tmp)) goto err; @@ -6462,39 +6372,60 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env) if (IS_NULL(tmp)) goto err; alt = tmp; list = NULL; - - /* \x0D\x0A */ - num1 = ONIGENC_CODE_TO_MBC(env->enc, 0x0D, buf); - if (num1 < 0) return num1; - num2 = ONIGENC_CODE_TO_MBC(env->enc, 0x0A, buf + num1); - if (num2 < 0) return num2; - np1 = node_new_str_raw(buf, buf + num1 + num2); - if (IS_NULL(np1)) goto err; - - tmp = onig_node_new_alt(np1, alt); - if (IS_NULL(tmp)) goto err; - alt = tmp; - np1 = NULL; - - /* (?>...) */ - *np = node_new_enclose(ENCLOSE_STOP_BACKTRACK); - if (IS_NULL(*np)) goto err; - NENCLOSE(*np)->target = alt; - return ONIG_NORMAL; } + else #endif /* USE_UNICODE_PROPERTIES */ - if (IS_NULL(*np)) { + { /* PerlSyntax: (?s:.), RubySyntax: (?m:.) */ - OnigOptionType option; np1 = node_new_anychar(); if (IS_NULL(np1)) goto err; option = env->option; ONOFF(option, ONIG_OPTION_MULTILINE, 0); + tmp = node_new_option(option); + if (IS_NULL(tmp)) goto err; + NENCLOSE(tmp)->target = np1; + np1 = tmp; + + alt = onig_node_new_alt(np1, NULL_NODE); + if (IS_NULL(alt)) goto err; + np1 = NULL; + } + + /* \x0D\x0A */ + r = ONIGENC_CODE_TO_MBC(env->enc, 0x0D, buf); + if (r < 0) goto err; + num1 = r; + r = ONIGENC_CODE_TO_MBC(env->enc, 0x0A, buf + num1); + if (r < 0) goto err; + np1 = node_new_str_raw(buf, buf + num1 + r); + if (IS_NULL(np1)) goto err; + + tmp = onig_node_new_alt(np1, alt); + if (IS_NULL(tmp)) goto err; + alt = tmp; + np1 = NULL; + + /* (?>\x0D\x0A|...) */ + tmp = node_new_enclose(ENCLOSE_STOP_BACKTRACK); + if (IS_NULL(tmp)) goto err; + NENCLOSE(tmp)->target = alt; + np1 = tmp; + +#ifdef USE_UNICODE_PROPERTIES + if (ONIGENC_IS_UNICODE(env->enc)) { + /* Don't ignore case. */ + option = env->option; + ONOFF(option, ONIG_OPTION_IGNORECASE, 1); *np = node_new_option(option); if (IS_NULL(*np)) goto err; NENCLOSE(*np)->target = np1; } + else +#endif + { + *np = np1; + } return ONIG_NORMAL; err: @@ -6503,6 +6434,7 @@ node_extended_grapheme_cluster(Node** np, ScanEnv* env) onig_node_free(list2); onig_node_free(alt); onig_node_free(alt2); + bbuf_free(pbuf1); return (r == 0) ? ONIGERR_MEMORY : r; } @@ -6535,7 +6467,7 @@ is_onechar_cclass(CClassNode* cc, OnigCodePoint* code) /* only one char found in the bbuf, save the code point. */ c = data[0]; if (((c < SINGLE_BYTE_SIZE) && BITSET_AT(cc->bs, c))) { - /* skip if c is included in the bitset */ + /* skip if c is included in the bitset */ c = not_found; } } @@ -6549,9 +6481,9 @@ is_onechar_cclass(CClassNode* cc, OnigCodePoint* code) Bits b1 = cc->bs[i]; if (b1 != 0) { if (((b1 & (b1 - 1)) == 0) && (c == not_found)) { - c = BITS_IN_ROOM * i + countbits(b1 - 1); + c = BITS_IN_ROOM * i + countbits(b1 - 1); } else { - return 0; /* the character class contains multiple chars */ + return 0; /* the character class contains multiple chars */ } } } @@ -6596,7 +6528,10 @@ parse_exp(Node** np, OnigToken* tok, int term, env->option = NENCLOSE(*np)->option; r = fetch_token(tok, src, end, env); - if (r < 0) return r; + if (r < 0) { + env->option = prev; + return r; + } r = parse_subexp(&target, tok, term, src, end, env); env->option = prev; if (r < 0) { @@ -6749,69 +6684,13 @@ parse_exp(Node** np, OnigToken* tok, int term, { CClassNode* cc; -#ifdef USE_SHARED_CCLASS_TABLE - const OnigCodePoint *mbr; - OnigCodePoint sb_out; - - r = ONIGENC_GET_CTYPE_CODE_RANGE(env->enc, tok->u.prop.ctype, - &sb_out, &mbr); - if (r == 0 && - ! IS_ASCII_RANGE(env->option) && - ONIGENC_CODE_RANGE_NUM(mbr) - >= THRESHOLD_RANGE_NUM_FOR_SHARE_CCLASS) { - type_cclass_key key; - type_cclass_key* new_key; - - key.enc = env->enc; - key.not = tok->u.prop.not; - key.type = tok->u.prop.ctype; - - THREAD_ATOMIC_START; - - if (IS_NULL(OnigTypeCClassTable)) { - OnigTypeCClassTable - = onig_st_init_table_with_size(&type_type_cclass_hash, 10); - if (IS_NULL(OnigTypeCClassTable)) { - THREAD_ATOMIC_END; - return ONIGERR_MEMORY; - } - } - else { - if (onig_st_lookup(OnigTypeCClassTable, (st_data_t )&key, - (st_data_t* )np)) { - THREAD_ATOMIC_END; - break; - } - } - - *np = node_new_cclass_by_codepoint_range(tok->u.prop.not, - sb_out, mbr); - if (IS_NULL(*np)) { - THREAD_ATOMIC_END; - return ONIGERR_MEMORY; - } - - cc = NCCLASS(*np); - NCCLASS_SET_SHARE(cc); - new_key = (type_cclass_key* )xmalloc(sizeof(type_cclass_key)); - xmemcpy(new_key, &key, sizeof(type_cclass_key)); - onig_st_add_direct(OnigTypeCClassTable, (st_data_t )new_key, - (st_data_t )*np); - - THREAD_ATOMIC_END; - } - else { -#endif - *np = node_new_cclass(); - CHECK_NULL_RETURN_MEMERR(*np); - cc = NCCLASS(*np); - r = add_ctype_to_cc(cc, tok->u.prop.ctype, 0, - IS_ASCII_RANGE(env->option), env); - if (r != 0) return r; - if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); -#ifdef USE_SHARED_CCLASS_TABLE - } -#endif + *np = node_new_cclass(); + CHECK_NULL_RETURN_MEMERR(*np); + cc = NCCLASS(*np); + r = add_ctype_to_cc(cc, tok->u.prop.ctype, 0, + IS_ASCII_RANGE(env->option), env); + if (r != 0) return r; + if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc); } break; @@ -7041,6 +6920,9 @@ parse_subexp(Node** top, OnigToken* tok, int term, Node *node, **headp; *top = NULL; + env->parse_depth++; + if (env->parse_depth > ParseDepthLimit) + return ONIGERR_PARSE_DEPTH_LIMIT_OVER; r = parse_branch(&node, tok, term, src, end, env); if (r < 0) { onig_node_free(node); @@ -7078,6 +6960,7 @@ parse_subexp(Node** top, OnigToken* tok, int term, return ONIGERR_PARSER_BUG; } + env->parse_depth--; return r; } diff --git a/regparse.h b/regparse.h index caf0790b1c..111a840b84 100644 --- a/regparse.h +++ b/regparse.h @@ -1,11 +1,11 @@ -#ifndef ONIGURUMA_REGPARSE_H -#define ONIGURUMA_REGPARSE_H +#ifndef ONIGMO_REGPARSE_H +#define ONIGMO_REGPARSE_H /********************************************************************** regparse.h - Onigmo (Oniguruma-mod) (regular expression library) **********************************************************************/ /*- * Copyright (c) 2002-2007 K.Kosako - * Copyright (c) 2011 K.Takata + * Copyright (c) 2011-2016 K.Takata * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -317,9 +317,12 @@ typedef struct { int curr_max_regnum; int has_recursion; #endif + unsigned int parse_depth; int warnings_flag; +#ifdef RUBY const char* sourcefile; int sourceline; +#endif } ScanEnv; @@ -332,36 +335,35 @@ typedef struct { int new_val; } GroupNumRemap; -extern int onig_renumber_name_table P_((regex_t* reg, GroupNumRemap* map)); +extern int onig_renumber_name_table(regex_t* reg, GroupNumRemap* map); #endif -extern int onig_strncmp P_((const UChar* s1, const UChar* s2, int n)); -extern void onig_strcpy P_((UChar* dest, const UChar* src, const UChar* end)); -extern void onig_scan_env_set_error_string P_((ScanEnv* env, int ecode, UChar* arg, UChar* arg_end)); -extern int onig_scan_unsigned_number P_((UChar** src, const UChar* end, OnigEncoding enc)); -extern void onig_reduce_nested_quantifier P_((Node* pnode, Node* cnode)); -extern void onig_node_conv_to_str_node P_((Node* node, int raw)); -extern int onig_node_str_cat P_((Node* node, const UChar* s, const UChar* end)); -extern int onig_node_str_set P_((Node* node, const UChar* s, const UChar* end)); -extern void onig_node_free P_((Node* node)); -extern Node* onig_node_new_enclose P_((int type)); -extern Node* onig_node_new_anchor P_((int type)); -extern Node* onig_node_new_str P_((const UChar* s, const UChar* end)); -extern Node* onig_node_new_list P_((Node* left, Node* right)); -extern Node* onig_node_list_add P_((Node* list, Node* x)); -extern Node* onig_node_new_alt P_((Node* left, Node* right)); -extern void onig_node_str_clear P_((Node* node)); -extern int onig_free_node_list P_((void)); -extern int onig_names_free P_((regex_t* reg)); -extern int onig_parse_make_tree P_((Node** root, const UChar* pattern, const UChar* end, regex_t* reg, ScanEnv* env)); -extern int onig_free_shared_cclass_table P_((void)); +extern int onig_strncmp(const UChar* s1, const UChar* s2, int n); +extern void onig_strcpy(UChar* dest, const UChar* src, const UChar* end); +extern void onig_scan_env_set_error_string(ScanEnv* env, int ecode, UChar* arg, UChar* arg_end); +extern int onig_scan_unsigned_number(UChar** src, const UChar* end, OnigEncoding enc); +extern void onig_reduce_nested_quantifier(Node* pnode, Node* cnode); +extern void onig_node_conv_to_str_node(Node* node, int raw); +extern int onig_node_str_cat(Node* node, const UChar* s, const UChar* end); +extern int onig_node_str_set(Node* node, const UChar* s, const UChar* end); +extern void onig_node_free(Node* node); +extern Node* onig_node_new_enclose(int type); +extern Node* onig_node_new_anchor(int type); +extern Node* onig_node_new_str(const UChar* s, const UChar* end); +extern Node* onig_node_new_list(Node* left, Node* right); +extern Node* onig_node_list_add(Node* list, Node* x); +extern Node* onig_node_new_alt(Node* left, Node* right); +extern void onig_node_str_clear(Node* node); +extern int onig_names_free(regex_t* reg); +extern int onig_parse_make_tree(Node** root, const UChar* pattern, const UChar* end, regex_t* reg, ScanEnv* env); +extern int onig_free_shared_cclass_table(void); #ifdef ONIG_DEBUG -#ifdef USE_NAMED_GROUP +# ifdef USE_NAMED_GROUP extern int onig_print_names(FILE*, regex_t*); -#endif +# endif #endif RUBY_SYMBOL_EXPORT_END -#endif /* ONIGURUMA_REGPARSE_H */ +#endif /* ONIGMO_REGPARSE_H */ diff --git a/regsyntax.c b/regsyntax.c index 7cb98f2d46..657ffcd0f3 100644 --- a/regsyntax.c +++ b/regsyntax.c @@ -3,7 +3,7 @@ **********************************************************************/ /*- * Copyright (c) 2002-2006 K.Kosako - * Copyright (c) 2011-2012 K.Takata + * Copyright (c) 2011-2016 K.Takata * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -229,7 +229,7 @@ const OnigSyntaxType OnigSyntaxPerl = { (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY | ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 | ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS | - ONIG_SYN_OP_ESC_C_CONTROL ) + ONIG_SYN_OP_ESC_O_BRACE_OCTAL | ONIG_SYN_OP_ESC_C_CONTROL ) & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END ) , ( ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE | ONIG_SYN_OP2_QMARK_GROUP_EFFECT | ONIG_SYN_OP2_OPTION_PERL | @@ -248,7 +248,8 @@ const OnigSyntaxType OnigSyntaxPerl = { ONIG_SYN_OP2_ESC_K_NAMED_BACKREF ) , ( SYN_GNU_REGEX_BV | ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME | - ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME_CALL ) + ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME_CALL | + ONIG_SYN_USE_LEFT_MOST_NAMED_GROUP ) , ( ONIG_OPTION_SINGLELINE | ONIG_OPTION_CAPTURE_GROUP ) , { @@ -332,25 +333,25 @@ onig_set_syntax_options(OnigSyntaxType* syntax, OnigOptionType options) } extern unsigned int -onig_get_syntax_op(OnigSyntaxType* syntax) +onig_get_syntax_op(const OnigSyntaxType* syntax) { return syntax->op; } extern unsigned int -onig_get_syntax_op2(OnigSyntaxType* syntax) +onig_get_syntax_op2(const OnigSyntaxType* syntax) { return syntax->op2; } extern unsigned int -onig_get_syntax_behavior(OnigSyntaxType* syntax) +onig_get_syntax_behavior(const OnigSyntaxType* syntax) { return syntax->behavior; } extern OnigOptionType -onig_get_syntax_options(OnigSyntaxType* syntax) +onig_get_syntax_options(const OnigSyntaxType* syntax) { return syntax->options; } diff --git a/template/encdb.h.tmpl b/template/encdb.h.tmpl index 9cbb1f0083..9de29bebde 100644 --- a/template/encdb.h.tmpl +++ b/template/encdb.h.tmpl @@ -41,7 +41,8 @@ encdirs.each do |encdir| open(File.join(encdir,fn)) do |f| name = nil f.each_line do |line| - if (/^OnigEncodingDefine/ =~ line)..(/"(.*?)"/ =~ line) + if (/^#ifndef RUBY/ =~ line)..(/^#endif/ =~ line) + elsif (/^OnigEncodingDefine/ =~ line)..(/"(.*?)"/ =~ line) if $1 if name lines << %[ENC_SET_BASE("#$1", "#{name}");] diff --git a/tool/enc-unicode.rb b/tool/enc-unicode.rb index feb94c3e61..eddd39747a 100755 --- a/tool/enc-unicode.rb +++ b/tool/enc-unicode.rb @@ -20,7 +20,7 @@ end $unicode_version = File.basename(ARGV[0])[/\A[.\d]+\z/] -POSIX_NAMES = %w[NEWLINE Alpha Blank Cntrl Digit Graph Lower Print Punct Space Upper XDigit Word Alnum ASCII] +POSIX_NAMES = %w[NEWLINE Alpha Blank Cntrl Digit Graph Lower Print XPosixPunct Space Upper XDigit Word Alnum ASCII Punct] def pair_codepoints(codepoints) @@ -115,6 +115,7 @@ def define_posix_props(data) data['Upper'] = data['Uppercase'] data['Lower'] = data['Lowercase'] data['Punct'] = data['Punctuation'] + data['XPosixPunct'] = data['Punctuation'] + [0x24, 0x2b, 0x3c, 0x3d, 0x3e, 0x5e, 0x60, 0x7c, 0x7e] data['Digit'] = data['Decimal_Number'] data['XDigit'] = (0x0030..0x0039).to_a + (0x0041..0x0046).to_a + (0x0061..0x0066).to_a @@ -260,7 +261,11 @@ $const_cache = {} # given property, group of paired codepoints, and a human-friendly name for # the group def make_const(prop, data, name) - puts "\n/* '#{prop}': #{name} */" + if name.empty? + puts "\n/* '#{prop}' */" + else + puts "\n/* '#{prop}': #{name} */" + end if origprop = $const_cache.key(data) puts "#define CR_#{prop} CR_#{origprop}" else @@ -387,7 +392,13 @@ props.concat parse_scripts(data, categories) aliases = parse_aliases(data) define_posix_props(data) POSIX_NAMES.each do |name| - make_const(name, data[name], "[[:#{name}:]]") + if name == 'XPosixPunct' + make_const(name, data[name], "[[:Punct:]]") + elsif name == 'Punct' + make_const(name, data[name], "") + else + make_const(name, data[name], "[[:#{name}:]]") + end end output.ifdef :USE_UNICODE_PROPERTIES props.each do |name| -- cgit v1.2.3