From 5770336f8be4ac6dbdff43587fda2b508d3786de Mon Sep 17 00:00:00 2001 From: ksaito Date: Fri, 5 Mar 2004 15:31:51 +0000 Subject: Initial revision git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@5892 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ascii.c | 54 + euc_jp.c | 191 +++ oniggnu.h | 77 + oniguruma.h | 715 ++++++++ regcomp.c | 5440 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ regenc.c | 586 +++++++ regenc.h | 96 ++ regerror.c | 291 ++++ regexec.c | 3299 ++++++++++++++++++++++++++++++++++++ reggnu.c | 256 +++ regint.h | 685 ++++++++ regparse.c | 4815 ++++++++++++++++++++++++++++++++++++++++++++++++++++ regparse.h | 277 +++ sjis.c | 174 ++ utf8.c | 566 +++++++ 15 files changed, 17522 insertions(+) create mode 100644 ascii.c create mode 100644 euc_jp.c create mode 100644 oniggnu.h create mode 100644 oniguruma.h create mode 100644 regcomp.c create mode 100644 regenc.c create mode 100644 regenc.h create mode 100644 regerror.c create mode 100644 regexec.c create mode 100644 reggnu.c create mode 100644 regint.h create mode 100644 regparse.c create mode 100644 regparse.h create mode 100644 sjis.c create mode 100644 utf8.c diff --git a/ascii.c b/ascii.c new file mode 100644 index 0000000000..44cc78f77c --- /dev/null +++ b/ascii.c @@ -0,0 +1,54 @@ +/********************************************************************** + + ascii.c - Oniguruma (regular expression library) + + Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) + +**********************************************************************/ +#include "regenc.h" + +static int +ascii_code_is_ctype(OnigCodePoint code, unsigned int ctype) +{ + if (code < 128) + return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); + else + return FALSE; +} + +OnigEncodingType OnigEncodingASCII = { + { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 + }, + "US-ASCII", /* name */ + 1, /* max byte length */ + FALSE, /* is_fold_match */ + ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ + TRUE, /* is continuous sb mb codepoint */ + onigenc_single_byte_mbc_to_code, + onigenc_single_byte_code_to_mbclen, + onigenc_single_byte_code_to_mbc, + onigenc_ascii_mbc_to_lower, + onigenc_ascii_mbc_is_case_ambig, + ascii_code_is_ctype, + onigenc_nothing_get_ctype_code_range, + onigenc_single_byte_left_adjust_char_head, + onigenc_single_byte_is_allowed_reverse_match, + onigenc_nothing_get_all_fold_match_code, + onigenc_nothing_get_fold_match_info +}; diff --git a/euc_jp.c b/euc_jp.c new file mode 100644 index 0000000000..848016ba5a --- /dev/null +++ b/euc_jp.c @@ -0,0 +1,191 @@ +/********************************************************************** + + euc_jp.c - Oniguruma (regular expression library) + + Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) + +**********************************************************************/ +#include "regenc.h" + +#define eucjp_islead(c) ((UChar )((c) - 0xa1) > 0xfe - 0xa1) + +static OnigCodePoint +eucjp_mbc_to_code(UChar* p, UChar* end) +{ + int c, i, len; + OnigCodePoint n; + + c = *p++; + len = enc_len(ONIG_ENCODING_EUC_JP, c); + n = c; + if (len == 1) return n; + + for (i = 1; i < len; i++) { + if (p >= end) break; + c = *p++; + n <<= 8; n += c; + } + return n; +} + +static int +eucjp_code_to_mbclen(OnigCodePoint code) +{ + if ((code & 0xff0000) != 0) return 3; + else if ((code & 0xff00) != 0) return 2; + else return 1; +} + +static int +eucjp_code_to_mbc_first(OnigCodePoint code) +{ + int first; + + if ((code & 0xff0000) != 0) { + first = (code >> 16) & 0xff; + /* + if (enc_len(ONIG_ENCODING_EUC_JP, first) != 3) + return ONIGERR_INVALID_WIDE_CHAR_VALUE; + */ + } + else if ((code & 0xff00) != 0) { + first = (code >> 8) & 0xff; + /* + if (enc_len(ONIG_ENCODING_EUC_JP, first) != 2) + return ONIGERR_INVALID_WIDE_CHAR_VALUE; + */ + } + else { + /* + if (enc_len(ONIG_ENCODING_EUC_JP, code) != 1) + return ONIGERR_INVALID_WIDE_CHAR_VALUE; + */ + return (int )code; + } + return first; +} + +static int +eucjp_code_to_mbc(OnigCodePoint code, UChar *buf) +{ + UChar *p = buf; + + if ((code & 0xff0000) != 0) *p++ = (UChar )(((code >> 16) & 0xff)); + if ((code & 0xff00) != 0) *p++ = (UChar )(((code >> 8) & 0xff)); + *p++ = (UChar )(code & 0xff); + +#if 1 + if (enc_len(ONIG_ENCODING_EUC_JP, buf[0]) != (p - buf)) + return ONIGERR_INVALID_WIDE_CHAR_VALUE; +#endif + return p - buf; +} + +static int +eucjp_mbc_to_lower(UChar* p, UChar* lower) +{ + int len; + + if (ONIGENC_IS_MBC_ASCII(p)) { + *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + return 1; + } + else { + len = enc_len(ONIG_ENCODING_EUC_JP, *p); + if (lower != p) { + /* memcpy(lower, p, len); */ + int i; + for (i = 0; i < len; i++) { + *lower++ = *p++; + } + } + return len; /* return byte length of converted char to lower */ + } +} + +static int +eucjp_code_is_ctype(OnigCodePoint code, unsigned int ctype) +{ + if ((ctype & ONIGENC_CTYPE_WORD) != 0) { + if (code < 128) + return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); + else { + int first = eucjp_code_to_mbc_first(code); + return (enc_len(ONIG_ENCODING_EUC_JP, first) > 1 ? TRUE : FALSE); + } + + ctype &= ~ONIGENC_CTYPE_WORD; + if (ctype == 0) return FALSE; + } + + if (code < 128) + return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); + else + return FALSE; +} + +static UChar* +eucjp_left_adjust_char_head(UChar* start, UChar* s) +{ + /* Assumed in this encoding, + mb-trail bytes don't mix with single bytes. + */ + UChar *p; + int len; + + if (s <= start) return s; + p = s; + + while (!eucjp_islead(*p) && p > start) p--; + len = enc_len(ONIG_ENCODING_EUC_JP, *p); + if (p + len > s) return p; + p += len; + return p + ((s - p) & ~1); +} + +static int +eucjp_is_allowed_reverse_match(UChar* s, UChar* end) +{ + UChar c = *s; + if (c <= 0x7e || c == 0x8e || c == 0x8f) + return TRUE; + else + return FALSE; +} + +OnigEncodingType OnigEncodingEUC_JP = { + { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 + }, + "EUC-JP", /* name */ + 3, /* max byte length */ + FALSE, /* is_fold_match */ + ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ + FALSE, /* is continuous sb mb codepoint */ + eucjp_mbc_to_code, + eucjp_code_to_mbclen, + eucjp_code_to_mbc, + eucjp_mbc_to_lower, + onigenc_mbn_mbc_is_case_ambig, + eucjp_code_is_ctype, + onigenc_nothing_get_ctype_code_range, + eucjp_left_adjust_char_head, + eucjp_is_allowed_reverse_match, + onigenc_nothing_get_all_fold_match_code, + onigenc_nothing_get_fold_match_info +}; diff --git a/oniggnu.h b/oniggnu.h new file mode 100644 index 0000000000..d78dc18b11 --- /dev/null +++ b/oniggnu.h @@ -0,0 +1,77 @@ +/********************************************************************** + + oniggnu.h - Oniguruma (regular expression library) + + Copyright (C) 2004 K.Kosako (kosako@sofnec.co.jp) + +**********************************************************************/ +#ifndef ONIGGNU_H +#define ONIGGNU_H + +#include "oniguruma.h" + +#define MBCTYPE_ASCII 0 +#define MBCTYPE_EUC 1 +#define MBCTYPE_SJIS 2 +#define MBCTYPE_UTF8 3 + +/* GNU regex options */ +#ifndef RE_NREGS +#define RE_NREGS ONIG_NREGION +#endif +#define RE_OPTION_IGNORECASE ONIG_OPTION_IGNORECASE +#define RE_OPTION_EXTENDED ONIG_OPTION_EXTEND +#define RE_OPTION_MULTILINE ONIG_OPTION_MULTILINE +#define RE_OPTION_SINGLELINE ONIG_OPTION_SINGLELINE +#define RE_OPTION_LONGEST ONIG_OPTION_FIND_LONGEST +#define RE_OPTION_POSIXLINE (RE_OPTION_MULTILINE|RE_OPTION_SINGLELINE) + +#ifdef RUBY_PLATFORM +#define re_mbcinit ruby_re_mbcinit +#define re_compile_pattern ruby_re_compile_pattern +#define re_recompile_pattern ruby_re_recompile_pattern +#define re_free_pattern ruby_re_free_pattern +#define re_adjust_startpos ruby_re_adjust_startpos +#define re_search ruby_re_search +#define re_match ruby_re_match +#define re_set_casetable ruby_re_set_casetable +#define re_copy_registers ruby_re_copy_registers +#define re_free_registers ruby_re_free_registers +#define register_info_type ruby_register_info_type +#define re_error_code_to_str ruby_error_code_to_str + +#define ruby_error_code_to_str onig_error_code_to_str +#define ruby_re_copy_registers onig_region_copy +#else +#define re_error_code_to_str onig_error_code_to_str +#define re_copy_registers onig_region_copy +#endif + +#ifdef ONIG_RUBY_M17N +ONIG_EXTERN +void re_mbcinit P_((OnigEncoding)); +#else +ONIG_EXTERN +void re_mbcinit P_((int)); +#endif + +ONIG_EXTERN +int re_compile_pattern P_((const char*, int, struct re_pattern_buffer*, char* err_buf)); +ONIG_EXTERN +int re_recompile_pattern P_((const char*, int, struct re_pattern_buffer*, char* err_buf)); +ONIG_EXTERN +void re_free_pattern P_((struct re_pattern_buffer*)); +ONIG_EXTERN +int re_adjust_startpos P_((struct re_pattern_buffer*, const char*, int, int, int)); +ONIG_EXTERN +int re_search P_((struct re_pattern_buffer*, const char*, int, int, int, struct re_registers*)); +ONIG_EXTERN +int re_match P_((struct re_pattern_buffer*, const char *, int, int, struct re_registers*)); +ONIG_EXTERN +void re_set_casetable P_((const char*)); +ONIG_EXTERN +void re_free_registers P_((struct re_registers*)); +ONIG_EXTERN +int re_alloc_pattern P_((struct re_pattern_buffer**)); /* added */ + +#endif /* ONIGGNU_H */ diff --git a/oniguruma.h b/oniguruma.h new file mode 100644 index 0000000000..8654bff188 --- /dev/null +++ b/oniguruma.h @@ -0,0 +1,715 @@ +/********************************************************************** + + oniguruma.h - Oniguruma (regular expression library) + + Copyright (C) 2002-2004 K.Kosako (kosako@sofnec.co.jp) + +**********************************************************************/ +#ifndef ONIGURUMA_H +#define ONIGURUMA_H + +#define ONIGURUMA +#define ONIGURUMA_VERSION_MAJOR 2 +#define ONIGURUMA_VERSION_MINOR 2 +#define ONIGURUMA_VERSION_TEENY 4 + +#ifndef P_ +#if defined(__STDC__) || defined(_WIN32) +# define P_(args) args +#else +# define P_(args) () +#endif +#endif + +#ifndef PV_ +#ifdef HAVE_STDARG_PROTOTYPES +# define PV_(args) args +#else +# define PV_(args) () +#endif +#endif + +#ifndef ONIG_EXTERN +#if defined(_WIN32) && !defined(__CYGWIN__) +#if defined(EXPORT) || defined(RUBY_EXPORT) +#define ONIG_EXTERN extern __declspec(dllexport) +#else +#define ONIG_EXTERN extern __declspec(dllimport) +#endif +#endif +#endif + +#ifndef ONIG_EXTERN +#define ONIG_EXTERN extern +#endif + +/* PART: character encoding */ + +typedef unsigned char UChar; +typedef unsigned long OnigCodePoint; +typedef unsigned int OnigDistance; + +#define ONIG_INFINITE_DISTANCE ~((OnigDistance )0) + +typedef struct { + OnigCodePoint from; + OnigCodePoint to; +} OnigCodePointRange; + +#define ONIGENC_FOLD_MATCH_MAX_TARGET_NUM_SIZE 16 +typedef struct { + int target_num; + int target_byte_len[ONIGENC_FOLD_MATCH_MAX_TARGET_NUM_SIZE]; + UChar* target_str[ONIGENC_FOLD_MATCH_MAX_TARGET_NUM_SIZE]; +} OnigEncFoldMatchInfo; + + +#if defined(RUBY_PLATFORM) && defined(M17N_H) + +#define ONIG_RUBY_M17N +typedef m17n_encoding* OnigEncoding; + +#else + +typedef struct { + const char len_table[256]; + const char* name; + int max_enc_len; + int is_fold_match; + int ctype_support_level; /* sb-only/full */ + int is_continuous_sb_mb; /* code point is continuous from sb to mb */ + OnigCodePoint (*mbc_to_code)(UChar* p, UChar* end); + int (*code_to_mbclen)(OnigCodePoint code); + int (*code_to_mbc)(OnigCodePoint code, UChar *buf); + int (*mbc_to_lower)(UChar* p, UChar* lower); + int (*mbc_is_case_ambig)(UChar* p); + int (*code_is_ctype)(OnigCodePoint code, unsigned int ctype); + int (*get_ctype_code_range)(int ctype, int* nsb, int* nmb, OnigCodePointRange* sbr[], OnigCodePointRange* mbr[]); + UChar* (*left_adjust_char_head)(UChar* start, UChar* s); + int (*is_allowed_reverse_match)(UChar* p, UChar* e); + int (*get_all_fold_match_code)(OnigCodePoint** codes); + int (*get_fold_match_info)(UChar* p, UChar* end, OnigEncFoldMatchInfo** info); +} OnigEncodingType; + +typedef OnigEncodingType* OnigEncoding; + +ONIG_EXTERN OnigEncodingType OnigEncodingASCII; +ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_1; +ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_2; +ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_3; +ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_4; +ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_5; +ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_6; +ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_7; +ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_8; +ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_9; +ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_10; +ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_11; +ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_13; +ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_14; +ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_15; +ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_16; +ONIG_EXTERN OnigEncodingType OnigEncodingUTF8; +ONIG_EXTERN OnigEncodingType OnigEncodingEUC_JP; +ONIG_EXTERN OnigEncodingType OnigEncodingEUC_TW; +ONIG_EXTERN OnigEncodingType OnigEncodingEUC_KR; +ONIG_EXTERN OnigEncodingType OnigEncodingEUC_CN; +ONIG_EXTERN OnigEncodingType OnigEncodingSJIS; +ONIG_EXTERN OnigEncodingType OnigEncodingKOI8; +ONIG_EXTERN OnigEncodingType OnigEncodingKOI8_R; +ONIG_EXTERN OnigEncodingType OnigEncodingBIG5; + +#define ONIG_ENCODING_ASCII (&OnigEncodingASCII) +#define ONIG_ENCODING_ISO_8859_1 (&OnigEncodingISO_8859_1) +#define ONIG_ENCODING_ISO_8859_2 (&OnigEncodingISO_8859_2) +#define ONIG_ENCODING_ISO_8859_3 (&OnigEncodingISO_8859_3) +#define ONIG_ENCODING_ISO_8859_4 (&OnigEncodingISO_8859_4) +#define ONIG_ENCODING_ISO_8859_5 (&OnigEncodingISO_8859_5) +#define ONIG_ENCODING_ISO_8859_6 (&OnigEncodingISO_8859_6) +#define ONIG_ENCODING_ISO_8859_7 (&OnigEncodingISO_8859_7) +#define ONIG_ENCODING_ISO_8859_8 (&OnigEncodingISO_8859_8) +#define ONIG_ENCODING_ISO_8859_9 (&OnigEncodingISO_8859_9) +#define ONIG_ENCODING_ISO_8859_10 (&OnigEncodingISO_8859_10) +#define ONIG_ENCODING_ISO_8859_11 (&OnigEncodingISO_8859_11) +#define ONIG_ENCODING_ISO_8859_13 (&OnigEncodingISO_8859_13) +#define ONIG_ENCODING_ISO_8859_14 (&OnigEncodingISO_8859_14) +#define ONIG_ENCODING_ISO_8859_15 (&OnigEncodingISO_8859_15) +#define ONIG_ENCODING_ISO_8859_16 (&OnigEncodingISO_8859_16) +#define ONIG_ENCODING_UTF8 (&OnigEncodingUTF8) +#define ONIG_ENCODING_EUC_JP (&OnigEncodingEUC_JP) +#define ONIG_ENCODING_EUC_TW (&OnigEncodingEUC_TW) +#define ONIG_ENCODING_EUC_KR (&OnigEncodingEUC_KR) +#define ONIG_ENCODING_EUC_CN (&OnigEncodingEUC_CN) +#define ONIG_ENCODING_SJIS (&OnigEncodingSJIS) +#define ONIG_ENCODING_KOI8 (&OnigEncodingKOI8) +#define ONIG_ENCODING_KOI8_R (&OnigEncodingKOI8_R) +#define ONIG_ENCODING_BIG5 (&OnigEncodingBIG5) + +#endif /* else RUBY && M17N */ + +#define ONIG_ENCODING_UNDEF ((OnigEncoding )0) + + +/* work size */ +#define ONIGENC_CODE_TO_MBC_MAXLEN 7 +#define ONIGENC_MBC_TO_LOWER_MAXLEN ONIGENC_CODE_TO_MBC_MAXLEN + +/* character types */ +#define ONIGENC_CTYPE_ALPHA (1<< 0) +#define ONIGENC_CTYPE_BLANK (1<< 1) +#define ONIGENC_CTYPE_CNTRL (1<< 2) +#define ONIGENC_CTYPE_DIGIT (1<< 3) +#define ONIGENC_CTYPE_GRAPH (1<< 4) +#define ONIGENC_CTYPE_LOWER (1<< 5) +#define ONIGENC_CTYPE_PRINT (1<< 6) +#define ONIGENC_CTYPE_PUNCT (1<< 7) +#define ONIGENC_CTYPE_SPACE (1<< 8) +#define ONIGENC_CTYPE_UPPER (1<< 9) +#define ONIGENC_CTYPE_XDIGIT (1<<10) +#define ONIGENC_CTYPE_WORD (1<<11) +#define ONIGENC_CTYPE_ASCII (1<<12) +#define ONIGENC_CTYPE_ALNUM (ONIGENC_CTYPE_ALPHA | ONIGENC_CTYPE_DIGIT) + +/* ctype support level */ +#define ONIGENC_CTYPE_SUPPORT_LEVEL_SB 0 +#define ONIGENC_CTYPE_SUPPORT_LEVEL_FULL 1 + + +#define enc_len(enc,byte) ONIGENC_MBC_LEN_BY_HEAD(enc,byte) + +#define ONIGENC_IS_UNDEF(enc) ((enc) == ONIG_ENCODING_UNDEF) +#define ONIGENC_IS_SINGLEBYTE(enc) (ONIGENC_MBC_MAXLEN(enc) == 1) +#define ONIGENC_IS_MBC_HEAD(enc,byte) (ONIGENC_MBC_LEN_BY_HEAD(enc,byte) != 1) +#define ONIGENC_IS_MBC_ASCII(p) (*(p) < 128) +#define ONIGENC_IS_CODE_ASCII(code) ((code) < 128) +#define ONIGENC_IS_CODE_SB_WORD(enc,code) \ + (ONIGENC_IS_CODE_ASCII(code) && ONIGENC_IS_CODE_WORD(enc,code)) +#define ONIGENC_IS_MBC_WORD(enc,s,end) \ + ONIGENC_IS_CODE_WORD(enc,ONIGENC_MBC_TO_CODE(enc,s,end)) + + +#ifdef ONIG_RUBY_M17N + +#include /* for isblank(), isgraph() */ + +#define ONIGENC_MBC_TO_LOWER(enc,p,buf) onigenc_mbc_to_lower(enc,p,buf) +#define ONIGENC_IS_MBC_CASE_AMBIG(enc,p) onigenc_mbc_is_case_ambig(enc,p) + +#define ONIGENC_IS_FOLD_MATCH(enc) FALSE +#define ONIGENC_IS_CONTINUOUS_SB_MB(enc) FALSE +#define ONIGENC_CTYPE_SUPPORT_LEVEL(enc) ONIGENC_CTYPE_SUPPORT_LEVEL_SB +#define ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc,s,end) \ + onigenc_is_allowed_reverse_match(enc, s, end) +#define ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc,start,s) \ + onigenc_get_left_adjust_char_head(enc, start, s) +#define ONIGENC_GET_ALL_FOLD_MATCH_CODE(enc,codes) 0 +#define ONIGENC_GET_FOLD_MATCH_INFO(enc,p,end,info) ONIG_NO_SUPPORT_CONFIG +#define ONIGENC_GET_CTYPE_CODE_RANGE(enc,ctype,nsb,nmb,sbr,mbr) \ + ONIG_NO_SUPPORT_CONFIG +#define ONIGENC_MBC_LEN_BY_HEAD(enc,b) m17n_mbclen(enc,(int )b) +#define ONIGENC_MBC_MAXLEN(enc) m17n_mbmaxlen(enc) +#define ONIGENC_MBC_MAXLEN_DIST(enc) \ + (ONIGENC_MBC_MAXLEN(enc) > 0 ? ONIGENC_MBC_MAXLEN(enc) \ + : ONIG_INFINITE_DISTANCE) +#define ONIGENC_MBC_TO_CODE(enc,p,e) m17n_codepoint((enc),(p),(e)) +#define ONIGENC_CODE_TO_MBCLEN(enc,code) m17n_codelen((enc),(code)) +#define ONIGENC_CODE_TO_MBC(enc,code,buf) onigenc_code_to_mbc(enc, code, buf) + +#if 0 +#define ONIGENC_STEP_BACK(enc,start,s,n) /* !! not supported !! */ +#endif + +#define ONIGENC_IS_CODE_CTYPE(enc,code,ctype) \ + onigenc_is_code_ctype(enc,code,ctype) + +#ifdef isblank +# define ONIGENC_IS_CODE_BLANK(enc,code) isblank((int )code) +#else +# define ONIGENC_IS_CODE_BLANK(enc,code) ((code) == ' ' || (code) == '\t') +#endif +#ifdef isgraph +# define ONIGENC_IS_CODE_GRAPH(enc,code) isgraph((int )code) +#else +# define ONIGENC_IS_CODE_GRAPH(enc,code) \ + (isprint((int )code) && !isspace((int )code)) +#endif + +#define ONIGENC_IS_CODE_PRINT(enc,code) m17n_isprint(enc,code) +#define ONIGENC_IS_CODE_ALNUM(enc,code) m17n_isalnum(enc,code) +#define ONIGENC_IS_CODE_ALPHA(enc,code) m17n_isalpha(enc,code) +#define ONIGENC_IS_CODE_LOWER(enc,code) m17n_islower(enc,code) +#define ONIGENC_IS_CODE_UPPER(enc,code) m17n_isupper(enc,code) +#define ONIGENC_IS_CODE_CNTRL(enc,code) m17n_iscntrl(enc,code) +#define ONIGENC_IS_CODE_PUNCT(enc,code) m17n_ispunct(enc,code) +#define ONIGENC_IS_CODE_SPACE(enc,code) m17n_isspace(enc,code) +#define ONIGENC_IS_CODE_DIGIT(enc,code) m17n_isdigit(enc,code) +#define ONIGENC_IS_CODE_XDIGIT(enc,code) m17n_isxdigit(enc,code) +#define ONIGENC_IS_CODE_WORD(enc,code) m17n_iswchar(enc,code) + +ONIG_EXTERN +int onigenc_is_code_ctype P_((OnigEncoding enc, OnigCodePoint code, int ctype)); +ONIG_EXTERN +int onigenc_code_to_mbc P_((OnigEncoding enc, OnigCodePoint code, UChar *buf)); +ONIG_EXTERN +int onigenc_mbc_to_lower P_((OnigEncoding enc, UChar* p, UChar* buf)); +ONIG_EXTERN +int onigenc_mbc_is_case_ambig P_((OnigEncoding enc, UChar* p)); +ONIG_EXTERN +int onigenc_is_allowed_reverse_match P_((OnigEncoding enc, UChar* s, UChar* end)); + +#else /* ONIG_RUBY_M17N */ + +#define ONIGENC_NAME(enc) ((enc)->name) + +#define ONIGENC_MBC_TO_LOWER(enc,p,buf) (enc)->mbc_to_lower(p,buf) +#define ONIGENC_IS_MBC_CASE_AMBIG(enc,p) (enc)->mbc_is_case_ambig(p) + +#define ONIGENC_IS_FOLD_MATCH(enc) ((enc)->is_fold_match) +#define ONIGENC_IS_CONTINUOUS_SB_MB(enc) ((enc)->is_continuous_sb_mb) +#define ONIGENC_CTYPE_SUPPORT_LEVEL(enc) ((enc)->ctype_support_level) +#define ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc,s,end) \ + (enc)->is_allowed_reverse_match(s,end) +#define ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc,start,s) \ + (enc)->left_adjust_char_head(start, s) +#define ONIGENC_GET_ALL_FOLD_MATCH_CODE(enc,codes) \ + (enc)->get_all_fold_match_code(codes) +#define ONIGENC_GET_FOLD_MATCH_INFO(enc,p,end,info) \ + (enc)->get_fold_match_info(p,end,info) +#define ONIGENC_STEP_BACK(enc,start,s,n) \ + onigenc_step_back((enc),(start),(s),(n)) + +#define ONIGENC_MBC_LEN_BY_HEAD(enc,byte) ((enc)->len_table[(int )(byte)]) +#define ONIGENC_MBC_MAXLEN(enc) ((enc)->max_enc_len) +#define ONIGENC_MBC_MAXLEN_DIST(enc) ONIGENC_MBC_MAXLEN(enc) +#define ONIGENC_MBC_TO_CODE(enc,p,e) (enc)->mbc_to_code((p),(e)) +#define ONIGENC_CODE_TO_MBCLEN(enc,code) (enc)->code_to_mbclen(code) +#define ONIGENC_CODE_TO_MBC(enc,code,buf) (enc)->code_to_mbc(code,buf) + +#define ONIGENC_IS_CODE_CTYPE(enc,code,ctype) (enc)->code_is_ctype(code,ctype) + +#define ONIGENC_IS_CODE_GRAPH(enc,code) \ + ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_GRAPH) +#define ONIGENC_IS_CODE_PRINT(enc,code) \ + ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_PRINT) +#define ONIGENC_IS_CODE_ALNUM(enc,code) \ + ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_ALNUM) +#define ONIGENC_IS_CODE_ALPHA(enc,code) \ + ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_ALPHA) +#define ONIGENC_IS_CODE_LOWER(enc,code) \ + ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_LOWER) +#define ONIGENC_IS_CODE_UPPER(enc,code) \ + ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_UPPER) +#define ONIGENC_IS_CODE_CNTRL(enc,code) \ + ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_CNTRL) +#define ONIGENC_IS_CODE_PUNCT(enc,code) \ + ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_PUNCT) +#define ONIGENC_IS_CODE_SPACE(enc,code) \ + ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_SPACE) +#define ONIGENC_IS_CODE_BLANK(enc,code) \ + ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_BLANK) +#define ONIGENC_IS_CODE_DIGIT(enc,code) \ + ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_DIGIT) +#define ONIGENC_IS_CODE_XDIGIT(enc,code) \ + ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_XDIGIT) +#define ONIGENC_IS_CODE_WORD(enc,code) \ + ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_WORD) + +#define ONIGENC_GET_CTYPE_CODE_RANGE(enc,ctype,nsb,nmb,sbr,mbr) \ + (enc)->get_ctype_code_range(ctype,nsb,nmb,sbr,mbr) + +ONIG_EXTERN +UChar* onigenc_step_back P_((OnigEncoding enc, UChar* start, UChar* s, int n)); + +#endif /* is not ONIG_RUBY_M17N */ + + +/* encoding API */ +ONIG_EXTERN +int onigenc_init P_(()); +ONIG_EXTERN +int onigenc_set_default_encoding P_((OnigEncoding enc)); +ONIG_EXTERN +OnigEncoding onigenc_get_default_encoding P_(()); +ONIG_EXTERN +void onigenc_set_default_caseconv_table P_((UChar* table)); +ONIG_EXTERN +UChar* onigenc_get_right_adjust_char_head_with_prev P_((OnigEncoding enc, UChar* start, UChar* s, UChar** prev)); +ONIG_EXTERN +UChar* onigenc_get_prev_char_head P_((OnigEncoding enc, UChar* start, UChar* s)); +ONIG_EXTERN +UChar* onigenc_get_left_adjust_char_head P_((OnigEncoding enc, UChar* start, UChar* s)); +ONIG_EXTERN +UChar* onigenc_get_right_adjust_char_head P_((OnigEncoding enc, UChar* start, UChar* s)); + + + +/* PART: regular expression */ + +/* config parameters */ +#define ONIG_NREGION 10 +#define ONIG_MAX_BACKREF_NUM 1000 +#define ONIG_MAX_REPEAT_NUM 100000 +#define ONIG_MAX_MULTI_BYTE_RANGES_NUM 1000 +/* constants */ +#define ONIG_MAX_ERROR_MESSAGE_LEN 90 + +#if defined(RUBY_PLATFORM) && !defined(ONIG_RUBY_M17N) +ONIG_EXTERN OnigEncoding OnigEncDefaultCharEncoding; +#undef ismbchar +#define ismbchar(c) (mbclen((c)) != 1) +#define mbclen(c) (OnigEncDefaultCharEncoding->len_table[(unsigned char )(c)]) +#endif + +typedef unsigned int OnigOptionType; + +#define ONIG_OPTION_DEFAULT ONIG_OPTION_NONE + +/* options */ +#define ONIG_OPTION_NONE 0 +#define ONIG_OPTION_IGNORECASE 1L +#define ONIG_OPTION_EXTEND (ONIG_OPTION_IGNORECASE << 1) +#define ONIG_OPTION_MULTILINE (ONIG_OPTION_EXTEND << 1) +#define ONIG_OPTION_SINGLELINE (ONIG_OPTION_MULTILINE << 1) +#define ONIG_OPTION_FIND_LONGEST (ONIG_OPTION_SINGLELINE << 1) +#define ONIG_OPTION_FIND_NOT_EMPTY (ONIG_OPTION_FIND_LONGEST << 1) +#define ONIG_OPTION_NEGATE_SINGLELINE (ONIG_OPTION_FIND_NOT_EMPTY << 1) +#define ONIG_OPTION_DONT_CAPTURE_GROUP (ONIG_OPTION_NEGATE_SINGLELINE << 1) +#define ONIG_OPTION_CAPTURE_GROUP (ONIG_OPTION_DONT_CAPTURE_GROUP << 1) +/* options (search time) */ +#define ONIG_OPTION_NOTBOL (ONIG_OPTION_CAPTURE_GROUP << 1) +#define ONIG_OPTION_NOTEOL (ONIG_OPTION_NOTBOL << 1) +#define ONIG_OPTION_POSIX_REGION (ONIG_OPTION_NOTEOL << 1) + +#define ONIG_OPTION_ON(options,regopt) ((options) |= (regopt)) +#define ONIG_OPTION_OFF(options,regopt) ((options) &= ~(regopt)) +#define ONIG_IS_OPTION_ON(options,option) ((options) & (option)) + +/* syntax */ +typedef struct { + unsigned int op; + unsigned int op2; + unsigned int behavior; + OnigOptionType options; /* default option */ +} OnigSyntaxType; + +ONIG_EXTERN OnigSyntaxType OnigSyntaxPosixBasic; +ONIG_EXTERN OnigSyntaxType OnigSyntaxPosixExtended; +ONIG_EXTERN OnigSyntaxType OnigSyntaxEmacs; +ONIG_EXTERN OnigSyntaxType OnigSyntaxGrep; +ONIG_EXTERN OnigSyntaxType OnigSyntaxGnuRegex; +ONIG_EXTERN OnigSyntaxType OnigSyntaxJava; +ONIG_EXTERN OnigSyntaxType OnigSyntaxPerl; +ONIG_EXTERN OnigSyntaxType OnigSyntaxRuby; + +/* predefined syntaxes (see regparse.c) */ +#define ONIG_SYNTAX_POSIX_BASIC (&OnigSyntaxPosixBasic) +#define ONIG_SYNTAX_POSIX_EXTENDED (&OnigSyntaxPosixExtended) +#define ONIG_SYNTAX_EMACS (&OnigSyntaxEmacs) +#define ONIG_SYNTAX_GREP (&OnigSyntaxGrep) +#define ONIG_SYNTAX_GNU_REGEX (&OnigSyntaxGnuRegex) +#define ONIG_SYNTAX_JAVA (&OnigSyntaxJava) +#define ONIG_SYNTAX_PERL (&OnigSyntaxPerl) +#define ONIG_SYNTAX_RUBY (&OnigSyntaxRuby) + +/* default syntax */ +ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax; +#define ONIG_SYNTAX_DEFAULT OnigDefaultSyntax + +/* syntax (operators) */ +#define ONIG_SYN_OP_VARIABLE_META_CHARACTERS (1<<0) +#define ONIG_SYN_OP_DOT_ANYCHAR (1<<1) /* . */ +#define ONIG_SYN_OP_ASTERISK_ZERO_INF (1<<2) /* * */ +#define ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF (1<<3) +#define ONIG_SYN_OP_PLUS_ONE_INF (1<<4) /* + */ +#define ONIG_SYN_OP_ESC_PLUS_ONE_INF (1<<5) +#define ONIG_SYN_OP_QMARK_ZERO_ONE (1<<6) /* ? */ +#define ONIG_SYN_OP_ESC_QMARK_ZERO_ONE (1<<7) +#define ONIG_SYN_OP_BRACE_INTERVAL (1<<8) /* {lower,upper} */ +#define ONIG_SYN_OP_ESC_BRACE_INTERVAL (1<<9) /* \{lower,upper\} */ +#define ONIG_SYN_OP_VBAR_ALT (1<<10) /* | */ +#define ONIG_SYN_OP_ESC_VBAR_ALT (1<<11) /* \| */ +#define ONIG_SYN_OP_LPAREN_SUBEXP (1<<12) /* (...) */ +#define ONIG_SYN_OP_ESC_LPAREN_SUBEXP (1<<13) /* \(...\) */ +#define ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR (1<<14) /* \A, \Z, \z */ +#define ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR (1<<15) /* \G */ +#define ONIG_SYN_OP_DECIMAL_BACKREF (1<<16) /* \num */ +#define ONIG_SYN_OP_BRACKET_CC (1<<17) /* [...] */ +#define ONIG_SYN_OP_ESC_W_WORD (1<<18) /* \w, \W */ +#define ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END (1<<19) /* \<. \> */ +#define ONIG_SYN_OP_ESC_B_WORD_BOUND (1<<20) /* \b, \B */ +#define ONIG_SYN_OP_ESC_S_WHITE_SPACE (1<<21) /* \s, \S */ +#define ONIG_SYN_OP_ESC_D_DIGIT (1<<22) /* \d, \D */ +#define ONIG_SYN_OP_LINE_ANCHOR (1<<23) /* ^, $ */ +#define ONIG_SYN_OP_POSIX_BRACKET (1<<24) /* [:xxxx:] */ +#define ONIG_SYN_OP_QMARK_NON_GREEDY (1<<25) /* ??,*?,+?,{n,m}? */ +#define ONIG_SYN_OP_ESC_CONTROL_CHARS (1<<26) /* \n,\r,\t,\a ... */ +#define ONIG_SYN_OP_ESC_C_CONTROL (1<<27) /* \cx */ +#define ONIG_SYN_OP_ESC_OCTAL3 (1<<28) /* \OOO */ +#define ONIG_SYN_OP_ESC_X_HEX2 (1<<29) /* \xHH */ +#define ONIG_SYN_OP_ESC_X_BRACE_HEX8 (1<<30) /* \x{7HHHHHHH} */ + +#define ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE (1<<0) /* \Q...\E */ +#define ONIG_SYN_OP2_QMARK_GROUP_EFFECT (1<<1) /* (?...) */ +#define ONIG_SYN_OP2_OPTION_PERL (1<<2) /* (?imsx),(?-imsx) */ +#define ONIG_SYN_OP2_OPTION_RUBY (1<<3) /* (?imx), (?-imx) */ +#define ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT (1<<4) /* ?+,*+,++ */ +#define ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL (1<<5) /* {n,m}+ */ +#define ONIG_SYN_OP2_CCLASS_SET_OP (1<<6) /* [...&&..[..]..] */ +#define ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP (1<<7) /* (?...) */ +#define ONIG_SYN_OP2_ESC_K_NAMED_BACKREF (1<<8) /* \k */ +#define ONIG_SYN_OP2_ESC_G_SUBEXP_CALL (1<<9) /* \g, \g */ +#define ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY (1<<10) /* (?@..),(?@..) */ +#define ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL (1<<11) /* \C-x */ +#define ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META (1<<12) /* \M-x */ +#define ONIG_SYN_OP2_ESC_V_VTAB (1<<13) /* \v as VTAB */ +#define ONIG_SYN_OP2_ESC_U_HEX4 (1<<14) /* \uHHHH */ +#define ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR (1<<15) /* \`, \' */ +#define ONIG_SYN_OP2_ESC_P_CHAR_PROPERTY (1<<16) /* \p{...}, \P{...} */ + +/* syntax (behavior) */ +#define ONIG_SYN_CONTEXT_INDEP_ANCHORS (1<<31) /* not implemented */ +#define ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS (1<<0) /* ?, *, +, {n,m} */ +#define ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS (1<<1) /* error or ignore */ +#define ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP (1<<2) /* ...)... */ +#define ONIG_SYN_ALLOW_INVALID_INTERVAL (1<<3) /* {??? */ +#define ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV (1<<4) /* {,n} => {0,n} */ +#define ONIG_SYN_STRICT_CHECK_BACKREF (1<<5) /* /(\1)/,/\1()/ ..*/ +#define ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND (1<<6) /* (?<=a|bc) */ +#define ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP (1<<7) /* see doc/RE */ +#define ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME (1<<8) /* (?)(?) */ + +/* syntax (behavior) in char class [...] */ +#define ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC (1<<20) /* [^...] */ +#define ONIG_SYN_BACKSLASH_ESCAPE_IN_CC (1<<21) /* [..\w..] etc.. */ +#define ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC (1<<22) +#define ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC (1<<23) /* [0-9-a]=[0-9\-a] */ +/* syntax (behavior) warning */ +#define ONIG_SYN_WARN_CC_OP_NOT_ESCAPED (1<<24) /* [,-,] */ +#define ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT (1<<25) /* (?:a*)+ */ + +/* meta character specifiers (onig_set_meta_char()) */ +#define ONIG_META_CHAR_ESCAPE 0 +#define ONIG_META_CHAR_ANYCHAR 1 +#define ONIG_META_CHAR_ANYTIME 2 +#define ONIG_META_CHAR_ZERO_OR_ONE_TIME 3 +#define ONIG_META_CHAR_ONE_OR_MORE_TIME 4 +#define ONIG_META_CHAR_ANYCHAR_ANYTIME 5 + +#define ONIG_INEFFECTIVE_META_CHAR 0 + +/* error codes */ +#define ONIG_IS_PATTERN_ERROR(ecode) ((ecode) <= -100 && (ecode) > -1000) +/* normal return */ +#define ONIG_NORMAL 0 +#define ONIG_MISMATCH -1 +#define ONIG_NO_SUPPORT_CONFIG -2 +/* internal error */ +#define ONIGERR_PARSER_BUG -11 +#define ONIGERR_STACK_BUG -12 +#define ONIGERR_UNDEFINED_BYTECODE -13 +#define ONIGERR_UNEXPECTED_BYTECODE -14 +#define ONIGERR_MATCH_STACK_LIMIT_OVER -15 +#define ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED -21 +#define ONIGERR_SPECIFIED_ENCODING_CANT_CONVERT_TO_WIDE_CHAR -22 +/* general error */ +#define ONIGERR_INVALID_ARGUMENT -30 +/* syntax error */ +#define ONIGERR_END_PATTERN_AT_LEFT_BRACE -100 +#define ONIGERR_END_PATTERN_AT_LEFT_BRACKET -101 +#define ONIGERR_EMPTY_CHAR_CLASS -102 +#define ONIGERR_PREMATURE_END_OF_CHAR_CLASS -103 +#define ONIGERR_END_PATTERN_AT_BACKSLASH -104 +#define ONIGERR_END_PATTERN_AT_META -105 +#define ONIGERR_END_PATTERN_AT_CONTROL -106 +#define ONIGERR_META_CODE_SYNTAX -108 +#define ONIGERR_CONTROL_CODE_SYNTAX -109 +#define ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE -110 +#define ONIGERR_CHAR_CLASS_VALUE_AT_START_OF_RANGE -111 +#define ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS -112 +#define ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED -113 +#define ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID -114 +#define ONIGERR_NESTED_REPEAT_OPERATOR -115 +#define ONIGERR_UNMATCHED_CLOSE_PARENTHESIS -116 +#define ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS -117 +#define ONIGERR_END_PATTERN_IN_GROUP -118 +#define ONIGERR_UNDEFINED_GROUP_OPTION -119 +#define ONIGERR_INVALID_POSIX_BRACKET_TYPE -121 +#define ONIGERR_INVALID_LOOK_BEHIND_PATTERN -122 +#define ONIGERR_INVALID_REPEAT_RANGE_PATTERN -123 +/* values error (syntax error) */ +#define ONIGERR_TOO_BIG_NUMBER -200 +#define ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE -201 +#define ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE -202 +#define ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS -203 +#define ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE -204 +#define ONIGERR_TOO_MANY_MULTI_BYTE_RANGES -205 +#define ONIGERR_TOO_SHORT_MULTI_BYTE_STRING -206 +#define ONIGERR_TOO_BIG_BACKREF_NUMBER -207 +#define ONIGERR_INVALID_BACKREF -208 +#define ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED -209 +#define ONIGERR_TOO_LONG_WIDE_CHAR_VALUE -212 +#define ONIGERR_EMPTY_GROUP_NAME -214 +#define ONIGERR_INVALID_GROUP_NAME -215 +#define ONIGERR_INVALID_CHAR_IN_GROUP_NAME -216 +#define ONIGERR_UNDEFINED_NAME_REFERENCE -217 +#define ONIGERR_UNDEFINED_GROUP_REFERENCE -218 +#define ONIGERR_MULTIPLEX_DEFINED_NAME -219 +#define ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL -220 +#define ONIGERR_NEVER_ENDING_RECURSION -221 +#define ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY -222 +#define ONIGERR_INVALID_CHAR_PROPERTY_NAME -223 +/* errors related to thread */ +#define ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT -1001 + + +/* must be smaller than BIT_STATUS_BITS_NUM (unsigned int * 8) */ +#define ONIG_MAX_CAPTURE_HISTORY_GROUP 31 +#define ONIG_IS_CAPTURE_HISTORY_GROUP(r, i) \ + ((i) <= ONIG_MAX_CAPTURE_HISTORY_GROUP && (r)->list && (r)->list[i]) + +/* match result region type */ +struct re_registers { + int allocated; + int num_regs; + int* beg; + int* end; + /* extended */ + struct re_registers** list; /* capture history. list[1]-list[31] */ +}; + +#define ONIG_REGION_NOTPOS -1 + +typedef struct re_registers OnigRegion; + +typedef struct { + UChar* par; + UChar* par_end; +} OnigErrorInfo; + +typedef struct { + int lower; + int upper; +} OnigRepeatRange; + +typedef void (*OnigWarnFunc) P_((char* s)); +extern void onig_null_warn P_((char* s)); +#define ONIG_NULL_WARN onig_null_warn + +#define ONIG_CHAR_TABLE_SIZE 256 + +/* regex_t state */ +#define ONIG_STATE_NORMAL 0 +#define ONIG_STATE_SEARCHING 1 +#define ONIG_STATE_COMPILING -1 +#define ONIG_STATE_MODIFY -2 + +#define ONIG_STATE(reg) \ + ((reg)->state > 0 ? ONIG_STATE_SEARCHING : (reg)->state) + +typedef struct re_pattern_buffer { + /* common members of BBuf(bytes-buffer) */ + unsigned char* p; /* compiled pattern */ + unsigned int used; /* used space for p */ + unsigned int alloc; /* allocated space for p */ + + int state; /* normal, searching, compiling */ + int num_mem; /* used memory(...) num counted from 1 */ + int num_repeat; /* OP_REPEAT/OP_REPEAT_NG id-counter */ + int num_null_check; /* OP_NULL_CHECK_START/END id counter */ + int num_call; /* number of subexp call */ + unsigned int capture_history; /* (?@...) flag (1-31) */ + unsigned int bt_mem_start; /* need backtrack flag */ + unsigned int bt_mem_end; /* need backtrack flag */ + int stack_pop_level; + int repeat_range_alloc; + OnigRepeatRange* repeat_range; + + OnigEncoding enc; + OnigOptionType options; + OnigSyntaxType* syntax; + void* name_table; + + /* optimization info (string search, char-map and anchors) */ + int optimize; /* optimize flag */ + int threshold_len; /* search str-length for apply optimize */ + int anchor; /* BEGIN_BUF, BEGIN_POS, (SEMI_)END_BUF */ + OnigDistance anchor_dmin; /* (SEMI_)END_BUF anchor distance */ + OnigDistance anchor_dmax; /* (SEMI_)END_BUF anchor distance */ + int sub_anchor; /* start-anchor for exact or map */ + unsigned char *exact; + unsigned char *exact_end; + unsigned char map[ONIG_CHAR_TABLE_SIZE]; /* used as BM skip or char-map */ + int *int_map; /* BM skip for exact_len > 255 */ + int *int_map_backward; /* BM skip for backward search */ + OnigDistance dmin; /* min-distance of exact or map */ + OnigDistance dmax; /* max-distance of exact or map */ + + /* regex_t link chain */ + struct re_pattern_buffer* chain; /* escape compile-conflict */ +} regex_t; + + +/* Oniguruma Native API */ +ONIG_EXTERN +int onig_init P_((void)); +ONIG_EXTERN +int onig_error_code_to_str PV_((UChar* s, int err_code, ...)); +ONIG_EXTERN +void onig_set_warn_func P_((OnigWarnFunc f)); +ONIG_EXTERN +void onig_set_verb_warn_func P_((OnigWarnFunc f)); +ONIG_EXTERN +int onig_new P_((regex_t**, UChar* pattern, UChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo)); +ONIG_EXTERN +void onig_free P_((regex_t*)); +ONIG_EXTERN +int onig_recompile P_((regex_t*, UChar* pattern, UChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo)); +ONIG_EXTERN +int onig_search P_((regex_t*, UChar* str, UChar* end, UChar* start, UChar* range, OnigRegion* region, OnigOptionType option)); +ONIG_EXTERN +int onig_match P_((regex_t*, UChar* str, UChar* end, UChar* at, OnigRegion* region, OnigOptionType option)); +ONIG_EXTERN +OnigRegion* onig_region_new P_((void)); +ONIG_EXTERN +void onig_region_free P_((OnigRegion* region, int free_self)); +ONIG_EXTERN +void onig_region_copy P_((OnigRegion* to, OnigRegion* from)); +ONIG_EXTERN +void onig_region_clear P_((OnigRegion* region)); +ONIG_EXTERN +int onig_region_resize P_((OnigRegion* region, int n)); +ONIG_EXTERN +int onig_name_to_group_numbers P_((regex_t* reg, UChar* name, UChar* name_end, + int** nums)); +ONIG_EXTERN +int onig_name_to_backref_number P_((regex_t* reg, UChar* name, UChar* name_end, OnigRegion *region)); +ONIG_EXTERN +int onig_foreach_name P_((regex_t* reg, int (*func)(UChar*,UChar*,int,int*,regex_t*,void*), void* arg)); +ONIG_EXTERN +int onig_number_of_names P_((regex_t* reg)); +ONIG_EXTERN +OnigEncoding onig_get_encoding P_((regex_t* reg)); +ONIG_EXTERN +OnigOptionType onig_get_options P_((regex_t* reg)); +ONIG_EXTERN +OnigSyntaxType* onig_get_syntax P_((regex_t* reg)); +ONIG_EXTERN +int onig_set_default_syntax P_((OnigSyntaxType* syntax)); +ONIG_EXTERN +void onig_copy_syntax P_((OnigSyntaxType* to, OnigSyntaxType* from)); +ONIG_EXTERN +void onig_set_syntax_op P_((OnigSyntaxType* syntax, unsigned int op)); +ONIG_EXTERN +void onig_set_syntax_op2 P_((OnigSyntaxType* syntax, unsigned int op2)); +ONIG_EXTERN +void onig_set_syntax_behavior P_((OnigSyntaxType* syntax, unsigned int behavior)); +ONIG_EXTERN +void onig_set_syntax_options P_((OnigSyntaxType* syntax, OnigOptionType options)); +ONIG_EXTERN +int onig_set_meta_char P_((unsigned int what, unsigned int c)); +ONIG_EXTERN +int onig_end P_((void)); +ONIG_EXTERN +const char* onig_version P_((void)); + +#endif /* ONIGURUMA_H */ diff --git a/regcomp.c b/regcomp.c new file mode 100644 index 0000000000..24d44dd1b8 --- /dev/null +++ b/regcomp.c @@ -0,0 +1,5440 @@ +/********************************************************************** + + regcomp.c - Oniguruma (regular expression library) + + Copyright (C) 2002-2004 K.Kosako (kosako@sofnec.co.jp) + +**********************************************************************/ +#include "regparse.h" + +#ifndef PLATFORM_UNALIGNED_WORD_ACCESS +static unsigned char PadBuf[WORD_ALIGNMENT_SIZE]; +#endif + +static void +swap_node(Node* a, Node* b) +{ + Node c; + c = *a; *a = *b; *b = c; +} + +static OnigDistance +distance_add(OnigDistance d1, OnigDistance d2) +{ + if (d1 == ONIG_INFINITE_DISTANCE || d2 == ONIG_INFINITE_DISTANCE) + return ONIG_INFINITE_DISTANCE; + else { + if (d1 <= ONIG_INFINITE_DISTANCE - d2) return d1 + d2; + else return ONIG_INFINITE_DISTANCE; + } +} + +static OnigDistance +distance_multiply(OnigDistance d, int m) +{ + if (m == 0) return 0; + + if (d < ONIG_INFINITE_DISTANCE / m) + return d * m; + else + return ONIG_INFINITE_DISTANCE; +} + +static int +bitset_is_empty(BitSetRef bs) +{ + int i; + for (i = 0; i < BITSET_SIZE; i++) { + if (bs[i] != 0) return 0; + } + return 1; +} + +#ifdef ONIG_DEBUG +static int +bitset_on_num(BitSetRef bs) +{ + int i, n; + + n = 0; + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + if (BITSET_AT(bs, i)) n++; + } + return n; +} +#endif + +extern int +onig_bbuf_init(BBuf* buf, int size) +{ + buf->p = (UChar* )xmalloc(size); + if (IS_NULL(buf->p)) return(ONIGERR_MEMORY); + + buf->alloc = size; + buf->used = 0; + return 0; +} + + +#ifdef USE_SUBEXP_CALL + +static int +unset_addr_list_init(UnsetAddrList* uslist, int size) +{ + UnsetAddr* p; + + p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size); + CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY); + uslist->num = 0; + uslist->alloc = size; + uslist->us = p; + return 0; +} + +static void +unset_addr_list_end(UnsetAddrList* uslist) +{ + if (IS_NOT_NULL(uslist->us)) + xfree(uslist->us); +} + +static int +unset_addr_list_add(UnsetAddrList* uslist, int offset, struct _Node* node) +{ + UnsetAddr* p; + int size; + + if (uslist->num >= uslist->alloc) { + size = uslist->alloc * 2; + p = (UnsetAddr* )xrealloc(uslist->us, sizeof(UnsetAddr) * size); + CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY); + uslist->alloc = size; + uslist->us = p; + } + + uslist->us[uslist->num].offset = offset; + uslist->us[uslist->num].target = node; + uslist->num++; + return 0; +} +#endif /* USE_SUBEXP_CALL */ + + +#if 0 +static int +bitset_mbmaxlen(BitSetRef bs, int negative, OnigEncoding enc) +{ + int i; + int len, maxlen = 0; + + if (negative) { + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + if (! BITSET_AT(bs, i)) { + len = enc_len(enc, i); + if (len > maxlen) maxlen = len; + } + } + } + else { + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + if (BITSET_AT(bs, i)) { + len = enc_len(enc, i); + if (len > maxlen) maxlen = len; + } + } + } + return maxlen; +} +#endif + +static int +add_opcode(regex_t* reg, int opcode) +{ + BBUF_ADD1(reg, opcode); + return 0; +} + +static int +add_rel_addr(regex_t* reg, int addr) +{ + RelAddrType ra = (RelAddrType )addr; + +#ifdef PLATFORM_UNALIGNED_WORD_ACCESS + BBUF_ADD(reg, &ra, SIZE_RELADDR); +#else + UChar buf[SERIALIZE_BUFSIZE]; + SERIALIZE_RELADDR(ra, buf); + BBUF_ADD(reg, buf, SIZE_RELADDR); +#endif + return 0; +} + +static int +add_abs_addr(regex_t* reg, int addr) +{ + AbsAddrType ra = (AbsAddrType )addr; + +#ifdef PLATFORM_UNALIGNED_WORD_ACCESS + BBUF_ADD(reg, &ra, SIZE_ABSADDR); +#else + UChar buf[SERIALIZE_BUFSIZE]; + SERIALIZE_ABSADDR(ra, buf); + BBUF_ADD(reg, buf, SIZE_ABSADDR); +#endif + return 0; +} + +static int +add_length(regex_t* reg, int len) +{ + LengthType l = (LengthType )len; + +#ifdef PLATFORM_UNALIGNED_WORD_ACCESS + BBUF_ADD(reg, &l, SIZE_LENGTH); +#else + UChar buf[SERIALIZE_BUFSIZE]; + SERIALIZE_LENGTH(l, buf); + BBUF_ADD(reg, buf, SIZE_LENGTH); +#endif + return 0; +} + +static int +add_mem_num(regex_t* reg, int num) +{ + MemNumType n = (MemNumType )num; + +#ifdef PLATFORM_UNALIGNED_WORD_ACCESS + BBUF_ADD(reg, &n, SIZE_MEMNUM); +#else + UChar buf[SERIALIZE_BUFSIZE]; + SERIALIZE_MEMNUM(n, buf); + BBUF_ADD(reg, buf, SIZE_MEMNUM); +#endif + return 0; +} + +#if 0 +static int +add_repeat_num(regex_t* reg, int num) +{ + RepeatNumType n = (RepeatNumType )num; + +#ifdef PLATFORM_UNALIGNED_WORD_ACCESS + BBUF_ADD(reg, &n, SIZE_REPEATNUM); +#else + UChar buf[SERIALIZE_BUFSIZE]; + SERIALIZE_REPEATNUM(n, buf); + BBUF_ADD(reg, buf, SIZE_REPEATNUM); +#endif + return 0; +} +#endif + +static int +add_option(regex_t* reg, OnigOptionType option) +{ +#ifdef PLATFORM_UNALIGNED_WORD_ACCESS + BBUF_ADD(reg, &option, SIZE_OPTION); +#else + UChar buf[SERIALIZE_BUFSIZE]; + SERIALIZE_OPTION(option, buf); + BBUF_ADD(reg, buf, SIZE_OPTION); +#endif + return 0; +} + +static int +add_opcode_rel_addr(regex_t* reg, int opcode, int addr) +{ + int r; + + r = add_opcode(reg, opcode); + if (r) return r; + r = add_rel_addr(reg, addr); + return r; +} + +static int +add_bytes(regex_t* reg, UChar* bytes, int len) +{ + BBUF_ADD(reg, bytes, len); + return 0; +} + +static int +add_bitset(regex_t* reg, BitSetRef bs) +{ + BBUF_ADD(reg, bs, SIZE_BITSET); + return 0; +} + +static int +add_opcode_option(regex_t* reg, int opcode, OnigOptionType option) +{ + int r; + + r = add_opcode(reg, opcode); + if (r) return r; + r = add_option(reg, option); + return r; +} + +static int compile_length_tree(Node* node, regex_t* reg); +static int compile_tree(Node* node, regex_t* reg); + + +#define IS_NEED_STR_LEN_OP_EXACT(op) \ + ((op) == OP_EXACTN || (op) == OP_EXACTMB2N ||\ + (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN || (op) == OP_EXACTN_IC) + +static int +select_str_opcode(int mb_len, int str_len, int ignore_case) +{ + int op; + + switch (mb_len) { + case 1: + if (ignore_case) { + switch (str_len) { + case 1: op = OP_EXACT1_IC; break; + default: op = OP_EXACTN_IC; break; + } + } + else { + switch (str_len) { + case 1: op = OP_EXACT1; break; + case 2: op = OP_EXACT2; break; + case 3: op = OP_EXACT3; break; + case 4: op = OP_EXACT4; break; + case 5: op = OP_EXACT5; break; + default: op = OP_EXACTN; break; + } + } + break; + + case 2: + switch (str_len) { + case 1: op = OP_EXACTMB2N1; break; + case 2: op = OP_EXACTMB2N2; break; + case 3: op = OP_EXACTMB2N3; break; + default: op = OP_EXACTMB2N; break; + } + break; + + case 3: + op = OP_EXACTMB3N; + break; + + default: + op = OP_EXACTMBN; + break; + } + return op; +} + +static int +compile_tree_empty_check(Node* node, regex_t* reg, int empty_info) +{ + int r; + int saved_num_null_check = reg->num_null_check; + + if (empty_info != 0) { + r = add_opcode(reg, OP_NULL_CHECK_START); + if (r) return r; + r = add_mem_num(reg, reg->num_null_check); /* NULL CHECK ID */ + if (r) return r; + reg->num_null_check++; + } + + r = compile_tree(node, reg); + if (r) return r; + + if (empty_info != 0) { + if (empty_info == NQ_TARGET_IS_EMPTY) + r = add_opcode(reg, OP_NULL_CHECK_END); + else if (empty_info == NQ_TARGET_IS_EMPTY_MEM) + r = add_opcode(reg, OP_NULL_CHECK_END_MEMST); + else if (empty_info == NQ_TARGET_IS_EMPTY_REC) + r = add_opcode(reg, OP_NULL_CHECK_END_MEMST_PUSH); + + if (r) return r; + r = add_mem_num(reg, saved_num_null_check); /* NULL CHECK ID */ + } + return r; +} + +#ifdef USE_SUBEXP_CALL +static int +compile_call(CallNode* node, regex_t* reg) +{ + int r; + + r = add_opcode(reg, OP_CALL); + if (r) return r; + r = unset_addr_list_add(node->unset_addr_list, BBUF_GET_OFFSET_POS(reg), + node->target); + if (r) return r; + r = add_abs_addr(reg, 0 /*dummy addr.*/); + return r; +} +#endif + +static int +compile_tree_n_times(Node* node, int n, regex_t* reg) +{ + int i, r; + + for (i = 0; i < n; i++) { + r = compile_tree(node, reg); + if (r) return r; + } + return 0; +} + +static int +add_compile_string_length(UChar* s, int mb_len, int str_len, + regex_t* reg, int ignore_case) +{ + int len; + int op = select_str_opcode(mb_len, str_len, ignore_case); + + len = SIZE_OPCODE; + if (op == OP_EXACTMBN) + len += SIZE_LENGTH; + + if (IS_NEED_STR_LEN_OP_EXACT(op)) + len += SIZE_LENGTH; + + len += mb_len * str_len; + return len; +} + +static int +add_compile_string(UChar* s, int mb_len, int str_len, + regex_t* reg, int ignore_case) +{ + int op = select_str_opcode(mb_len, str_len, ignore_case); + add_opcode(reg, op); + + if (op == OP_EXACTMBN) + add_length(reg, mb_len); + + if (IS_NEED_STR_LEN_OP_EXACT(op)) + add_length(reg, str_len); + + add_bytes(reg, s, mb_len * str_len); + return 0; +} + + +static int +compile_length_string_node(StrNode* sn, regex_t* reg) +{ + int rlen, r, len, prev_len, slen, ambig, ic; + OnigEncoding enc = reg->enc; + UChar *p, *prev; + + if (sn->end <= sn->s) + return 0; + + ic = IS_IGNORECASE(reg->options); + + p = prev = sn->s; + prev_len = enc_len(enc, *p); + if (ic != 0 && prev_len == 1) + ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); + else + ambig = 0; + + p += prev_len; + slen = 1; + rlen = 0; + + for (; p < sn->end; ) { + len = enc_len(enc, *p); + if (len == prev_len) { + slen++; + if (ic != 0 && ambig == 0 && len == 1) + ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); + } + else { + r = add_compile_string_length(prev, prev_len, slen, reg, ambig); + rlen += r; + + if (ic != 0 && len == 1) + ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); + else + ambig = 0; + + prev = p; + slen = 1; + prev_len = len; + } + + p += len; + } + r = add_compile_string_length(prev, prev_len, slen, reg, ambig); + rlen += r; + return rlen; +} + +static int +compile_length_string_raw_node(StrNode* sn, regex_t* reg) +{ + if (sn->end <= sn->s) + return 0; + + return add_compile_string_length(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0); +} + +static int +compile_string_node(StrNode* sn, regex_t* reg) +{ + int r, len, prev_len, slen, ambig, ic; + OnigEncoding enc = reg->enc; + UChar *p, *prev; + + if (sn->end <= sn->s) + return 0; + + ic = IS_IGNORECASE(reg->options); + + p = prev = sn->s; + prev_len = enc_len(enc, *p); + if (ic != 0 && prev_len == 1) { + ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); + if (ambig != 0) + ONIGENC_MBC_TO_LOWER(reg->enc, p, p); + } + else + ambig = 0; + + p += prev_len; + slen = 1; + + for (; p < sn->end; ) { + len = enc_len(enc, *p); + if (len == prev_len) { + slen++; + if (ic != 0 && len == 1) { + if (ambig == 0) + ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); + if (ambig != 0) ONIGENC_MBC_TO_LOWER(reg->enc, p, p); + } + } + else { + r = add_compile_string(prev, prev_len, slen, reg, ambig); + if (r) return r; + if (ic != 0 && len == 1) { + ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); + if (ambig != 0) ONIGENC_MBC_TO_LOWER(reg->enc, p, p); + } + else + ambig = 0; + + prev = p; + slen = 1; + prev_len = len; + } + + p += len; + } + return add_compile_string(prev, prev_len, slen, reg, ambig); +} + +static int +compile_string_raw_node(StrNode* sn, regex_t* reg) +{ + if (sn->end <= sn->s) + return 0; + + return add_compile_string(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0); +} + +static int +add_multi_byte_cclass(BBuf* mbuf, regex_t* reg) +{ +#ifdef PLATFORM_UNALIGNED_WORD_ACCESS + add_length(reg, mbuf->used); + return add_bytes(reg, mbuf->p, mbuf->used); +#else + int r, pad_size; + UChar* p = BBUF_GET_ADD_ADDRESS(reg) + SIZE_LENGTH; + + GET_ALIGNMENT_PAD_SIZE(p, pad_size); + add_length(reg, mbuf->used + (WORD_ALIGNMENT_SIZE - 1)); + if (pad_size != 0) add_bytes(reg, PadBuf, pad_size); + + r = add_bytes(reg, mbuf->p, mbuf->used); + + /* padding for return value from compile_length_cclass_node() to be fix. */ + pad_size = (WORD_ALIGNMENT_SIZE - 1) - pad_size; + if (pad_size != 0) add_bytes(reg, PadBuf, pad_size); + return r; +#endif +} + +static int +compile_length_cclass_node(CClassNode* cc, regex_t* reg) +{ + int len; + + if (IS_NULL(cc->mbuf)) { + len = SIZE_OPCODE + SIZE_BITSET; + } + else { + if (bitset_is_empty(cc->bs)) { + /* SIZE_BITSET is included in mbuf->used. */ + len = SIZE_OPCODE; + } + else { + len = SIZE_OPCODE + SIZE_BITSET; + } +#ifdef PLATFORM_UNALIGNED_WORD_ACCESS + len += SIZE_LENGTH + cc->mbuf->used; +#else + len += SIZE_LENGTH + cc->mbuf->used + (WORD_ALIGNMENT_SIZE - 1); +#endif + } + + return len; +} + +static int +compile_cclass_node(CClassNode* cc, regex_t* reg) +{ + int r; + + if (IS_NULL(cc->mbuf)) { + if (cc->not) add_opcode(reg, OP_CCLASS_NOT); + else add_opcode(reg, OP_CCLASS); + + r = add_bitset(reg, cc->bs); + } + else { + if (bitset_is_empty(cc->bs)) { + if (cc->not) add_opcode(reg, OP_CCLASS_MB_NOT); + else add_opcode(reg, OP_CCLASS_MB); + + r = add_multi_byte_cclass(cc->mbuf, reg); + } + else { + if (cc->not) add_opcode(reg, OP_CCLASS_MIX_NOT); + else add_opcode(reg, OP_CCLASS_MIX); + + r = add_bitset(reg, cc->bs); + if (r) return r; + r = add_multi_byte_cclass(cc->mbuf, reg); + } + } + + return r; +} + +static int +entry_repeat_range(regex_t* reg, int id, int lower, int upper) +{ +#define REPEAT_RANGE_ALLOC 4 + + OnigRepeatRange* p; + + if (reg->repeat_range_alloc == 0) { + p = (OnigRepeatRange* )xmalloc(sizeof(OnigRepeatRange) * REPEAT_RANGE_ALLOC); + CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY); + reg->repeat_range = p; + reg->repeat_range_alloc = REPEAT_RANGE_ALLOC; + } + else if (reg->repeat_range_alloc <= id) { + int n; + n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC; + p = (OnigRepeatRange* )xrealloc(reg->repeat_range, + sizeof(OnigRepeatRange) * n); + CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY); + reg->repeat_range = p; + reg->repeat_range_alloc = n; + } + else { + p = reg->repeat_range; + } + + p[id].lower = lower; + p[id].upper = upper; + return 0; +} + +static int +compile_range_repeat_node(QualifierNode* qn, int target_len, int empty_info, + regex_t* reg) +{ + int r; + int num_repeat = reg->num_repeat; + + r = add_opcode(reg, qn->greedy ? OP_REPEAT : OP_REPEAT_NG); + if (r) return r; + r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */ + reg->num_repeat++; + if (r) return r; + r = add_rel_addr(reg, target_len + SIZE_OP_REPEAT_INC); + if (r) return r; + + r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper); + if (r) return r; + + r = compile_tree_empty_check(qn->target, reg, empty_info); + if (r) return r; + + r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG); + if (r) return r; + r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */ + return r; +} + +#define QUALIFIER_EXPAND_LIMIT_SIZE 50 + +static int +compile_length_qualifier_node(QualifierNode* qn, regex_t* reg) +{ + int len, mod_tlen; + int infinite = IS_REPEAT_INFINITE(qn->upper); + int empty_info = qn->target_empty_info; + int tlen = compile_length_tree(qn->target, reg); + + if (tlen < 0) return tlen; + + /* anychar repeat */ + if (NTYPE(qn->target) == N_ANYCHAR) { + if (qn->greedy && infinite) { + if (IS_NOT_NULL(qn->next_head_exact)) + return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower; + else + return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower; + } + } + + if (empty_info != 0) + mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END); + else + mod_tlen = tlen; + + if (infinite && + (qn->lower <= 1 || tlen * qn->lower <= QUALIFIER_EXPAND_LIMIT_SIZE)) { + if (qn->lower == 1 && tlen > QUALIFIER_EXPAND_LIMIT_SIZE) { + len = SIZE_OP_JUMP; + } + else { + len = tlen * qn->lower; + } + + if (qn->greedy) { + if (IS_NOT_NULL(qn->head_exact)) + len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP; + else if (IS_NOT_NULL(qn->next_head_exact)) + len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP; + else + len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP; + } + else + len += SIZE_OP_JUMP + mod_tlen + SIZE_OP_PUSH; + } + else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?..){0}/ */ + len = SIZE_OP_JUMP + tlen; + } + else if (!infinite && qn->greedy && + (tlen + SIZE_OP_PUSH) * qn->upper <= QUALIFIER_EXPAND_LIMIT_SIZE) { + len = tlen * qn->lower; + len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower); + } + else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */ + len = SIZE_OP_PUSH + SIZE_OP_JUMP + tlen; + } + else { + len = SIZE_OP_REPEAT_INC + + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM; + } + + return len; +} + +static int +is_anychar_star_qualifier(QualifierNode* qn) +{ + if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) && + NTYPE(qn->target) == N_ANYCHAR) + return 1; + else + return 0; +} + +static int +compile_qualifier_node(QualifierNode* qn, regex_t* reg) +{ + int i, r, mod_tlen; + int infinite = IS_REPEAT_INFINITE(qn->upper); + int empty_info = qn->target_empty_info; + int tlen = compile_length_tree(qn->target, reg); + + if (tlen < 0) return tlen; + + if (is_anychar_star_qualifier(qn)) { + r = compile_tree_n_times(qn->target, qn->lower, reg); + if (r) return r; + if (IS_NOT_NULL(qn->next_head_exact)) { + if (IS_MULTILINE(reg->options)) + r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT); + else + r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT); + if (r) return r; + return add_bytes(reg, NSTRING(qn->next_head_exact).s, 1); + } + else { + if (IS_MULTILINE(reg->options)) + return add_opcode(reg, OP_ANYCHAR_ML_STAR); + else + return add_opcode(reg, OP_ANYCHAR_STAR); + } + } + + if (empty_info != 0) + mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END); + else + mod_tlen = tlen; + + if (infinite && + (qn->lower <= 1 || tlen * qn->lower <= QUALIFIER_EXPAND_LIMIT_SIZE)) { + if (qn->lower == 1 && tlen > QUALIFIER_EXPAND_LIMIT_SIZE) { + if (qn->greedy) { + if (IS_NOT_NULL(qn->head_exact)) + r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_OR_JUMP_EXACT1); + else if (IS_NOT_NULL(qn->next_head_exact)) + r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_IF_PEEK_NEXT); + else + r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH); + } + else { + r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_JUMP); + } + if (r) return r; + } + else { + r = compile_tree_n_times(qn->target, qn->lower, reg); + if (r) return r; + } + + if (qn->greedy) { + if (IS_NOT_NULL(qn->head_exact)) { + r = add_opcode_rel_addr(reg, OP_PUSH_OR_JUMP_EXACT1, + mod_tlen + SIZE_OP_JUMP); + if (r) return r; + add_bytes(reg, NSTRING(qn->head_exact).s, 1); + r = compile_tree_empty_check(qn->target, reg, empty_info); + if (r) return r; + r = add_opcode_rel_addr(reg, OP_JUMP, + -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1)); + } + else if (IS_NOT_NULL(qn->next_head_exact)) { + r = add_opcode_rel_addr(reg, OP_PUSH_IF_PEEK_NEXT, + mod_tlen + SIZE_OP_JUMP); + if (r) return r; + add_bytes(reg, NSTRING(qn->next_head_exact).s, 1); + r = compile_tree_empty_check(qn->target, reg, empty_info); + if (r) return r; + r = add_opcode_rel_addr(reg, OP_JUMP, + -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_IF_PEEK_NEXT)); + } + else { + r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP); + if (r) return r; + r = compile_tree_empty_check(qn->target, reg, empty_info); + if (r) return r; + r = add_opcode_rel_addr(reg, OP_JUMP, + -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH)); + } + } + else { + r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen); + if (r) return r; + r = compile_tree_empty_check(qn->target, reg, empty_info); + if (r) return r; + r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH)); + } + } + else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?..){0}/ */ + r = add_opcode_rel_addr(reg, OP_JUMP, tlen); + if (r) return r; + r = compile_tree(qn->target, reg); + } + else if (!infinite && qn->greedy && + (tlen + SIZE_OP_PUSH) * qn->upper <= QUALIFIER_EXPAND_LIMIT_SIZE) { + int n = qn->upper - qn->lower; + + r = compile_tree_n_times(qn->target, qn->lower, reg); + if (r) return r; + + for (i = 0; i < n; i++) { + r = add_opcode_rel_addr(reg, OP_PUSH, + (n - i) * tlen + (n - i - 1) * SIZE_OP_PUSH); + if (r) return r; + r = compile_tree(qn->target, reg); + if (r) return r; + } + } + else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */ + r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP); + if (r) return r; + r = add_opcode_rel_addr(reg, OP_JUMP, tlen); + if (r) return r; + r = compile_tree(qn->target, reg); + } + else { + r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg); + } + return r; +} + +static int +compile_length_option_node(EffectNode* node, regex_t* reg) +{ + int tlen; + OnigOptionType prev = reg->options; + + reg->options = node->option; + tlen = compile_length_tree(node->target, reg); + reg->options = prev; + + if (tlen < 0) return tlen; + + if (IS_DYNAMIC_OPTION(prev ^ node->option)) { + return SIZE_OP_SET_OPTION_PUSH + SIZE_OP_SET_OPTION + SIZE_OP_FAIL + + tlen + SIZE_OP_SET_OPTION; + } + else + return tlen; +} + +static int +compile_option_node(EffectNode* node, regex_t* reg) +{ + int r; + OnigOptionType prev = reg->options; + + if (IS_DYNAMIC_OPTION(prev ^ node->option)) { + r = add_opcode_option(reg, OP_SET_OPTION_PUSH, node->option); + if (r) return r; + r = add_opcode_option(reg, OP_SET_OPTION, prev); + if (r) return r; + r = add_opcode(reg, OP_FAIL); + if (r) return r; + + reg->options = node->option; + r = compile_tree(node->target, reg); + reg->options = prev; + if (r) return r; + r = add_opcode_option(reg, OP_SET_OPTION, prev); + } + else { + reg->options = node->option; + r = compile_tree(node->target, reg); + reg->options = prev; + } + return r; +} + +static int +compile_length_effect_node(EffectNode* node, regex_t* reg) +{ + int len; + int tlen; + + if (node->type == EFFECT_OPTION) + return compile_length_option_node(node, reg); + + if (node->target) { + tlen = compile_length_tree(node->target, reg); + if (tlen < 0) return tlen; + } + else + tlen = 0; + + switch (node->type) { + case EFFECT_MEMORY: +#ifdef USE_SUBEXP_CALL + if (IS_EFFECT_CALLED(node)) { + len = SIZE_OP_MEMORY_START_PUSH + tlen + + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN; + if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)) + len += (IS_EFFECT_RECURSION(node) + ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH); + else + len += (IS_EFFECT_RECURSION(node) + ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END); + } + else +#endif + { + if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum)) + len = SIZE_OP_MEMORY_START_PUSH; + else + len = SIZE_OP_MEMORY_START; + + len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum) + ? SIZE_OP_MEMORY_END_PUSH : SIZE_OP_MEMORY_END); + } + break; + + case EFFECT_STOP_BACKTRACK: + if (IS_EFFECT_SIMPLE_REPEAT(node)) { + QualifierNode* qn = &NQUALIFIER(node->target); + tlen = compile_length_tree(qn->target, reg); + if (tlen < 0) return tlen; + + len = tlen * qn->lower + + SIZE_OP_PUSH + tlen + SIZE_OP_POP + SIZE_OP_JUMP; + } + else { + len = SIZE_OP_PUSH_STOP_BT + tlen + SIZE_OP_POP_STOP_BT; + } + break; + + default: + return ONIGERR_TYPE_BUG; + break; + } + + return len; +} + +static int get_char_length_tree(Node* node, regex_t* reg, int* len); + +static int +compile_effect_node(EffectNode* node, regex_t* reg) +{ + int r, len; + + if (node->type == EFFECT_OPTION) + return compile_option_node(node, reg); + + switch (node->type) { + case EFFECT_MEMORY: +#ifdef USE_SUBEXP_CALL + if (IS_EFFECT_CALLED(node)) { + r = add_opcode(reg, OP_CALL); + if (r) return r; + node->call_addr = BBUF_GET_OFFSET_POS(reg) + SIZE_ABSADDR + SIZE_OP_JUMP; + node->state |= NST_ADDR_FIXED; + r = add_abs_addr(reg, (int )node->call_addr); + if (r) return r; + len = compile_length_tree(node->target, reg); + len += (SIZE_OP_MEMORY_START_PUSH + SIZE_OP_RETURN); + if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)) + len += (IS_EFFECT_RECURSION(node) + ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH); + else + len += (IS_EFFECT_RECURSION(node) + ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END); + + r = add_opcode_rel_addr(reg, OP_JUMP, len); + if (r) return r; + } +#endif + if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum)) + r = add_opcode(reg, OP_MEMORY_START_PUSH); + else + r = add_opcode(reg, OP_MEMORY_START); + if (r) return r; + r = add_mem_num(reg, node->regnum); + if (r) return r; + r = compile_tree(node->target, reg); + if (r) return r; +#ifdef USE_SUBEXP_CALL + if (IS_EFFECT_CALLED(node)) { + if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)) + r = add_opcode(reg, (IS_EFFECT_RECURSION(node) + ? OP_MEMORY_END_PUSH_REC : OP_MEMORY_END_PUSH)); + else + r = add_opcode(reg, (IS_EFFECT_RECURSION(node) + ? OP_MEMORY_END_REC : OP_MEMORY_END)); + + if (r) return r; + r = add_mem_num(reg, node->regnum); + if (r) return r; + r = add_opcode(reg, OP_RETURN); + } + else +#endif + { + if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)) + r = add_opcode(reg, OP_MEMORY_END_PUSH); + else + r = add_opcode(reg, OP_MEMORY_END); + if (r) return r; + r = add_mem_num(reg, node->regnum); + } + break; + + case EFFECT_STOP_BACKTRACK: + if (IS_EFFECT_SIMPLE_REPEAT(node)) { + QualifierNode* qn = &NQUALIFIER(node->target); + r = compile_tree_n_times(qn->target, qn->lower, reg); + if (r) return r; + + len = compile_length_tree(qn->target, reg); + if (len < 0) return len; + + r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_POP + SIZE_OP_JUMP); + if (r) return r; + r = compile_tree(qn->target, reg); + if (r) return r; + r = add_opcode(reg, OP_POP); + if (r) return r; + r = add_opcode_rel_addr(reg, OP_JUMP, + -((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP + (int )SIZE_OP_JUMP)); + } + else { + r = add_opcode(reg, OP_PUSH_STOP_BT); + if (r) return r; + r = compile_tree(node->target, reg); + if (r) return r; + r = add_opcode(reg, OP_POP_STOP_BT); + } + break; + + default: + return ONIGERR_TYPE_BUG; + break; + } + + return r; +} + +static int +compile_length_anchor_node(AnchorNode* node, regex_t* reg) +{ + int len; + int tlen = 0; + + if (node->target) { + tlen = compile_length_tree(node->target, reg); + if (tlen < 0) return tlen; + } + + switch (node->type) { + case ANCHOR_PREC_READ: + len = SIZE_OP_PUSH_POS + tlen + SIZE_OP_POP_POS; + break; + case ANCHOR_PREC_READ_NOT: + len = SIZE_OP_PUSH_POS_NOT + tlen + SIZE_OP_FAIL_POS; + break; + case ANCHOR_LOOK_BEHIND: + len = SIZE_OP_LOOK_BEHIND + tlen; + break; + case ANCHOR_LOOK_BEHIND_NOT: + len = SIZE_OP_PUSH_LOOK_BEHIND_NOT + tlen + SIZE_OP_FAIL_LOOK_BEHIND_NOT; + break; + + default: + len = SIZE_OPCODE; + break; + } + + return len; +} + +static int +compile_anchor_node(AnchorNode* node, regex_t* reg) +{ + int r, len; + + switch (node->type) { + case ANCHOR_BEGIN_BUF: r = add_opcode(reg, OP_BEGIN_BUF); break; + case ANCHOR_END_BUF: r = add_opcode(reg, OP_END_BUF); break; + case ANCHOR_BEGIN_LINE: r = add_opcode(reg, OP_BEGIN_LINE); break; + case ANCHOR_END_LINE: r = add_opcode(reg, OP_END_LINE); break; + case ANCHOR_SEMI_END_BUF: r = add_opcode(reg, OP_SEMI_END_BUF); break; + case ANCHOR_BEGIN_POSITION: r = add_opcode(reg, OP_BEGIN_POSITION); break; + + case ANCHOR_WORD_BOUND: r = add_opcode(reg, OP_WORD_BOUND); break; + case ANCHOR_NOT_WORD_BOUND: r = add_opcode(reg, OP_NOT_WORD_BOUND); break; +#ifdef USE_WORD_BEGIN_END + case ANCHOR_WORD_BEGIN: r = add_opcode(reg, OP_WORD_BEGIN); break; + case ANCHOR_WORD_END: r = add_opcode(reg, OP_WORD_END); break; +#endif + + case ANCHOR_PREC_READ: + r = add_opcode(reg, OP_PUSH_POS); + if (r) return r; + r = compile_tree(node->target, reg); + if (r) return r; + r = add_opcode(reg, OP_POP_POS); + break; + + case ANCHOR_PREC_READ_NOT: + len = compile_length_tree(node->target, reg); + if (len < 0) return len; + r = add_opcode_rel_addr(reg, OP_PUSH_POS_NOT, len + SIZE_OP_FAIL_POS); + if (r) return r; + r = compile_tree(node->target, reg); + if (r) return r; + r = add_opcode(reg, OP_FAIL_POS); + break; + + case ANCHOR_LOOK_BEHIND: + { + int n; + r = add_opcode(reg, OP_LOOK_BEHIND); + if (r) return r; + if (node->char_len < 0) { + r = get_char_length_tree(node->target, reg, &n); + if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; + } + else + n = node->char_len; + r = add_length(reg, n); + if (r) return r; + r = compile_tree(node->target, reg); + } + break; + + case ANCHOR_LOOK_BEHIND_NOT: + { + int n; + len = compile_length_tree(node->target, reg); + r = add_opcode_rel_addr(reg, OP_PUSH_LOOK_BEHIND_NOT, + len + SIZE_OP_FAIL_LOOK_BEHIND_NOT); + if (r) return r; + if (node->char_len < 0) { + r = get_char_length_tree(node->target, reg, &n); + if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; + } + else + n = node->char_len; + r = add_length(reg, n); + if (r) return r; + r = compile_tree(node->target, reg); + if (r) return r; + r = add_opcode(reg, OP_FAIL_LOOK_BEHIND_NOT); + } + break; + + default: + return ONIGERR_TYPE_BUG; + break; + } + + return r; +} + +static int +compile_length_tree(Node* node, regex_t* reg) +{ + int len, type, r; + + type = NTYPE(node); + switch (type) { + case N_LIST: + len = 0; + do { + r = compile_length_tree(NCONS(node).left, reg); + if (r < 0) return r; + len += r; + } while (IS_NOT_NULL(node = NCONS(node).right)); + r = len; + break; + + case N_ALT: + { + int n; + + n = r = 0; + do { + r += compile_length_tree(NCONS(node).left, reg); + n++; + } while (IS_NOT_NULL(node = NCONS(node).right)); + r += (SIZE_OP_PUSH + SIZE_OP_JUMP) * (n - 1); + } + break; + + case N_STRING: + if (NSTRING_IS_RAW(node)) + r = compile_length_string_raw_node(&(NSTRING(node)), reg); + else + r = compile_length_string_node(&(NSTRING(node)), reg); + break; + + case N_CCLASS: + r = compile_length_cclass_node(&(NCCLASS(node)), reg); + break; + + case N_CTYPE: + case N_ANYCHAR: + r = SIZE_OPCODE; + break; + + case N_BACKREF: + { + BackrefNode* br = &(NBACKREF(node)); + + if (br->back_num == 1) { + r = ((!IS_IGNORECASE(reg->options) && br->back_static[0] <= 3) + ? SIZE_OPCODE : (SIZE_OPCODE + SIZE_MEMNUM)); + } + else { + r = SIZE_OPCODE + SIZE_LENGTH + (SIZE_MEMNUM * br->back_num); + } + } + break; + +#ifdef USE_SUBEXP_CALL + case N_CALL: + r = SIZE_OP_CALL; + break; +#endif + + case N_QUALIFIER: + r = compile_length_qualifier_node(&(NQUALIFIER(node)), reg); + break; + + case N_EFFECT: + r = compile_length_effect_node(&NEFFECT(node), reg); + break; + + case N_ANCHOR: + r = compile_length_anchor_node(&(NANCHOR(node)), reg); + break; + + default: + return ONIGERR_TYPE_BUG; + break; + } + + return r; +} + +static int +compile_tree(Node* node, regex_t* reg) +{ + int n, type, len, pos, r = 0; + + type = NTYPE(node); + switch (type) { + case N_LIST: + do { + r = compile_tree(NCONS(node).left, reg); + } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + break; + + case N_ALT: + { + Node* x = node; + len = 0; + do { + len += compile_length_tree(NCONS(x).left, reg); + if (NCONS(x).right != NULL) { + len += SIZE_OP_PUSH + SIZE_OP_JUMP; + } + } while (IS_NOT_NULL(x = NCONS(x).right)); + pos = reg->used + len; /* goal position */ + + do { + len = compile_length_tree(NCONS(node).left, reg); + if (IS_NOT_NULL(NCONS(node).right)) { + r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_JUMP); + if (r) break; + } + r = compile_tree(NCONS(node).left, reg); + if (r) break; + if (IS_NOT_NULL(NCONS(node).right)) { + len = pos - (reg->used + SIZE_OP_JUMP); + r = add_opcode_rel_addr(reg, OP_JUMP, len); + if (r) break; + } + } while (IS_NOT_NULL(node = NCONS(node).right)); + } + break; + + case N_STRING: + if (NSTRING_IS_RAW(node)) + r = compile_string_raw_node(&(NSTRING(node)), reg); + else + r = compile_string_node(&(NSTRING(node)), reg); + break; + + case N_CCLASS: + r = compile_cclass_node(&(NCCLASS(node)), reg); + break; + + case N_CTYPE: + { + int op; + + switch (NCTYPE(node).type) { + case CTYPE_WORD: op = OP_WORD; break; + case CTYPE_NOT_WORD: op = OP_NOT_WORD; break; + default: + return ONIGERR_TYPE_BUG; + break; + } + r = add_opcode(reg, op); + } + break; + + case N_ANYCHAR: + if (IS_MULTILINE(reg->options)) + r = add_opcode(reg, OP_ANYCHAR_ML); + else + r = add_opcode(reg, OP_ANYCHAR); + break; + + case N_BACKREF: + { + int i; + BackrefNode* br = &(NBACKREF(node)); + + if (br->back_num == 1) { + n = br->back_static[0]; + if (IS_IGNORECASE(reg->options)) { + r = add_opcode(reg, OP_BACKREFN_IC); + if (r) return r; + r = add_mem_num(reg, n); + } + else { + switch (n) { + case 1: r = add_opcode(reg, OP_BACKREF1); break; + case 2: r = add_opcode(reg, OP_BACKREF2); break; + case 3: r = add_opcode(reg, OP_BACKREF3); break; + default: + r = add_opcode(reg, OP_BACKREFN); + if (r) return r; + r = add_mem_num(reg, n); + break; + } + } + } + else { + int* p; + add_opcode(reg, (IS_IGNORECASE(reg->options) ? + OP_BACKREF_MULTI_IC : OP_BACKREF_MULTI)); + if (r) return r; + add_length(reg, br->back_num); + if (r) return r; + p = BACKREFS_P(br); + for (i = br->back_num - 1; i >= 0; i--) { + r = add_mem_num(reg, p[i]); + if (r) return r; + } + } + } + break; + +#ifdef USE_SUBEXP_CALL + case N_CALL: + r = compile_call(&(NCALL(node)), reg); + break; +#endif + + case N_QUALIFIER: + r = compile_qualifier_node(&(NQUALIFIER(node)), reg); + break; + + case N_EFFECT: + r = compile_effect_node(&NEFFECT(node), reg); + break; + + case N_ANCHOR: + r = compile_anchor_node(&(NANCHOR(node)), reg); + break; + + default: +#ifdef ONIG_DEBUG + fprintf(stderr, "compile_tree: undefined node type %d\n", NTYPE(node)); +#endif + break; + } + + return r; +} + +#ifdef USE_NAMED_GROUP +typedef struct { + int new_val; +} NumMap; + +static int +noname_disable_map(Node** plink, NumMap* map, int* counter) +{ + int r = 0; + Node* node = *plink; + + switch (NTYPE(node)) { + case N_LIST: + case N_ALT: + do { + r = noname_disable_map(&(NCONS(node).left), map, counter); + } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + break; + + case N_QUALIFIER: + { + Node** ptarget = &(NQUALIFIER(node).target); + Node* old = *ptarget; + r = noname_disable_map(ptarget, map, counter); + if (*ptarget != old && NTYPE(*ptarget) == N_QUALIFIER) { + onig_reduce_nested_qualifier(node, *ptarget); + } + } + break; + + case N_EFFECT: + { + EffectNode* en = &(NEFFECT(node)); + if (en->type == EFFECT_MEMORY) { + if (IS_EFFECT_NAMED_GROUP(en)) { + (*counter)++; + map[en->regnum].new_val = *counter; + en->regnum = *counter; + r = noname_disable_map(&(en->target), map, counter); + } + else { + *plink = en->target; + en->target = NULL_NODE; + onig_node_free(node); + r = noname_disable_map(plink, map, counter); + } + } + else + r = noname_disable_map(&(en->target), map, counter); + } + break; + + default: + break; + } + + return r; +} + +static int +renumber_node_backref(Node* node, NumMap* map) +{ + int i, pos, n, old_num; + int *backs; + BackrefNode* bn = &(NBACKREF(node)); + + if (! IS_BACKREF_NAME_REF(bn)) + return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED; + + old_num = bn->back_num; + if (IS_NULL(bn->back_dynamic)) + backs = bn->back_static; + else + backs = bn->back_dynamic; + + for (i = 0, pos = 0; i < old_num; i++) { + n = map[backs[i]].new_val; + if (n > 0) { + backs[pos] = n; + pos++; + } + } + + bn->back_num = pos; + return 0; +} + +static int +renumber_by_map(Node* node, NumMap* map) +{ + int r = 0; + + switch (NTYPE(node)) { + case N_LIST: + case N_ALT: + do { + r = renumber_by_map(NCONS(node).left, map); + } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + break; + case N_QUALIFIER: + r = renumber_by_map(NQUALIFIER(node).target, map); + break; + case N_EFFECT: + r = renumber_by_map(NEFFECT(node).target, map); + break; + + case N_BACKREF: + r = renumber_node_backref(node, map); + break; + + default: + break; + } + + return r; +} + +static int +numbered_ref_check(Node* node) +{ + int r = 0; + + switch (NTYPE(node)) { + case N_LIST: + case N_ALT: + do { + r = numbered_ref_check(NCONS(node).left); + } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + break; + case N_QUALIFIER: + r = numbered_ref_check(NQUALIFIER(node).target); + break; + case N_EFFECT: + r = numbered_ref_check(NEFFECT(node).target); + break; + + case N_BACKREF: + if (! IS_BACKREF_NAME_REF(&(NBACKREF(node)))) + return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED; + break; + + default: + break; + } + + return r; +} + +static int +disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env) +{ + int r, i, pos, counter; + BitStatusType loc; + NumMap* map; + + map = (NumMap* )xalloca(sizeof(NumMap) * (env->num_mem + 1)); + CHECK_NULL_RETURN_VAL(map, ONIGERR_MEMORY); + for (i = 1; i <= env->num_mem; i++) { + map[i].new_val = 0; + } + counter = 0; + r = noname_disable_map(root, map, &counter); + if (r != 0) return r; + + r = renumber_by_map(*root, map); + if (r != 0) return r; + + for (i = 1, pos = 1; i <= env->num_mem; i++) { + if (map[i].new_val > 0) { + SCANENV_MEM_NODES(env)[pos] = SCANENV_MEM_NODES(env)[i]; + pos++; + } + } + + loc = env->capture_history; + BIT_STATUS_CLEAR(env->capture_history); + for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) { + if (BIT_STATUS_AT(loc, i)) { + BIT_STATUS_ON_AT_SIMPLE(env->capture_history, map[i].new_val); + } + } + + env->num_mem = env->num_named; + reg->num_mem = env->num_named; + return 0; +} +#endif /* USE_NAMED_GROUP */ + +#ifdef USE_SUBEXP_CALL +static int +unset_addr_list_fix(UnsetAddrList* uslist, regex_t* reg) +{ + int i, offset; + EffectNode* en; + AbsAddrType addr; +#ifndef PLATFORM_UNALIGNED_WORD_ACCESS + UChar buf[SERIALIZE_BUFSIZE]; +#endif + + for (i = 0; i < uslist->num; i++) { + en = &(NEFFECT(uslist->us[i].target)); + if (! IS_EFFECT_ADDR_FIXED(en)) return ONIGERR_PARSER_BUG; + addr = en->call_addr; + offset = uslist->us[i].offset; + +#ifdef PLATFORM_UNALIGNED_WORD_ACCESS + BBUF_WRITE(reg, offset, &addr, SIZE_ABSADDR); +#else + SERIALIZE_ABSADDR(addr, buf); + BBUF_WRITE(reg, offset, buf, SIZE_ABSADDR); +#endif + } + return 0; +} +#endif + +#ifdef USE_INFINITE_REPEAT_MONOMANIAC_MEM_STATUS_CHECK +static int +qualifiers_memory_node_info(Node* node) +{ + int r = 0; + + switch (NTYPE(node)) { + case N_LIST: + case N_ALT: + { + int v; + do { + v = qualifiers_memory_node_info(NCONS(node).left); + if (v > r) r = v; + } while (v >= 0 && IS_NOT_NULL(node = NCONS(node).right)); + } + break; + +#ifdef USE_SUBEXP_CALL + case N_CALL: + if (IS_CALL_RECURSION(&NCALL(node))) { + return NQ_TARGET_IS_EMPTY_REC; /* tiny version */ + } + else + r = qualifiers_memory_node_info(NCALL(node).target); + break; +#endif + + case N_QUALIFIER: + { + QualifierNode* qn = &(NQUALIFIER(node)); + if (qn->upper != 0) { + r = qualifiers_memory_node_info(qn->target); + } + } + break; + + case N_EFFECT: + { + EffectNode* en = &(NEFFECT(node)); + switch (en->type) { + case EFFECT_MEMORY: + return NQ_TARGET_IS_EMPTY_MEM; + break; + + case EFFECT_OPTION: + case EFFECT_STOP_BACKTRACK: + r = qualifiers_memory_node_info(en->target); + break; + default: + break; + } + } + break; + + case N_BACKREF: + case N_STRING: + case N_CTYPE: + case N_CCLASS: + case N_ANYCHAR: + case N_ANCHOR: + default: + break; + } + + return r; +} +#endif /* USE_INFINITE_REPEAT_MONOMANIAC_MEM_STATUS_CHECK */ + +static int +get_min_match_length(Node* node, OnigDistance *min, ScanEnv* env) +{ + OnigDistance tmin; + int r = 0; + + *min = 0; + switch (NTYPE(node)) { + case N_BACKREF: + { + int i; + int* backs; + Node** nodes = SCANENV_MEM_NODES(env); + BackrefNode* br = &(NBACKREF(node)); + if (br->state & NST_RECURSION) break; + + backs = BACKREFS_P(br); + if (backs[0] > env->num_mem) return ONIGERR_INVALID_BACKREF; + r = get_min_match_length(nodes[backs[0]], min, env); + if (r != 0) break; + for (i = 1; i < br->back_num; i++) { + if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF; + r = get_min_match_length(nodes[backs[i]], &tmin, env); + if (r != 0) break; + if (*min > tmin) *min = tmin; + } + } + break; + +#ifdef USE_SUBEXP_CALL + case N_CALL: + if (IS_CALL_RECURSION(&NCALL(node))) { + EffectNode* en = &(NEFFECT(NCALL(node).target)); + if (IS_EFFECT_MIN_FIXED(en)) + *min = en->min_len; + } + else + r = get_min_match_length(NCALL(node).target, min, env); + break; +#endif + + case N_LIST: + do { + r = get_min_match_length(NCONS(node).left, &tmin, env); + if (r == 0) *min += tmin; + } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + break; + + case N_ALT: + { + Node *x, *y; + y = node; + do { + x = NCONS(y).left; + r = get_min_match_length(x, &tmin, env); + if (r != 0) break; + if (y == node) *min = tmin; + else if (*min > tmin) *min = tmin; + } while (r == 0 && IS_NOT_NULL(y = NCONS(y).right)); + } + break; + + case N_STRING: + { + StrNode* sn = &(NSTRING(node)); + *min = sn->end - sn->s; + } + break; + + case N_CTYPE: + switch (NCTYPE(node).type) { + case CTYPE_WORD: *min = 1; break; + case CTYPE_NOT_WORD: *min = 1; break; + default: + break; + } + break; + + case N_CCLASS: + case N_ANYCHAR: + *min = 1; + break; + + case N_QUALIFIER: + { + QualifierNode* qn = &(NQUALIFIER(node)); + + if (qn->lower > 0) { + r = get_min_match_length(qn->target, min, env); + if (r == 0) + *min = distance_multiply(*min, qn->lower); + } + } + break; + + case N_EFFECT: + { + EffectNode* en = &(NEFFECT(node)); + switch (en->type) { + case EFFECT_MEMORY: +#ifdef USE_SUBEXP_CALL + if (IS_EFFECT_MIN_FIXED(en)) + *min = en->min_len; + else { + r = get_min_match_length(en->target, min, env); + if (r == 0) { + en->min_len = *min; + SET_EFFECT_STATUS(node, NST_MIN_FIXED); + } + } + break; +#endif + case EFFECT_OPTION: + case EFFECT_STOP_BACKTRACK: + r = get_min_match_length(en->target, min, env); + break; + } + } + break; + + case N_ANCHOR: + default: + break; + } + + return r; +} + +static int +get_max_match_length(Node* node, OnigDistance *max, ScanEnv* env) +{ + OnigDistance tmax; + int r = 0; + + *max = 0; + switch (NTYPE(node)) { + case N_LIST: + do { + r = get_max_match_length(NCONS(node).left, &tmax, env); + if (r == 0) + *max = distance_add(*max, tmax); + } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + break; + + case N_ALT: + do { + r = get_max_match_length(NCONS(node).left, &tmax, env); + if (r == 0 && *max < tmax) *max = tmax; + } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + break; + + case N_STRING: + { + StrNode* sn = &(NSTRING(node)); + *max = sn->end - sn->s; + } + break; + + case N_CTYPE: + switch (NCTYPE(node).type) { + case CTYPE_WORD: + case CTYPE_NOT_WORD: + *max = ONIGENC_MBC_MAXLEN_DIST(env->enc); + break; + + default: + break; + } + break; + + case N_CCLASS: + case N_ANYCHAR: + *max = ONIGENC_MBC_MAXLEN_DIST(env->enc); + break; + + case N_BACKREF: + { + int i; + int* backs; + Node** nodes = SCANENV_MEM_NODES(env); + BackrefNode* br = &(NBACKREF(node)); + if (br->state & NST_RECURSION) { + *max = ONIG_INFINITE_DISTANCE; + break; + } + backs = BACKREFS_P(br); + for (i = 0; i < br->back_num; i++) { + if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF; + r = get_max_match_length(nodes[backs[i]], &tmax, env); + if (r != 0) break; + if (*max < tmax) *max = tmax; + } + } + break; + +#ifdef USE_SUBEXP_CALL + case N_CALL: + if (! IS_CALL_RECURSION(&(NCALL(node)))) + r = get_max_match_length(NCALL(node).target, max, env); + else + *max = ONIG_INFINITE_DISTANCE; + break; +#endif + + case N_QUALIFIER: + { + QualifierNode* qn = &(NQUALIFIER(node)); + + if (qn->upper != 0) { + r = get_max_match_length(qn->target, max, env); + if (r == 0 && *max != 0) { + if (! IS_REPEAT_INFINITE(qn->upper)) + *max = distance_multiply(*max, qn->upper); + else + *max = ONIG_INFINITE_DISTANCE; + } + } + } + break; + + case N_EFFECT: + { + EffectNode* en = &(NEFFECT(node)); + switch (en->type) { + case EFFECT_MEMORY: +#ifdef USE_SUBEXP_CALL + if (IS_EFFECT_MAX_FIXED(en)) + *max = en->max_len; + else { + r = get_max_match_length(en->target, max, env); + if (r == 0) { + en->max_len = *max; + SET_EFFECT_STATUS(node, NST_MAX_FIXED); + } + } + break; +#endif + case EFFECT_OPTION: + case EFFECT_STOP_BACKTRACK: + r = get_max_match_length(en->target, max, env); + break; + } + } + break; + + case N_ANCHOR: + default: + break; + } + + return r; +} + +#define GET_CHAR_LEN_VARLEN -1 +#define GET_CHAR_LEN_TOP_ALT_VARLEN -2 + +/* fixed size pattern node only */ +static int +get_char_length_tree1(Node* node, regex_t* reg, int* len, int level) +{ + int tlen; + int r = 0; + + level++; + *len = 0; + switch (NTYPE(node)) { + case N_LIST: + do { + r = get_char_length_tree1(NCONS(node).left, reg, &tlen, level); + if (r == 0) + *len = distance_add(*len, tlen); + } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + break; + + case N_ALT: + { + int tlen2; + int varlen = 0; + + r = get_char_length_tree1(NCONS(node).left, reg, &tlen, level); + while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)) { + r = get_char_length_tree1(NCONS(node).left, reg, &tlen2, level); + if (r == 0) { + if (tlen != tlen2) + varlen = 1; + } + } + if (r == 0) { + if (varlen != 0) { + if (level == 1) + r = GET_CHAR_LEN_TOP_ALT_VARLEN; + else + r = GET_CHAR_LEN_VARLEN; + } + else + *len = tlen; + } + } + break; + + case N_STRING: + { + StrNode* sn = &(NSTRING(node)); + UChar *s = sn->s; + while (s < sn->end) { + s += enc_len(reg->enc, *s); + (*len)++; + } + } + break; + + case N_QUALIFIER: + { + QualifierNode* qn = &(NQUALIFIER(node)); + if (qn->lower == qn->upper) { + r = get_char_length_tree1(qn->target, reg, &tlen, level); + if (r == 0) + *len = distance_multiply(tlen, qn->lower); + } + else + r = GET_CHAR_LEN_VARLEN; + } + break; + +#ifdef USE_SUBEXP_CALL + case N_CALL: + if (! IS_CALL_RECURSION(&(NCALL(node)))) + r = get_char_length_tree1(NCALL(node).target, reg, len, level); + else + r = GET_CHAR_LEN_VARLEN; + break; +#endif + + case N_CTYPE: + switch (NCTYPE(node).type) { + case CTYPE_WORD: + case CTYPE_NOT_WORD: + *len = 1; + break; + } + break; + + case N_CCLASS: + case N_ANYCHAR: + *len = 1; + break; + + case N_EFFECT: + { + EffectNode* en = &(NEFFECT(node)); + switch (en->type) { + case EFFECT_MEMORY: +#ifdef USE_SUBEXP_CALL + if (IS_EFFECT_CLEN_FIXED(en)) + *len = en->char_len; + else { + r = get_char_length_tree1(en->target, reg, len, level); + if (r == 0) { + en->char_len = *len; + SET_EFFECT_STATUS(node, NST_CLEN_FIXED); + } + } + break; +#endif + case EFFECT_OPTION: + case EFFECT_STOP_BACKTRACK: + r = get_char_length_tree1(en->target, reg, len, level); + break; + default: + break; + } + } + break; + + case N_ANCHOR: + break; + + default: + r = GET_CHAR_LEN_VARLEN; + break; + } + + return r; +} + +static int +get_char_length_tree(Node* node, regex_t* reg, int* len) +{ + return get_char_length_tree1(node, reg, len, 0); +} + +extern int +onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc) +{ + int found; + + if (code >= SINGLE_BYTE_SIZE) { + if (IS_NULL(cc->mbuf)) { + found = 0; + } + else { + found = (onig_is_in_code_range(cc->mbuf->p, code) != 0 ? 1 : 0); + } + } + else { + found = (BITSET_AT(cc->bs, code) == 0 ? 0 : 1); + } + + if (cc->not == 0) + return found; + else + return !found; +} + +/* x is not included y ==> 1 : 0 */ +static int +is_not_included(Node* x, Node* y, regex_t* reg) +{ + int i, len; + OnigCodePoint code; + UChar *p, c; + int ytype; + + retry: + ytype = NTYPE(y); + switch (NTYPE(x)) { + case N_CTYPE: + { + switch (ytype) { + case N_CTYPE: + switch (NCTYPE(x).type) { + case CTYPE_WORD: + if (NCTYPE(y).type == CTYPE_NOT_WORD) + return 1; + else + return 0; + break; + case CTYPE_NOT_WORD: + if (NCTYPE(y).type == CTYPE_WORD) + return 1; + else + return 0; + break; + default: + break; + } + break; + + case N_CCLASS: + swap: + { + Node* tmp; + tmp = x; x = y; y = tmp; + goto retry; + } + break; + + case N_STRING: + goto swap; + break; + + default: + break; + } + } + break; + + case N_CCLASS: + { + CClassNode* xc = &(NCCLASS(x)); + switch (ytype) { + case N_CTYPE: + switch (NCTYPE(y).type) { + case CTYPE_WORD: + if (IS_NULL(xc->mbuf) && xc->not == 0) { + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + if (BITSET_AT(xc->bs, i)) { + if (ONIGENC_IS_CODE_SB_WORD(reg->enc, i)) return 0; + } + } + return 1; + } + return 0; + break; + case CTYPE_NOT_WORD: + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + if (! ONIGENC_IS_CODE_SB_WORD(reg->enc, i)) { + if (xc->not == 0) { + if (BITSET_AT(xc->bs, i)) + return 0; + } + else { + if (! BITSET_AT(xc->bs, i)) + return 0; + } + } + } + return 1; + break; + + default: + break; + } + break; + + case N_CCLASS: + { + int v; + CClassNode* yc = &(NCCLASS(y)); + + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + v = BITSET_AT(xc->bs, i); + if ((v != 0 && xc->not == 0) || (v == 0 && xc->not)) { + v = BITSET_AT(yc->bs, i); + if ((v != 0 && yc->not == 0) || (v == 0 && yc->not)) + return 0; + } + } + if ((IS_NULL(xc->mbuf) && xc->not == 0) || + (IS_NULL(yc->mbuf) && yc->not == 0)) + return 1; + return 0; + } + break; + + case N_STRING: + goto swap; + break; + + default: + break; + } + } + break; + + case N_STRING: + { + StrNode* xs = &(NSTRING(x)); + if (NSTRING_LEN(x) == 0) + break; + + c = *(xs->s); + switch (ytype) { + case N_CTYPE: + switch (NCTYPE(y).type) { + case CTYPE_WORD: + return (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end) ? 0 : 1); + break; + case CTYPE_NOT_WORD: + return (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end) ? 1 : 0); + break; + default: + break; + } + break; + + case N_CCLASS: + { + CClassNode* cc = &(NCCLASS(y)); + + code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s, + xs->s + enc_len(reg->enc, c)); + return (onig_is_code_in_cc(reg->enc, code, cc) != 0 ? 0 : 1); + } + break; + + case N_STRING: + { + UChar *q; + StrNode* ys = &(NSTRING(y)); + len = NSTRING_LEN(x); + if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y); + if (NSTRING_IS_CASE_AMBIG(x) || NSTRING_IS_CASE_AMBIG(y)) { + UChar plow[ONIGENC_MBC_TO_LOWER_MAXLEN]; + UChar qlow[ONIGENC_MBC_TO_LOWER_MAXLEN]; + int plen, qlen; + for (p = ys->s, q = xs->s; q < xs->end; ) { + plen = ONIGENC_MBC_TO_LOWER(reg->enc, p, plow); + qlen = ONIGENC_MBC_TO_LOWER(reg->enc, q, qlow); + if (plen != qlen || onig_strncmp(plow, qlow, plen) != 0) + return 1; + p += enc_len(reg->enc, *p); + q += enc_len(reg->enc, *q); + } + } + else { + for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) { + if (*p != *q) return 1; + } + } + } + break; + + default: + break; + } + } + break; + + default: + break; + } + + return 0; +} + +static Node* +get_head_value_node(Node* node, int exact, regex_t* reg) +{ + Node* n = NULL_NODE; + + switch (NTYPE(node)) { + case N_BACKREF: + case N_ALT: + case N_ANYCHAR: +#ifdef USE_SUBEXP_CALL + case N_CALL: +#endif + break; + + case N_CTYPE: + case N_CCLASS: + if (exact == 0) { + n = node; + } + break; + + case N_LIST: + n = get_head_value_node(NCONS(node).left, exact, reg); + break; + + case N_STRING: + { + StrNode* sn = &(NSTRING(node)); + + if (sn->end <= sn->s) + break; + + if (exact != 0 && + !NSTRING_IS_RAW(node) && IS_IGNORECASE(reg->options)) { + if (! ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, sn->s)) + n = node; + } + else { + n = node; + } + } + break; + + case N_QUALIFIER: + { + QualifierNode* qn = &(NQUALIFIER(node)); + if (qn->lower > 0) { + if (IS_NOT_NULL(qn->head_exact)) + n = qn->head_exact; + else + n = get_head_value_node(qn->target, exact, reg); + } + } + break; + + case N_EFFECT: + { + EffectNode* en = &(NEFFECT(node)); + switch (en->type) { + case EFFECT_OPTION: + { + OnigOptionType options = reg->options; + + reg->options = NEFFECT(node).option; + n = get_head_value_node(NEFFECT(node).target, exact, reg); + reg->options = options; + } + break; + + case EFFECT_MEMORY: + case EFFECT_STOP_BACKTRACK: + n = get_head_value_node(en->target, exact, reg); + break; + } + } + break; + + case N_ANCHOR: + if (NANCHOR(node).type == ANCHOR_PREC_READ) + n = get_head_value_node(NANCHOR(node).target, exact, reg); + break; + + default: + break; + } + + return n; +} + +static int +check_type_tree(Node* node, int type_mask, int effect_mask, int anchor_mask) +{ + int type, r = 0; + + type = NTYPE(node); + if ((type & type_mask) == 0) + return 1; + + switch (type) { + case N_LIST: + case N_ALT: + do { + r = check_type_tree(NCONS(node).left, type_mask, effect_mask, anchor_mask); + } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + break; + + case N_QUALIFIER: + r = check_type_tree(NQUALIFIER(node).target, type_mask, effect_mask, + anchor_mask); + break; + + case N_EFFECT: + { + EffectNode* en = &(NEFFECT(node)); + if ((en->type & effect_mask) == 0) + return 1; + + r = check_type_tree(en->target, type_mask, effect_mask, anchor_mask); + } + break; + + case N_ANCHOR: + type = NANCHOR(node).type; + if ((type & anchor_mask) == 0) + return 1; + + if (NANCHOR(node).target) + r = check_type_tree(NANCHOR(node).target, + type_mask, effect_mask, anchor_mask); + break; + + default: + break; + } + return r; +} + +#ifdef USE_SUBEXP_CALL + +#define RECURSION_EXIST 1 +#define RECURSION_INFINITE 2 + +static int +subexp_inf_recursive_check(Node* node, ScanEnv* env, int head) +{ + int type; + int r = 0; + + type = NTYPE(node); + switch (type) { + case N_LIST: + { + Node *x; + OnigDistance min; + int ret; + + x = node; + do { + ret = subexp_inf_recursive_check(NCONS(x).left, env, head); + if (ret < 0 || ret == RECURSION_INFINITE) return ret; + r |= ret; + if (head) { + ret = get_min_match_length(NCONS(x).left, &min, env); + if (ret != 0) return ret; + if (min != 0) head = 0; + } + } while (IS_NOT_NULL(x = NCONS(x).right)); + } + break; + + case N_ALT: + { + int ret; + r = RECURSION_EXIST; + do { + ret = subexp_inf_recursive_check(NCONS(node).left, env, head); + if (ret < 0 || ret == RECURSION_INFINITE) return ret; + r &= ret; + } while (IS_NOT_NULL(node = NCONS(node).right)); + } + break; + + case N_QUALIFIER: + r = subexp_inf_recursive_check(NQUALIFIER(node).target, env, head); + break; + + case N_ANCHOR: + { + AnchorNode* an = &(NANCHOR(node)); + switch (an->type) { + case ANCHOR_PREC_READ: + case ANCHOR_PREC_READ_NOT: + case ANCHOR_LOOK_BEHIND: + case ANCHOR_LOOK_BEHIND_NOT: + r = subexp_inf_recursive_check(an->target, env, head); + break; + } + } + break; + + case N_CALL: + r = subexp_inf_recursive_check(NCALL(node).target, env, head); + break; + + case N_EFFECT: + if (IS_EFFECT_MARK2(&(NEFFECT(node)))) + return 0; + else if (IS_EFFECT_MARK1(&(NEFFECT(node)))) + return (head == 0 ? RECURSION_EXIST : RECURSION_INFINITE); + else { + SET_EFFECT_STATUS(node, NST_MARK2); + r = subexp_inf_recursive_check(NEFFECT(node).target, env, head); + CLEAR_EFFECT_STATUS(node, NST_MARK2); + } + break; + + default: + break; + } + + return r; +} + +static int +subexp_inf_recursive_check_trav(Node* node, ScanEnv* env) +{ + int type; + int r = 0; + + type = NTYPE(node); + switch (type) { + case N_LIST: + case N_ALT: + do { + r = subexp_inf_recursive_check_trav(NCONS(node).left, env); + } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + break; + + case N_QUALIFIER: + r = subexp_inf_recursive_check_trav(NQUALIFIER(node).target, env); + break; + + case N_ANCHOR: + { + AnchorNode* an = &(NANCHOR(node)); + switch (an->type) { + case ANCHOR_PREC_READ: + case ANCHOR_PREC_READ_NOT: + case ANCHOR_LOOK_BEHIND: + case ANCHOR_LOOK_BEHIND_NOT: + r = subexp_inf_recursive_check_trav(an->target, env); + break; + } + } + break; + + case N_EFFECT: + { + EffectNode* en = &(NEFFECT(node)); + + if (IS_EFFECT_RECURSION(en)) { + SET_EFFECT_STATUS(node, NST_MARK1); + r = subexp_inf_recursive_check(en->target, env, 1); + if (r > 0) return ONIGERR_NEVER_ENDING_RECURSION; + CLEAR_EFFECT_STATUS(node, NST_MARK1); + } + r = subexp_inf_recursive_check_trav(en->target, env); + } + + break; + + default: + break; + } + + return r; +} + +static int +subexp_recursive_check(Node* node) +{ + int type; + int r = 0; + + type = NTYPE(node); + switch (type) { + case N_LIST: + case N_ALT: + do { + r |= subexp_recursive_check(NCONS(node).left); + } while (IS_NOT_NULL(node = NCONS(node).right)); + break; + + case N_QUALIFIER: + r = subexp_recursive_check(NQUALIFIER(node).target); + break; + + case N_ANCHOR: + { + AnchorNode* an = &(NANCHOR(node)); + switch (an->type) { + case ANCHOR_PREC_READ: + case ANCHOR_PREC_READ_NOT: + case ANCHOR_LOOK_BEHIND: + case ANCHOR_LOOK_BEHIND_NOT: + r = subexp_recursive_check(an->target); + break; + } + } + break; + + case N_CALL: + r = subexp_recursive_check(NCALL(node).target); + if (r != 0) SET_CALL_RECURSION(node); + break; + + case N_EFFECT: + if (IS_EFFECT_MARK2(&(NEFFECT(node)))) + return 0; + else if (IS_EFFECT_MARK1(&(NEFFECT(node)))) + return 1; /* recursion */ + else { + SET_EFFECT_STATUS(node, NST_MARK2); + r = subexp_recursive_check(NEFFECT(node).target); + CLEAR_EFFECT_STATUS(node, NST_MARK2); + } + break; + + default: + break; + } + + return r; +} + + +static int +subexp_recursive_check_trav(Node* node, ScanEnv* env) +{ +#define FOUND_CALLED_NODE 1 + + int type; + int r = 0; + + type = NTYPE(node); + switch (type) { + case N_LIST: + case N_ALT: + { + int ret; + do { + ret = subexp_recursive_check_trav(NCONS(node).left, env); + if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE; + else if (ret < 0) return ret; + } while (IS_NOT_NULL(node = NCONS(node).right)); + } + break; + + case N_QUALIFIER: + r = subexp_recursive_check_trav(NQUALIFIER(node).target, env); + if (NQUALIFIER(node).upper == 0) { + if (r == FOUND_CALLED_NODE) + NQUALIFIER(node).is_refered = 1; + } + break; + + case N_ANCHOR: + { + AnchorNode* an = &(NANCHOR(node)); + switch (an->type) { + case ANCHOR_PREC_READ: + case ANCHOR_PREC_READ_NOT: + case ANCHOR_LOOK_BEHIND: + case ANCHOR_LOOK_BEHIND_NOT: + r = subexp_recursive_check_trav(an->target, env); + break; + } + } + break; + + case N_EFFECT: + { + EffectNode* en = &(NEFFECT(node)); + + if (! IS_EFFECT_RECURSION(en)) { + if (IS_EFFECT_CALLED(en)) { + SET_EFFECT_STATUS(node, NST_MARK1); + r = subexp_recursive_check(en->target); + if (r != 0) SET_EFFECT_STATUS(node, NST_RECURSION); + CLEAR_EFFECT_STATUS(node, NST_MARK1); + } + } + r = subexp_recursive_check_trav(en->target, env); + if (IS_EFFECT_CALLED(en)) + r |= FOUND_CALLED_NODE; + } + break; + + default: + break; + } + + return r; +} + +static int +setup_subexp_call(Node* node, ScanEnv* env) +{ + int type; + int r = 0; + + type = NTYPE(node); + switch (type) { + case N_LIST: + do { + r = setup_subexp_call(NCONS(node).left, env); + } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + break; + + case N_ALT: + do { + r = setup_subexp_call(NCONS(node).left, env); + } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + break; + + case N_QUALIFIER: + r = setup_subexp_call(NQUALIFIER(node).target, env); + break; + case N_EFFECT: + r = setup_subexp_call(NEFFECT(node).target, env); + break; + + case N_CALL: + { + int n, num, *refs; + UChar *p; + CallNode* cn = &(NCALL(node)); + Node** nodes = SCANENV_MEM_NODES(env); + +#ifdef USE_NAMED_GROUP + n = onig_name_to_group_numbers(env->reg, cn->name, cn->name_end, &refs); +#else + n = -1; +#endif + if (n <= 0) { + /* name not found, check group number. (?*ddd) */ + p = cn->name; + num = onig_scan_unsigned_number(&p, cn->name_end, env->enc); + if (num <= 0 || p != cn->name_end) { + onig_scan_env_set_error_string(env, + ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end); + return ONIGERR_UNDEFINED_NAME_REFERENCE; + } +#ifdef USE_NAMED_GROUP + if (env->num_named > 0 && + IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && + !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) { + return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED; + } +#endif + if (num > env->num_mem) { + onig_scan_env_set_error_string(env, + ONIGERR_UNDEFINED_GROUP_REFERENCE, cn->name, cn->name_end); + return ONIGERR_UNDEFINED_GROUP_REFERENCE; + } + cn->ref_num = num; + goto set_call_attr; + } + else if (n > 1) { + onig_scan_env_set_error_string(env, + ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL, cn->name, cn->name_end); + return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL; + } + else { + cn->ref_num = refs[0]; + set_call_attr: + cn->target = nodes[cn->ref_num]; + if (IS_NULL(cn->target)) { + onig_scan_env_set_error_string(env, + ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end); + return ONIGERR_UNDEFINED_NAME_REFERENCE; + } + SET_EFFECT_STATUS(cn->target, NST_CALLED); + BIT_STATUS_ON_AT(env->bt_mem_start, cn->ref_num); + cn->unset_addr_list = env->unset_addr_list; + } + } + break; + + case N_ANCHOR: + { + AnchorNode* an = &(NANCHOR(node)); + + switch (an->type) { + case ANCHOR_PREC_READ: + case ANCHOR_PREC_READ_NOT: + case ANCHOR_LOOK_BEHIND: + case ANCHOR_LOOK_BEHIND_NOT: + r = setup_subexp_call(an->target, env); + break; + } + } + break; + + default: + break; + } + + return r; +} +#endif + +/* divide different length alternatives in look-behind. + (?<=A|B) ==> (?<=A)|(?<=B) + (? (?type; + + head = an->target; + np = NCONS(head).left; + tmp_node = *node; *node = *head; *head = tmp_node; + NCONS(node).left = head; + NANCHOR(head).target = np; + + np = node; + while ((np = NCONS(np).right) != NULL_NODE) { + insert_node = onig_node_new_anchor(anc_type); + CHECK_NULL_RETURN_VAL(insert_node, ONIGERR_MEMORY); + NANCHOR(insert_node).target = NCONS(np).left; + NCONS(np).left = insert_node; + } + + if (anc_type == ANCHOR_LOOK_BEHIND_NOT) { + np = node; + do { + np->type = N_LIST; /* alt -> list */ + } while ((np = NCONS(np).right) != NULL_NODE); + } + return 0; +} + +static int +setup_look_behind(Node* node, regex_t* reg, ScanEnv* env) +{ + int r, len; + AnchorNode* an = &(NANCHOR(node)); + + r = get_char_length_tree(an->target, reg, &len); + if (r == 0) + an->char_len = len; + else if (r == GET_CHAR_LEN_VARLEN) + r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN; + else if (r == GET_CHAR_LEN_TOP_ALT_VARLEN) { + if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND)) + r = divide_look_behind_alternatives(node); + else + r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN; + } + + return r; +} + +static int +next_setup(Node* node, Node* next_node, regex_t* reg) +{ + int type; + + retry: + type = NTYPE(node); + if (type == N_QUALIFIER) { + QualifierNode* qn = &(NQUALIFIER(node)); + if (qn->greedy && IS_REPEAT_INFINITE(qn->upper)) { +#ifdef USE_QUALIFIER_PEEK_NEXT + qn->next_head_exact = get_head_value_node(next_node, 1, reg); +#endif + /* automatic posseivation a*b ==> (?>a*)b */ + if (qn->lower <= 1) { + int ttype = NTYPE(qn->target); + if (IS_NODE_TYPE_SIMPLE(ttype)) { + Node *x, *y; + x = get_head_value_node(qn->target, 0, reg); + if (IS_NOT_NULL(x)) { + y = get_head_value_node(next_node, 0, reg); + if (IS_NOT_NULL(y) && is_not_included(x, y, reg)) { + Node* en = onig_node_new_effect(EFFECT_STOP_BACKTRACK); + CHECK_NULL_RETURN_VAL(en, ONIGERR_MEMORY); + SET_EFFECT_STATUS(en, NST_SIMPLE_REPEAT); + swap_node(node, en); + NEFFECT(node).target = en; + } + } + } + } + } + } + else if (type == N_EFFECT) { + EffectNode* en = &(NEFFECT(node)); + if (en->type == EFFECT_MEMORY) { + node = en->target; + goto retry; + } + } + return 0; +} + +#define IN_ALT (1<<0) +#define IN_NOT (1<<1) +#define IN_REPEAT (1<<2) + +/* setup_tree does the following work. + 1. check empty loop. (set qn->target_empty_info) + 2. expand ignore-case in char class. + 3. set memory status bit flags. (reg->mem_stats) + 4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact]. + 5. find invalid patterns in look-behind. + 6. expand repeated string. + */ +static int +setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) +{ + int type; + int r = 0; + + type = NTYPE(node); + switch (type) { + case N_LIST: + { + Node* prev = NULL_NODE; + do { + r = setup_tree(NCONS(node).left, reg, state, env); + if (IS_NOT_NULL(prev) && r == 0) { + r = next_setup(prev, NCONS(node).left, reg); + } + prev = NCONS(node).left; + } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + } + break; + + case N_ALT: + do { + r = setup_tree(NCONS(node).left, reg, (state | IN_ALT), env); + } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + break; + + case N_CCLASS: + if (IS_IGNORECASE(reg->options)) { + int i; + UChar c, lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; + BitSetRef bs = NCCLASS(node).bs; + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + c = (UChar )i; + ONIGENC_MBC_TO_LOWER(reg->enc, &c, lowbuf); + if (*lowbuf != c) { + if (BITSET_AT(bs, c)) BITSET_SET_BIT(bs, *lowbuf); + if (BITSET_AT(bs, *lowbuf)) BITSET_SET_BIT(bs, c); + } + } + } + break; + + case N_STRING: + if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) { + StrNode* sn = &NSTRING(node); + UChar* p = sn->s; + + while (p < sn->end) { + if (ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p)) { + NSTRING_SET_CASE_AMBIG(node); + break; + } + p++; + } + } + break; + + case N_CTYPE: + case N_ANYCHAR: + break; + +#ifdef USE_SUBEXP_CALL + case N_CALL: + break; +#endif + + case N_BACKREF: + { + int i; + int* p; + Node** nodes = SCANENV_MEM_NODES(env); + BackrefNode* br = &(NBACKREF(node)); + p = BACKREFS_P(br); + for (i = 0; i < br->back_num; i++) { + if (p[i] > env->num_mem) return ONIGERR_INVALID_BACKREF; + BIT_STATUS_ON_AT(env->backrefed_mem, p[i]); + BIT_STATUS_ON_AT(env->bt_mem_start, p[i]); + SET_EFFECT_STATUS(nodes[p[i]], NST_MEM_BACKREFED); + } + } + break; + + case N_QUALIFIER: + { + OnigDistance d; + QualifierNode* qn = &(NQUALIFIER(node)); + Node* target = qn->target; + + if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) { + r = get_min_match_length(target, &d, env); + if (r) break; + if (d == 0) { + qn->target_empty_info = NQ_TARGET_IS_EMPTY; +#ifdef USE_INFINITE_REPEAT_MONOMANIAC_MEM_STATUS_CHECK + r = qualifiers_memory_node_info(target); + if (r < 0) break; + if (r > 0) { + qn->target_empty_info = r; + } +#endif +#if 0 + r = get_max_match_length(target, &d, env); + if (r == 0 && d == 0) { + /* ()* ==> ()?, ()+ ==> () */ + qn->upper = 1; + if (qn->lower > 1) qn->lower = 1; + if (NTYPE(target) == N_STRING) { + qn->upper = qn->lower = 0; /* /(?:)+/ ==> // */ + } + } +#endif + } + } + + if (qn->lower != qn->upper) + state |= IN_REPEAT; + r = setup_tree(target, reg, state, env); + if (r) break; + + /* expand string */ +#define EXPAND_STRING_MAX_LENGTH 100 + if (NTYPE(target) == N_STRING) { + if (!IS_REPEAT_INFINITE(qn->lower) && qn->lower == qn->upper && + qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) { + int len = NSTRING_LEN(target); + StrNode* sn = &(NSTRING(target)); + + if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) { + int i, n = qn->lower; + onig_node_conv_to_str_node(node, NSTRING(target).flag); + for (i = 0; i < n; i++) { + r = onig_node_str_cat(node, sn->s, sn->end); + if (r) break; + } + onig_node_free(target); + break; /* break case N_QUALIFIER: */ + } + } + } + +#ifdef USE_OP_PUSH_OR_JUMP_EXACT + if (qn->greedy && (qn->target_empty_info != 0)) { + if (NTYPE(target) == N_QUALIFIER) { + QualifierNode* tqn = &(NQUALIFIER(target)); + if (IS_NOT_NULL(tqn->head_exact)) { + qn->head_exact = tqn->head_exact; + tqn->head_exact = NULL; + } + } + else { + qn->head_exact = get_head_value_node(qn->target, 1, reg); + } + } +#endif + } + break; + + case N_EFFECT: + { + EffectNode* en = &(NEFFECT(node)); + + switch (en->type) { + case EFFECT_OPTION: + { + OnigOptionType options = reg->options; + reg->options = NEFFECT(node).option; + r = setup_tree(NEFFECT(node).target, reg, state, env); + reg->options = options; + } + break; + + case EFFECT_MEMORY: + if ((state & (IN_ALT | IN_NOT | IN_REPEAT)) != 0) { + BIT_STATUS_ON_AT(env->bt_mem_start, en->regnum); + /* SET_EFFECT_STATUS(node, NST_MEM_IN_ALT_NOT); */ + } + /* fall */ + case EFFECT_STOP_BACKTRACK: + { + Node* target = en->target; + r = setup_tree(target, reg, state, env); + if (NTYPE(target) == N_QUALIFIER) { + QualifierNode* tqn = &(NQUALIFIER(target)); + if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 && + tqn->greedy != 0) { /* (?>a*), a*+ etc... */ + int qtype = NTYPE(tqn->target); + if (IS_NODE_TYPE_SIMPLE(qtype)) + SET_EFFECT_STATUS(node, NST_SIMPLE_REPEAT); + } + } + } + break; + } + } + break; + + case N_ANCHOR: + { + AnchorNode* an = &(NANCHOR(node)); + + switch (an->type) { + case ANCHOR_PREC_READ: + r = setup_tree(an->target, reg, state, env); + break; + case ANCHOR_PREC_READ_NOT: + r = setup_tree(an->target, reg, (state | IN_NOT), env); + break; + +/* allowed node types in look-behind */ +#define ALLOWED_TYPE_IN_LB \ + ( N_LIST | N_ALT | N_STRING | N_CCLASS | N_CTYPE | \ + N_ANYCHAR | N_ANCHOR | N_EFFECT | N_QUALIFIER | N_CALL ) + +#define ALLOWED_EFFECT_IN_LB ( EFFECT_MEMORY ) +#define ALLOWED_EFFECT_IN_LB_NOT 0 + +#define ALLOWED_ANCHOR_IN_LB \ +( ANCHOR_LOOK_BEHIND | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF ) +#define ALLOWED_ANCHOR_IN_LB_NOT \ +( ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF ) + /* can't allow all anchors, because \G in look-behind through Search(). + ex. /(?<=\G)zz/.match("azz") => success. */ + + case ANCHOR_LOOK_BEHIND: + { + r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB, + ALLOWED_EFFECT_IN_LB, ALLOWED_ANCHOR_IN_LB); + if (r < 0) return r; + if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; + r = setup_look_behind(node, reg, env); + if (r != 0) return r; + r = setup_tree(an->target, reg, state, env); + } + break; + + case ANCHOR_LOOK_BEHIND_NOT: + { + r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB, + ALLOWED_EFFECT_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT); + if (r < 0) return r; + if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; + r = setup_look_behind(node, reg, env); + if (r != 0) return r; + r = setup_tree(an->target, reg, (state | IN_NOT), env); + } + break; + } + } + break; + + default: + break; + } + + return r; +} + +/* set skip map for Boyer-Moor search */ +static int +set_bm_skip(UChar* s, UChar* end, OnigEncoding enc, int ignore_case, + UChar skip[], int** int_skip) +{ + int i, len; + UChar lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; + + len = end - s; + if (len < ONIG_CHAR_TABLE_SIZE) { + for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = len; + + if (ignore_case) { + for (i = 0; i < len - 1; i++) { + ONIGENC_MBC_TO_LOWER(enc, &(s[i]), lowbuf); + skip[*lowbuf] = len - 1 - i; + } + } + else { + for (i = 0; i < len - 1; i++) + skip[s[i]] = len - 1 - i; + } + } + else { + if (IS_NULL(*int_skip)) { + *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE); + if (IS_NULL(*int_skip)) return ONIGERR_MEMORY; + } + for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = len; + + if (ignore_case) { + for (i = 0; i < len - 1; i++) { + ONIGENC_MBC_TO_LOWER(enc, &(s[i]), lowbuf); + (*int_skip)[*lowbuf] = len - 1 - i; + } + } + else { + for (i = 0; i < len - 1; i++) + (*int_skip)[s[i]] = len - 1 - i; + } + } + return 0; +} + +#define OPT_EXACT_MAXLEN 24 + +typedef struct { + OnigDistance min; /* min byte length */ + OnigDistance max; /* max byte length */ +} MinMaxLen; + +typedef struct { + MinMaxLen mmd; + BitStatusType backrefed_status; + OnigEncoding enc; + OnigOptionType options; + ScanEnv* scan_env; +} OptEnv; + +typedef struct { + int left_anchor; + int right_anchor; +} OptAncInfo; + +typedef struct { + MinMaxLen mmd; /* info position */ + OptAncInfo anc; + + int reach_end; + int ignore_case; + int len; + UChar s[OPT_EXACT_MAXLEN]; +} OptExactInfo; + +typedef struct { + MinMaxLen mmd; /* info position */ + OptAncInfo anc; + + int value; /* weighted value */ + UChar map[ONIG_CHAR_TABLE_SIZE]; +} OptMapInfo; + +typedef struct { + MinMaxLen len; + + OptAncInfo anc; + OptExactInfo exb; /* boundary */ + OptExactInfo exm; /* middle */ + OptExactInfo expr; /* prec read (?=...) */ + + OptMapInfo map; /* boundary */ +} NodeOptInfo; + + +static int +map_position_value(int i) +{ + static int vals[] = { + 10, 10, 10, 10, 10, 10, 10, 10, 10, 1, 1, 10, 10, 1, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 1, 6, 3, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, + 5, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 5, 5, 5, + 5, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 10, + }; + + if (i < sizeof(vals)/sizeof(vals[0])) return vals[i]; + + return 7; /* Take it easy. */ +} + +static int +distance_value(MinMaxLen* mm) +{ + /* 1000 / (min-max-dist + 1) */ + static int dist_vals[] = { + 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100, + 91, 83, 77, 71, 67, 63, 59, 56, 53, 50, + 48, 45, 43, 42, 40, 38, 37, 36, 34, 33, + 32, 31, 30, 29, 29, 28, 27, 26, 26, 25, + 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, + 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, + 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, + 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, + 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 10, 10, 10, 10, 10 + }; + + int d; + + if (mm->max == ONIG_INFINITE_DISTANCE) return 0; + + d = mm->max - mm->min; + if (d < sizeof(dist_vals)/sizeof(dist_vals[0])) + /* return dist_vals[d] * 16 / (mm->min + 12); */ + return dist_vals[d]; + else + return 1; +} + +static int +comp_distance_value(MinMaxLen* d1, MinMaxLen* d2, int v1, int v2) +{ + if (v2 <= 0) return -1; + if (v1 <= 0) return 1; + + v1 *= distance_value(d1); + v2 *= distance_value(d2); + + if (v2 > v1) return 1; + if (v2 < v1) return -1; + + if (d2->min < d1->min) return 1; + if (d2->min > d1->min) return -1; + return 0; +} + +static int +is_equal_mml(MinMaxLen* a, MinMaxLen* b) +{ + return (a->min == b->min && a->max == b->max) ? 1 : 0; +} + + +static void +set_mml(MinMaxLen* mml, OnigDistance min, OnigDistance max) +{ + mml->min = min; + mml->max = max; +} + +static void +clear_mml(MinMaxLen* mml) +{ + mml->min = mml->max = 0; +} + +static void +copy_mml(MinMaxLen* to, MinMaxLen* from) +{ + to->min = from->min; + to->max = from->max; +} + +static void +add_mml(MinMaxLen* to, MinMaxLen* from) +{ + to->min = distance_add(to->min, from->min); + to->max = distance_add(to->max, from->max); +} + +static void +add_len_mml(MinMaxLen* to, OnigDistance len) +{ + to->min = distance_add(to->min, len); + to->max = distance_add(to->max, len); +} + +static void +alt_merge_mml(MinMaxLen* to, MinMaxLen* from) +{ + if (to->min > from->min) to->min = from->min; + if (to->max < from->max) to->max = from->max; +} + +static void +copy_opt_env(OptEnv* to, OptEnv* from) +{ + *to = *from; +} + +static void +clear_opt_anc_info(OptAncInfo* anc) +{ + anc->left_anchor = 0; + anc->right_anchor = 0; +} + +static void +copy_opt_anc_info(OptAncInfo* to, OptAncInfo* from) +{ + *to = *from; +} + +static void +concat_opt_anc_info(OptAncInfo* to, OptAncInfo* left, OptAncInfo* right, + OnigDistance left_len, OnigDistance right_len) +{ + clear_opt_anc_info(to); + + to->left_anchor = left->left_anchor; + if (left_len == 0) { + to->left_anchor |= right->left_anchor; + } + + to->right_anchor = right->right_anchor; + if (right_len == 0) { + to->right_anchor |= left->right_anchor; + } +} + +static int +is_left_anchor(int anc) +{ + if (anc == ANCHOR_END_BUF || anc == ANCHOR_SEMI_END_BUF || + anc == ANCHOR_END_LINE || anc == ANCHOR_PREC_READ || + anc == ANCHOR_PREC_READ_NOT) + return 0; + + return 1; +} + +static int +is_set_opt_anc_info(OptAncInfo* to, int anc) +{ + if ((to->left_anchor & anc) != 0) return 1; + + return ((to->right_anchor & anc) != 0 ? 1 : 0); +} + +static void +add_opt_anc_info(OptAncInfo* to, int anc) +{ + if (is_left_anchor(anc)) + to->left_anchor |= anc; + else + to->right_anchor |= anc; +} + +static void +remove_opt_anc_info(OptAncInfo* to, int anc) +{ + if (is_left_anchor(anc)) + to->left_anchor &= ~anc; + else + to->right_anchor &= ~anc; +} + +static void +alt_merge_opt_anc_info(OptAncInfo* to, OptAncInfo* add) +{ + to->left_anchor &= add->left_anchor; + to->right_anchor &= add->right_anchor; +} + +static int +is_full_opt_exact_info(OptExactInfo* ex) +{ + return (ex->len >= OPT_EXACT_MAXLEN ? 1 : 0); +} + +static void +clear_opt_exact_info(OptExactInfo* ex) +{ + clear_mml(&ex->mmd); + clear_opt_anc_info(&ex->anc); + ex->reach_end = 0; + ex->ignore_case = 0; + ex->len = 0; + ex->s[0] = '\0'; +} + +static void +copy_opt_exact_info(OptExactInfo* to, OptExactInfo* from) +{ + *to = *from; +} + +static void +concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add) +{ + int i, n; + OptAncInfo tanc; + + if (! to->ignore_case && add->ignore_case) { + if (to->len >= add->len) return ; /* avoid */ + + to->ignore_case = 1; + } + + for (i = to->len, n = 0; n < add->len && i < OPT_EXACT_MAXLEN; i++, n++) + to->s[i] = add->s[n]; + + to->len = i; + to->reach_end = (n == add->len ? add->reach_end : 0); + + concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1); + if (! to->reach_end) tanc.right_anchor = 0; + copy_opt_anc_info(&to->anc, &tanc); +} + +static void +concat_opt_exact_info_str(OptExactInfo* to, + UChar* s, UChar* end, int raw, OnigEncoding enc) +{ + int i, j, len; + UChar *p; + + for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) { + if (raw) { + to->s[i++] = *p++; + } + else { + len = enc_len(enc, *p); + if (i + len > OPT_EXACT_MAXLEN) break; + for (j = 0; j < len; j++) + to->s[i++] = *p++; + } + } + + to->len = i; +} + +static void +alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env) +{ + int i, j, len; + + if (add->len == 0 || to->len == 0) { + clear_opt_exact_info(to); + return ; + } + + if (! is_equal_mml(&to->mmd, &add->mmd)) { + clear_opt_exact_info(to); + return ; + } + + for (i = 0; i < to->len && i < add->len; ) { + if (to->s[i] != add->s[i]) break; + len = enc_len(env->enc, to->s[i]); + + for (j = 1; j < len; j++) { + if (to->s[i+j] != add->s[i+j]) break; + } + if (j < len) break; + i += len; + } + + if (! add->reach_end || i < add->len || i < to->len) { + to->reach_end = 0; + } + to->len = i; + to->ignore_case |= add->ignore_case; + + alt_merge_opt_anc_info(&to->anc, &add->anc); + if (! to->reach_end) to->anc.right_anchor = 0; +} + +static void +select_opt_exact_info(OptExactInfo* now, OptExactInfo* alt) +{ + int vlen1, vlen2; + + vlen1 = now->len * (now->ignore_case ? 1 : 2); + vlen2 = alt->len * (alt->ignore_case ? 1 : 2); + + if (comp_distance_value(&now->mmd, &alt->mmd, vlen1, vlen2) > 0) + copy_opt_exact_info(now, alt); +} + +static void +clear_opt_map_info(OptMapInfo* map) +{ + int i; + + clear_mml(&map->mmd); + clear_opt_anc_info(&map->anc); + map->value = 0; + for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) + map->map[i] = 0; +} + +static void +copy_opt_map_info(OptMapInfo* to, OptMapInfo* from) +{ + *to = *from; +} + +static void +add_char_opt_map_info(OptMapInfo* map, int c) +{ + if (map->map[c] == 0) { + map->map[c] = 1; + map->value += map_position_value(c); + } +} + +static void +add_char_amb_opt_map_info(OptMapInfo* map, int c, OnigEncoding enc) +{ + UChar x, low[ONIGENC_MBC_TO_LOWER_MAXLEN]; + + add_char_opt_map_info(map, c); + + x = (UChar )c; + ONIGENC_MBC_TO_LOWER(enc, &x, low); + if (*low != x) { + add_char_opt_map_info(map, (int )(*low)); + } + else { + int i; + for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) { + x = (UChar )i; + ONIGENC_MBC_TO_LOWER(enc, &x, low); + if ((int )(*low) == c) add_char_opt_map_info(map, i); + } + } +} + +static void +select_opt_map_info(OptMapInfo* now, OptMapInfo* alt) +{ + static int z = 1<<15; /* 32768: something big value */ + + int v1, v2; + + if (alt->value == 0) return ; + if (now->value == 0) { + copy_opt_map_info(now, alt); + return ; + } + + v1 = z / now->value; + v2 = z / alt->value; + if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0) + copy_opt_map_info(now, alt); +} + +static int +comp_opt_exact_or_map_info(OptExactInfo* e, OptMapInfo* m) +{ +#define COMP_EM_BASE 20 + int ve, vm; + + if (m->value <= 0) return -1; + + ve = COMP_EM_BASE * e->len * (e->ignore_case ? 1 : 2); + vm = COMP_EM_BASE * 5 * 2 / m->value; + return comp_distance_value(&e->mmd, &m->mmd, ve, vm); +} + +static void +alt_merge_opt_map_info(OptMapInfo* to, OptMapInfo* add) +{ + int i, val; + + /* if (! is_equal_mml(&to->mmd, &add->mmd)) return ; */ + if (to->value == 0) return ; + if (add->value == 0 || to->mmd.max < add->mmd.min) { + clear_opt_map_info(to); + return ; + } + + alt_merge_mml(&to->mmd, &add->mmd); + + val = 0; + for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) { + if (add->map[i]) + to->map[i] = 1; + + if (to->map[i]) + val += map_position_value(i); + } + to->value = val; + + alt_merge_opt_anc_info(&to->anc, &add->anc); +} + +static void +set_bound_node_opt_info(NodeOptInfo* opt, MinMaxLen* mmd) +{ + copy_mml(&(opt->exb.mmd), mmd); + copy_mml(&(opt->expr.mmd), mmd); + copy_mml(&(opt->map.mmd), mmd); +} + +static void +clear_node_opt_info(NodeOptInfo* opt) +{ + clear_mml(&opt->len); + clear_opt_anc_info(&opt->anc); + clear_opt_exact_info(&opt->exb); + clear_opt_exact_info(&opt->exm); + clear_opt_exact_info(&opt->expr); + clear_opt_map_info(&opt->map); +} + +static void +copy_node_opt_info(NodeOptInfo* to, NodeOptInfo* from) +{ + *to = *from; +} + +static void +concat_left_node_opt_info(NodeOptInfo* to, NodeOptInfo* add) +{ + int exb_reach, exm_reach; + OptAncInfo tanc; + + concat_opt_anc_info(&tanc, &to->anc, &add->anc, to->len.max, add->len.max); + copy_opt_anc_info(&to->anc, &tanc); + + if (add->exb.len > 0 && to->len.max == 0) { + concat_opt_anc_info(&tanc, &to->anc, &add->exb.anc, + to->len.max, add->len.max); + copy_opt_anc_info(&add->exb.anc, &tanc); + } + + if (add->map.value > 0 && to->len.max == 0) { + if (add->map.mmd.max == 0) + add->map.anc.left_anchor |= to->anc.left_anchor; + } + + exb_reach = to->exb.reach_end; + exm_reach = to->exm.reach_end; + + if (add->len.max != 0) + to->exb.reach_end = to->exm.reach_end = 0; + + if (add->exb.len > 0) { + if (exb_reach) { + concat_opt_exact_info(&to->exb, &add->exb); + clear_opt_exact_info(&add->exb); + } + else if (exm_reach) { + concat_opt_exact_info(&to->exm, &add->exb); + clear_opt_exact_info(&add->exb); + } + } + select_opt_exact_info(&to->exm, &add->exb); + select_opt_exact_info(&to->exm, &add->exm); + + if (to->expr.len > 0) { + if (add->len.max > 0) { + if (to->expr.len > (int )add->len.max) + to->expr.len = add->len.max; + + if (to->expr.mmd.max == 0) + select_opt_exact_info(&to->exb, &to->expr); + else + select_opt_exact_info(&to->exm, &to->expr); + } + } + else if (add->expr.len > 0) { + copy_opt_exact_info(&to->expr, &add->expr); + } + + select_opt_map_info(&to->map, &add->map); + + add_mml(&to->len, &add->len); +} + +static void +alt_merge_node_opt_info(NodeOptInfo* to, NodeOptInfo* add, OptEnv* env) +{ + alt_merge_opt_anc_info (&to->anc, &add->anc); + alt_merge_opt_exact_info(&to->exb, &add->exb, env); + alt_merge_opt_exact_info(&to->exm, &add->exm, env); + alt_merge_opt_exact_info(&to->expr, &add->expr, env); + alt_merge_opt_map_info (&to->map, &add->map); + + alt_merge_mml(&to->len, &add->len); +} + + +#define MAX_NODE_OPT_INFO_REF_COUNT 5 + +static int +optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) +{ + int type; + int r = 0; + + clear_node_opt_info(opt); + set_bound_node_opt_info(opt, &env->mmd); + + type = NTYPE(node); + switch (type) { + case N_LIST: + { + OptEnv nenv; + NodeOptInfo nopt; + Node* nd = node; + + copy_opt_env(&nenv, env); + do { + r = optimize_node_left(NCONS(nd).left, &nopt, &nenv); + if (r == 0) { + add_mml(&nenv.mmd, &nopt.len); + concat_left_node_opt_info(opt, &nopt); + } + } while (r == 0 && IS_NOT_NULL(nd = NCONS(nd).right)); + } + break; + + case N_ALT: + { + NodeOptInfo nopt; + Node* nd = node; + + do { + r = optimize_node_left(NCONS(nd).left, &nopt, env); + if (r == 0) { + if (nd == node) copy_node_opt_info(opt, &nopt); + else alt_merge_node_opt_info(opt, &nopt, env); + } + } while ((r == 0) && IS_NOT_NULL(nd = NCONS(nd).right)); + } + break; + + case N_STRING: + { + UChar *p; + int len, plen; + StrNode* sn = &(NSTRING(node)); + int slen = sn->end - sn->s; + int is_raw = NSTRING_IS_RAW(node); + + if ((! IS_IGNORECASE(env->options)) || is_raw) { + concat_opt_exact_info_str(&opt->exb, sn->s, sn->end, + NSTRING_IS_RAW(node), env->enc); + if (slen > 0) { + add_char_opt_map_info(&opt->map, *(sn->s)); + } + } + else { + for (p = sn->s; p < sn->end; ) { + len = enc_len(env->enc, *p); + if (len == 1 && ONIGENC_IS_MBC_CASE_AMBIG(env->enc, p)) { + break; + } + p += len; + } + + plen = p - sn->s; + if (plen > slen / 5) { + concat_opt_exact_info_str(&opt->exb, sn->s, p, is_raw, env->enc); + concat_opt_exact_info_str(&opt->exm, p, sn->end, is_raw, env->enc); + opt->exm.ignore_case = 1; + if (opt->exm.len == sn->end - p) + opt->exm.reach_end = 1; + + copy_mml(&(opt->exm.mmd), &(opt->exb.mmd)); + add_len_mml(&(opt->exm.mmd), plen); + } + else { + concat_opt_exact_info_str(&opt->exb, sn->s, sn->end, + is_raw, env->enc); + opt->exb.ignore_case = 1; + } + + if (slen > 0) { + if (p == sn->s) + add_char_amb_opt_map_info(&opt->map, *(sn->s), env->enc); + else + add_char_opt_map_info(&opt->map, *(sn->s)); + } + } + + if (opt->exb.len == slen) + opt->exb.reach_end = 1; + + set_mml(&opt->len, slen, slen); + } + break; + + case N_CCLASS: + { + int i, z, len, found, mb_found; + CClassNode* cc = &(NCCLASS(node)); + + /* no need to check ignore case. (setted in setup_tree()) */ + found = mb_found = 0; + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + z = BITSET_AT(cc->bs, i); + if ((z && !cc->not) || (!z && cc->not)) { + found = 1; + add_char_opt_map_info(&opt->map, i); + } + } + + if (IS_NULL(cc->mbuf)) { + if (cc->not) { + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + add_char_opt_map_info(&opt->map, i); + } + mb_found = 1; + } + } + else { + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + z = ONIGENC_IS_MBC_HEAD(env->enc, i); + if (z) { + mb_found = 1; + add_char_opt_map_info(&opt->map, i); + } + } + } + + if (mb_found) { + len = ONIGENC_MBC_MAXLEN_DIST(env->enc); + set_mml(&opt->len, 1, len); + } + else if (found) { + len = 1; + set_mml(&opt->len, 1, len); + } + } + break; + + case N_CTYPE: + { + int c; + int len, min, max; + + min = ONIGENC_MBC_MAXLEN_DIST(env->enc); + max = 0; + +#define IS_WORD_HEAD_BYTE(enc,b) \ + (ONIGENC_IS_MBC_ASCII(&b) ? ONIGENC_IS_CODE_WORD(enc,((OnigCodePoint )b)) \ + : ONIGENC_IS_MBC_HEAD(enc,b)) + + switch (NCTYPE(node).type) { + case CTYPE_WORD: + for (c = 0; c < SINGLE_BYTE_SIZE; c++) { + if (IS_WORD_HEAD_BYTE(env->enc, c)) { + add_char_opt_map_info(&opt->map, c); + len = enc_len(env->enc, c); + if (len < min) min = len; + if (len > max) max = len; + } + } + break; + + case CTYPE_NOT_WORD: + for (c = 0; c < SINGLE_BYTE_SIZE; c++) { + if (! IS_WORD_HEAD_BYTE(env->enc, c)) { + add_char_opt_map_info(&opt->map, c); + len = enc_len(env->enc, c); + if (len < min) min = len; + if (len > max) max = len; + } + } + break; + } + + set_mml(&opt->len, min, max); + } + break; + + case N_ANYCHAR: + { + OnigDistance len = ONIGENC_MBC_MAXLEN_DIST(env->enc); + set_mml(&opt->len, 1, len); + } + break; + + case N_ANCHOR: + switch (NANCHOR(node).type) { + case ANCHOR_BEGIN_BUF: + case ANCHOR_BEGIN_POSITION: + case ANCHOR_BEGIN_LINE: + case ANCHOR_END_BUF: + case ANCHOR_SEMI_END_BUF: + case ANCHOR_END_LINE: + add_opt_anc_info(&opt->anc, NANCHOR(node).type); + break; + + case ANCHOR_PREC_READ: + { + NodeOptInfo nopt; + + r = optimize_node_left(NANCHOR(node).target, &nopt, env); + if (r == 0) { + if (nopt.exb.len > 0) + copy_opt_exact_info(&opt->expr, &nopt.exb); + else if (nopt.exm.len > 0) + copy_opt_exact_info(&opt->expr, &nopt.exm); + + opt->expr.reach_end = 0; + + if (nopt.map.value > 0) + copy_opt_map_info(&opt->map, &nopt.map); + } + } + break; + + case ANCHOR_PREC_READ_NOT: + case ANCHOR_LOOK_BEHIND: /* Sorry, I can't make use of it. */ + case ANCHOR_LOOK_BEHIND_NOT: + break; + } + break; + + case N_BACKREF: + { + int i; + int* backs; + OnigDistance min, max, tmin, tmax; + Node** nodes = SCANENV_MEM_NODES(env->scan_env); + BackrefNode* br = &(NBACKREF(node)); + + if (br->state & NST_RECURSION) { + set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE); + break; + } + backs = BACKREFS_P(br); + r = get_min_match_length(nodes[backs[0]], &min, env->scan_env); + if (r != 0) break; + r = get_max_match_length(nodes[backs[0]], &max, env->scan_env); + if (r != 0) break; + for (i = 1; i < br->back_num; i++) { + r = get_min_match_length(nodes[backs[i]], &tmin, env->scan_env); + if (r != 0) break; + r = get_max_match_length(nodes[backs[i]], &tmax, env->scan_env); + if (r != 0) break; + if (min > tmin) min = tmin; + if (max < tmax) max = tmax; + } + if (r == 0) set_mml(&opt->len, min, max); + } + break; + +#ifdef USE_SUBEXP_CALL + case N_CALL: + if (IS_CALL_RECURSION(&(NCALL(node)))) + set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE); + else { + OnigOptionType save = env->options; + env->options = NEFFECT(NCALL(node).target).option; + r = optimize_node_left(NCALL(node).target, opt, env); + env->options = save; + } + break; +#endif + + case N_QUALIFIER: + { + int i; + OnigDistance min, max; + NodeOptInfo nopt; + QualifierNode* qn = &(NQUALIFIER(node)); + + r = optimize_node_left(qn->target, &nopt, env); + if (r) break; + + if (qn->lower == 0 && IS_REPEAT_INFINITE(qn->upper)) { + if (env->mmd.max == 0 && + NTYPE(qn->target) == N_ANYCHAR && qn->greedy) { + if (IS_POSIXLINE(env->options)) + add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_PL); + else + add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR); + } + } + else { + if (qn->lower > 0) { + copy_node_opt_info(opt, &nopt); + if (nopt.exb.len > 0) { + if (nopt.exb.reach_end) { + for (i = 2; i < qn->lower && + ! is_full_opt_exact_info(&opt->exb); i++) { + concat_opt_exact_info(&opt->exb, &nopt.exb); + } + if (i < qn->lower) { + opt->exb.reach_end = 0; + } + } + } + + if (qn->lower != qn->upper) { + opt->exb.reach_end = 0; + opt->exm.reach_end = 0; + } + if (qn->lower > 1) + opt->exm.reach_end = 0; + } + } + + min = distance_multiply(nopt.len.min, qn->lower); + if (IS_REPEAT_INFINITE(qn->upper)) + max = (nopt.len.max > 0 ? ONIG_INFINITE_DISTANCE : 0); + else + max = distance_multiply(nopt.len.max, qn->upper); + + set_mml(&opt->len, min, max); + } + break; + + case N_EFFECT: + { + EffectNode* en = &(NEFFECT(node)); + + switch (en->type) { + case EFFECT_OPTION: + { + OnigOptionType save = env->options; + + env->options = en->option; + r = optimize_node_left(en->target, opt, env); + env->options = save; + } + break; + + case EFFECT_MEMORY: +#ifdef USE_SUBEXP_CALL + en->opt_count++; + if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) { + OnigDistance min, max; + + min = 0; + max = ONIG_INFINITE_DISTANCE; + if (IS_EFFECT_MIN_FIXED(en)) min = en->min_len; + if (IS_EFFECT_MAX_FIXED(en)) max = en->max_len; + set_mml(&opt->len, min, max); + } + else +#endif + { + r = optimize_node_left(en->target, opt, env); + + if (is_set_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK)) { + if (BIT_STATUS_AT(env->backrefed_status, en->regnum)) + remove_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK); + } + } + break; + + case EFFECT_STOP_BACKTRACK: + r = optimize_node_left(en->target, opt, env); + break; + } + } + break; + + default: +#ifdef ONIG_DEBUG + fprintf(stderr, "optimize_node_left: undefined node type %d\n", + NTYPE(node)); +#endif + r = ONIGERR_TYPE_BUG; + break; + } + + return r; +} + +static int +set_optimize_exact_info(regex_t* reg, OptExactInfo* e) +{ + int r; + + if (e->len == 0) return 0; + + reg->exact = onig_strdup(e->s, e->s + e->len); + CHECK_NULL_RETURN_VAL(reg->exact, ONIGERR_MEMORY); + + reg->exact_end = reg->exact + e->len; + + if (e->ignore_case) { + UChar buf[ONIGENC_MBC_TO_LOWER_MAXLEN]; + int len, low_len, i, j, alloc_size; + + alloc_size = e->len; + i = j = 0; + while (i < e->len) { + low_len = ONIGENC_MBC_TO_LOWER(reg->enc, &(e->s[i]), buf); + len = enc_len(reg->enc, e->s[i]); + if (low_len > alloc_size - i) { + reg->exact = xrealloc(reg->exact, alloc_size * 2); + CHECK_NULL_RETURN_VAL(reg->exact, ONIGERR_MEMORY); + alloc_size *= 2; + } + + xmemcpy(&(reg->exact[j]), buf, low_len); + i += len; + j += low_len; + } + reg->exact_end = reg->exact + j; + reg->optimize = ONIG_OPTIMIZE_EXACT_IC; + } + else { + int allow_reverse; + + if (e->anc.left_anchor & ANCHOR_BEGIN_LINE) + allow_reverse = 1; + else + allow_reverse = + ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end); + + if (e->len >= 3 || (e->len >= 2 && allow_reverse)) { + r = set_bm_skip(reg->exact, reg->exact_end, reg->enc, 0, + reg->map, &(reg->int_map)); + if (r) return r; + + reg->optimize = (allow_reverse != 0 + ? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV); + } + else { + reg->optimize = ONIG_OPTIMIZE_EXACT; + } + } + + reg->dmin = e->mmd.min; + reg->dmax = e->mmd.max; + + if (reg->dmin != ONIG_INFINITE_DISTANCE) { + reg->threshold_len = reg->dmin + (reg->exact_end - reg->exact); + } + + return 0; +} + +static void +set_optimize_map_info(regex_t* reg, OptMapInfo* m) +{ + int i; + + for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) + reg->map[i] = m->map[i]; + + reg->optimize = ONIG_OPTIMIZE_MAP; + reg->dmin = m->mmd.min; + reg->dmax = m->mmd.max; + + if (reg->dmin != ONIG_INFINITE_DISTANCE) { + reg->threshold_len = reg->dmin + 1; + } +} + +static void +set_sub_anchor(regex_t* reg, OptAncInfo* anc) +{ + reg->sub_anchor |= anc->left_anchor & ANCHOR_BEGIN_LINE; + reg->sub_anchor |= anc->right_anchor & ANCHOR_END_LINE; +} + +#ifdef ONIG_DEBUG +static void print_optimize_info(FILE* f, regex_t* reg); +#endif + +static int +set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) +{ + + int r; + NodeOptInfo opt; + OptEnv env; + + env.enc = reg->enc; + env.options = reg->options; + env.scan_env = scan_env; + clear_mml(&env.mmd); + + r = optimize_node_left(node, &opt, &env); + if (r) return r; + + reg->anchor = opt.anc.left_anchor & (ANCHOR_BEGIN_BUF | + ANCHOR_BEGIN_POSITION | ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_PL); + + reg->anchor |= opt.anc.right_anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF); + + if (reg->anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF)) { + reg->anchor_dmin = opt.len.min; + reg->anchor_dmax = opt.len.max; + } + + if (opt.exb.len > 0 || opt.exm.len > 0) { + select_opt_exact_info(&opt.exb, &opt.exm); + if (opt.map.value > 0 && + comp_opt_exact_or_map_info(&opt.exb, &opt.map) > 0) { + goto set_map; + } + else { + r = set_optimize_exact_info(reg, &opt.exb); + set_sub_anchor(reg, &opt.exb.anc); + } + } + else if (opt.map.value > 0) { + set_map: + set_optimize_map_info(reg, &opt.map); + set_sub_anchor(reg, &opt.map.anc); + } + else { + reg->sub_anchor |= opt.anc.left_anchor & ANCHOR_BEGIN_LINE; + if (opt.len.max == 0) + reg->sub_anchor |= opt.anc.right_anchor & ANCHOR_END_LINE; + } + +#if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH) + print_optimize_info(stderr, reg); +#endif + return r; +} + +static void +clear_optimize_info(regex_t* reg) +{ + reg->optimize = ONIG_OPTIMIZE_NONE; + reg->anchor = 0; + reg->anchor_dmin = 0; + reg->anchor_dmax = 0; + reg->sub_anchor = 0; + reg->exact_end = (UChar* )NULL; + reg->threshold_len = 0; + if (IS_NOT_NULL(reg->exact)) { + xfree(reg->exact); + reg->exact = (UChar* )NULL; + } +} + +#ifdef ONIG_DEBUG + +static void +print_distance_range(FILE* f, OnigDistance a, OnigDistance b) +{ + if (a == ONIG_INFINITE_DISTANCE) + fputs("inf", f); + else + fprintf(f, "(%u)", a); + + fputs("-", f); + + if (b == ONIG_INFINITE_DISTANCE) + fputs("inf", f); + else + fprintf(f, "(%u)", b); +} + +static void +print_anchor(FILE* f, int anchor) +{ + int q = 0; + + fprintf(f, "["); + + if (anchor & ANCHOR_BEGIN_BUF) { + fprintf(f, "begin-buf"); + q = 1; + } + if (anchor & ANCHOR_BEGIN_LINE) { + if (q) fprintf(f, ", "); + q = 1; + fprintf(f, "begin-line"); + } + if (anchor & ANCHOR_BEGIN_POSITION) { + if (q) fprintf(f, ", "); + q = 1; + fprintf(f, "begin-pos"); + } + if (anchor & ANCHOR_END_BUF) { + if (q) fprintf(f, ", "); + q = 1; + fprintf(f, "end-buf"); + } + if (anchor & ANCHOR_SEMI_END_BUF) { + if (q) fprintf(f, ", "); + q = 1; + fprintf(f, "semi-end-buf"); + } + if (anchor & ANCHOR_END_LINE) { + if (q) fprintf(f, ", "); + q = 1; + fprintf(f, "end-line"); + } + if (anchor & ANCHOR_ANYCHAR_STAR) { + if (q) fprintf(f, ", "); + q = 1; + fprintf(f, "anychar-star"); + } + if (anchor & ANCHOR_ANYCHAR_STAR_PL) { + if (q) fprintf(f, ", "); + fprintf(f, "anychar-star-pl"); + } + + fprintf(f, "]"); +} + +static void +print_optimize_info(FILE* f, regex_t* reg) +{ + static char* on[] = { "NONE", "EXACT", "EXACT_BM", "EXACT_BM_NOT_REV", + "EXACT_IC", "MAP" }; + + fprintf(f, "optimize: %s\n", on[reg->optimize]); + fprintf(f, " anchor: "); print_anchor(f, reg->anchor); + if ((reg->anchor & ANCHOR_END_BUF_MASK) != 0) + print_distance_range(f, reg->anchor_dmin, reg->anchor_dmax); + fprintf(f, "\n"); + + if (reg->optimize) { + fprintf(f, " sub anchor: "); print_anchor(f, reg->sub_anchor); + fprintf(f, "\n"); + } + fprintf(f, "\n"); + + if (reg->exact) { + UChar *p; + fprintf(f, "exact: ["); + for (p = reg->exact; p < reg->exact_end; p++) { + fputc(*p, f); + } + fprintf(f, "]: length: %d\n", (reg->exact_end - reg->exact)); + } + else if (reg->optimize & ONIG_OPTIMIZE_MAP) { + int i, n = 0; + for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) + if (reg->map[i]) n++; + + fprintf(f, "map: n=%d\n", n); + if (n > 0) { + fputc('[', f); + for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) + if (reg->map[i] && enc_len(reg->enc, i) == 1 && + ONIGENC_IS_CODE_PRINT(reg->enc, i)) + fputc(i, f); + fprintf(f, "]\n"); + } + } +} +#endif /* ONIG_DEBUG */ + + +static void +onig_free_body(regex_t* reg) +{ + if (IS_NOT_NULL(reg->p)) xfree(reg->p); + if (IS_NOT_NULL(reg->exact)) xfree(reg->exact); + if (IS_NOT_NULL(reg->int_map)) xfree(reg->int_map); + if (IS_NOT_NULL(reg->int_map_backward)) xfree(reg->int_map_backward); + if (IS_NOT_NULL(reg->repeat_range)) xfree(reg->repeat_range); + if (IS_NOT_NULL(reg->chain)) onig_free(reg->chain); + +#ifdef USE_NAMED_GROUP + onig_names_free(reg); +#endif +} + +extern void +onig_free(regex_t* reg) +{ + if (IS_NOT_NULL(reg)) { + onig_free_body(reg); + xfree(reg); + } +} + +#define REGEX_TRANSFER(to,from) do {\ + (to)->state = ONIG_STATE_MODIFY;\ + onig_free_body(to);\ + xmemcpy(to, from, sizeof(regex_t));\ + xfree(from);\ +} while (0) + +static void +onig_transfer(regex_t* to, regex_t* from) +{ + THREAD_ATOMIC_START; + REGEX_TRANSFER(to, from); + THREAD_ATOMIC_END; +} + +#define REGEX_CHAIN_HEAD(reg) do {\ + while (IS_NOT_NULL((reg)->chain)) {\ + (reg) = (reg)->chain;\ + }\ +} while (0) + +static void +onig_chain_link_add(regex_t* to, regex_t* add) +{ + THREAD_ATOMIC_START; + REGEX_CHAIN_HEAD(to); + to->chain = add; + THREAD_ATOMIC_END; +} + +extern void +onig_chain_reduce(regex_t* reg) +{ + regex_t *head, *prev; + + THREAD_ATOMIC_START; + prev = reg; + head = prev->chain; + if (IS_NOT_NULL(head)) { + reg->state = ONIG_STATE_MODIFY; + while (IS_NOT_NULL(head->chain)) { + prev = head; + head = head->chain; + } + prev->chain = (regex_t* )NULL; + REGEX_TRANSFER(reg, head); + } + THREAD_ATOMIC_END; +} + +#if 0 +extern int +onig_clone(regex_t** to, regex_t* from) +{ + int r, size; + regex_t* reg; + + if (ONIG_STATE(from) == ONIG_STATE_NORMAL) { + from->state++; /* increment as search counter */ + if (IS_NOT_NULL(from->chain)) { + onig_chain_reduce(from); + from->state++; + } + } + else { + int n = 0; + while (ONIG_STATE(from) < ONIG_STATE_NORMAL) { + if (++n > THREAD_PASS_LIMIT_COUNT) + return ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT; + THREAD_PASS; + } + from->state++; /* increment as search counter */ + } + + r = onig_alloc_init(®, ONIG_OPTION_NONE, from->enc, ONIG_SYNTAX_DEFAULT); + if (r != 0) { + from->state--; + return r; + } + + xmemcpy(reg, from, sizeof(onig_t)); + reg->state = ONIG_STATE_NORMAL; + reg->chain = (regex_t* )NULL; + + if (from->p) { + reg->p = (UChar* )xmalloc(reg->alloc); + if (IS_NULL(reg->p)) goto mem_error; + xmemcpy(reg->p, from->p, reg->alloc); + } + + if (from->exact) { + reg->exact = (UChar* )xmalloc(from->exact_end - from->exact); + if (IS_NULL(reg->exact)) goto mem_error; + reg->exact_end = reg->exact + (from->exact_end - from->exact); + xmemcpy(reg->exact, from->exact, reg->exact_end - reg->exact); + } + + if (from->int_map) { + size = sizeof(int) * ONIG_CHAR_TABLE_SIZE; + reg->int_map = (int* )xmalloc(size); + if (IS_NULL(reg->int_map)) goto mem_error; + xmemcpy(reg->int_map, from->int_map, size); + } + + if (from->int_map_backward) { + size = sizeof(int) * ONIG_CHAR_TABLE_SIZE; + reg->int_map_backward = (int* )xmalloc(size); + if (IS_NULL(reg->int_map_backward)) goto mem_error; + xmemcpy(reg->int_map_backward, from->int_map_backward, size); + } + +#ifdef USE_NAMED_GROUP + reg->name_table = names_clone(from); /* names_clone is not implemented */ +#endif + + from->state--; + *to = reg; + return 0; + + mem_error: + from->state--; + return ONIGERR_MEMORY; +} +#endif + +#ifdef ONIG_DEBUG +static void print_compiled_byte_code_list P_((FILE* f, regex_t* reg)); +#endif +#ifdef ONIG_DEBUG_PARSE_TREE +static void print_tree P_((FILE* f, Node* node)); +#endif + +extern int +onig_compile(regex_t* reg, UChar* pattern, UChar* pattern_end, + OnigErrorInfo* einfo) +{ +#define COMPILE_INIT_SIZE 20 + + int r, init_size; + Node* root; + ScanEnv scan_env; +#ifdef USE_SUBEXP_CALL + UnsetAddrList uslist; +#endif + + reg->state = ONIG_STATE_COMPILING; + + if (reg->alloc == 0) { + init_size = (pattern_end - pattern) * 2; + if (init_size <= 0) init_size = COMPILE_INIT_SIZE; + r = BBUF_INIT(reg, init_size); + if (r != 0) goto end; + } + else + reg->used = 0; + + reg->num_mem = 0; + reg->num_repeat = 0; + reg->num_null_check = 0; + reg->repeat_range_alloc = 0; + reg->repeat_range = (OnigRepeatRange* )NULL; + + r = onig_parse_make_tree(&root, pattern, pattern_end, reg, &scan_env); + if (r != 0) goto err; + +#ifdef USE_NAMED_GROUP + /* mixed use named group and no-named group */ + if (scan_env.num_named > 0 && + IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && + !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) { + if (scan_env.num_named != scan_env.num_mem) + r = disable_noname_group_capture(&root, reg, &scan_env); + else + r = numbered_ref_check(root); + + if (r != 0) goto err; + } +#endif + +#ifdef ONIG_DEBUG_PARSE_TREE + print_tree(stderr, root); +#endif + +#ifdef USE_SUBEXP_CALL + if (scan_env.num_call > 0) { + r = unset_addr_list_init(&uslist, scan_env.num_call); + if (r != 0) goto err; + scan_env.unset_addr_list = &uslist; + r = setup_subexp_call(root, &scan_env); + if (r != 0) goto err_unset; + r = subexp_recursive_check_trav(root, &scan_env); + if (r < 0) goto err_unset; + r = subexp_inf_recursive_check_trav(root, &scan_env); + if (r != 0) goto err_unset; + + reg->num_call = scan_env.num_call; + } + else + reg->num_call = 0; +#endif + + r = setup_tree(root, reg, 0, &scan_env); + if (r != 0) goto err_unset; + + reg->capture_history = scan_env.capture_history; + reg->bt_mem_start = scan_env.bt_mem_start; + reg->bt_mem_start |= reg->capture_history; + if (IS_FIND_CONDITION(reg->options)) + BIT_STATUS_ON_ALL(reg->bt_mem_end); + else { + reg->bt_mem_end = scan_env.bt_mem_end; + reg->bt_mem_end |= reg->capture_history; + } + + clear_optimize_info(reg); +#ifndef ONIG_DONT_OPTIMIZE + r = set_optimize_info_from_tree(root, reg, &scan_env); + if (r != 0) goto err_unset; +#endif + + if (IS_NOT_NULL(scan_env.mem_nodes_dynamic)) { + xfree(scan_env.mem_nodes_dynamic); + scan_env.mem_nodes_dynamic = (Node** )NULL; + } + + r = compile_tree(root, reg); + if (r == 0) { + r = add_opcode(reg, OP_END); +#ifdef USE_SUBEXP_CALL + if (scan_env.num_call > 0) { + r = unset_addr_list_fix(&uslist, reg); + unset_addr_list_end(&uslist); + if (r) goto err; + } +#endif + + if ((reg->num_repeat != 0) || (reg->bt_mem_end != 0)) + reg->stack_pop_level = STACK_POP_LEVEL_ALL; + else { + if (reg->bt_mem_start != 0) + reg->stack_pop_level = STACK_POP_LEVEL_MEM_START; + else + reg->stack_pop_level = STACK_POP_LEVEL_FREE; + } + } +#ifdef USE_SUBEXP_CALL + else if (scan_env.num_call > 0) { + unset_addr_list_end(&uslist); + } +#endif + onig_node_free(root); + +#ifdef ONIG_DEBUG_COMPILE +#ifdef USE_NAMED_GROUP + onig_print_names(stderr, reg); +#endif + print_compiled_byte_code_list(stderr, reg); +#endif + + end: + reg->state = ONIG_STATE_NORMAL; + return r; + + err_unset: +#ifdef USE_SUBEXP_CALL + if (scan_env.num_call > 0) { + unset_addr_list_end(&uslist); + } +#endif + err: + if (IS_NOT_NULL(scan_env.error)) { + if (IS_NOT_NULL(einfo)) { + einfo->par = scan_env.error; + einfo->par_end = scan_env.error_end; + } + } + + if (IS_NOT_NULL(root)) onig_node_free(root); + if (IS_NOT_NULL(scan_env.mem_nodes_dynamic)) + xfree(scan_env.mem_nodes_dynamic); + return r; +} + +extern int +onig_recompile(regex_t* reg, UChar* pattern, UChar* pattern_end, + OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, + OnigErrorInfo* einfo) +{ + int r; + regex_t *new_reg; + + r = onig_new(&new_reg, pattern, pattern_end, option, enc, syntax, einfo); + if (r) return r; + if (ONIG_STATE(reg) == ONIG_STATE_NORMAL) { + onig_transfer(reg, new_reg); + } + else { + onig_chain_link_add(reg, new_reg); + } + return 0; +} + +static int onig_inited = 0; + +extern int +onig_alloc_init(regex_t** reg, OnigOptionType option, OnigEncoding enc, + OnigSyntaxType* syntax) +{ + if (! onig_inited) + onig_init(); + + if (ONIGENC_IS_UNDEF(enc)) + return ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED; + + *reg = (regex_t* )xmalloc(sizeof(regex_t)); + if (IS_NULL(*reg)) return ONIGERR_MEMORY; + + if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) { + option |= syntax->options; + option &= ~ONIG_OPTION_SINGLELINE; + } + else + option |= syntax->options; + + (*reg)->state = ONIG_STATE_NORMAL; + (*reg)->enc = enc; + (*reg)->options = option; + (*reg)->syntax = syntax; + (*reg)->optimize = 0; + (*reg)->exact = (UChar* )NULL; + (*reg)->int_map = (int* )NULL; + (*reg)->int_map_backward = (int* )NULL; + (*reg)->chain = (regex_t* )NULL; + + (*reg)->p = (UChar* )NULL; + (*reg)->alloc = 0; + (*reg)->used = 0; + (*reg)->name_table = (void* )NULL; + + return 0; +} + +extern int +onig_new(regex_t** reg, UChar* pattern, UChar* pattern_end, + OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, + OnigErrorInfo* einfo) +{ + int r; + + if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL; + + r = onig_alloc_init(reg, option, enc, syntax); + if (r) return r; + + r = onig_compile(*reg, pattern, pattern_end, einfo); + if (r) { + onig_free(*reg); + *reg = NULL; + } + return r; +} + +extern int +onig_init() +{ + if (onig_inited != 0) + return 0; + + onig_inited = 1; + + THREAD_ATOMIC_START; + + onigenc_init(); + onigenc_set_default_caseconv_table((UChar* )0); + +#ifdef ONIG_DEBUG_STATISTICS + onig_statistics_init(); +#endif + + THREAD_ATOMIC_END; + return 0; +} + +extern int +onig_end() +{ +#ifdef ONIG_DEBUG_STATISTICS + onig_print_statistics(stderr); +#endif + +#ifdef USE_RECYCLE_NODE + onig_free_node_list(); +#endif + + onig_inited = 0; + return 0; +} + + +#ifdef ONIG_DEBUG + +OnigOpInfoType OnigOpInfo[] = { + { OP_FINISH, "finish", ARG_NON }, + { OP_END, "end", ARG_NON }, + { OP_EXACT1, "exact1", ARG_SPECIAL }, + { OP_EXACT2, "exact2", ARG_SPECIAL }, + { OP_EXACT3, "exact3", ARG_SPECIAL }, + { OP_EXACT4, "exact4", ARG_SPECIAL }, + { OP_EXACT5, "exact5", ARG_SPECIAL }, + { OP_EXACTN, "exactn", ARG_SPECIAL }, + { OP_EXACTMB2N1, "exactmb2-n1", ARG_SPECIAL }, + { OP_EXACTMB2N2, "exactmb2-n2", ARG_SPECIAL }, + { OP_EXACTMB2N3, "exactmb2-n3", ARG_SPECIAL }, + { OP_EXACTMB2N, "exactmb2-n", ARG_SPECIAL }, + { OP_EXACTMB3N, "exactmb3n" , ARG_SPECIAL }, + { OP_EXACTMBN, "exactmbn", ARG_SPECIAL }, + { OP_EXACT1_IC, "exact1-ic", ARG_SPECIAL }, + { OP_EXACTN_IC, "exactn-ic", ARG_SPECIAL }, + { OP_CCLASS, "cclass", ARG_SPECIAL }, + { OP_CCLASS_MB, "cclass-mb", ARG_SPECIAL }, + { OP_CCLASS_MIX, "cclass-mix", ARG_SPECIAL }, + { OP_CCLASS_NOT, "cclass-not", ARG_SPECIAL }, + { OP_CCLASS_MB_NOT, "cclass-mb-not", ARG_SPECIAL }, + { OP_CCLASS_MIX_NOT, "cclass-mix-not", ARG_SPECIAL }, + { OP_ANYCHAR, "anychar", ARG_NON }, + { OP_ANYCHAR_ML, "anychar-ml", ARG_NON }, + { OP_ANYCHAR_STAR, "anychar*", ARG_NON }, + { OP_ANYCHAR_ML_STAR, "anychar-ml*", ARG_NON }, + { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next", ARG_SPECIAL }, + { OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next", ARG_SPECIAL }, + { OP_WORD, "word", ARG_NON }, + { OP_NOT_WORD, "not-word", ARG_NON }, + { OP_WORD_SB, "word-sb", ARG_NON }, + { OP_WORD_MB, "word-mb", ARG_NON }, + { OP_WORD_BOUND, "word-bound", ARG_NON }, + { OP_NOT_WORD_BOUND, "not-word-bound", ARG_NON }, + { OP_WORD_BEGIN, "word-begin", ARG_NON }, + { OP_WORD_END, "word-end", ARG_NON }, + { OP_BEGIN_BUF, "begin-buf", ARG_NON }, + { OP_END_BUF, "end-buf", ARG_NON }, + { OP_BEGIN_LINE, "begin-line", ARG_NON }, + { OP_END_LINE, "end-line", ARG_NON }, + { OP_SEMI_END_BUF, "semi-end-buf", ARG_NON }, + { OP_BEGIN_POSITION, "begin-position", ARG_NON }, + { OP_BACKREF1, "backref1", ARG_NON }, + { OP_BACKREF2, "backref2", ARG_NON }, + { OP_BACKREF3, "backref3", ARG_NON }, + { OP_BACKREFN, "backrefn", ARG_MEMNUM }, + { OP_BACKREFN_IC, "backrefn-ic", ARG_MEMNUM }, + { OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL }, + { OP_BACKREF_MULTI_IC, "backref_multi-ic",ARG_SPECIAL }, + { OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM }, + { OP_MEMORY_START, "mem-start", ARG_MEMNUM }, + { OP_MEMORY_END_PUSH, "mem-end-push", ARG_MEMNUM }, + { OP_MEMORY_END_PUSH_REC, "mem-end-push-rec", ARG_MEMNUM }, + { OP_MEMORY_END, "mem-end", ARG_MEMNUM }, + { OP_MEMORY_END_REC, "mem-end-rec", ARG_MEMNUM }, + { OP_SET_OPTION_PUSH, "set-option-push", ARG_OPTION }, + { OP_SET_OPTION, "set-option", ARG_OPTION }, + { OP_FAIL, "fail", ARG_NON }, + { OP_JUMP, "jump", ARG_RELADDR }, + { OP_PUSH, "push", ARG_RELADDR }, + { OP_POP, "pop", ARG_NON }, + { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1", ARG_SPECIAL }, + { OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next", ARG_SPECIAL }, + { OP_REPEAT, "repeat", ARG_SPECIAL }, + { OP_REPEAT_NG, "repeat-ng", ARG_SPECIAL }, + { OP_REPEAT_INC, "repeat-inc", ARG_MEMNUM }, + { OP_REPEAT_INC_NG, "repeat-inc-ng", ARG_MEMNUM }, + { OP_NULL_CHECK_START, "null-check-start",ARG_MEMNUM }, + { OP_NULL_CHECK_END, "null-check-end", ARG_MEMNUM }, + { OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM }, + { OP_NULL_CHECK_END_MEMST_PUSH,"null-check-end-memst-push", ARG_MEMNUM }, + { OP_PUSH_POS, "push-pos", ARG_NON }, + { OP_POP_POS, "pop-pos", ARG_NON }, + { OP_PUSH_POS_NOT, "push-pos-not", ARG_RELADDR }, + { OP_FAIL_POS, "fail-pos", ARG_NON }, + { OP_PUSH_STOP_BT, "push-stop-bt", ARG_NON }, + { OP_POP_STOP_BT, "pop-stop-bt", ARG_NON }, + { OP_LOOK_BEHIND, "look-behind", ARG_SPECIAL }, + { OP_PUSH_LOOK_BEHIND_NOT, "push-look-behind-not", ARG_SPECIAL }, + { OP_FAIL_LOOK_BEHIND_NOT, "fail-look-behind-not", ARG_NON }, + { OP_CALL, "call", ARG_ABSADDR }, + { OP_RETURN, "return", ARG_NON }, + { -1, "", ARG_NON } +}; + +static char* +op2name(int opcode) +{ + int i; + + for (i = 0; OnigOpInfo[i].opcode >= 0; i++) { + if (opcode == OnigOpInfo[i].opcode) + return OnigOpInfo[i].name; + } + return ""; +} + +static int +op2arg_type(int opcode) +{ + int i; + + for (i = 0; OnigOpInfo[i].opcode >= 0; i++) { + if (opcode == OnigOpInfo[i].opcode) + return OnigOpInfo[i].arg_type; + } + return ARG_SPECIAL; +} + +static void +Indent(FILE* f, int indent) +{ + int i; + for (i = 0; i < indent; i++) putc(' ', f); +} + +static void +p_string(FILE* f, int len, UChar* s) +{ + fputs(":", f); + while (len-- > 0) { fputc(*s++, f); } +} + +static void +p_len_string(FILE* f, LengthType len, int mb_len, UChar* s) +{ + int x = len * mb_len; + + fprintf(f, ":%d:", len); + while (x-- > 0) { fputc(*s++, f); } +} + +extern void +onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp) +{ + int i, n, arg_type; + RelAddrType addr; + LengthType len; + MemNumType mem; + OnigCodePoint code; + UChar *q; + + fprintf(f, "[%s", op2name(*bp)); + arg_type = op2arg_type(*bp); + if (arg_type != ARG_SPECIAL) { + bp++; + switch (arg_type) { + case ARG_NON: + break; + case ARG_RELADDR: + addr = *((RelAddrType* )bp); + bp += SIZE_RELADDR; + fprintf(f, ":(%d)", addr); + break; + case ARG_ABSADDR: + GET_ABSADDR_INC(addr, bp); + fprintf(f, ":(%d)", addr); + break; + case ARG_LENGTH: + GET_LENGTH_INC(len, bp); + fprintf(f, ":%d", len); + break; + case ARG_MEMNUM: + mem = *((MemNumType* )bp); + bp += SIZE_MEMNUM; + fprintf(f, ":%d", mem); + break; + case ARG_OPTION: + { + OnigOptionType option = *((OnigOptionType* )bp); + bp += SIZE_OPTION; + fprintf(f, ":%d", option); + } + break; + } + } + else { + switch (*bp++) { + case OP_EXACT1: + case OP_ANYCHAR_STAR_PEEK_NEXT: + case OP_ANYCHAR_ML_STAR_PEEK_NEXT: + p_string(f, 1, bp++); break; + case OP_EXACT2: + p_string(f, 2, bp); bp += 2; break; + case OP_EXACT3: + p_string(f, 3, bp); bp += 3; break; + case OP_EXACT4: + p_string(f, 4, bp); bp += 4; break; + case OP_EXACT5: + p_string(f, 5, bp); bp += 5; break; + case OP_EXACTN: + GET_LENGTH_INC(len, bp); + p_len_string(f, len, 1, bp); + bp += len; + break; + + case OP_EXACTMB2N1: + p_string(f, 2, bp); bp += 2; break; + case OP_EXACTMB2N2: + p_string(f, 4, bp); bp += 4; break; + case OP_EXACTMB2N3: + p_string(f, 6, bp); bp += 6; break; + case OP_EXACTMB2N: + GET_LENGTH_INC(len, bp); + p_len_string(f, len, 2, bp); + bp += len * 2; + break; + case OP_EXACTMB3N: + GET_LENGTH_INC(len, bp); + p_len_string(f, len, 3, bp); + bp += len * 3; + break; + case OP_EXACTMBN: + { + int mb_len; + + GET_LENGTH_INC(mb_len, bp); + GET_LENGTH_INC(len, bp); + fprintf(f, ":%d:%d:", mb_len, len); + n = len * mb_len; + while (n-- > 0) { fputc(*bp++, f); } + } + break; + + case OP_EXACT1_IC: + p_string(f, 1, bp++); + break; + case OP_EXACTN_IC: + GET_LENGTH_INC(len, bp); + p_len_string(f, len, 1, bp); + bp += len; + break; + + case OP_CCLASS: + n = bitset_on_num((BitSetRef )bp); + bp += SIZE_BITSET; + fprintf(f, ":%d", n); + break; + + case OP_CCLASS_NOT: + n = bitset_on_num((BitSetRef )bp); + bp += SIZE_BITSET; + fprintf(f, ":%d", n); + break; + + case OP_CCLASS_MB: + case OP_CCLASS_MB_NOT: + GET_LENGTH_INC(len, bp); + q = bp; +#ifndef PLATFORM_UNALIGNED_WORD_ACCESS + ALIGNMENT_RIGHT(q); +#endif + GET_CODE_POINT(code, q); + bp += len; + fprintf(f, ":%d:%d", (int )code, len); + break; + + case OP_CCLASS_MIX: + case OP_CCLASS_MIX_NOT: + n = bitset_on_num((BitSetRef )bp); + bp += SIZE_BITSET; + GET_LENGTH_INC(len, bp); + q = bp; +#ifndef PLATFORM_UNALIGNED_WORD_ACCESS + ALIGNMENT_RIGHT(q); +#endif + GET_CODE_POINT(code, q); + bp += len; + fprintf(f, ":%d:%d:%d", n, (int )code, len); + break; + + case OP_BACKREF_MULTI: + case OP_BACKREF_MULTI_IC: + fputs(" ", f); + GET_LENGTH_INC(len, bp); + for (i = 0; i < len; i++) { + GET_MEMNUM_INC(mem, bp); + if (i > 0) fputs(", ", f); + fprintf(f, "%d", mem); + } + break; + + case OP_REPEAT: + case OP_REPEAT_NG: + { + mem = *((MemNumType* )bp); + bp += SIZE_MEMNUM; + addr = *((RelAddrType* )bp); + bp += SIZE_RELADDR; + fprintf(f, ":%d:%d", mem, addr); + } + break; + + case OP_PUSH_OR_JUMP_EXACT1: + case OP_PUSH_IF_PEEK_NEXT: + addr = *((RelAddrType* )bp); + bp += SIZE_RELADDR; + fprintf(f, ":(%d)", addr); + p_string(f, 1, bp); + bp += 1; + break; + + case OP_LOOK_BEHIND: + GET_LENGTH_INC(len, bp); + fprintf(f, ":%d", len); + break; + + case OP_PUSH_LOOK_BEHIND_NOT: + GET_RELADDR_INC(addr, bp); + GET_LENGTH_INC(len, bp); + fprintf(f, ":%d:(%d)", len, addr); + break; + + default: + fprintf(stderr, "onig_print_compiled_byte_code: undefined code %d\n", + *--bp); + } + } + fputs("]", f); + if (nextp) *nextp = bp; +} + +static void +print_compiled_byte_code_list(FILE* f, regex_t* reg) +{ + int ncode; + UChar* bp = reg->p; + UChar* end = reg->p + reg->used; + + fprintf(f, "code length: %d\n", reg->used); + + ncode = 0; + while (bp < end) { + ncode++; + if (bp > reg->p) { + if (ncode % 5 == 0) + fprintf(f, "\n"); + else + fputs(" ", f); + } + onig_print_compiled_byte_code(f, bp, &bp); + } + + fprintf(f, "\n"); +} + +static void +print_indent_tree(FILE* f, Node* node, int indent) +{ + int i, type; + int add = 3; + UChar* p; + + Indent(f, indent); + if (IS_NULL(node)) { + fprintf(f, "ERROR: null node!!!\n"); + exit (0); + } + + type = NTYPE(node); + switch (type) { + case N_LIST: + case N_ALT: + if (NTYPE(node) == N_LIST) + fprintf(f, "\n", (int )node); + else + fprintf(f, "\n", (int )node); + + print_indent_tree(f, NCONS(node).left, indent + add); + while (IS_NOT_NULL(node = NCONS(node).right)) { + if (NTYPE(node) != type) { + fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NTYPE(node)); + exit(0); + } + print_indent_tree(f, NCONS(node).left, indent + add); + } + break; + + case N_STRING: + fprintf(f, "", + (NSTRING_IS_RAW(node) ? "-raw" : ""), (int )node); + for (p = NSTRING(node).s; p < NSTRING(node).end; p++) { + if (*p >= 0x20 && *p < 0x7f) + fputc(*p, f); + else { + fprintf(f, " 0x%02x", *p); + } + } + break; + + case N_CCLASS: + fprintf(f, "", (int )node); + if (NCCLASS(node).not) fputs(" not", f); + if (NCCLASS(node).mbuf) { + BBuf* bbuf = NCCLASS(node).mbuf; + for (i = 0; i < bbuf->used; i++) { + if (i > 0) fprintf(f, ","); + fprintf(f, "%0x", bbuf->p[i]); + } + } +#if 0 + fprintf(f, "\n"); + Indent(f, indent); + for (i = 0; i < SINGLE_BYTE_SIZE; i++) + fputc((BITSET_AT(NCCLASS(node).bs, i) ? '1' : '0'), f); +#endif + break; + + case N_CTYPE: + fprintf(f, " ", (int )node); + switch (NCTYPE(node).type) { + case CTYPE_WORD: fputs("word", f); break; + case CTYPE_NOT_WORD: fputs("not word", f); break; + default: + fprintf(f, "ERROR: undefined ctype.\n"); + exit(0); + } + break; + + case N_ANYCHAR: + fprintf(f, "", (int )node); + break; + + case N_ANCHOR: + fprintf(f, " ", (int )node); + switch (NANCHOR(node).type) { + case ANCHOR_BEGIN_BUF: fputs("begin buf", f); break; + case ANCHOR_END_BUF: fputs("end buf", f); break; + case ANCHOR_BEGIN_LINE: fputs("begin line", f); break; + case ANCHOR_END_LINE: fputs("end line", f); break; + case ANCHOR_SEMI_END_BUF: fputs("semi end buf", f); break; + case ANCHOR_BEGIN_POSITION: fputs("begin position", f); break; + + case ANCHOR_WORD_BOUND: fputs("word bound", f); break; + case ANCHOR_NOT_WORD_BOUND: fputs("not word bound", f); break; +#ifdef USE_WORD_BEGIN_END + case ANCHOR_WORD_BEGIN: fputs("word begin", f); break; + case ANCHOR_WORD_END: fputs("word end", f); break; +#endif + case ANCHOR_PREC_READ: fputs("prec read", f); break; + case ANCHOR_PREC_READ_NOT: fputs("prec read not", f); break; + case ANCHOR_LOOK_BEHIND: fputs("look_behind", f); break; + case ANCHOR_LOOK_BEHIND_NOT: fputs("look_behind_not",f); break; + + default: + fprintf(f, "ERROR: undefined anchor type.\n"); + break; + } + break; + + case N_BACKREF: + { + int* p; + BackrefNode* br = &(NBACKREF(node)); + p = BACKREFS_P(br); + fprintf(f, "", (int )node); + for (i = 0; i < br->back_num; i++) { + if (i > 0) fputs(", ", f); + fprintf(f, "%d", p[i]); + } + } + break; + +#ifdef USE_SUBEXP_CALL + case N_CALL: + { + CallNode* cn = &(NCALL(node)); + fprintf(f, "", (int )node); + p_string(f, cn->name_end - cn->name, cn->name); + } + break; +#endif + + case N_QUALIFIER: + fprintf(f, "{%d,%d}%s\n", (int )node, + NQUALIFIER(node).lower, NQUALIFIER(node).upper, + (NQUALIFIER(node).greedy ? "" : "?")); + print_indent_tree(f, NQUALIFIER(node).target, indent + add); + break; + + case N_EFFECT: + fprintf(f, " ", (int )node); + switch (NEFFECT(node).type) { + case EFFECT_OPTION: + fprintf(f, "option:%d\n", NEFFECT(node).option); + print_indent_tree(f, NEFFECT(node).target, indent + add); + break; + case EFFECT_MEMORY: + fprintf(f, "memory:%d", NEFFECT(node).regnum); + break; + case EFFECT_STOP_BACKTRACK: + fprintf(f, "stop-bt"); + break; + + default: + break; + } + fprintf(f, "\n"); + print_indent_tree(f, NEFFECT(node).target, indent + add); + break; + + default: + fprintf(f, "print_indent_tree: undefined node type %d\n", NTYPE(node)); + break; + } + + if (type != N_LIST && type != N_ALT && type != N_QUALIFIER && + type != N_EFFECT) + fprintf(f, "\n"); + fflush(f); +} +#endif /* ONIG_DEBUG */ + +#ifdef ONIG_DEBUG_PARSE_TREE +static void +print_tree(FILE* f, Node* node) +{ + print_indent_tree(f, node, 0); +} +#endif diff --git a/regenc.c b/regenc.c new file mode 100644 index 0000000000..21598ca7c7 --- /dev/null +++ b/regenc.c @@ -0,0 +1,586 @@ +/********************************************************************** + + regenc.c - Oniguruma (regular expression library) + + Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) + +**********************************************************************/ +#include "regenc.h" + +OnigEncoding OnigEncDefaultCharEncoding = ONIG_ENCODING_INIT_DEFAULT; + +extern int +onigenc_init() +{ + return 0; +} + +extern OnigEncoding +onigenc_get_default_encoding() +{ + return OnigEncDefaultCharEncoding; +} + +extern int +onigenc_set_default_encoding(OnigEncoding enc) +{ + OnigEncDefaultCharEncoding = enc; + return 0; +} + +extern UChar* +onigenc_get_right_adjust_char_head(OnigEncoding enc, UChar* start, UChar* s) +{ + UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s); + if (p < s) { + p += enc_len(enc, *p); + } + return p; +} + +extern UChar* +onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc, + UChar* start, UChar* s, UChar** prev) +{ + UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s); + + if (p < s) { + if (prev) *prev = p; + p += enc_len(enc, *p); + } + else { + if (prev) *prev = (UChar* )NULL; /* Sorry */ + } + return p; +} + +extern UChar* +onigenc_get_prev_char_head(OnigEncoding enc, UChar* start, UChar* s) +{ + if (s <= start) + return (UChar* )NULL; + + return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1); +} + +extern UChar* +onigenc_step_back(OnigEncoding enc, UChar* start, UChar* s, int n) +{ + while (ONIG_IS_NOT_NULL(s) && n-- > 0) { + if (s <= start) + return (UChar* )NULL; + + s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s - 1); + } + return s; +} + + +#ifndef ONIG_RUBY_M17N + +#ifndef NOT_RUBY +#define USE_APPLICATION_TO_LOWER_CASE_TABLE +#endif + +UChar* OnigEncAsciiToLowerCaseTable = (UChar* )0; + +#ifndef USE_APPLICATION_TO_LOWER_CASE_TABLE +static UChar BuiltInAsciiToLowerCaseTable[] = { + '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', + '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', + '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', + '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037', + '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047', + '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057', + '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067', + '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077', + '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147', + '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', + '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', + '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137', + '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147', + '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', + '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', + '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177', + '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207', + '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217', + '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227', + '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237', + '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247', + '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257', + '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267', + '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277', + '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307', + '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317', + '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327', + '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337', + '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347', + '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', + '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367', + '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377', +}; +#endif /* not USE_APPLICATION_TO_LOWER_CASE_TABLE */ + +unsigned short OnigEncAsciiCtypeTable[256] = { + 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, + 0x1004, 0x1106, 0x1104, 0x1104, 0x1104, 0x1104, 0x1004, 0x1004, + 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, + 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, + 0x1142, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, + 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, + 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, + 0x1c58, 0x1c58, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, + 0x10d0, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1a51, + 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, + 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, + 0x1a51, 0x1a51, 0x1a51, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x18d0, + 0x10d0, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1871, + 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, + 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, + 0x1871, 0x1871, 0x1871, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x1004, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 +}; + +extern void +onigenc_set_default_caseconv_table(UChar* table) +{ + if (table == (UChar* )0) { +#ifndef USE_APPLICATION_TO_LOWER_CASE_TABLE + table = BuiltInAsciiToLowerCaseTable; +#else + return ; +#endif + } + + if (table != OnigEncAsciiToLowerCaseTable) { + OnigEncAsciiToLowerCaseTable = table; + } +} + +extern UChar* +onigenc_get_left_adjust_char_head(OnigEncoding enc, UChar* start, UChar* s) +{ + return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s); +} + +extern int +onigenc_nothing_get_all_fold_match_code(OnigCodePoint** codes) +{ + return 0; +} + +extern int +onigenc_nothing_get_fold_match_info(UChar* p, UChar* end, + OnigEncFoldMatchInfo** info) +{ + return -1; +} + +extern int +onigenc_nothing_get_ctype_code_range(int ctype, int* nsb, int* nmb, + OnigCodePointRange* sbr[], OnigCodePointRange* mbr[]) +{ + return -1; +} + +/* for single byte encodings */ +extern int +onigenc_ascii_mbc_to_lower(UChar* p, UChar* lower) +{ + *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + return 1; /* return byte length of converted char to lower */ +} + +extern int +onigenc_ascii_mbc_is_case_ambig(UChar* p) +{ + return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p); +} + +extern OnigCodePoint +onigenc_single_byte_mbc_to_code(UChar* p, UChar* end) +{ + return (OnigCodePoint )(*p); +} + +extern int +onigenc_single_byte_code_to_mbclen(OnigCodePoint code) +{ + return 1; +} + +extern int +onigenc_single_byte_code_to_mbc_first(OnigCodePoint code) +{ + return (code & 0xff); +} + +extern int +onigenc_single_byte_code_to_mbc(OnigCodePoint code, UChar *buf) +{ + *buf = (UChar )(code & 0xff); + return 1; +} + +extern UChar* +onigenc_single_byte_left_adjust_char_head(UChar* start, UChar* s) +{ + return s; +} + +extern int +onigenc_single_byte_is_allowed_reverse_match(UChar* s, UChar* end) +{ + return TRUE; +} + +extern OnigCodePoint +onigenc_mbn_mbc_to_code(OnigEncoding enc, UChar* p, UChar* end) +{ + int c, i, len; + OnigCodePoint n; + + c = *p++; + len = enc_len(enc, c); + n = c; + if (len == 1) return n; + + for (i = 1; i < len; i++) { + if (p >= end) break; + c = *p++; + n <<= 8; n += c; + } + return n; +} + +extern int +onigenc_mbn_mbc_to_lower(OnigEncoding enc, UChar* p, UChar* lower) +{ + int len; + + if (ONIGENC_IS_MBC_ASCII(p)) { + *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + return 1; + } + else { + len = enc_len(enc, *p); + if (lower != p) { + /* memcpy(lower, p, len); */ + int i; + for (i = 0; i < len; i++) { + *lower++ = *p++; + } + } + return len; /* return byte length of converted to lower char */ + } +} + +extern int +onigenc_mbn_mbc_is_case_ambig(UChar* p) +{ + if (ONIGENC_IS_MBC_ASCII(p)) + return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p); + + return FALSE; +} + +extern int +onigenc_mb2_code_to_mbclen(OnigCodePoint code) +{ + if ((code & 0xff00) != 0) return 2; + else return 1; +} + +extern int +onigenc_mb4_code_to_mbclen(OnigCodePoint code) +{ + if ((code & 0xff000000) != 0) return 4; + else if ((code & 0xff0000) != 0) return 3; + else if ((code & 0xff00) != 0) return 2; + else return 1; +} + +extern int +onigenc_mb2_code_to_mbc_first(OnigCodePoint code) +{ + int first; + + if ((code & 0xff00) != 0) { + first = (code >> 8) & 0xff; + } + else { + return (int )code; + } + return first; +} + +extern int +onigenc_mb4_code_to_mbc_first(OnigCodePoint code) +{ + int first; + + if ((code & 0xff000000) != 0) { + first = (code >> 24) & 0xff; + } + else if ((code & 0xff0000) != 0) { + first = (code >> 16) & 0xff; + } + else if ((code & 0xff00) != 0) { + first = (code >> 8) & 0xff; + } + else { + return (int )code; + } + return first; +} + +extern int +onigenc_mb2_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf) +{ + UChar *p = buf; + + if ((code & 0xff00) != 0) { + *p++ = (UChar )((code >> 8) & 0xff); + } + *p++ = (UChar )(code & 0xff); + +#if 1 + if (enc_len(enc, buf[0]) != (p - buf)) + return ONIGERR_INVALID_WIDE_CHAR_VALUE; +#endif + return p - buf; +} + +extern int +onigenc_mb4_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf) +{ + UChar *p = buf; + + if ((code & 0xff000000) != 0) { + *p++ = (UChar )((code >> 24) & 0xff); + } + if ((code & 0xff0000) != 0) { + *p++ = (UChar )((code >> 16) & 0xff); + } + if ((code & 0xff00) != 0) { + *p++ = (UChar )((code >> 8) & 0xff); + } + *p++ = (UChar )(code & 0xff); + +#if 1 + if (enc_len(enc, buf[0]) != (p - buf)) + return ONIGERR_INVALID_WIDE_CHAR_VALUE; +#endif + return p - buf; +} + +extern int +onigenc_mb2_code_is_ctype(OnigEncoding enc, OnigCodePoint code, + unsigned int ctype) +{ + if ((ctype & ONIGENC_CTYPE_WORD) != 0) { + if (code < 128) + return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); + else { + int first = onigenc_mb2_code_to_mbc_first(code); + return (enc_len(enc, first) > 1 ? TRUE : FALSE); + } + + ctype &= ~ONIGENC_CTYPE_WORD; + if (ctype == 0) return FALSE; + } + + if (code < 128) + return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); + else + return FALSE; +} + +extern int +onigenc_mb4_code_is_ctype(OnigEncoding enc, OnigCodePoint code, + unsigned int ctype) +{ + if ((ctype & ONIGENC_CTYPE_WORD) != 0) { + if (code < 128) + return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); + else { + int first = onigenc_mb4_code_to_mbc_first(code); + return (enc_len(enc, first) > 1 ? TRUE : FALSE); + } + + ctype &= ~ONIGENC_CTYPE_WORD; + if (ctype == 0) return FALSE; + } + + if (code < 128) + return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); + else + return FALSE; +} + +extern int +onigenc_get_all_fold_match_code_ss_0xdf(OnigCodePoint** codes) +{ + static OnigCodePoint list[] = { 0xdf }; + *codes = list; + return 1; +} + +extern int +onigenc_get_fold_match_info_ss_0xdf(UChar* p, UChar* end, + OnigEncFoldMatchInfo** info) +{ + /* German alphabet ess-tsett(U+00DF) */ + static OnigEncFoldMatchInfo ss = { + 3, + { 1, 2, 2 }, + { "\337", "ss", "SS" } /* 0337: 0xdf */ + }; + + if (p >= end) return -1; + + if (*p == 0xdf) { + *info = &ss; + return 1; + } + else if (p + 1 < end) { + if ((*p == 'S' && *(p+1) == 'S') || + (*p == 's' && *(p+1) == 's')) { + *info = &ss; + return 2; + } + } + + return -1; /* is not a fold string. */ +} + +#else /* ONIG_RUBY_M17N */ + +extern int +onigenc_is_code_ctype(OnigEncoding enc, OnigCodePoint code, int ctype) +{ + switch (ctype) { + case ONIGENC_CTYPE_ALPHA: + return m17n_isalpha(enc, code); + break; + case ONIGENC_CTYPE_BLANK: + return ONIGENC_IS_CODE_BLANK(enc, (int )(code)); + break; + case ONIGENC_CTYPE_CNTRL: + return m17n_iscntrl(enc, code); + break; + case ONIGENC_CTYPE_DIGIT: + return m17n_isdigit(enc, code); + break; + case ONIGENC_CTYPE_GRAPH: + return ONIGENC_IS_CODE_GRAPH(enc, (int )(code)); + break; + case ONIGENC_CTYPE_LOWER: + return m17n_islower(enc, code); + break; + case ONIGENC_CTYPE_PRINT: + return m17n_isprint(enc, code); + break; + case ONIGENC_CTYPE_PUNCT: + return m17n_ispunct(enc, code); + break; + case ONIGENC_CTYPE_SPACE: + return m17n_isspace(enc, code); + break; + case ONIGENC_CTYPE_UPPER: + return m17n_isupper(enc, code); + break; + case ONIGENC_CTYPE_XDIGIT: + return m17n_isxdigit(enc, code); + break; + case ONIGENC_CTYPE_WORD: + return m17n_iswchar(enc, code); + break; + case ONIGENC_CTYPE_ASCII: + return (code < 128 ? TRUE : FALSE); + break; + case ONIGENC_CTYPE_ALNUM: + return m17n_isalnum(enc, code); + break; + default: + break; + } + + return 0; +} + +extern int +onigenc_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf) +{ + int c, len; + + m17n_mbcput(enc, code, buf); + c = m17n_firstbyte(enc, code); + len = enc_len(enc, c); + return len; +} + +extern int +onigenc_mbc_to_lower(OnigEncoding enc, UChar* p, UChar* buf) +{ + unsigned int c, low; + + c = m17n_codepoint(enc, p, p + enc_len(enc, *p)); + low = m17n_tolower(enc, c); + m17n_mbcput(enc, low, buf); + + return m17n_codelen(enc, low); +} + +extern int +onigenc_mbc_is_case_ambig(OnigEncoding enc, UChar* p) +{ + unsigned int c = m17n_codepoint(enc, p, p + enc_len(enc, *p)); + + if (m17n_isupper(enc, c) || m17n_islower(enc, c)) + return TRUE; + return FALSE; +} + +extern UChar* +onigenc_get_left_adjust_char_head(OnigEncoding enc, UChar* start, UChar* s) +{ + UChar *p; + int len; + + if (s <= start) return s; + p = s; + + while (!m17n_islead(enc, *p) && p > start) p--; + while (p + (len = enc_len(enc, *p)) < s) { + p += len; + } + if (p + len == s) return s; + return p; +} + +extern int +onigenc_is_allowed_reverse_match(OnigEncoding enc, UChar* s, UChar* end) +{ + return ONIGENC_IS_SINGLEBYTE(enc); +} + +extern void +onigenc_set_default_caseconv_table(UChar* table) { } + +#endif /* ONIG_RUBY_M17N */ diff --git a/regenc.h b/regenc.h new file mode 100644 index 0000000000..e0c6211d32 --- /dev/null +++ b/regenc.h @@ -0,0 +1,96 @@ +/********************************************************************** + + regenc.h - Oniguruma (regular expression library) + + Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) + +**********************************************************************/ +#ifndef REGENC_H +#define REGENC_H + +#ifndef RUBY_PLATFORM +#include "config.h" +#endif +#include "oniguruma.h" + +#ifndef NULL +#define NULL ((void* )0) +#endif + +#ifndef TRUE +#define TRUE 1 +#endif + +#ifndef FALSE +#define FALSE 0 +#endif + +/* error codes */ +/* internal error */ +#define ONIGERR_MEMORY -5 +#define ONIGERR_TYPE_BUG -6 +/* syntax error [-400, -999] */ +#define ONIGERR_INVALID_WIDE_CHAR_VALUE -400 +#define ONIGERR_TOO_BIG_WIDE_CHAR_VALUE -401 + +#define ONIG_NEWLINE '\n' +#define ONIG_IS_NEWLINE(c) ((c) == ONIG_NEWLINE) +#define ONIG_IS_NULL(p) (((void*)(p)) == (void*)0) +#define ONIG_IS_NOT_NULL(p) (((void*)(p)) != (void*)0) +#define ONIG_CHECK_NULL_RETURN(p) if (ONIG_IS_NULL(p)) return NULL +#define ONIG_CHECK_NULL_RETURN_VAL(p,val) if (ONIG_IS_NULL(p)) return (val) + + +#ifdef ONIG_RUBY_M17N + +#define ONIG_ENCODING_INIT_DEFAULT ONIG_ENCODING_UNDEF + +#else /* ONIG_RUBY_M17N */ + +#define ONIG_ENCODING_INIT_DEFAULT ONIG_ENCODING_ASCII + +/* for encoding system implementation (internal) */ +ONIG_EXTERN int onigenc_nothing_get_all_fold_match_code P_((OnigCodePoint** codes)); +ONIG_EXTERN int onigenc_nothing_get_fold_match_info P_((UChar* p, UChar* end, OnigEncFoldMatchInfo** info)); +ONIG_EXTERN int onigenc_nothing_get_ctype_code_range P_((int ctype, int* nsb, int* nmb, OnigCodePointRange* sbr[], OnigCodePointRange* mbr[])); + +/* methods for single byte encoding */ +ONIG_EXTERN int onigenc_ascii_mbc_to_lower P_((UChar* p, UChar* lower)); +ONIG_EXTERN int onigenc_ascii_mbc_is_case_ambig P_((UChar* p)); +ONIG_EXTERN OnigCodePoint onigenc_single_byte_mbc_to_code P_((UChar* p, UChar* end)); +ONIG_EXTERN int onigenc_single_byte_code_to_mbclen P_((OnigCodePoint code)); +ONIG_EXTERN int onigenc_single_byte_code_to_mbc_first P_((OnigCodePoint code)); +ONIG_EXTERN int onigenc_single_byte_code_to_mbc P_((OnigCodePoint code, UChar *buf)); +ONIG_EXTERN UChar* onigenc_single_byte_left_adjust_char_head P_((UChar* start, UChar* s)); +ONIG_EXTERN int onigenc_single_byte_is_allowed_reverse_match P_((UChar* s, UChar* end)); + +/* methods for multi byte encoding */ +ONIG_EXTERN OnigCodePoint onigenc_mbn_mbc_to_code P_((OnigEncoding enc, UChar* p, UChar* end)); +ONIG_EXTERN int onigenc_mbn_mbc_to_lower P_((OnigEncoding enc, UChar* p, UChar* lower)); +ONIG_EXTERN int onigenc_mbn_mbc_is_case_ambig P_((UChar* p)); +ONIG_EXTERN int onigenc_mb2_code_to_mbclen P_((OnigCodePoint code)); +ONIG_EXTERN int onigenc_mb2_code_to_mbc_first P_((OnigCodePoint code)); +ONIG_EXTERN int onigenc_mb2_code_to_mbc P_((OnigEncoding enc, OnigCodePoint code, UChar *buf)); +ONIG_EXTERN int onigenc_mb2_code_is_ctype P_((OnigEncoding enc, OnigCodePoint code, unsigned int ctype)); +ONIG_EXTERN int onigenc_mb4_code_to_mbclen P_((OnigCodePoint code)); +ONIG_EXTERN int onigenc_mb4_code_to_mbc_first P_((OnigCodePoint code)); +ONIG_EXTERN int onigenc_mb4_code_to_mbc P_((OnigEncoding enc, OnigCodePoint code, UChar *buf)); +ONIG_EXTERN int onigenc_mb4_code_is_ctype P_((OnigEncoding enc, OnigCodePoint code, unsigned int ctype)); + +ONIG_EXTERN int onigenc_get_all_fold_match_code_ss_0xdf P_((OnigCodePoint** codes)); +ONIG_EXTERN int onigenc_get_fold_match_info_ss_0xdf P_((UChar* p, UChar* end, OnigEncFoldMatchInfo** info)); + +#endif /* is not ONIG_RUBY_M17N */ + + +ONIG_EXTERN OnigEncoding OnigEncDefaultCharEncoding; +ONIG_EXTERN UChar* OnigEncAsciiToLowerCaseTable; +ONIG_EXTERN unsigned short OnigEncAsciiCtypeTable[]; + +#define ONIGENC_ASCII_CODE_TO_LOWER_CASE(c) OnigEncAsciiToLowerCaseTable[c] +#define ONIGENC_IS_ASCII_CODE_CTYPE(code,ctype) \ + ((OnigEncAsciiCtypeTable[code] & ctype) != 0) +#define ONIGENC_IS_ASCII_CODE_CASE_AMBIG(code) \ + ONIGENC_IS_ASCII_CODE_CTYPE(code, (ONIGENC_CTYPE_UPPER | ONIGENC_CTYPE_LOWER)) + +#endif /* REGENC_H */ diff --git a/regerror.c b/regerror.c new file mode 100644 index 0000000000..5a6c31b82e --- /dev/null +++ b/regerror.c @@ -0,0 +1,291 @@ +/********************************************************************** + + regerror.c - Oniguruma (regular expression library) + + Copyright (C) 2002-2004 K.Kosako (kosako@sofnec.co.jp) + +**********************************************************************/ +#include "regint.h" +#include /* for vsnprintf() */ + +#ifdef HAVE_STDARG_PROTOTYPES +#include +#define va_init_list(a,b) va_start(a,b) +#else +#include +#define va_init_list(a,b) va_start(a) +#endif + +extern char* +onig_error_code_to_format(int code) +{ + char *p; + + if (code >= 0) return (char* )0; + + switch (code) { + case ONIG_MISMATCH: + p = "mismatch"; break; + case ONIG_NO_SUPPORT_CONFIG: + p = "no support in this configuration"; break; + case ONIGERR_MEMORY: + p = "fail to memory allocation"; break; + case ONIGERR_MATCH_STACK_LIMIT_OVER: + p = "match-stack limit over"; break; + case ONIGERR_TYPE_BUG: + p = "undefined type (bug)"; break; + case ONIGERR_PARSER_BUG: + p = "internal parser error (bug)"; break; + case ONIGERR_STACK_BUG: + p = "stack error (bug)"; break; + case ONIGERR_UNDEFINED_BYTECODE: + p = "undefined bytecode (bug)"; break; + case ONIGERR_UNEXPECTED_BYTECODE: + p = "unexpected bytecode (bug)"; break; + case ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED: + p = "default multibyte-encoding is not setted"; break; + case ONIGERR_SPECIFIED_ENCODING_CANT_CONVERT_TO_WIDE_CHAR: + p = "can't convert to wide-char on specified multibyte-encoding"; break; + case ONIGERR_INVALID_ARGUMENT: + p = "invalid argument"; break; + case ONIGERR_END_PATTERN_AT_LEFT_BRACE: + p = "end pattern at left brace"; break; + case ONIGERR_END_PATTERN_AT_LEFT_BRACKET: + p = "end pattern at left bracket"; break; + case ONIGERR_EMPTY_CHAR_CLASS: + p = "empty char-class"; break; + case ONIGERR_PREMATURE_END_OF_CHAR_CLASS: + p = "premature end of char-class"; break; + case ONIGERR_END_PATTERN_AT_BACKSLASH: + p = "end pattern at backslash"; break; + case ONIGERR_END_PATTERN_AT_META: + p = "end pattern at meta"; break; + case ONIGERR_END_PATTERN_AT_CONTROL: + p = "end pattern at control"; break; + case ONIGERR_META_CODE_SYNTAX: + p = "illegal meta-code syntax"; break; + case ONIGERR_CONTROL_CODE_SYNTAX: + p = "illegal control-code syntax"; break; + case ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE: + p = "char-class value at end of range"; break; + case ONIGERR_CHAR_CLASS_VALUE_AT_START_OF_RANGE: + p = "char-class value at start of range"; break; + case ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS: + p = "unmatched range specifier in char-class"; break; + case ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED: + p = "target of repeat operator is not specified"; break; + case ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID: + p = "target of repeat operator is invalid"; break; + case ONIGERR_NESTED_REPEAT_OPERATOR: + p = "nested repeat operator"; break; + case ONIGERR_UNMATCHED_CLOSE_PARENTHESIS: + p = "unmatched close parenthesis"; break; + case ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS: + p = "end pattern with unmatched parenthesis"; break; + case ONIGERR_END_PATTERN_IN_GROUP: + p = "end pattern in group"; break; + case ONIGERR_UNDEFINED_GROUP_OPTION: + p = "undefined group option"; break; + case ONIGERR_INVALID_POSIX_BRACKET_TYPE: + p = "invalid POSIX bracket type"; break; + case ONIGERR_INVALID_LOOK_BEHIND_PATTERN: + p = "invalid pattern in look-behind"; break; + case ONIGERR_INVALID_REPEAT_RANGE_PATTERN: + p = "invalid repeat range {lower,upper}"; break; + case ONIGERR_TOO_BIG_NUMBER: + p = "too big number"; break; + case ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE: + p = "too big number for repeat range"; break; + case ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE: + p = "upper is smaller than lower in repeat range"; break; + case ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS: + p = "empty range in char class"; break; + case ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE: + p = "mismatch multibyte code length in char-class range"; break; + case ONIGERR_TOO_MANY_MULTI_BYTE_RANGES: + p = "too many multibyte code ranges are specified"; break; + case ONIGERR_TOO_SHORT_MULTI_BYTE_STRING: + p = "too short multibyte code string"; break; + case ONIGERR_TOO_BIG_BACKREF_NUMBER: + p = "too big backref number"; break; + case ONIGERR_INVALID_BACKREF: +#ifdef USE_NAMED_GROUP + p = "invalid backref number/name"; break; +#else + p = "invalid backref number"; break; +#endif + case ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED: + p = "numbered backref/call is not allowed. (use name)"; break; + case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE: + p = "too big wide-char value"; break; + case ONIGERR_TOO_LONG_WIDE_CHAR_VALUE: + p = "too long wide-char value"; break; + case ONIGERR_INVALID_WIDE_CHAR_VALUE: + p = "invalid wide-char value"; break; + case ONIGERR_EMPTY_GROUP_NAME: + p = "group name is empty"; break; + case ONIGERR_INVALID_GROUP_NAME: + p = "invalid group name <%n>"; break; + case ONIGERR_INVALID_CHAR_IN_GROUP_NAME: +#ifdef USE_NAMED_GROUP + p = "invalid char in group name <%n>"; break; +#else + p = "invalid char in group number <%n>"; break; +#endif + case ONIGERR_UNDEFINED_NAME_REFERENCE: + p = "undefined name <%n> reference"; break; + case ONIGERR_UNDEFINED_GROUP_REFERENCE: + p = "undefined group <%n> reference"; break; + case ONIGERR_MULTIPLEX_DEFINED_NAME: + p = "multiplex defined name <%n>"; break; + case ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL: + p = "multiplex definition name <%n> call"; break; + case ONIGERR_NEVER_ENDING_RECURSION: + p = "never ending recursion"; break; + case ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY: + p = "group number is too big for capture history"; break; + case ONIGERR_INVALID_CHAR_PROPERTY_NAME: + p = "invalid character property name"; break; + case ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT: + p = "over thread pass limit count"; break; + + default: + p = "undefined error code"; break; + } + + return p; +} + + +/* for ONIG_MAX_ERROR_MESSAGE_LEN */ +#define MAX_ERROR_PAR_LEN 30 + +extern int +#ifdef HAVE_STDARG_PROTOTYPES +onig_error_code_to_str(UChar* s, int code, ...) +#else +onig_error_code_to_str(s, code, va_alist) + UChar* s; + int code; + va_dcl +#endif +{ + UChar *p, *q; + OnigErrorInfo* einfo; + int len; + va_list vargs; + + va_init_list(vargs, code); + + switch (code) { + case ONIGERR_UNDEFINED_NAME_REFERENCE: + case ONIGERR_UNDEFINED_GROUP_REFERENCE: + case ONIGERR_MULTIPLEX_DEFINED_NAME: + case ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL: + case ONIGERR_INVALID_GROUP_NAME: + case ONIGERR_INVALID_CHAR_IN_GROUP_NAME: + einfo = va_arg(vargs, OnigErrorInfo*); + len = einfo->par_end - einfo->par; + q = onig_error_code_to_format(code); + p = s; + while (*q != '\0') { + if (*q == '%') { + q++; + if (*q == 'n') { /* '%n': name */ + if (len > MAX_ERROR_PAR_LEN) { + xmemcpy(p, einfo->par, MAX_ERROR_PAR_LEN - 3); + p += (MAX_ERROR_PAR_LEN - 3); + xmemcpy(p, "...", 3); + p += 3; + } + else { + xmemcpy(p, einfo->par, len); + p += len; + } + q++; + } + else + goto normal_char; + } + else { + normal_char: + *p++ = *q++; + } + } + *p = '\0'; + len = p - s; + break; + + default: + q = onig_error_code_to_format(code); + len = strlen(q); + xmemcpy(s, q, len); + s[len] = '\0'; + break; + } + + va_end(vargs); + return len; +} + + +void +#ifdef HAVE_STDARG_PROTOTYPES +onig_snprintf_with_pattern(char buf[], int bufsize, OnigEncoding enc, + char* pat, char* pat_end, char *fmt, ...) +#else +onig_snprintf_with_pattern(buf, bufsize, enc, pat, pat_end, fmt, va_alist) + char buf[]; + int bufsize; + OnigEncoding enc; + char* pat; + char* pat_end; + const char *fmt; + va_dcl +#endif +{ + int n, need, len; + UChar *p, *s; + va_list args; + + va_init_list(args, fmt); + n = vsnprintf(buf, bufsize, fmt, args); + va_end(args); + + need = (pat_end - pat) * 4 + 4; + + if (n + need < bufsize) { + strcat(buf, ": /"); + s = buf + strlen(buf); + + p = pat; + while (p < (UChar* )pat_end) { + if (*p == MC_ESC) { + *s++ = *p++; + len = enc_len(enc, *p); + while (len-- > 0) *s++ = *p++; + } + else if (*p == '/') { + *s++ = MC_ESC; + *s++ = *p++; + } + else if (ONIGENC_IS_MBC_HEAD(enc, *p)) { + len = enc_len(enc, *p); + while (len-- > 0) *s++ = *p++; + } + else if (!ONIGENC_IS_CODE_PRINT(enc, *p) && + !ONIGENC_IS_CODE_SPACE(enc, *p)) { + char b[5]; + sprintf(b, "\\%03o", *p & 0377); + len = strlen(b); + while (len-- > 0) *s++ = *p++; + } + else { + *s++ = *p++; + } + } + + *s++ = '/'; + *s = '\0'; + } +} diff --git a/regexec.c b/regexec.c new file mode 100644 index 0000000000..2ded602e15 --- /dev/null +++ b/regexec.c @@ -0,0 +1,3299 @@ +/********************************************************************** + + regexec.c - Oniguruma (regular expression library) + + Copyright (C) 2002-2004 K.Kosako (kosako@sofnec.co.jp) + +**********************************************************************/ +#include "regint.h" + +static void +region_list_clear(OnigRegion** list) +{ + int i; + + if (IS_NOT_NULL(list)) { + for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) { + if (IS_NOT_NULL(list[i])) { + xfree(list[i]); + list[i] = (OnigRegion* )0; + } + } + } +} + +static void +region_list_free(OnigRegion* r) +{ + if (IS_NOT_NULL(r->list)) { + region_list_clear(r->list); + xfree(r->list); + r->list = (OnigRegion** )0; + } +} + +static OnigRegion** +region_list_new() +{ + int i; + OnigRegion** list; + + list = (OnigRegion** )xmalloc(sizeof(OnigRegion*) + * (ONIG_MAX_CAPTURE_HISTORY_GROUP + 1)); + CHECK_NULL_RETURN(list); + for (i = 0; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) { + list[i] = (OnigRegion* )0; + } + + return list; +} + +extern void +onig_region_clear(OnigRegion* region) +{ + int i; + + for (i = 0; i < region->num_regs; i++) { + region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS; + } + region_list_clear(region->list); +} + +extern int +onig_region_resize(OnigRegion* region, int n) +{ + int i; + + region->num_regs = n; + + if (n < ONIG_NREGION) + n = ONIG_NREGION; + + if (region->allocated == 0) { + region->beg = (int* )xmalloc(n * sizeof(int)); + region->end = (int* )xmalloc(n * sizeof(int)); + + if (region->beg == 0 || region->end == 0) + return ONIGERR_MEMORY; + + region->allocated = n; + } + else if (region->allocated < n) { + region->beg = (int* )xrealloc(region->beg, n * sizeof(int)); + region->end = (int* )xrealloc(region->end, n * sizeof(int)); + + if (region->beg == 0 || region->end == 0) + return ONIGERR_MEMORY; + + region->allocated = n; + } + + for (i = 0; i < region->num_regs; i++) { + region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS; + } + + if (IS_NOT_NULL(region->list)) + region_list_clear(region->list); + + return 0; +} + +static int +region_ensure_size(OnigRegion* region, int n) +{ + int i, new_size; + + if (region->allocated >= n) + return 0; + + new_size = region->allocated; + if (new_size == 0) + new_size = ONIG_NREGION; + while (new_size < n) + new_size *= 2; + + if (region->allocated == 0) { + region->beg = (int* )xmalloc(new_size * sizeof(int)); + region->end = (int* )xmalloc(new_size * sizeof(int)); + if (region->beg == 0 || region->end == 0) + return ONIGERR_MEMORY; + + region->allocated = new_size; + } + else if (region->allocated < new_size) { + region->beg = (int* )xrealloc(region->beg, new_size * sizeof(int)); + region->end = (int* )xrealloc(region->end, new_size * sizeof(int)); + if (region->beg == 0 || region->end == 0) + return ONIGERR_MEMORY; + + region->allocated = new_size; + } + + for (i = region->num_regs; i < n; i++) { + region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS; + } + return 0; +} + +static int +region_list_add_entry(OnigRegion* region, int group, int start, int end) +{ + int r, pos; + OnigRegion** list; + + if (group > ONIG_MAX_CAPTURE_HISTORY_GROUP) + return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY; + + if (IS_NULL(region->list)) { + region->list = region_list_new(); + CHECK_NULL_RETURN_VAL(region->list, ONIGERR_MEMORY); + } + + list = region->list; + if (IS_NULL(list[group])) { + list[group] = onig_region_new(); + CHECK_NULL_RETURN_VAL(list[group], ONIGERR_MEMORY); + } + + r = region_ensure_size(list[group], list[group]->num_regs + 1); + if (r != 0) return r; + + pos = list[group]->num_regs; + list[group]->beg[pos] = start; + list[group]->end[pos] = end; + list[group]->num_regs++; + + return 0; +} + +static void +onig_region_init(OnigRegion* region) +{ + region->num_regs = 0; + region->allocated = 0; + region->beg = (int* )0; + region->end = (int* )0; + region->list = (OnigRegion** )0; +} + +extern OnigRegion* +onig_region_new() +{ + OnigRegion* r; + + r = (OnigRegion* )xmalloc(sizeof(OnigRegion)); + onig_region_init(r); + return r; +} + +extern void +onig_region_free(OnigRegion* r, int free_self) +{ + if (r) { + if (r->allocated > 0) { + if (r->beg) xfree(r->beg); + if (r->end) xfree(r->end); + r->allocated = 0; + } + region_list_free(r); + if (free_self) xfree(r); + } +} + +extern void +onig_region_copy(OnigRegion* to, OnigRegion* from) +{ +#define RREGC_SIZE (sizeof(int) * from->num_regs) + int i; + + if (to == from) return; + + if (to->allocated == 0) { + if (from->num_regs > 0) { + to->beg = (int* )xmalloc(RREGC_SIZE); + to->end = (int* )xmalloc(RREGC_SIZE); + to->allocated = from->num_regs; + } + } + else if (to->allocated < from->num_regs) { + to->beg = (int* )xrealloc(to->beg, RREGC_SIZE); + to->end = (int* )xrealloc(to->end, RREGC_SIZE); + to->allocated = from->num_regs; + } + + for (i = 0; i < from->num_regs; i++) { + to->beg[i] = from->beg[i]; + to->end[i] = from->end[i]; + } + to->num_regs = from->num_regs; + + if (IS_NOT_NULL(from->list)) { + if (IS_NULL(to->list)) { + to->list = region_list_new(); + } + + for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) { + if (IS_NOT_NULL(from->list[i])) { + if (IS_NULL(to->list[i])) + to->list[i] = onig_region_new(); + + onig_region_copy(to->list[i], from->list[i]); + } + else { + if (IS_NOT_NULL(to->list[i])) { + xfree(to->list[i]); + to->list[i] = (OnigRegion* )0; + } + } + } + } + else + region_list_free(to); +} + + +/** stack **/ +#define INVALID_STACK_INDEX -1 +typedef int StackIndex; + +typedef struct _StackType { + unsigned int type; + union { + struct { + UChar *pcode; /* byte code position */ + UChar *pstr; /* string position */ + UChar *pstr_prev; /* previous char position of pstr */ + } state; + struct { + int count; /* for OP_REPEAT_INC, OP_REPEAT_INC_NG */ + UChar *pcode; /* byte code position (head of repeated target) */ + int num; /* repeat id */ + } repeat; + struct { + StackIndex si; /* index of stack */ + } repeat_inc; + struct { + int num; /* memory num */ + UChar *pstr; /* start/end position */ + /* Following information is setted, if this stack type is MEM-START */ + StackIndex start; /* prev. info (for backtrack "(...)*" ) */ + StackIndex end; /* prev. info (for backtrack "(...)*" ) */ + } mem; + struct { + int num; /* null check id */ + UChar *pstr; /* start position */ + } null_check; +#ifdef USE_SUBEXP_CALL + struct { + UChar *ret_addr; /* byte code position */ + int num; /* null check id */ + UChar *pstr; /* string position */ + } call_frame; +#endif + } u; +} StackType; + +/* stack type */ +/* used by normal-POP */ +#define STK_ALT 0x0001 +#define STK_LOOK_BEHIND_NOT 0x0003 +#define STK_POS_NOT 0x0005 +/* avoided by normal-POP, but value should be small */ +#define STK_NULL_CHECK_START 0x0100 +/* handled by normal-POP */ +#define STK_MEM_START 0x0200 +#define STK_MEM_END 0x0300 +#define STK_REPEAT_INC 0x0400 +/* avoided by normal-POP */ +#define STK_POS 0x0500 /* used when POP-POS */ +#define STK_STOP_BT 0x0600 /* mark for "(?>...)" */ +#define STK_REPEAT 0x0700 +#define STK_CALL_FRAME 0x0800 +#define STK_RETURN 0x0900 +#define STK_MEM_END_MARK 0x0a00 +#define STK_VOID 0x0b00 /* for fill a blank */ +#define STK_NULL_CHECK_END 0x0c00 /* for recursive call */ + +/* stack type check mask */ +#define STK_MASK_POP_USED 0x00ff +#define IS_TO_VOID_TARGET(stk) \ + (((stk)->type & STK_MASK_POP_USED) || (stk)->type == STK_NULL_CHECK_START) + +typedef struct { + void* stack_p; + int stack_n; + OnigOptionType options; + OnigRegion* region; + UChar* start; /* search start position (for \G: BEGIN_POSITION) */ +} MatchArg; + +#define MATCH_ARG_INIT(msa, arg_option, arg_region, arg_start) do {\ + (msa).stack_p = (void* )0;\ + (msa).options = (arg_option);\ + (msa).region = (arg_region);\ + (msa).start = (arg_start);\ +} while (0) + +#define MATCH_ARG_FREE(msa) if ((msa).stack_p) xfree((msa).stack_p) + + +#define STACK_INIT(alloc_addr, ptr_num, stack_num) do {\ + if (msa->stack_p) {\ + alloc_addr = (char* )xalloca(sizeof(char*) * (ptr_num));\ + stk_alloc = (StackType* )(msa->stack_p);\ + stk_base = stk_alloc;\ + stk = stk_base;\ + stk_end = stk_base + msa->stack_n;\ + }\ + else {\ + alloc_addr = (char* )xalloca(sizeof(char*) * (ptr_num)\ + + sizeof(StackType) * (stack_num));\ + stk_alloc = (StackType* )(alloc_addr + sizeof(char*) * (ptr_num));\ + stk_base = stk_alloc;\ + stk = stk_base;\ + stk_end = stk_base + (stack_num);\ + }\ +} while(0) + +#define STACK_SAVE do{\ + if (stk_base != stk_alloc) {\ + msa->stack_p = stk_base;\ + msa->stack_n = stk_end - stk_base;\ + };\ +} while(0) + +static int +stack_double(StackType** arg_stk_base, StackType** arg_stk_end, + StackType** arg_stk, StackType* stk_alloc, MatchArg* msa) +{ + int n; + StackType *x, *stk_base, *stk_end, *stk; + + stk_base = *arg_stk_base; + stk_end = *arg_stk_end; + stk = *arg_stk; + + n = stk_end - stk_base; + if (stk_base == stk_alloc && IS_NULL(msa->stack_p)) { + x = (StackType* )xmalloc(sizeof(StackType) * n * 2); + if (IS_NULL(x)) { + STACK_SAVE; + return ONIGERR_MEMORY; + } + xmemcpy(x, stk_base, n * sizeof(StackType)); + n *= 2; + } + else { + n *= 2; + if (n > MATCH_STACK_LIMIT_SIZE) return ONIGERR_MATCH_STACK_LIMIT_OVER; + x = (StackType* )xrealloc(stk_base, sizeof(StackType) * n); + if (IS_NULL(x)) { + STACK_SAVE; + return ONIGERR_MEMORY; + } + } + *arg_stk = x + (stk - stk_base); + *arg_stk_base = x; + *arg_stk_end = x + n; + return 0; +} + +#define STACK_ENSURE(n) do {\ + if (stk_end - stk < (n)) {\ + int r = stack_double(&stk_base, &stk_end, &stk, stk_alloc, msa);\ + if (r != 0) { STACK_SAVE; return r; } \ + }\ +} while(0) + +#define STACK_AT(index) (stk_base + (index)) +#define GET_STACK_INDEX(stk) ((stk) - stk_base) + +#define STACK_PUSH(stack_type,pat,s,sprev) do {\ + STACK_ENSURE(1);\ + stk->type = (stack_type);\ + stk->u.state.pcode = (pat);\ + stk->u.state.pstr = (s);\ + stk->u.state.pstr_prev = (sprev);\ + STACK_INC;\ +} while(0) + +#define STACK_PUSH_ENSURED(stack_type,pat) do {\ + stk->type = (stack_type);\ + stk->u.state.pcode = (pat);\ + STACK_INC;\ +} while(0) + +#define STACK_PUSH_TYPE(stack_type) do {\ + STACK_ENSURE(1);\ + stk->type = (stack_type);\ + STACK_INC;\ +} while(0) + +#define STACK_PUSH_ALT(pat,s,sprev) STACK_PUSH(STK_ALT,pat,s,sprev) +#define STACK_PUSH_POS(s,sprev) STACK_PUSH(STK_POS,NULL_UCHARP,s,sprev) +#define STACK_PUSH_POS_NOT(pat,s,sprev) STACK_PUSH(STK_POS_NOT,pat,s,sprev) +#define STACK_PUSH_STOP_BT STACK_PUSH_TYPE(STK_STOP_BT) +#define STACK_PUSH_LOOK_BEHIND_NOT(pat,s,sprev) \ + STACK_PUSH(STK_LOOK_BEHIND_NOT,pat,s,sprev) + +#define STACK_PUSH_REPEAT(id, pat) do {\ + STACK_ENSURE(1);\ + stk->type = STK_REPEAT;\ + stk->u.repeat.num = (id);\ + stk->u.repeat.pcode = (pat);\ + stk->u.repeat.count = 0;\ + STACK_INC;\ +} while(0) + +#define STACK_PUSH_REPEAT_INC(sindex) do {\ + STACK_ENSURE(1);\ + stk->type = STK_REPEAT_INC;\ + stk->u.repeat_inc.si = (sindex);\ + STACK_INC;\ +} while(0) + +#define STACK_PUSH_MEM_START(mnum, s) do {\ + STACK_ENSURE(1);\ + stk->type = STK_MEM_START;\ + stk->u.mem.num = (mnum);\ + stk->u.mem.pstr = (s);\ + stk->u.mem.start = mem_start_stk[mnum];\ + stk->u.mem.end = mem_end_stk[mnum];\ + mem_start_stk[mnum] = GET_STACK_INDEX(stk);\ + mem_end_stk[mnum] = INVALID_STACK_INDEX;\ + STACK_INC;\ +} while(0) + +#define STACK_PUSH_MEM_END(mnum, s) do {\ + STACK_ENSURE(1);\ + stk->type = STK_MEM_END;\ + stk->u.mem.num = (mnum);\ + stk->u.mem.pstr = (s);\ + stk->u.mem.start = mem_start_stk[mnum];\ + stk->u.mem.end = mem_end_stk[mnum];\ + mem_end_stk[mnum] = GET_STACK_INDEX(stk);\ + STACK_INC;\ +} while(0) + +#define STACK_PUSH_MEM_END_MARK(mnum) do {\ + STACK_ENSURE(1);\ + stk->type = STK_MEM_END_MARK;\ + stk->u.mem.num = (mnum);\ + STACK_INC;\ +} while(0) + +#define STACK_GET_MEM_START(mnum, k) do {\ + int level = 0;\ + k = stk;\ + while (k > stk_base) {\ + k--;\ + if ((k->type == STK_MEM_END_MARK || k->type == STK_MEM_END) \ + && k->u.mem.num == (mnum)) {\ + level++;\ + }\ + else if (k->type == STK_MEM_START && k->u.mem.num == (mnum)) {\ + if (level == 0) break;\ + level--;\ + }\ + }\ +} while (0) + +#define STACK_GET_MEM_RANGE(k, mnum, start, end) do {\ + int level = 0;\ + while (k < stk) {\ + if (k->type == STK_MEM_START && k->u.mem.num == (mnum)) {\ + if (level == 0) (start) = k->u.mem.pstr;\ + level++;\ + }\ + else if (k->type == STK_MEM_END && k->u.mem.num == (mnum)) {\ + level--;\ + if (level == 0) {\ + (end) = k->u.mem.pstr;\ + break;\ + }\ + }\ + k++;\ + }\ +} while (0) + +#define STACK_PUSH_NULL_CHECK_START(cnum, s) do {\ + STACK_ENSURE(1);\ + stk->type = STK_NULL_CHECK_START;\ + stk->u.null_check.num = (cnum);\ + stk->u.null_check.pstr = (s);\ + STACK_INC;\ +} while(0) + +#define STACK_PUSH_NULL_CHECK_END(cnum) do {\ + STACK_ENSURE(1);\ + stk->type = STK_NULL_CHECK_END;\ + stk->u.null_check.num = (cnum);\ + STACK_INC;\ +} while(0) + +#define STACK_PUSH_CALL_FRAME(pat) do {\ + STACK_ENSURE(1);\ + stk->type = STK_CALL_FRAME;\ + stk->u.call_frame.ret_addr = (pat);\ + STACK_INC;\ +} while(0) + +#define STACK_PUSH_RETURN do {\ + STACK_ENSURE(1);\ + stk->type = STK_RETURN;\ + STACK_INC;\ +} while(0) + + +#ifdef ONIG_DEBUG +#define STACK_BASE_CHECK(p) \ + if ((p) < stk_base) goto stack_error; +#else +#define STACK_BASE_CHECK(p) +#endif + +#define STACK_POP_ONE do {\ + stk--;\ + STACK_BASE_CHECK(stk); \ +} while(0) + +#define STACK_POP do {\ + switch (pop_level) {\ + case STACK_POP_LEVEL_FREE:\ + while (1) {\ + stk--;\ + STACK_BASE_CHECK(stk); \ + if ((stk->type & STK_MASK_POP_USED) != 0) break;\ + }\ + break;\ + case STACK_POP_LEVEL_MEM_START:\ + while (1) {\ + stk--;\ + STACK_BASE_CHECK(stk); \ + if ((stk->type & STK_MASK_POP_USED) != 0) break;\ + else if (stk->type == STK_MEM_START) {\ + mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ + mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\ + }\ + }\ + break;\ + default:\ + while (1) {\ + stk--;\ + STACK_BASE_CHECK(stk); \ + if ((stk->type & STK_MASK_POP_USED) != 0) break;\ + else if (stk->type == STK_MEM_START) {\ + mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ + mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\ + }\ + else if (stk->type == STK_REPEAT_INC) {\ + STACK_AT(stk->u.repeat_inc.si)->u.repeat.count--;\ + }\ + else if (stk->type == STK_MEM_END) {\ + mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ + mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\ + }\ + }\ + break;\ + }\ +} while(0) + +#define STACK_POP_TIL_POS_NOT do {\ + while (1) {\ + stk--;\ + STACK_BASE_CHECK(stk); \ + if (stk->type == STK_POS_NOT) break;\ + else if (stk->type == STK_MEM_START) {\ + mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ + mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\ + }\ + else if (stk->type == STK_REPEAT_INC) {\ + STACK_AT(stk->u.repeat_inc.si)->u.repeat.count--;\ + }\ + else if (stk->type == STK_MEM_END) {\ + mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ + mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\ + }\ + }\ +} while(0) + +#define STACK_POP_TIL_LOOK_BEHIND_NOT do {\ + while (1) {\ + stk--;\ + STACK_BASE_CHECK(stk); \ + if (stk->type == STK_LOOK_BEHIND_NOT) break;\ + else if (stk->type == STK_MEM_START) {\ + mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ + mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\ + }\ + else if (stk->type == STK_REPEAT_INC) {\ + STACK_AT(stk->u.repeat_inc.si)->u.repeat.count--;\ + }\ + else if (stk->type == STK_MEM_END) {\ + mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ + mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\ + }\ + }\ +} while(0) + +#define STACK_POS_END(k) do {\ + k = stk;\ + while (1) {\ + k--;\ + STACK_BASE_CHECK(k); \ + if (IS_TO_VOID_TARGET(k)) {\ + k->type = STK_VOID;\ + }\ + else if (k->type == STK_POS) {\ + k->type = STK_VOID;\ + break;\ + }\ + }\ +} while(0) + +#define STACK_STOP_BT_END do {\ + StackType *k = stk;\ + while (1) {\ + k--;\ + STACK_BASE_CHECK(k); \ + if (IS_TO_VOID_TARGET(k)) {\ + k->type = STK_VOID;\ + }\ + else if (k->type == STK_STOP_BT) {\ + k->type = STK_VOID;\ + break;\ + }\ + }\ +} while(0) + +#define STACK_NULL_CHECK(isnull,id,s) do {\ + StackType* k = stk;\ + while (1) {\ + k--;\ + STACK_BASE_CHECK(k); \ + if (k->type == STK_NULL_CHECK_START) {\ + if (k->u.null_check.num == (id)) {\ + (isnull) = (k->u.null_check.pstr == (s));\ + break;\ + }\ + }\ + }\ +} while(0) + +#define STACK_NULL_CHECK_REC(isnull,id,s) do {\ + int level = 0;\ + StackType* k = stk;\ + while (1) {\ + k--;\ + STACK_BASE_CHECK(k); \ + if (k->type == STK_NULL_CHECK_START) {\ + if (k->u.null_check.num == (id)) {\ + if (level == 0) {\ + (isnull) = (k->u.null_check.pstr == (s));\ + break;\ + }\ + else level--;\ + }\ + }\ + else if (k->type == STK_NULL_CHECK_END) {\ + level++;\ + }\ + }\ +} while(0) + +#define STACK_NULL_CHECK_MEMST(isnull,id,s,reg) do {\ + StackType* k = stk;\ + while (1) {\ + k--;\ + STACK_BASE_CHECK(k); \ + if (k->type == STK_NULL_CHECK_START) {\ + if (k->u.null_check.num == (id)) {\ + if (k->u.null_check.pstr != (s)) {\ + (isnull) = 0;\ + break;\ + }\ + else {\ + UChar* endp;\ + (isnull) = 1;\ + while (k < stk) {\ + if (k->type == STK_MEM_START) {\ + if (k->u.mem.end == INVALID_STACK_INDEX) {\ + (isnull) = 0; break;\ + }\ + if (BIT_STATUS_AT(reg->bt_mem_end, k->u.mem.num))\ + endp = STACK_AT(k->u.mem.end)->u.mem.pstr;\ + else\ + endp = (UChar* )k->u.mem.end;\ + if (STACK_AT(k->u.mem.start)->u.mem.pstr != endp) {\ + (isnull) = 0; break;\ + }\ + else if (endp != s) {\ + (isnull) = -1; /* empty, but position changed */ \ + }\ + }\ + k++;\ + }\ + break;\ + }\ + }\ + }\ + }\ +} while(0) + +#define STACK_NULL_CHECK_MEMST_REC(isnull,id,s,reg) do {\ + int level = 0;\ + StackType* k = stk;\ + while (1) {\ + k--;\ + STACK_BASE_CHECK(k); \ + if (k->type == STK_NULL_CHECK_START) {\ + if (k->u.null_check.num == (id)) {\ + if (level == 0) {\ + if (k->u.null_check.pstr != (s)) {\ + (isnull) = 0;\ + break;\ + }\ + else {\ + UChar* endp;\ + (isnull) = 1;\ + while (k < stk) {\ + if (k->type == STK_MEM_START) {\ + if (k->u.mem.end == INVALID_STACK_INDEX) {\ + (isnull) = 0; break;\ + }\ + if (BIT_STATUS_AT(reg->bt_mem_end, k->u.mem.num))\ + endp = STACK_AT(k->u.mem.end)->u.mem.pstr;\ + else\ + endp = (UChar* )k->u.mem.end;\ + if (STACK_AT(k->u.mem.start)->u.mem.pstr != endp) {\ + (isnull) = 0; break;\ + }\ + else if (endp != s) {\ + (isnull) = -1; /* empty, but position changed */ \ + }\ + }\ + k++;\ + }\ + break;\ + }\ + }\ + else {\ + level--;\ + }\ + }\ + }\ + else if (k->type == STK_NULL_CHECK_END) {\ + if (k->u.null_check.num == (id)) level++;\ + }\ + }\ +} while(0) + +#define STACK_GET_REPEAT(id, k) do {\ + int level = 0;\ + k = stk;\ + while (1) {\ + k--;\ + STACK_BASE_CHECK(k); \ + if (k->type == STK_REPEAT) {\ + if (level == 0) {\ + if (k->u.repeat.num == (id)) {\ + break;\ + }\ + }\ + }\ + else if (k->type == STK_CALL_FRAME) level--;\ + else if (k->type == STK_RETURN) level++;\ + }\ +} while (0) + +#define STACK_RETURN(addr) do {\ + int level = 0;\ + StackType* k = stk;\ + while (1) {\ + k--;\ + STACK_BASE_CHECK(k); \ + if (k->type == STK_CALL_FRAME) {\ + if (level == 0) {\ + (addr) = k->u.call_frame.ret_addr;\ + break;\ + }\ + else level--;\ + }\ + else if (k->type == STK_RETURN)\ + level++;\ + }\ +} while(0) + + +#define STRING_CMP(s1,s2,len) do {\ + while (len-- > 0) {\ + if (*s1++ != *s2++) goto fail;\ + }\ +} while(0) + +#define STRING_CMP_IC(s1,ps2,len) do {\ + if (string_cmp_ic(encode, s1, ps2, len) == 0) \ + goto fail; \ +} while(0) + +static int string_cmp_ic(OnigEncoding enc, + UChar* s1, UChar** ps2, int mblen) +{ + UChar buf1[ONIGENC_MBC_TO_LOWER_MAXLEN]; + UChar buf2[ONIGENC_MBC_TO_LOWER_MAXLEN]; + UChar *p1, *p2, *end, *s2; + int len1, len2; + + s2 = *ps2; + end = s1 + mblen; + while (s1 < end) { + len1 = ONIGENC_MBC_TO_LOWER(enc, s1, buf1); + len2 = ONIGENC_MBC_TO_LOWER(enc, s2, buf2); + if (len1 != len2) return 0; + p1 = buf1; + p2 = buf2; + while (len1-- > 0) { + if (*p1 != *p2) return 0; + p1++; + p2++; + } + + s1 += enc_len(enc, *s1); + s2 += enc_len(enc, *s2); + } + + *ps2 = s2; + return 1; +} + +#define STRING_CMP_VALUE(s1,s2,len,is_fail) do {\ + is_fail = 0;\ + while (len-- > 0) {\ + if (*s1++ != *s2++) {\ + is_fail = 1; break;\ + }\ + }\ +} while(0) + +#define STRING_CMP_VALUE_IC(s1,ps2,len,is_fail) do {\ + if (string_cmp_ic(encode, s1, ps2, len) == 0) \ + is_fail = 1; \ + else \ + is_fail = 0; \ +} while(0) + +#define ON_STR_BEGIN(s) ((s) == str) +#define ON_STR_END(s) ((s) == end) +#define IS_EMPTY_STR (str == end) + +#define DATA_ENSURE(n) \ + if (s + (n) > end) goto fail + +#define DATA_ENSURE_CHECK(n) (s + (n) <= end) + +#ifdef ONIG_DEBUG_STATISTICS + +#define USE_TIMEOFDAY + +#ifdef USE_TIMEOFDAY +#ifdef HAVE_SYS_TIME_H +#include +#endif +#ifdef HAVE_UNISTD_H +#include +#endif +static struct timeval ts, te; +#define GETTIME(t) gettimeofday(&(t), (struct timezone* )0) +#define TIMEDIFF(te,ts) (((te).tv_usec - (ts).tv_usec) + \ + (((te).tv_sec - (ts).tv_sec)*1000000)) +#else +#ifdef HAVE_SYS_TIMES_H +#include +#endif +static struct tms ts, te; +#define GETTIME(t) times(&(t)) +#define TIMEDIFF(te,ts) ((te).tms_utime - (ts).tms_utime) +#endif + +static int OpCounter[256]; +static int OpPrevCounter[256]; +static unsigned long OpTime[256]; +static int OpCurr = OP_FINISH; +static int OpPrevTarget = OP_FAIL; +static int MaxStackDepth = 0; + +#define STAT_OP_IN(opcode) do {\ + if (opcode == OpPrevTarget) OpPrevCounter[OpCurr]++;\ + OpCurr = opcode;\ + OpCounter[opcode]++;\ + GETTIME(ts);\ +} while (0) + +#define STAT_OP_OUT do {\ + GETTIME(te);\ + OpTime[OpCurr] += TIMEDIFF(te, ts);\ +} while (0) + +#ifdef RUBY_PLATFORM +/* + * :nodoc: + */ +static VALUE onig_stat_print() +{ + onig_print_statistics(stderr); + return Qnil; +} +#endif + +extern void onig_statistics_init() +{ + int i; + for (i = 0; i < 256; i++) { + OpCounter[i] = OpPrevCounter[i] = 0; OpTime[i] = 0; + } + MaxStackDepth = 0; + +#ifdef RUBY_PLATFORM + rb_define_global_function("onig_stat_print", onig_stat_print, 0); +#endif +} + +extern void +onig_print_statistics(FILE* f) +{ + int i; + fprintf(f, " count prev time\n"); + for (i = 0; OnigOpInfo[i].opcode >= 0; i++) { + fprintf(f, "%8d: %8d: %10ld: %s\n", + OpCounter[i], OpPrevCounter[i], OpTime[i], OnigOpInfo[i].name); + } + fprintf(f, "\nmax stack depth: %d\n", MaxStackDepth); +} + +#define STACK_INC do {\ + stk++;\ + if (stk - stk_base > MaxStackDepth) \ + MaxStackDepth = stk - stk_base;\ +} while (0) + +#else +#define STACK_INC stk++ + +#define STAT_OP_IN(opcode) +#define STAT_OP_OUT +#endif + +extern int +onig_is_in_code_range(UChar* p, OnigCodePoint code) +{ + OnigCodePoint n, *data; + OnigCodePoint low, high, x; + + GET_CODE_POINT(n, p); + data = (OnigCodePoint* )p; + data++; + + for (low = 0, high = n; low < high; ) { + x = (low + high) >> 1; + if (code > data[x * 2 + 1]) + low = x + 1; + else + high = x; + } + + return ((low < n && code >= data[low * 2]) ? 1 : 0); +} + + +/* matching region of POSIX API */ +typedef int regoff_t; + +typedef struct { + regoff_t rm_so; + regoff_t rm_eo; +} posix_regmatch_t; + +/* match data(str - end) from position (sstart). */ +/* if sstart == str then set sprev to NULL. */ +static int +match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart, + UChar* sprev, MatchArg* msa) +{ + static UChar FinishCode[] = { OP_FINISH }; + + int i, n, num_mem, best_len, pop_level; + LengthType tlen, tlen2; + MemNumType mem; + RelAddrType addr; + OnigOptionType option = reg->options; + OnigEncoding encode = reg->enc; + int ignore_case; + UChar *s, *q, *sbegin; + UChar *p = reg->p; + char *alloca_base; + StackType *stk_alloc, *stk_base, *stk, *stk_end; + StackType *stkp; /* used as any purpose. */ + StackIndex *repeat_stk; + StackIndex *mem_start_stk, *mem_end_stk; + n = reg->num_repeat + reg->num_mem * 2; + + STACK_INIT(alloca_base, n, INIT_MATCH_STACK_SIZE); + ignore_case = IS_IGNORECASE(option); + pop_level = reg->stack_pop_level; + num_mem = reg->num_mem; + repeat_stk = (StackIndex* )alloca_base; + + mem_start_stk = (StackIndex* )(repeat_stk + reg->num_repeat); + mem_end_stk = mem_start_stk + num_mem; + mem_start_stk--; /* for index start from 1, + mem_start_stk[1]..mem_start_stk[num_mem] */ + mem_end_stk--; /* for index start from 1, + mem_end_stk[1]..mem_end_stk[num_mem] */ + for (i = 1; i <= num_mem; i++) { + mem_start_stk[i] = mem_end_stk[i] = INVALID_STACK_INDEX; + } + +#ifdef ONIG_DEBUG_MATCH + fprintf(stderr, "match_at: str: %d, end: %d, start: %d, sprev: %d\n", + (int )str, (int )end, (int )sstart, (int )sprev); + fprintf(stderr, "size: %d, start offset: %d\n", + (int )(end - str), (int )(sstart - str)); +#endif + + STACK_PUSH_ENSURED(STK_ALT, FinishCode); /* bottom stack */ + best_len = ONIG_MISMATCH; + s = sstart; + while (1) { +#ifdef ONIG_DEBUG_MATCH + { + UChar *q, *bp, buf[50]; + int len; + fprintf(stderr, "%4d> \"", (int )(s - str)); + bp = buf; + for (i = 0, q = s; i < 7 && q < end; i++) { + len = enc_len(encode, *q); + while (len-- > 0) *bp++ = *q++; + } + if (q < end) { xmemcpy(bp, "...\"", 4); bp += 4; } + else { xmemcpy(bp, "\"", 1); bp += 1; } + *bp = 0; + fputs(buf, stderr); + for (i = 0; i < 20 - (bp - buf); i++) fputc(' ', stderr); + onig_print_compiled_byte_code(stderr, p, NULL); + fprintf(stderr, "\n"); + } +#endif + + sbegin = s; + switch (*p++) { + case OP_END: STAT_OP_IN(OP_END); + n = s - sstart; + if (n > best_len) { + OnigRegion* region = msa->region; + best_len = n; + if (region) { +#ifdef USE_POSIX_REGION_OPTION + if (IS_POSIX_REGION(msa->options)) { + posix_regmatch_t* rmt = (posix_regmatch_t* )region; + + rmt[0].rm_so = sstart - str; + rmt[0].rm_eo = s - str; + for (i = 1; i <= num_mem; i++) { + if (mem_end_stk[i] != INVALID_STACK_INDEX) { + if (BIT_STATUS_AT(reg->bt_mem_start, i)) + rmt[i].rm_so = STACK_AT(mem_start_stk[i])->u.mem.pstr - str; + else + rmt[i].rm_so = (UChar* )((void* )(mem_start_stk[i])) - str; + + rmt[i].rm_eo = (BIT_STATUS_AT(reg->bt_mem_end, i) + ? STACK_AT(mem_end_stk[i])->u.mem.pstr + : (UChar* )((void* )mem_end_stk[i])) - str; + } + else { + rmt[i].rm_so = rmt[i].rm_eo = ONIG_REGION_NOTPOS; + } + } + } + else { +#endif /* USE_POSIX_REGION_OPTION */ + region->beg[0] = sstart - str; + region->end[0] = s - str; + for (i = 1; i <= num_mem; i++) { + if (mem_end_stk[i] != INVALID_STACK_INDEX) { + if (BIT_STATUS_AT(reg->bt_mem_start, i)) + region->beg[i] = STACK_AT(mem_start_stk[i])->u.mem.pstr - str; + else + region->beg[i] = (UChar* )((void* )mem_start_stk[i]) - str; + + region->end[i] = (BIT_STATUS_AT(reg->bt_mem_end, i) + ? STACK_AT(mem_end_stk[i])->u.mem.pstr + : (UChar* )((void* )mem_end_stk[i])) - str; + } + else { + region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS; + } + } + + if (reg->capture_history != 0) { + UChar *pstart, *pend; + for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) { + if (BIT_STATUS_AT(reg->capture_history, i) != 0) { + stkp = stk_base; + do { + STACK_GET_MEM_RANGE(stkp, i, pstart, pend); + if (stkp < stk) { + int r; + r = region_list_add_entry(region, i, + pstart - str, pend - str); + if (r) { + STACK_SAVE; + return r; + } + } + stkp++; + } while (stkp < stk); + } + } + } /* list of captures */ +#ifdef USE_POSIX_REGION_OPTION + } /* else IS_POSIX_REGION() */ +#endif + } /* if (region) */ + } /* n > best_len */ + STAT_OP_OUT; + + if (IS_FIND_CONDITION(option)) { + if (IS_FIND_NOT_EMPTY(option) && s == sstart) { + best_len = ONIG_MISMATCH; + goto fail; /* for retry */ + } + if (IS_FIND_LONGEST(option) && s < end) { + goto fail; /* for retry */ + } + } + else { + /* default behavior: return first-matching result. */ + goto finish; + } + break; + + case OP_EXACT1: STAT_OP_IN(OP_EXACT1); +#if 0 + DATA_ENSURE(1); + if (*p != *s) goto fail; + p++; s++; +#endif + if (*p != *s++) goto fail; + DATA_ENSURE(0); + p++; + STAT_OP_OUT; + break; + + case OP_EXACT1_IC: STAT_OP_IN(OP_EXACT1_IC); + { + int len; + UChar *q, lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; + + len = ONIGENC_MBC_TO_LOWER(encode, s, lowbuf); + DATA_ENSURE(len); + q = lowbuf; + s += enc_len(encode, *s); + while (len-- > 0) { + if (*p != *q) goto fail; + p++; q++; + } + } + STAT_OP_OUT; + break; + + case OP_EXACT2: STAT_OP_IN(OP_EXACT2); + DATA_ENSURE(2); + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + sprev = s; + p++; s++; + STAT_OP_OUT; + continue; + break; + + case OP_EXACT3: STAT_OP_IN(OP_EXACT3); + DATA_ENSURE(3); + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + sprev = s; + p++; s++; + STAT_OP_OUT; + continue; + break; + + case OP_EXACT4: STAT_OP_IN(OP_EXACT4); + DATA_ENSURE(4); + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + sprev = s; + p++; s++; + STAT_OP_OUT; + continue; + break; + + case OP_EXACT5: STAT_OP_IN(OP_EXACT5); + DATA_ENSURE(5); + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + sprev = s; + p++; s++; + STAT_OP_OUT; + continue; + break; + + case OP_EXACTN: STAT_OP_IN(OP_EXACTN); + GET_LENGTH_INC(tlen, p); + DATA_ENSURE(tlen); + while (tlen-- > 0) { + if (*p++ != *s++) goto fail; + } + sprev = s - 1; + STAT_OP_OUT; + continue; + break; + + case OP_EXACTN_IC: STAT_OP_IN(OP_EXACTN_IC); + { + int len; + UChar *q, *endp, lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; + + GET_LENGTH_INC(tlen, p); + endp = p + tlen; + + while (p < endp) { + len = ONIGENC_MBC_TO_LOWER(encode, s, lowbuf); + DATA_ENSURE(len); + sprev = s; + s += enc_len(encode, *s); + q = lowbuf; + while (len-- > 0) { + if (*p != *q) goto fail; + p++; q++; + } + } + } + + STAT_OP_OUT; + continue; + break; + + case OP_EXACTMB2N1: STAT_OP_IN(OP_EXACTMB2N1); + DATA_ENSURE(2); + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + STAT_OP_OUT; + break; + + case OP_EXACTMB2N2: STAT_OP_IN(OP_EXACTMB2N2); + DATA_ENSURE(4); + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + sprev = s; + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + STAT_OP_OUT; + continue; + break; + + case OP_EXACTMB2N3: STAT_OP_IN(OP_EXACTMB2N3); + DATA_ENSURE(6); + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + sprev = s; + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + STAT_OP_OUT; + continue; + break; + + case OP_EXACTMB2N: STAT_OP_IN(OP_EXACTMB2N); + GET_LENGTH_INC(tlen, p); + DATA_ENSURE(tlen * 2); + while (tlen-- > 0) { + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + } + sprev = s - 2; + STAT_OP_OUT; + continue; + break; + + case OP_EXACTMB3N: STAT_OP_IN(OP_EXACTMB3N); + GET_LENGTH_INC(tlen, p); + DATA_ENSURE(tlen * 3); + while (tlen-- > 0) { + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + if (*p != *s) goto fail; + p++; s++; + } + sprev = s - 3; + STAT_OP_OUT; + continue; + break; + + case OP_EXACTMBN: STAT_OP_IN(OP_EXACTMBN); + GET_LENGTH_INC(tlen, p); /* mb-len */ + GET_LENGTH_INC(tlen2, p); /* string len */ + tlen2 *= tlen; + DATA_ENSURE(tlen2); + while (tlen2-- > 0) { + if (*p != *s) goto fail; + p++; s++; + } + sprev = s - tlen; + STAT_OP_OUT; + continue; + break; + + case OP_CCLASS: STAT_OP_IN(OP_CCLASS); + DATA_ENSURE(1); + if (BITSET_AT(((BitSetRef )p), *s) == 0) goto fail; + p += SIZE_BITSET; + s += enc_len(encode, *s); /* OP_CCLASS can match mb-code. \D, \S */ + STAT_OP_OUT; + break; + + case OP_CCLASS_MB: STAT_OP_IN(OP_CCLASS_MB); + if (! ONIGENC_IS_MBC_HEAD(encode, *s)) goto fail; + + cclass_mb: + GET_LENGTH_INC(tlen, p); + { + OnigCodePoint code; + UChar *ss; + int mb_len = enc_len(encode, *s); + + DATA_ENSURE(mb_len); + ss = s; + s += mb_len; + code = ONIGENC_MBC_TO_CODE(encode, ss, s); + +#ifdef PLATFORM_UNALIGNED_WORD_ACCESS + if (! onig_is_in_code_range(p, code)) goto fail; +#else + q = p; + ALIGNMENT_RIGHT(q); + if (! onig_is_in_code_range(q, code)) goto fail; +#endif + } + p += tlen; + STAT_OP_OUT; + break; + + case OP_CCLASS_MIX: STAT_OP_IN(OP_CCLASS_MIX); + DATA_ENSURE(1); + if (ONIGENC_IS_MBC_HEAD(encode, *s)) { + p += SIZE_BITSET; + goto cclass_mb; + } + else { + if (BITSET_AT(((BitSetRef )p), *s) == 0) + goto fail; + + p += SIZE_BITSET; + GET_LENGTH_INC(tlen, p); + p += tlen; + s++; + } + STAT_OP_OUT; + break; + + case OP_CCLASS_NOT: STAT_OP_IN(OP_CCLASS_NOT); + DATA_ENSURE(1); + if (BITSET_AT(((BitSetRef )p), *s) != 0) goto fail; + p += SIZE_BITSET; + s += enc_len(encode, *s); + STAT_OP_OUT; + break; + + case OP_CCLASS_MB_NOT: STAT_OP_IN(OP_CCLASS_MB_NOT); + if (! ONIGENC_IS_MBC_HEAD(encode, *s)) { + DATA_ENSURE(1); + s++; + GET_LENGTH_INC(tlen, p); + p += tlen; + goto cc_mb_not_success; + } + + cclass_mb_not: + GET_LENGTH_INC(tlen, p); + { + OnigCodePoint code; + UChar *ss; + int mb_len = enc_len(encode, *s); + + if (s + mb_len > end) { + s = end; + p += tlen; + goto cc_mb_not_success; + } + + ss = s; + s += mb_len; + code = ONIGENC_MBC_TO_CODE(encode, ss, s); + +#ifdef PLATFORM_UNALIGNED_WORD_ACCESS + if (onig_is_in_code_range(p, code)) goto fail; +#else + q = p; + ALIGNMENT_RIGHT(q); + if (onig_is_in_code_range(q, code)) goto fail; +#endif + } + p += tlen; + + cc_mb_not_success: + STAT_OP_OUT; + break; + + case OP_CCLASS_MIX_NOT: STAT_OP_IN(OP_CCLASS_MIX_NOT); + DATA_ENSURE(1); + if (ONIGENC_IS_MBC_HEAD(encode, *s)) { + p += SIZE_BITSET; + goto cclass_mb_not; + } + else { + if (BITSET_AT(((BitSetRef )p), *s) != 0) + goto fail; + + p += SIZE_BITSET; + GET_LENGTH_INC(tlen, p); + p += tlen; + s++; + } + STAT_OP_OUT; + break; + + case OP_ANYCHAR: STAT_OP_IN(OP_ANYCHAR); + n = enc_len(encode, *s); + if (n > 1) { + DATA_ENSURE(n); + s += n; + } + else { + DATA_ENSURE(1); + if (ONIG_IS_NEWLINE(*s)) goto fail; + s++; + } + STAT_OP_OUT; + break; + + case OP_ANYCHAR_ML: STAT_OP_IN(OP_ANYCHAR_ML); + n = enc_len(encode, *s); + DATA_ENSURE(n); + s += n; + STAT_OP_OUT; + break; + + case OP_ANYCHAR_STAR: STAT_OP_IN(OP_ANYCHAR_STAR); + while (s < end) { + STACK_PUSH_ALT(p, s, sprev); + n = enc_len(encode, *s); + if (n > 1) { + DATA_ENSURE(n); + sprev = s; + s += n; + } + else { + if (ONIG_IS_NEWLINE(*s)) goto fail; + sprev = s; + s++; + } + } + STAT_OP_OUT; + break; + + case OP_ANYCHAR_ML_STAR: STAT_OP_IN(OP_ANYCHAR_ML_STAR); + while (s < end) { + STACK_PUSH_ALT(p, s, sprev); + n = enc_len(encode, *s); + if (n > 1) { + DATA_ENSURE(n); + sprev = s; + s += n; + } + else { + sprev = s; + s++; + } + } + STAT_OP_OUT; + break; + + case OP_ANYCHAR_STAR_PEEK_NEXT: STAT_OP_IN(OP_ANYCHAR_STAR_PEEK_NEXT); + while (s < end) { + if (*p == *s) { + STACK_PUSH_ALT(p + 1, s, sprev); + } + n = enc_len(encode, *s); + if (n > 1) { + DATA_ENSURE(n); + sprev = s; + s += n; + } + else { + if (ONIG_IS_NEWLINE(*s)) goto fail; + sprev = s; + s++; + } + } + p++; + STAT_OP_OUT; + break; + + case OP_ANYCHAR_ML_STAR_PEEK_NEXT:STAT_OP_IN(OP_ANYCHAR_ML_STAR_PEEK_NEXT); + while (s < end) { + if (*p == *s) { + STACK_PUSH_ALT(p + 1, s, sprev); + } + n = enc_len(encode, *s); + if (n >1) { + DATA_ENSURE(n); + sprev = s; + s += n; + } + else { + sprev = s; + s++; + } + } + p++; + STAT_OP_OUT; + break; + + case OP_WORD: STAT_OP_IN(OP_WORD); + DATA_ENSURE(1); + if (! ONIGENC_IS_MBC_WORD(encode, s, end)) + goto fail; + + s += enc_len(encode, *s); + STAT_OP_OUT; + break; + + case OP_NOT_WORD: STAT_OP_IN(OP_NOT_WORD); + DATA_ENSURE(1); + if (ONIGENC_IS_MBC_WORD(encode, s, end)) + goto fail; + + s += enc_len(encode, *s); + STAT_OP_OUT; + break; + + case OP_WORD_BOUND: STAT_OP_IN(OP_WORD_BOUND); + if (ON_STR_BEGIN(s)) { + DATA_ENSURE(1); + if (! ONIGENC_IS_MBC_WORD(encode, s, end)) + goto fail; + } + else if (ON_STR_END(s)) { + if (! ONIGENC_IS_MBC_WORD(encode, sprev, end)) + goto fail; + } + else { + if (ONIGENC_IS_MBC_WORD(encode, s, end) + == ONIGENC_IS_MBC_WORD(encode, sprev, end)) + goto fail; + } + STAT_OP_OUT; + continue; + break; + + case OP_NOT_WORD_BOUND: STAT_OP_IN(OP_NOT_WORD_BOUND); + if (ON_STR_BEGIN(s)) { + if (DATA_ENSURE_CHECK(1) && ONIGENC_IS_MBC_WORD(encode, s, end)) + goto fail; + } + else if (ON_STR_END(s)) { + if (ONIGENC_IS_MBC_WORD(encode, sprev, end)) + goto fail; + } + else { + if (ONIGENC_IS_MBC_WORD(encode, s, end) + != ONIGENC_IS_MBC_WORD(encode, sprev, end)) + goto fail; + } + STAT_OP_OUT; + continue; + break; + +#ifdef USE_WORD_BEGIN_END + case OP_WORD_BEGIN: STAT_OP_IN(OP_WORD_BEGIN); + if (DATA_ENSURE_CHECK(1) && ONIGENC_IS_MBC_WORD(encode, s, end)) { + if (ON_STR_BEGIN(s) || !ONIGENC_IS_MBC_WORD(encode, sprev, end)) { + STAT_OP_OUT; + continue; + } + } + goto fail; + break; + + case OP_WORD_END: STAT_OP_IN(OP_WORD_END); + if (!ON_STR_BEGIN(s) && ONIGENC_IS_MBC_WORD(encode, sprev, end)) { + if (ON_STR_END(s) || !ONIGENC_IS_MBC_WORD(encode, s, end)) { + STAT_OP_OUT; + continue; + } + } + goto fail; + break; +#endif + + case OP_BEGIN_BUF: STAT_OP_IN(OP_BEGIN_BUF); + if (! ON_STR_BEGIN(s)) goto fail; + + STAT_OP_OUT; + continue; + break; + + case OP_END_BUF: STAT_OP_IN(OP_END_BUF); + if (! ON_STR_END(s)) goto fail; + + STAT_OP_OUT; + continue; + break; + + case OP_BEGIN_LINE: STAT_OP_IN(OP_BEGIN_LINE); + if (ON_STR_BEGIN(s)) { + if (IS_NOTBOL(msa->options)) goto fail; + STAT_OP_OUT; + continue; + } + else if (ONIG_IS_NEWLINE(*sprev) && !ON_STR_END(s)) { + STAT_OP_OUT; + continue; + } + goto fail; + break; + + case OP_END_LINE: STAT_OP_IN(OP_END_LINE); + if (ON_STR_END(s)) { +#ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE + if (IS_EMPTY_STR || !ONIG_IS_NEWLINE(*sprev)) { +#endif + if (IS_NOTEOL(msa->options)) goto fail; + STAT_OP_OUT; + continue; +#ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE + } +#endif + } + else if (ONIG_IS_NEWLINE(*s)) { + STAT_OP_OUT; + continue; + } + goto fail; + break; + + case OP_SEMI_END_BUF: STAT_OP_IN(OP_SEMI_END_BUF); + if (ON_STR_END(s)) { +#ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE + if (IS_EMPTY_STR || !ONIG_IS_NEWLINE(*sprev)) { +#endif + if (IS_NOTEOL(msa->options)) goto fail; /* Is it needed? */ + STAT_OP_OUT; + continue; +#ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE + } +#endif + } + if (ONIG_IS_NEWLINE(*s) && ON_STR_END(s+1)) { + STAT_OP_OUT; + continue; + } + goto fail; + break; + + case OP_BEGIN_POSITION: STAT_OP_IN(OP_BEGIN_POSITION); + if (s != msa->start) + goto fail; + + STAT_OP_OUT; + continue; + break; + + case OP_MEMORY_START_PUSH: STAT_OP_IN(OP_MEMORY_START_PUSH); + GET_MEMNUM_INC(mem, p); + STACK_PUSH_MEM_START(mem, s); + STAT_OP_OUT; + continue; + break; + + case OP_MEMORY_START: STAT_OP_IN(OP_MEMORY_START); + GET_MEMNUM_INC(mem, p); + mem_start_stk[mem] = (StackIndex )((void* )s); + STAT_OP_OUT; + continue; + break; + + case OP_MEMORY_END_PUSH: STAT_OP_IN(OP_MEMORY_END_PUSH); + GET_MEMNUM_INC(mem, p); + STACK_PUSH_MEM_END(mem, s); + STAT_OP_OUT; + continue; + break; + + case OP_MEMORY_END: STAT_OP_IN(OP_MEMORY_END); + GET_MEMNUM_INC(mem, p); + mem_end_stk[mem] = (StackIndex )((void* )s); + STAT_OP_OUT; + continue; + break; + +#ifdef USE_SUBEXP_CALL + case OP_MEMORY_END_PUSH_REC: STAT_OP_IN(OP_MEMORY_END_PUSH_REC); + GET_MEMNUM_INC(mem, p); + STACK_GET_MEM_START(mem, stkp); /* should be before push mem-end. */ + STACK_PUSH_MEM_END(mem, s); + mem_start_stk[mem] = GET_STACK_INDEX(stkp); + STAT_OP_OUT; + continue; + break; + + case OP_MEMORY_END_REC: STAT_OP_IN(OP_MEMORY_END_REC); + GET_MEMNUM_INC(mem, p); + mem_end_stk[mem] = (StackIndex )((void* )s); + STACK_GET_MEM_START(mem, stkp); + + if (BIT_STATUS_AT(reg->bt_mem_start, mem)) + mem_start_stk[mem] = GET_STACK_INDEX(stkp); + else + mem_start_stk[mem] = (StackIndex )((void* )stkp->u.mem.pstr); + + STACK_PUSH_MEM_END_MARK(mem); + STAT_OP_OUT; + continue; + break; +#endif + + case OP_BACKREF1: STAT_OP_IN(OP_BACKREF1); + mem = 1; + goto backref; + break; + + case OP_BACKREF2: STAT_OP_IN(OP_BACKREF2); + mem = 2; + goto backref; + break; + + case OP_BACKREF3: STAT_OP_IN(OP_BACKREF3); + mem = 3; + goto backref; + break; + + case OP_BACKREFN: STAT_OP_IN(OP_BACKREFN); + GET_MEMNUM_INC(mem, p); + backref: + { + int len; + UChar *pstart, *pend; + + /* if you want to remove following line, + you should check in parse and compile time. */ + if (mem > num_mem) goto fail; + if (mem_end_stk[mem] == INVALID_STACK_INDEX) goto fail; + if (mem_start_stk[mem] == INVALID_STACK_INDEX) goto fail; + + if (BIT_STATUS_AT(reg->bt_mem_start, mem)) + pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr; + else + pstart = (UChar* )((void* )mem_start_stk[mem]); + + pend = (BIT_STATUS_AT(reg->bt_mem_end, mem) + ? STACK_AT(mem_end_stk[mem])->u.mem.pstr + : (UChar* )((void* )mem_end_stk[mem])); + n = pend - pstart; + DATA_ENSURE(n); + sprev = s; + STRING_CMP(pstart, s, n); + while (sprev + (len = enc_len(encode, *sprev)) < s) + sprev += len; + + STAT_OP_OUT; + continue; + } + break; + + case OP_BACKREFN_IC: STAT_OP_IN(OP_BACKREFN_IC); + GET_MEMNUM_INC(mem, p); + { + int len; + UChar *pstart, *pend; + + /* if you want to remove following line, + you should check in parse and compile time. */ + if (mem > num_mem) goto fail; + if (mem_end_stk[mem] == INVALID_STACK_INDEX) goto fail; + if (mem_start_stk[mem] == INVALID_STACK_INDEX) goto fail; + + if (BIT_STATUS_AT(reg->bt_mem_start, mem)) + pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr; + else + pstart = (UChar* )((void* )mem_start_stk[mem]); + + pend = (BIT_STATUS_AT(reg->bt_mem_end, mem) + ? STACK_AT(mem_end_stk[mem])->u.mem.pstr + : (UChar* )((void* )mem_end_stk[mem])); + n = pend - pstart; + DATA_ENSURE(n); + sprev = s; + STRING_CMP_IC(pstart, &s, n); + while (sprev + (len = enc_len(encode, *sprev)) < s) + sprev += len; + + STAT_OP_OUT; + continue; + } + break; + + case OP_BACKREF_MULTI: STAT_OP_IN(OP_BACKREF_MULTI); + { + int len, is_fail; + UChar *pstart, *pend, *swork; + + GET_LENGTH_INC(tlen, p); + for (i = 0; i < tlen; i++) { + GET_MEMNUM_INC(mem, p); + + if (mem_end_stk[mem] == INVALID_STACK_INDEX) continue; + if (mem_start_stk[mem] == INVALID_STACK_INDEX) continue; + + if (BIT_STATUS_AT(reg->bt_mem_start, mem)) + pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr; + else + pstart = (UChar* )((void* )mem_start_stk[mem]); + + pend = (BIT_STATUS_AT(reg->bt_mem_end, mem) + ? STACK_AT(mem_end_stk[mem])->u.mem.pstr + : (UChar* )((void* )mem_end_stk[mem])); + n = pend - pstart; + DATA_ENSURE(n); + sprev = s; + swork = s; + STRING_CMP_VALUE(pstart, swork, n, is_fail); + if (is_fail) continue; + s = swork; + while (sprev + (len = enc_len(encode, *sprev)) < s) + sprev += len; + + p += (SIZE_MEMNUM * (tlen - i - 1)); + break; /* success */ + } + if (i == tlen) goto fail; + STAT_OP_OUT; + continue; + } + break; + + case OP_BACKREF_MULTI_IC: STAT_OP_IN(OP_BACKREF_MULTI_IC); + { + int len, is_fail; + UChar *pstart, *pend, *swork; + + GET_LENGTH_INC(tlen, p); + for (i = 0; i < tlen; i++) { + GET_MEMNUM_INC(mem, p); + + if (mem_end_stk[mem] == INVALID_STACK_INDEX) continue; + if (mem_start_stk[mem] == INVALID_STACK_INDEX) continue; + + if (BIT_STATUS_AT(reg->bt_mem_start, mem)) + pstart = STACK_AT(mem_start_stk[mem])->u.mem.pstr; + else + pstart = (UChar* )((void* )mem_start_stk[mem]); + + pend = (BIT_STATUS_AT(reg->bt_mem_end, mem) + ? STACK_AT(mem_end_stk[mem])->u.mem.pstr + : (UChar* )((void* )mem_end_stk[mem])); + n = pend - pstart; + DATA_ENSURE(n); + sprev = s; + swork = s; + STRING_CMP_VALUE_IC(pstart, &swork, n, is_fail); + if (is_fail) continue; + s = swork; + while (sprev + (len = enc_len(encode, *sprev)) < s) + sprev += len; + + p += (SIZE_MEMNUM * (tlen - i - 1)); + break; /* success */ + } + if (i == tlen) goto fail; + STAT_OP_OUT; + continue; + } + break; + + case OP_SET_OPTION_PUSH: STAT_OP_IN(OP_SET_OPTION_PUSH); + GET_OPTION_INC(option, p); + ignore_case = IS_IGNORECASE(option); + STACK_PUSH_ALT(p, s, sprev); + p += SIZE_OP_SET_OPTION + SIZE_OP_FAIL; + STAT_OP_OUT; + continue; + break; + + case OP_SET_OPTION: STAT_OP_IN(OP_SET_OPTION); + GET_OPTION_INC(option, p); + ignore_case = IS_IGNORECASE(option); + STAT_OP_OUT; + continue; + break; + + case OP_NULL_CHECK_START: STAT_OP_IN(OP_NULL_CHECK_START); + GET_MEMNUM_INC(mem, p); /* mem: null check id */ + STACK_PUSH_NULL_CHECK_START(mem, s); + STAT_OP_OUT; + continue; + break; + + case OP_NULL_CHECK_END: STAT_OP_IN(OP_NULL_CHECK_END); + { + int isnull; + + GET_MEMNUM_INC(mem, p); /* mem: null check id */ + STACK_NULL_CHECK(isnull, mem, s); + if (isnull) { +#ifdef ONIG_DEBUG_MATCH + fprintf(stderr, "NULL_CHECK_END: skip id:%d, s:%d\n", + (int )mem, (int )s); +#endif + null_check_found: + /* empty loop founded, skip next instruction */ + switch (*p++) { + case OP_JUMP: + case OP_PUSH: + p += SIZE_RELADDR; + break; + case OP_REPEAT_INC: + case OP_REPEAT_INC_NG: + p += SIZE_MEMNUM; + break; + default: + goto unexpected_bytecode_error; + break; + } + } + } + STAT_OP_OUT; + continue; + break; + +#ifdef USE_INFINITE_REPEAT_MONOMANIAC_MEM_STATUS_CHECK + case OP_NULL_CHECK_END_MEMST: STAT_OP_IN(OP_NULL_CHECK_END_MEMST); + { + int isnull; + + GET_MEMNUM_INC(mem, p); /* mem: null check id */ + STACK_NULL_CHECK_MEMST(isnull, mem, s, reg); + if (isnull) { +#ifdef ONIG_DEBUG_MATCH + fprintf(stderr, "NULL_CHECK_END_MEMST: skip id:%d, s:%d\n", + (int )mem, (int )s); +#endif + if (isnull == -1) goto fail; + goto null_check_found; + } + } + STAT_OP_OUT; + continue; + break; +#endif + +#ifdef USE_SUBEXP_CALL + case OP_NULL_CHECK_END_MEMST_PUSH: + STAT_OP_IN(OP_NULL_CHECK_END_MEMST_PUSH); + { + int isnull; + + GET_MEMNUM_INC(mem, p); /* mem: null check id */ +#ifdef USE_INFINITE_REPEAT_MONOMANIAC_MEM_STATUS_CHECK + STACK_NULL_CHECK_MEMST_REC(isnull, mem, s, reg); +#else + STACK_NULL_CHECK_REC(isnull, mem, s); +#endif + if (isnull) { +#ifdef ONIG_DEBUG_MATCH + fprintf(stderr, "NULL_CHECK_END_MEMST_PUSH: skip id:%d, s:%d\n", + (int )mem, (int )s); +#endif + if (isnull == -1) goto fail; + goto null_check_found; + } + else { + STACK_PUSH_NULL_CHECK_END(mem); + } + } + STAT_OP_OUT; + continue; + break; +#endif + + case OP_JUMP: STAT_OP_IN(OP_JUMP); + GET_RELADDR_INC(addr, p); + p += addr; + STAT_OP_OUT; + continue; + break; + + case OP_PUSH: STAT_OP_IN(OP_PUSH); + GET_RELADDR_INC(addr, p); + STACK_PUSH_ALT(p + addr, s, sprev); + STAT_OP_OUT; + continue; + break; + + case OP_POP: STAT_OP_IN(OP_POP); + STACK_POP_ONE; + STAT_OP_OUT; + continue; + break; + + case OP_PUSH_OR_JUMP_EXACT1: STAT_OP_IN(OP_PUSH_OR_JUMP_EXACT1); + GET_RELADDR_INC(addr, p); + if (*p == *s && DATA_ENSURE_CHECK(1)) { + p++; + STACK_PUSH_ALT(p + addr, s, sprev); + STAT_OP_OUT; + continue; + } + p += (addr + 1); + STAT_OP_OUT; + continue; + break; + + case OP_PUSH_IF_PEEK_NEXT: STAT_OP_IN(OP_PUSH_IF_PEEK_NEXT); + GET_RELADDR_INC(addr, p); + if (*p == *s) { + p++; + STACK_PUSH_ALT(p + addr, s, sprev); + STAT_OP_OUT; + continue; + } + p++; + STAT_OP_OUT; + continue; + break; + + case OP_REPEAT: STAT_OP_IN(OP_REPEAT); + { + GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */ + GET_RELADDR_INC(addr, p); + + STACK_ENSURE(1); + repeat_stk[mem] = GET_STACK_INDEX(stk); + STACK_PUSH_REPEAT(mem, p); + + if (reg->repeat_range[mem].lower == 0) { + STACK_PUSH_ALT(p + addr, s, sprev); + } + } + STAT_OP_OUT; + continue; + break; + + case OP_REPEAT_NG: STAT_OP_IN(OP_REPEAT_NG); + { + GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */ + GET_RELADDR_INC(addr, p); + + STACK_ENSURE(1); + repeat_stk[mem] = GET_STACK_INDEX(stk); + STACK_PUSH_REPEAT(mem, p); + + if (reg->repeat_range[mem].lower == 0) { + STACK_PUSH_ALT(p, s, sprev); + p += addr; + } + } + STAT_OP_OUT; + continue; + break; + + case OP_REPEAT_INC: STAT_OP_IN(OP_REPEAT_INC); + { + StackIndex si; + + GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */ +#ifdef USE_SUBEXP_CALL + if (reg->num_call > 0) { + STACK_GET_REPEAT(mem, stkp); + si = GET_STACK_INDEX(stkp); + } + else { + si = repeat_stk[mem]; + stkp = STACK_AT(si); + } +#else + si = repeat_stk[mem]; + stkp = STACK_AT(si); +#endif + stkp->u.repeat.count++; + if (stkp->u.repeat.count == reg->repeat_range[mem].upper) { + /* end of repeat. Nothing to do. */ + } + else if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { + STACK_PUSH_ALT(p, s, sprev); + p = stkp->u.repeat.pcode; + } + else { + p = stkp->u.repeat.pcode; + } + STACK_PUSH_REPEAT_INC(si); + } + STAT_OP_OUT; + continue; + break; + + case OP_REPEAT_INC_NG: STAT_OP_IN(OP_REPEAT_INC_NG); + { + StackIndex si; + + GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */ +#ifdef USE_SUBEXP_CALL + if (reg->num_call > 0) { + STACK_GET_REPEAT(mem, stkp); + si = GET_STACK_INDEX(stkp); + } + else { + si = repeat_stk[mem]; + stkp = STACK_AT(si); + } +#else + si = repeat_stk[mem]; + stkp = STACK_AT(si); +#endif + stkp->u.repeat.count++; + if (stkp->u.repeat.count < reg->repeat_range[mem].upper) { + if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { + UChar* pcode = stkp->u.repeat.pcode; + + STACK_PUSH_REPEAT_INC(si); + STACK_PUSH_ALT(pcode, s, sprev); + } + else { + p = stkp->u.repeat.pcode; + STACK_PUSH_REPEAT_INC(si); + } + } + else if (stkp->u.repeat.count == reg->repeat_range[mem].upper) { + STACK_PUSH_REPEAT_INC(si); + } + } + STAT_OP_OUT; + continue; + break; + + case OP_PUSH_POS: STAT_OP_IN(OP_PUSH_POS); + STACK_PUSH_POS(s, sprev); + STAT_OP_OUT; + continue; + break; + + case OP_POP_POS: STAT_OP_IN(OP_POP_POS); + { + STACK_POS_END(stkp); + s = stkp->u.state.pstr; + sprev = stkp->u.state.pstr_prev; + } + STAT_OP_OUT; + continue; + break; + + case OP_PUSH_POS_NOT: STAT_OP_IN(OP_PUSH_POS_NOT); + GET_RELADDR_INC(addr, p); + STACK_PUSH_POS_NOT(p + addr, s, sprev); + STAT_OP_OUT; + continue; + break; + + case OP_FAIL_POS: STAT_OP_IN(OP_FAIL_POS); + STACK_POP_TIL_POS_NOT; + goto fail; + break; + + case OP_PUSH_STOP_BT: STAT_OP_IN(OP_PUSH_STOP_BT); + STACK_PUSH_STOP_BT; + STAT_OP_OUT; + continue; + break; + + case OP_POP_STOP_BT: STAT_OP_IN(OP_POP_STOP_BT); + STACK_STOP_BT_END; + STAT_OP_OUT; + continue; + break; + + case OP_LOOK_BEHIND: STAT_OP_IN(OP_LOOK_BEHIND); + GET_LENGTH_INC(tlen, p); + s = ONIGENC_STEP_BACK(encode, str, s, (int )tlen); + if (IS_NULL(s)) goto fail; + sprev = onigenc_get_prev_char_head(encode, str, s); + STAT_OP_OUT; + continue; + break; + + case OP_PUSH_LOOK_BEHIND_NOT: STAT_OP_IN(OP_PUSH_LOOK_BEHIND_NOT); + GET_RELADDR_INC(addr, p); + GET_LENGTH_INC(tlen, p); + q = ONIGENC_STEP_BACK(encode, str, s, (int )tlen); + if (IS_NULL(q)) { + /* too short case -> success. ex. /(?p + addr; + STAT_OP_OUT; + continue; + break; + + case OP_RETURN: STAT_OP_IN(OP_RETURN); + STACK_RETURN(p); + STACK_PUSH_RETURN; + STAT_OP_OUT; + continue; + break; +#endif + + case OP_FINISH: + goto finish; + break; + + fail: + STAT_OP_OUT; + /* fall */ + case OP_FAIL: STAT_OP_IN(OP_FAIL); + STACK_POP; + p = stk->u.state.pcode; + s = stk->u.state.pstr; + sprev = stk->u.state.pstr_prev; + STAT_OP_OUT; + continue; + break; + + default: + goto bytecode_error; + + } /* end of switch */ + sprev = sbegin; + } /* end of while(1) */ + + finish: + STACK_SAVE; + return best_len; + +#ifdef ONIG_DEBUG + stack_error: + STACK_SAVE; + return ONIGERR_STACK_BUG; +#endif + + bytecode_error: + STACK_SAVE; + return ONIGERR_UNDEFINED_BYTECODE; + + unexpected_bytecode_error: + STACK_SAVE; + return ONIGERR_UNEXPECTED_BYTECODE; +} + + +static UChar* +slow_search(OnigEncoding enc, UChar* target, UChar* target_end, + UChar* text, UChar* text_end, UChar* text_range) +{ + UChar *t, *p, *s, *end; + + end = text_end - (target_end - target) + 1; + if (end > text_range) + end = text_range; + + s = text; + + while (s < end) { + if (*s == *target) { + p = s + 1; + t = target + 1; + while (t < target_end) { + if (*t != *p++) + break; + t++; + } + if (t == target_end) + return s; + } + s += enc_len(enc, *s); + } + + return (UChar* )NULL; +} + +#if 0 +static int +str_trans_match_after_head_byte(OnigEncoding enc, + int len, UChar* t, UChar* tend, UChar* p) +{ + while (--len > 0) { + if (*t != *p) break; + t++; p++; + } + + if (len == 0) { + int lowlen; + UChar *q, lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; + + while (t < tend) { + len = enc_len(enc, *p); + lowlen = ONIGENC_MBC_TO_LOWER(enc, p, lowbuf); + q = lowbuf; + while (lowlen > 0) { + if (*t++ != *q++) break; + lowlen--; + } + if (lowlen > 0) break; + p += len; + } + if (t == tend) + return 1; + } + + return 0; +} +#endif + +static int +str_lower_case_match(OnigEncoding enc, UChar* t, UChar* tend, UChar* p) +{ + int len, lowlen; + UChar *q, lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; + + while (t < tend) { + len = enc_len(enc, *p); + lowlen = ONIGENC_MBC_TO_LOWER(enc, p, lowbuf); + q = lowbuf; + while (lowlen > 0) { + if (*t++ != *q++) return 0; + lowlen--; + } + p += len; + } + + return 1; +} + +static UChar* +slow_search_ic(OnigEncoding enc, + UChar* target, UChar* target_end, + UChar* text, UChar* text_end, UChar* text_range) +{ + int len, lowlen; + UChar *t, *p, *s, *end; + UChar lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; + + end = text_end - (target_end - target) + 1; + if (end > text_range) + end = text_range; + + s = text; + + while (s < end) { + len = enc_len(enc, *s); + lowlen = ONIGENC_MBC_TO_LOWER(enc, s, lowbuf); + if (*target == *lowbuf) { + p = lowbuf + 1; + t = target + 1; + while (--lowlen > 0) { + if (*p != *t) break; + p++; *t++; + } + if (lowlen == 0) { + if (str_lower_case_match(enc, t, target_end, s + len)) + return s; + } + } + + s += len; + } + + return (UChar* )NULL; +} + +static UChar* +slow_search_backward(OnigEncoding enc, UChar* target, UChar* target_end, + UChar* text, UChar* adjust_text, UChar* text_end, UChar* text_start) +{ + UChar *t, *p, *s; + + s = text_end - (target_end - target); + if (s > text_start) + s = text_start; + else + s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, adjust_text, s); + + while (s >= text) { + if (*s == *target) { + p = s + 1; + t = target + 1; + while (t < target_end) { + if (*t != *p++) + break; + t++; + } + if (t == target_end) + return s; + } + s = onigenc_get_prev_char_head(enc, adjust_text, s); + } + + return (UChar* )NULL; +} + +static UChar* +slow_search_backward_ic(OnigEncoding enc, + UChar* target,UChar* target_end, + UChar* text, UChar* adjust_text, + UChar* text_end, UChar* text_start) +{ + int len, lowlen; + UChar *t, *p, *s; + UChar lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; + + s = text_end - (target_end - target); + if (s > text_start) + s = text_start; + else + s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, adjust_text, s); + + while (s >= text) { + len = enc_len(enc, *s); + lowlen = ONIGENC_MBC_TO_LOWER(enc, s, lowbuf); + if (*target == *lowbuf) { + p = lowbuf + 1; + t = target + 1; + while (--lowlen > 0) { + if (*p != *t) break; + p++; *t++; + } + if (lowlen == 0) { + if (str_lower_case_match(enc, t, target_end, s + len)) + return s; + } + } + + s = onigenc_get_prev_char_head(enc, adjust_text, s); + } + + return (UChar* )NULL; +} + +static UChar* +bm_search_notrev(regex_t* reg, UChar* target, UChar* target_end, + UChar* text, UChar* text_end, UChar* text_range) +{ + UChar *s, *t, *p, *end; + UChar *tail; + int skip; + + end = text_range + (target_end - target) - 1; + if (end > text_end) + end = text_end; + + tail = target_end - 1; + s = text; + while ((s - text) < target_end - target) { + s += enc_len(reg->enc, *s); + } + s--; /* set to text check tail position. */ + + if (IS_NULL(reg->int_map)) { + while (s < end) { + p = s; + t = tail; + while (t >= target && *p == *t) { + p--; t--; + } + if (t < target) return p + 1; + + skip = reg->map[*s]; + p++; + t = p; + while ((p - t) < skip) { + p += enc_len(reg->enc, *p); + } + s += (p - t); + } + } + else { + while (s < end) { + p = s; + t = tail; + while (t >= target && *p == *t) { + p--; t--; + } + if (t < target) return p + 1; + + skip = reg->int_map[*s]; + p++; + t = p; + while ((p - t) < skip) { + p += enc_len(reg->enc, *p); + } + s += (p - t); + } + } + return (UChar* )NULL; +} + +static UChar* +bm_search(regex_t* reg, UChar* target, UChar* target_end, + UChar* text, UChar* text_end, UChar* text_range) +{ + UChar *s, *t, *p, *end; + UChar *tail; + + end = text_range + (target_end - target) - 1; + if (end > text_end) + end = text_end; + + tail = target_end - 1; + s = text + (target_end - target) - 1; + if (IS_NULL(reg->int_map)) { + while (s < end) { + p = s; + t = tail; + while (t >= target && *p == *t) { + p--; t--; + } + if (t < target) return p + 1; + s += reg->map[*s]; + } + } + else { /* see int_map[] */ + while (s < end) { + p = s; + t = tail; + while (t >= target && *p == *t) { + p--; t--; + } + if (t < target) return p + 1; + s += reg->int_map[*s]; + } + } + return (UChar* )NULL; +} + +static int +set_bm_backward_skip(UChar* s, UChar* end, OnigEncoding enc, + int ignore_case, int** skip) +{ + int i, len; + UChar lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; + + if (IS_NULL(*skip)) { + *skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE); + if (IS_NULL(*skip)) return ONIGERR_MEMORY; + } + + len = end - s; + for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) + (*skip)[i] = len; + + if (ignore_case) { + for (i = len - 1; i > 0; i--) { + ONIGENC_MBC_TO_LOWER(enc, &(s[i]), lowbuf); + (*skip)[*lowbuf] = i; + } + } + else { + for (i = len - 1; i > 0; i--) + (*skip)[s[i]] = i; + } + return 0; +} + +static UChar* +bm_search_backward(regex_t* reg, UChar* target, UChar* target_end, UChar* text, + UChar* adjust_text, UChar* text_end, UChar* text_start) +{ + UChar *s, *t, *p; + + s = text_end - (target_end - target); + if (text_start < s) + s = text_start; + else + s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, adjust_text, s); + + while (s >= text) { + p = s; + t = target; + while (t < target_end && *p == *t) { + p++; t++; + } + if (t == target_end) + return s; + + s -= reg->int_map_backward[*s]; + s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, adjust_text, s); + } + + return (UChar* )NULL; +} + +static UChar* +map_search(OnigEncoding enc, UChar map[], UChar* text, UChar* text_range) +{ + UChar *s = text; + + while (s < text_range) { + if (map[*s]) return s; + + s += enc_len(enc, *s); + } + return (UChar* )NULL; +} + +static UChar* +map_search_backward(OnigEncoding enc, UChar map[], + UChar* text, UChar* adjust_text, UChar* text_start) +{ + UChar *s = text_start; + + while (s >= text) { + if (map[*s]) return s; + + s = onigenc_get_prev_char_head(enc, adjust_text, s); + } + return (UChar* )NULL; +} + +extern int +onig_match(regex_t* reg, UChar* str, UChar* end, UChar* at, OnigRegion* region, + OnigOptionType option) +{ + int r; + UChar *prev; + MatchArg msa; + + MATCH_ARG_INIT(msa, option, region, at); + + if (region +#ifdef USE_POSIX_REGION_OPTION + && !IS_POSIX_REGION(option) +#endif + ) { + r = onig_region_resize(region, reg->num_mem + 1); + } + else + r = 0; + + if (r == 0) { + prev = onigenc_get_prev_char_head(reg->enc, str, at); + r = match_at(reg, str, end, at, prev, &msa); + } + MATCH_ARG_FREE(msa); + return r; +} + +static int +forward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, + UChar* range, UChar** low, UChar** high, UChar** low_prev) +{ + UChar *p, *pprev = (UChar* )NULL; + +#ifdef ONIG_DEBUG_SEARCH + fprintf(stderr, "forward_search_range: str: %d, end: %d, s: %d, range: %d\n", + (int )str, (int )end, (int )s, (int )range); +#endif + + p = s; + if (reg->dmin > 0) { + if (ONIGENC_IS_SINGLEBYTE(reg->enc)) { + p += reg->dmin; + } + else { + UChar *q = p + reg->dmin; + while (p < q) p += enc_len(reg->enc, *p); + } + } + + retry: + switch (reg->optimize) { + case ONIG_OPTIMIZE_EXACT: + p = slow_search(reg->enc, reg->exact, reg->exact_end, p, end, range); + break; + case ONIG_OPTIMIZE_EXACT_IC: + p = slow_search_ic(reg->enc, reg->exact, reg->exact_end, p, end, range); + break; + + case ONIG_OPTIMIZE_EXACT_BM: + p = bm_search(reg, reg->exact, reg->exact_end, p, end, range); + break; + + case ONIG_OPTIMIZE_EXACT_BM_NOT_REV: + p = bm_search_notrev(reg, reg->exact, reg->exact_end, p, end, range); + break; + + case ONIG_OPTIMIZE_MAP: + p = map_search(reg->enc, reg->map, p, range); + break; + } + + if (p && p < range) { + if (p - reg->dmin < s) { + retry_gate: + pprev = p; + p += enc_len(reg->enc, *p); + goto retry; + } + + if (reg->sub_anchor) { + UChar* prev; + + switch (reg->sub_anchor) { + case ANCHOR_BEGIN_LINE: + if (!ON_STR_BEGIN(p)) { + prev = onigenc_get_prev_char_head(reg->enc, + (pprev ? pprev : str), p); + if (!ONIG_IS_NEWLINE(*prev)) + goto retry_gate; + } + break; + + case ANCHOR_END_LINE: + if (ON_STR_END(p)) { + prev = onigenc_get_prev_char_head(reg->enc, + (pprev ? pprev : str), p); + if (prev && ONIG_IS_NEWLINE(*prev)) + goto retry_gate; + } + else if (!ONIG_IS_NEWLINE(*p)) + goto retry_gate; + break; + } + } + + if (reg->dmax == 0) { + *low = p; + if (low_prev) { + if (*low > s) + *low_prev = onigenc_get_prev_char_head(reg->enc, s, p); + else + *low_prev = onigenc_get_prev_char_head(reg->enc, + (pprev ? pprev : str), p); + } + } + else { + if (reg->dmax != ONIG_INFINITE_DISTANCE) { + *low = p - reg->dmax; + if (*low > s) { + *low = onigenc_get_right_adjust_char_head_with_prev(reg->enc, s, + *low, low_prev); + if (low_prev && IS_NULL(*low_prev)) + *low_prev = onigenc_get_prev_char_head(reg->enc, + (pprev ? pprev : s), *low); + } + else { + if (low_prev) + *low_prev = onigenc_get_prev_char_head(reg->enc, + (pprev ? pprev : str), *low); + } + } + } + /* no needs to adjust *high, *high is used as range check only */ + *high = p - reg->dmin; + +#ifdef ONIG_DEBUG_SEARCH + fprintf(stderr, + "forward_search_range success: low: %d, high: %d, dmin: %d, dmax: %d\n", + (int )(*low - str), (int )(*high - str), reg->dmin, reg->dmax); +#endif + return 1; /* success */ + } + + return 0; /* fail */ +} + +static int set_bm_backward_skip P_((UChar* s, UChar* end, OnigEncoding enc, + int ignore_case, int** skip)); + +#define BM_BACKWARD_SEARCH_LENGTH_THRESHOLD 100 + +static int +backward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s, + UChar* range, UChar* adjrange, UChar** low, UChar** high) +{ + int r; + UChar *p; + + range += reg->dmin; + p = s; + + retry: + switch (reg->optimize) { + case ONIG_OPTIMIZE_EXACT: + exact_method: + p = slow_search_backward(reg->enc, reg->exact, reg->exact_end, + range, adjrange, end, p); + break; + + case ONIG_OPTIMIZE_EXACT_IC: + p = slow_search_backward_ic(reg->enc, reg->exact, + reg->exact_end, range, adjrange, end, p); + break; + + case ONIG_OPTIMIZE_EXACT_BM: + case ONIG_OPTIMIZE_EXACT_BM_NOT_REV: + if (IS_NULL(reg->int_map_backward)) { + if (s - range < BM_BACKWARD_SEARCH_LENGTH_THRESHOLD) + goto exact_method; + + r = set_bm_backward_skip(reg->exact, reg->exact_end, reg->enc, 0, + &(reg->int_map_backward)); + if (r) return r; + } + p = bm_search_backward(reg, reg->exact, reg->exact_end, range, adjrange, + end, p); + break; + + case ONIG_OPTIMIZE_MAP: + p = map_search_backward(reg->enc, reg->map, range, adjrange, p); + break; + } + + if (p) { + if (reg->sub_anchor) { + UChar* prev; + + switch (reg->sub_anchor) { + case ANCHOR_BEGIN_LINE: + if (!ON_STR_BEGIN(p)) { + prev = onigenc_get_prev_char_head(reg->enc, adjrange, p); + if (!ONIG_IS_NEWLINE(*prev)) { + p = prev; + goto retry; + } + } + break; + + case ANCHOR_END_LINE: + if (ON_STR_END(p)) { + prev = onigenc_get_prev_char_head(reg->enc, adjrange, p); + if (IS_NULL(prev)) goto fail; + if (ONIG_IS_NEWLINE(*prev)) { + p = prev; + goto retry; + } + } + else if (!ONIG_IS_NEWLINE(*p)) { + p = onigenc_get_prev_char_head(reg->enc, adjrange, p); + if (IS_NULL(p)) goto fail; + goto retry; + } + break; + } + } + + /* no needs to adjust *high, *high is used as range check only */ + if (reg->dmax != ONIG_INFINITE_DISTANCE) { + *low = p - reg->dmax; + *high = p - reg->dmin; + *high = onigenc_get_right_adjust_char_head(reg->enc, adjrange, *high); + } + +#ifdef ONIG_DEBUG_SEARCH + fprintf(stderr, "backward_search_range: low: %d, high: %d\n", + (int )(*low - str), (int )(*high - str)); +#endif + return 1; /* success */ + } + + fail: +#ifdef ONIG_DEBUG_SEARCH + fprintf(stderr, "backward_search_range: fail.\n"); +#endif + return 0; /* fail */ +} + + +extern int +onig_search(regex_t* reg, UChar* str, UChar* end, + UChar* start, UChar* range, OnigRegion* region, OnigOptionType option) +{ + int r; + UChar *s, *prev; + MatchArg msa; + + if (ONIG_STATE(reg) == ONIG_STATE_NORMAL) { + reg->state++; /* increment as search counter */ + if (IS_NOT_NULL(reg->chain)) { + onig_chain_reduce(reg); + reg->state++; + } + } + else { + int n = 0; + while (ONIG_STATE(reg) < ONIG_STATE_NORMAL) { + if (++n > THREAD_PASS_LIMIT_COUNT) + return ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT; + THREAD_PASS; + } + reg->state++; /* increment as search counter */ + } + +#ifdef ONIG_DEBUG_SEARCH + fprintf(stderr, "onig_search (entry point): str: %d, end: %d, start: %d, range: %d\n", + (int )str, (int )(end - str), (int )(start - str), (int )(range - str)); +#endif + + if (region +#ifdef USE_POSIX_REGION_OPTION + && !IS_POSIX_REGION(option) +#endif + ) { + r = onig_region_resize(region, reg->num_mem + 1); + if (r) goto finish_no_msa; + } + + if (start > end || start < str) goto mismatch_no_msa; + +#define MATCH_AND_RETURN_CHECK \ + r = match_at(reg, str, end, s, prev, &msa);\ + if (r != ONIG_MISMATCH) {\ + if (r >= 0) goto match;\ + goto finish; /* error */ \ + } + + /* anchor optimize: resume search range */ + if (reg->anchor != 0 && str < end) { + UChar* semi_end; + + if (reg->anchor & ANCHOR_BEGIN_POSITION) { + /* search start-position only */ + begin_position: + if (range > start) + range = start + 1; + else + range = start; + } + else if (reg->anchor & ANCHOR_BEGIN_BUF) { + /* search str-position only */ + if (range > start) { + if (start != str) goto mismatch_no_msa; + range = str + 1; + } + else { + if (range <= str) { + start = str; + range = str; + } + else + goto mismatch_no_msa; + } + } + else if (reg->anchor & ANCHOR_END_BUF) { + semi_end = end; + + end_buf: + if ((OnigDistance )(semi_end - str) < reg->anchor_dmin) + goto mismatch_no_msa; + + if (range > start) { + if ((OnigDistance )(semi_end - start) > reg->anchor_dmax) { + start = semi_end - reg->anchor_dmax; + if (start < end) + start = onigenc_get_right_adjust_char_head(reg->enc, str, start); + else { /* match with empty at end */ + start = onigenc_get_prev_char_head(reg->enc, str, end); + } + } + if ((OnigDistance )(semi_end - (range - 1)) < reg->anchor_dmin) { + range = semi_end - reg->anchor_dmin + 1; + } + + if (start >= range) goto mismatch_no_msa; + } + else { + if ((OnigDistance )(semi_end - range) > reg->anchor_dmax) { + range = semi_end - reg->anchor_dmax; + } + if ((OnigDistance )(semi_end - start) < reg->anchor_dmin) { + start = semi_end - reg->anchor_dmin; + start = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, str, start); + if (range > start) goto mismatch_no_msa; + } + } + } + else if (reg->anchor & ANCHOR_SEMI_END_BUF) { + if (ONIG_IS_NEWLINE(end[-1])) { + semi_end = end - 1; + if (semi_end > str && start <= semi_end) { + goto end_buf; + } + } + else { + semi_end = end; + goto end_buf; + } + } + else if ((reg->anchor & ANCHOR_ANYCHAR_STAR_PL)) { + goto begin_position; + } + } + else if (str == end) { /* empty string */ + static UChar* address_for_empty_string = ""; + +#ifdef ONIG_DEBUG_SEARCH + fprintf(stderr, "onig_search: empty string.\n"); +#endif + + if (reg->threshold_len == 0) { + s = start = end = str = address_for_empty_string; + prev = (UChar* )NULL; + + MATCH_ARG_INIT(msa, option, region, start); + MATCH_AND_RETURN_CHECK; + goto mismatch; + } + goto mismatch_no_msa; + } + +#ifdef ONIG_DEBUG_SEARCH + fprintf(stderr, "onig_search(apply anchor): end: %d, start: %d, range: %d\n", + (int )(end - str), (int )(start - str), (int )(range - str)); +#endif + + MATCH_ARG_INIT(msa, option, region, start); + + s = start; + if (range > start) { /* forward search */ + if (s > str) + prev = onigenc_get_prev_char_head(reg->enc, str, s); + else + prev = (UChar* )NULL; + + if (reg->optimize != ONIG_OPTIMIZE_NONE) { + UChar *sch_range, *low, *high, *low_prev; + + sch_range = range; + if (reg->dmax != 0) { + if (reg->dmax == ONIG_INFINITE_DISTANCE) + sch_range = end; + else { + sch_range += reg->dmax; + if (sch_range > end) sch_range = end; + } + } + if (reg->dmax != ONIG_INFINITE_DISTANCE && + (end - start) >= reg->threshold_len) { + do { + if (! forward_search_range(reg, str, end, s, sch_range, + &low, &high, &low_prev)) goto mismatch; + if (s < low) { + s = low; + prev = low_prev; + } + while (s <= high) { + MATCH_AND_RETURN_CHECK; + prev = s; + s += enc_len(reg->enc, *s); + } + if ((reg->anchor & ANCHOR_ANYCHAR_STAR) != 0) { + if (IS_NOT_NULL(prev)) { + while (!ONIG_IS_NEWLINE(*prev) && s < range) { + prev = s; + s += enc_len(reg->enc, *s); + } + } + } + } while (s < range); + goto mismatch; + } + else { /* check only. */ + if ((end - start) < reg->threshold_len || + ! forward_search_range(reg, str, end, s, sch_range, + &low, &high, (UChar** )NULL)) goto mismatch; + } + } + + do { + MATCH_AND_RETURN_CHECK; + prev = s; + s += enc_len(reg->enc, *s); + } while (s <= range); /* exec s == range, because empty match with /$/. */ + } + else { /* backward search */ + if (reg->optimize != ONIG_OPTIMIZE_NONE) { + UChar *low, *high, *adjrange, *sch_start; + + adjrange = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, str, range); + if (reg->dmax != ONIG_INFINITE_DISTANCE && + (end - range) >= reg->threshold_len) { + do { + sch_start = s + reg->dmax; + if (sch_start > end) sch_start = end; + if (backward_search_range(reg, str, end, sch_start, range, adjrange, + &low, &high) <= 0) + goto mismatch; + + if (s > high) + s = high; + + while (s >= low) { + prev = onigenc_get_prev_char_head(reg->enc, str, s); + MATCH_AND_RETURN_CHECK; + s = prev; + } + } while (s >= range); + goto mismatch; + } + else { /* check only. */ + if ((end - range) < reg->threshold_len) goto mismatch; + + sch_start = s; + if (reg->dmax != 0) { + if (reg->dmax == ONIG_INFINITE_DISTANCE) + sch_start = end; + else { + sch_start += reg->dmax; + if (sch_start > end) sch_start = end; + else + sch_start = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, + start, sch_start); + } + } + if (backward_search_range(reg, str, end, sch_start, range, adjrange, + &low, &high) <= 0) goto mismatch; + } + } + + do { + prev = onigenc_get_prev_char_head(reg->enc, str, s); + MATCH_AND_RETURN_CHECK; + s = prev; + } while (s >= range); + } + + mismatch: + r = ONIG_MISMATCH; + + finish: + MATCH_ARG_FREE(msa); + reg->state--; /* decrement as search counter */ + + /* If result is mismatch and no FIND_NOT_EMPTY option, + then the region is not setted in match_at(). */ + if (IS_FIND_NOT_EMPTY(reg->options) && region +#ifdef USE_POSIX_REGION_OPTION + && !IS_POSIX_REGION(option) +#endif + ) { + onig_region_clear(region); + } + +#ifdef ONIG_DEBUG + if (r != ONIG_MISMATCH) + fprintf(stderr, "onig_search: error %d\n", r); +#endif + return r; + + mismatch_no_msa: + r = ONIG_MISMATCH; + finish_no_msa: + reg->state--; /* decrement as search counter */ +#ifdef ONIG_DEBUG + if (r != ONIG_MISMATCH) + fprintf(stderr, "onig_search: error %d\n", r); +#endif + return r; + + match: + reg->state--; /* decrement as search counter */ + MATCH_ARG_FREE(msa); + return s - str; +} + +extern OnigEncoding +onig_get_encoding(regex_t* reg) +{ + return reg->enc; +} + +extern OnigOptionType +onig_get_options(regex_t* reg) +{ + return reg->options; +} + +extern OnigSyntaxType* +onig_get_syntax(regex_t* reg) +{ + return reg->syntax; +} + +extern const char* +onig_version(void) +{ +#define MSTR(a) # a + + return (MSTR(ONIGURUMA_VERSION_MAJOR) "." + MSTR(ONIGURUMA_VERSION_MINOR) "." + MSTR(ONIGURUMA_VERSION_TEENY)); +} diff --git a/reggnu.c b/reggnu.c new file mode 100644 index 0000000000..9c6a2161c2 --- /dev/null +++ b/reggnu.c @@ -0,0 +1,256 @@ +/********************************************************************** + + reggnu.c - Oniguruma (regular expression library) + + Copyright (C) 2002-2004 K.Kosako (kosako@sofnec.co.jp) + +**********************************************************************/ +#include "regint.h" + +#ifndef ONIGGNU_H /* name changes from oniggnu.h to regex.h in ruby. */ +#include "oniggnu.h" +#endif + +#if defined(RUBY_PLATFORM) || defined(RUBY) +#ifndef ONIG_RUBY_M17N +#define USE_COMPATIBILITY_FOR_RUBY_EXTENSION_LIBRARY +#endif +#endif + +#ifndef NULL +#define NULL ((void* )0) +#endif + +extern void +re_free_registers(OnigRegion* r) +{ + /* 0: don't free self */ + onig_region_free(r, 0); +} + +extern int +re_adjust_startpos(regex_t* reg, const char* string, int size, + int startpos, int range) +{ + if (startpos > 0 && ONIGENC_MBC_MAXLEN(reg->enc) != 1 && startpos < size) { + UChar *p; + UChar *s = (UChar* )string + startpos; + + if (range > 0) { + p = onigenc_get_right_adjust_char_head(reg->enc, (UChar* )string, s); + } + else { + p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, (UChar* )string, s); + } + return p - (UChar* )string; + } + + return startpos; +} + +extern int +re_match(regex_t* reg, const char* str, int size, int pos, + struct re_registers* regs) +{ + return onig_match(reg, (UChar* )str, (UChar* )(str + size), + (UChar* )(str + pos), regs, ONIG_OPTION_NONE); +} + +extern int +re_search(regex_t* bufp, const char* string, int size, int startpos, int range, + struct re_registers* regs) +{ + return onig_search(bufp, (UChar* )string, (UChar* )(string + size), + (UChar* )(string + startpos), + (UChar* )(string + startpos + range), + regs, ONIG_OPTION_NONE); +} + +extern int +re_compile_pattern(const char* pattern, int size, regex_t* reg, char* ebuf) +{ + int r; + OnigErrorInfo einfo; + + r = onig_compile(reg, (UChar* )pattern, (UChar* )(pattern + size), &einfo); + if (r != 0) { + if (IS_NOT_NULL(ebuf)) + (void )onig_error_code_to_str((UChar* )ebuf, r, &einfo); + } + + return r; +} + +extern int +re_recompile_pattern(const char* pattern, int size, regex_t* reg, char* ebuf) +{ + int r; + OnigErrorInfo einfo; + OnigEncoding enc; + + /* I think encoding and options should be arguments of this function. + But this is adapted to present re.c. (2002/11/29) + */ + enc = OnigEncDefaultCharEncoding; + + r = onig_recompile(reg, (UChar* )pattern, (UChar* )(pattern + size), + reg->options, enc, OnigDefaultSyntax, &einfo); + if (r != 0) { + if (IS_NOT_NULL(ebuf)) + (void )onig_error_code_to_str((UChar* )ebuf, r, &einfo); + } + return r; +} + +extern void +re_free_pattern(regex_t* reg) +{ + onig_free(reg); +} + +extern int +re_alloc_pattern(regex_t** reg) +{ + return onig_alloc_init(reg, ONIG_OPTION_DEFAULT, OnigEncDefaultCharEncoding, + OnigDefaultSyntax); +} + +extern void +re_set_casetable(const char* table) +{ + onigenc_set_default_caseconv_table((UChar* )table); +} + +#ifdef USE_COMPATIBILITY_FOR_RUBY_EXTENSION_LIBRARY +static const unsigned char mbctab_ascii[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static const unsigned char mbctab_euc[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, +}; + +static const unsigned char mbctab_sjis[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 +}; + +static const unsigned char mbctab_utf8[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 0, 0, +}; + +const unsigned char *re_mbctab = mbctab_ascii; +#endif + +extern void +#ifdef ONIG_RUBY_M17N +re_mbcinit(OnigEncoding enc) +#else +re_mbcinit(int mb_code) +#endif +{ +#ifdef ONIG_RUBY_M17N + + onigenc_set_default_encoding(enc); + +#else + + OnigEncoding enc; + + switch (mb_code) { + case MBCTYPE_ASCII: + enc = ONIG_ENCODING_ASCII; + break; + case MBCTYPE_EUC: + enc = ONIG_ENCODING_EUC_JP; + break; + case MBCTYPE_SJIS: + enc = ONIG_ENCODING_SJIS; + break; + case MBCTYPE_UTF8: + enc = ONIG_ENCODING_UTF8; + break; + default: + return ; + break; + } + + onigenc_set_default_encoding(enc); +#endif + +#ifdef USE_COMPATIBILITY_FOR_RUBY_EXTENSION_LIBRARY + switch (mb_code) { + case MBCTYPE_ASCII: + re_mbctab = mbctab_ascii; + break; + case MBCTYPE_EUC: + re_mbctab = mbctab_euc; + break; + case MBCTYPE_SJIS: + re_mbctab = mbctab_sjis; + break; + case MBCTYPE_UTF8: + re_mbctab = mbctab_utf8; + break; + } +#endif +} diff --git a/regint.h b/regint.h new file mode 100644 index 0000000000..35736b6dcb --- /dev/null +++ b/regint.h @@ -0,0 +1,685 @@ +/********************************************************************** + + regint.h - Oniguruma (regular expression library) + + Copyright (C) 2002-2004 K.Kosako (kosako@sofnec.co.jp) + +**********************************************************************/ +#ifndef REGINT_H +#define REGINT_H + +/* for debug */ +/* #define ONIG_DEBUG_PARSE_TREE */ +/* #define ONIG_DEBUG_COMPILE */ +/* #define ONIG_DEBUG_SEARCH */ +/* #define ONIG_DEBUG_MATCH */ +/* #define ONIG_DONT_OPTIMIZE */ + +/* for byte-code statistical data. */ +/* #define ONIG_DEBUG_STATISTICS */ + +#if defined(ONIG_DEBUG_PARSE_TREE) || defined(ONIG_DEBUG_MATCH) || \ + defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_STATISTICS) +#ifndef ONIG_DEBUG +#define ONIG_DEBUG +#endif +#endif + +#if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ + (defined(__ppc__) && defined(__APPLE__)) || \ + defined(__x86_64) || defined(__x86_64__) || \ + defined(__mc68020__) +#define PLATFORM_UNALIGNED_WORD_ACCESS +#endif + +/* config */ +/* spec. config */ +#define USE_NAMED_GROUP +#define USE_SUBEXP_CALL +#define USE_FOLD_MATCH /* ess-tsett etc... */ +#define USE_INFINITE_REPEAT_MONOMANIAC_MEM_STATUS_CHECK /* /(?:()|())*\2/ */ +#define USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE /* /\n$/ =~ "\n" */ +#define USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR +/* internal config */ +#define USE_RECYCLE_NODE +#define USE_OP_PUSH_OR_JUMP_EXACT +#define USE_QUALIFIER_PEEK_NEXT + +#define INIT_MATCH_STACK_SIZE 160 +#define MATCH_STACK_LIMIT_SIZE 500000 + +/* interface to external system */ +#ifdef NOT_RUBY /* gived from Makefile */ +#include "config.h" +#define USE_VARIABLE_META_CHARS +#define USE_VARIABLE_SYNTAX +#define USE_WORD_BEGIN_END /* "\<": word-begin, "\>": word-end */ +#define USE_POSIX_REGION_OPTION /* needed for POSIX API support */ +#define THREAD_ATOMIC_START /* depend on thread system */ +#define THREAD_ATOMIC_END /* depend on thread system */ +#define THREAD_PASS /* depend on thread system */ +#define xmalloc malloc +#define xrealloc realloc +#define xfree free +#else +#include "ruby.h" +#include "version.h" +#include "rubysig.h" /* for DEFER_INTS, ENABLE_INTS */ +#define THREAD_ATOMIC_START DEFER_INTS +#define THREAD_ATOMIC_END ENABLE_INTS +#define THREAD_PASS rb_thread_schedule() +#define DEFAULT_WARN_FUNCTION rb_warn +#define DEFAULT_VERB_WARN_FUNCTION rb_warning + +#if defined(RUBY_VERSION_MAJOR) +#if RUBY_VERSION_MAJOR > 1 || \ +(RUBY_VERSION_MAJOR == 1 && \ + defined(RUBY_VERSION_MINOR) && RUBY_VERSION_MINOR >= 8) +#define USE_ST_HASH_TABLE +#endif +#endif + +#endif /* else NOT_RUBY */ + +#define THREAD_PASS_LIMIT_COUNT 10 +#define xmemset memset +#define xmemcpy memcpy +#define xmemmove memmove +#if defined(_WIN32) && !defined(__CYGWIN__) +#define xalloca _alloca +#ifdef NOT_RUBY +#define vsnprintf _vsnprintf +#endif +#else +#define xalloca alloca +#endif + +#ifdef HAVE_STDLIB_H +#include +#endif + +#if defined(HAVE_ALLOCA_H) && !defined(__GNUC__) +#include +#endif + +#ifdef HAVE_STRING_H +# include +#else +# include +#endif + +#include +#include + +#ifdef ONIG_DEBUG +# include +#endif + +#include "regenc.h" +#include "oniguruma.h" + +#ifdef MIN +#undef MIN +#endif +#ifdef MAX +#undef MAX +#endif +#define MIN(a,b) (((a)>(b))?(b):(a)) +#define MAX(a,b) (((a)<(b))?(b):(a)) + +#define IS_NULL(p) (((void*)(p)) == (void*)0) +#define IS_NOT_NULL(p) (((void*)(p)) != (void*)0) +#define CHECK_NULL_RETURN(p) if (IS_NULL(p)) return NULL +#define CHECK_NULL_RETURN_VAL(p,val) if (IS_NULL(p)) return (val) +#define NULL_UCHARP ((UChar* )0) + +#ifndef PLATFORM_UNALIGNED_WORD_ACCESS +#define WORD_ALIGNMENT_SIZE SIZEOF_INT + +#define GET_ALIGNMENT_PAD_SIZE(addr,pad_size) do {\ + (pad_size) = WORD_ALIGNMENT_SIZE \ + - ((unsigned int )(addr) % WORD_ALIGNMENT_SIZE);\ + if ((pad_size) == WORD_ALIGNMENT_SIZE) (pad_size) = 0;\ +} while (0) + +#define ALIGNMENT_RIGHT(addr) do {\ + (addr) += (WORD_ALIGNMENT_SIZE - 1);\ + (addr) -= ((unsigned int )(addr) % WORD_ALIGNMENT_SIZE);\ +} while (0) + + +#define B_SHIFT 8 +#define B_MASK 0xff + +#define SERIALIZE_2BYTE_INT(i,p) do {\ + *(p) = ((i) >> B_SHIFT) & B_MASK;\ + *((p)+1) = (i) & B_MASK;\ +} while (0) + +#define SERIALIZE_4BYTE_INT(i,p) do {\ + *(p) = ((i) >> B_SHIFT*3) & B_MASK;\ + *((p)+1) = ((i) >> B_SHIFT*2) & B_MASK;\ + *((p)+2) = ((i) >> B_SHIFT ) & B_MASK;\ + *((p)+3) = (i) & B_MASK;\ +} while (0) + +#define SERIALIZE_8BYTE_INT(i,p) do {\ + *(p) = ((i) >> B_SHIFT*7) & B_MASK;\ + *((p)+1) = ((i) >> B_SHIFT*6) & B_MASK;\ + *((p)+2) = ((i) >> B_SHIFT*5) & B_MASK;\ + *((p)+3) = ((i) >> B_SHIFT*4) & B_MASK;\ + *((p)+4) = ((i) >> B_SHIFT*3) & B_MASK;\ + *((p)+5) = ((i) >> B_SHIFT*2) & B_MASK;\ + *((p)+6) = ((i) >> B_SHIFT ) & B_MASK;\ + *((p)+7) = (i) & B_MASK;\ +} while (0) + +#define GET_2BYTE_INT_INC(type,i,p) do {\ + (i) = (type )(((unsigned int )(*(p)) << B_SHIFT) | (unsigned int )((p)[1]));\ + (p) += 2;\ +} while (0) + +#define GET_4BYTE_INT_INC(type,i,p) do {\ + (i) = (type )(((unsigned int )((p)[0]) << B_SHIFT*3) | \ + ((unsigned int )((p)[1]) << B_SHIFT*2) | \ + ((unsigned int )((p)[2]) << B_SHIFT ) | \ + ((unsigned int )((p)[3]) )); \ + (p) += 4;\ +} while (0) + +#define GET_8BYTE_INT_INC(type,i,p) do {\ + (i) = (type )(((unsigned long )((p)[0]) << B_SHIFT*7) | \ + ((unsigned long )((p)[1]) << B_SHIFT*6) | \ + ((unsigned long )((p)[2]) << B_SHIFT*5) | \ + ((unsigned long )((p)[3]) << B_SHIFT*4) | \ + ((unsigned long )((p)[4]) << B_SHIFT*3) | \ + ((unsigned long )((p)[5]) << B_SHIFT*2) | \ + ((unsigned long )((p)[6]) << B_SHIFT ) | \ + ((unsigned long )((p)[7]) )); \ + (p) += 8;\ +} while (0) + +#if SIZEOF_SHORT == 2 +#define GET_SHORT_INC(i,p) GET_2BYTE_INT_INC(short,i,p) +#define SERIALIZE_SHORT(i,p) SERIALIZE_2BYTE_INT(i,p) +#elif SIZEOF_SHORT == 4 +#define GET_SHORT_INC(i,p) GET_4BYTE_INT_INC(short,i,p) +#define SERIALIZE_SHORT(i,p) SERIALIZE_4BYTE_INT(i,p) +#elif SIZEOF_SHORT == 8 +#define GET_SHORT_INC(i,p) GET_8BYTE_INT_INC(short,i,p) +#define SERIALIZE_SHORT(i,p) SERIALIZE_8BYTE_INT(i,p) +#endif + +#if SIZEOF_INT == 2 +#define GET_INT_INC(i,p) GET_2BYTE_INT_INC(int,i,p) +#define GET_UINT_INC(i,p) GET_2BYTE_INT_INC(unsigned,i,p) +#define SERIALIZE_INT(i,p) SERIALIZE_2BYTE_INT(i,p) +#define SERIALIZE_UINT(i,p) SERIALIZE_2BYTE_INT(i,p) +#elif SIZEOF_INT == 4 +#define GET_INT_INC(i,p) GET_4BYTE_INT_INC(int,i,p) +#define GET_UINT_INC(i,p) GET_4BYTE_INT_INC(unsigned,i,p) +#define SERIALIZE_INT(i,p) SERIALIZE_4BYTE_INT(i,p) +#define SERIALIZE_UINT(i,p) SERIALIZE_4BYTE_INT(i,p) +#elif SIZEOF_INT == 8 +#define GET_INT_INC(i,p) GET_8BYTE_INT_INC(int,i,p) +#define GET_UINT_INC(i,p) GET_8BYTE_INT_INC(unsigned,i,p) +#define SERIALIZE_INT(i,p) SERIALIZE_8BYTE_INT(i,p) +#define SERIALIZE_UINT(i,p) SERIALIZE_8BYTE_INT(i,p) +#endif + +#endif /* PLATFORM_UNALIGNED_WORD_ACCESS */ + +/* stack pop level */ +#define STACK_POP_LEVEL_FREE 0 +#define STACK_POP_LEVEL_MEM_START 1 +#define STACK_POP_LEVEL_ALL 2 + +/* optimize flags */ +#define ONIG_OPTIMIZE_NONE 0 +#define ONIG_OPTIMIZE_EXACT 1 /* Slow Search */ +#define ONIG_OPTIMIZE_EXACT_BM 2 /* Boyer Moore Search */ +#define ONIG_OPTIMIZE_EXACT_BM_NOT_REV 3 /* BM (but not simple match) */ +#define ONIG_OPTIMIZE_EXACT_IC 4 /* Slow Search (ignore case) */ +#define ONIG_OPTIMIZE_MAP 5 /* char map */ + +/* bit status */ +typedef unsigned int BitStatusType; + +#define BIT_STATUS_BITS_NUM (sizeof(BitStatusType) * 8) +#define BIT_STATUS_CLEAR(stats) (stats) = 0 +#define BIT_STATUS_ON_ALL(stats) (stats) = ~((BitStatusType )0) +#define BIT_STATUS_AT(stats,n) \ + ((n) < BIT_STATUS_BITS_NUM ? ((stats) & (1 << n)) : ((stats) & 1)) + +#define BIT_STATUS_ON_AT(stats,n) do {\ + if ((n) < BIT_STATUS_BITS_NUM)\ + (stats) |= (1 << (n));\ + else\ + (stats) |= 1;\ +} while (0) + +#define BIT_STATUS_ON_AT_SIMPLE(stats,n) do {\ + if ((n) < BIT_STATUS_BITS_NUM)\ + (stats) |= (1 << (n));\ +} while (0) + + +#define INT_MAX_LIMIT ((1UL << (SIZEOF_INT * 8 - 1)) - 1) + +#define DIGITVAL(code) ((code) - '0') +#define ODIGITVAL(code) DIGITVAL(code) +#define XDIGITVAL(enc,code) \ + (ONIGENC_IS_CODE_DIGIT(enc,code) ? DIGITVAL(code) \ + : (ONIGENC_IS_CODE_UPPER(enc,code) ? (code) - 'A' + 10 : (code) - 'a' + 10)) + +#define IS_SINGLELINE(option) ((option) & ONIG_OPTION_SINGLELINE) +#define IS_MULTILINE(option) ((option) & ONIG_OPTION_MULTILINE) +#define IS_IGNORECASE(option) ((option) & ONIG_OPTION_IGNORECASE) +#define IS_EXTEND(option) ((option) & ONIG_OPTION_EXTEND) +#define IS_FIND_LONGEST(option) ((option) & ONIG_OPTION_FIND_LONGEST) +#define IS_FIND_NOT_EMPTY(option) ((option) & ONIG_OPTION_FIND_NOT_EMPTY) +#define IS_POSIXLINE(option) (IS_SINGLELINE(option) && IS_MULTILINE(option)) +#define IS_FIND_CONDITION(option) ((option) & \ + (ONIG_OPTION_FIND_LONGEST | ONIG_OPTION_FIND_NOT_EMPTY)) +#define IS_NOTBOL(option) ((option) & ONIG_OPTION_NOTBOL) +#define IS_NOTEOL(option) ((option) & ONIG_OPTION_NOTEOL) +#define IS_POSIX_REGION(option) ((option) & ONIG_OPTION_POSIX_REGION) + +/* OP_SET_OPTION is required for these options. +#define IS_DYNAMIC_OPTION(option) \ + (((option) & (ONIG_OPTION_MULTILINE | ONIG_OPTION_IGNORECASE)) != 0) +*/ +/* ignore-case and multibyte status are included in compiled code. */ +#define IS_DYNAMIC_OPTION(option) 0 + + +/* bitset */ +#define BITS_PER_BYTE 8 +#define SINGLE_BYTE_SIZE (1 << BITS_PER_BYTE) +#define BITS_IN_ROOM (sizeof(Bits) * BITS_PER_BYTE) +#define BITSET_SIZE (SINGLE_BYTE_SIZE / BITS_IN_ROOM) + +#ifdef PLATFORM_UNALIGNED_WORD_ACCESS +typedef unsigned int Bits; +#else +typedef unsigned char Bits; +#endif +typedef Bits BitSet[BITSET_SIZE]; +typedef Bits* BitSetRef; + +#define SIZE_BITSET sizeof(BitSet) + +#define BITSET_CLEAR(bs) do {\ + int i;\ + for (i = 0; i < BITSET_SIZE; i++) { (bs)[i] = 0; }\ +} while (0) + +#define BS_ROOM(bs,pos) (bs)[pos / BITS_IN_ROOM] +#define BS_BIT(pos) (1 << (pos % BITS_IN_ROOM)) + +#define BITSET_AT(bs, pos) (BS_ROOM(bs,pos) & BS_BIT(pos)) +#define BITSET_SET_BIT(bs, pos) BS_ROOM(bs,pos) |= BS_BIT(pos) +#define BITSET_CLEAR_BIT(bs, pos) BS_ROOM(bs,pos) &= ~(BS_BIT(pos)) +#define BITSET_INVERT_BIT(bs, pos) BS_ROOM(bs,pos) ^= BS_BIT(pos) + +/* bytes buffer */ +typedef struct _BBuf { + UChar* p; + unsigned int used; + unsigned int alloc; +} BBuf; + +#define BBUF_INIT(buf,size) onig_bbuf_init((BBuf* )(buf), (size)) + +#define BBUF_SIZE_INC(buf,inc) do{\ + (buf)->alloc += (inc);\ + (buf)->p = (UChar* )xrealloc((buf)->p, (buf)->alloc);\ + if (IS_NULL((buf)->p)) return(ONIGERR_MEMORY);\ +} while (0) + +#define BBUF_EXPAND(buf,low) do{\ + do { (buf)->alloc *= 2; } while ((buf)->alloc < (unsigned int )low);\ + (buf)->p = (UChar* )xrealloc((buf)->p, (buf)->alloc);\ + if (IS_NULL((buf)->p)) return(ONIGERR_MEMORY);\ +} while (0) + +#define BBUF_ENSURE_SIZE(buf,size) do{\ + unsigned int new_alloc = (buf)->alloc;\ + while (new_alloc < (unsigned int )(size)) { new_alloc *= 2; }\ + if ((buf)->alloc != new_alloc) {\ + (buf)->p = (UChar* )xrealloc((buf)->p, new_alloc);\ + if (IS_NULL((buf)->p)) return(ONIGERR_MEMORY);\ + (buf)->alloc = new_alloc;\ + }\ +} while (0) + +#define BBUF_WRITE(buf,pos,bytes,n) do{\ + int used = (pos) + (n);\ + if ((buf)->alloc < (unsigned int )used) BBUF_EXPAND((buf),used);\ + xmemcpy((buf)->p + (pos), (bytes), (n));\ + if ((buf)->used < (unsigned int )used) (buf)->used = used;\ +} while (0) + +#define BBUF_WRITE1(buf,pos,byte) do{\ + int used = (pos) + 1;\ + if ((buf)->alloc < (unsigned int )used) BBUF_EXPAND((buf),used);\ + (buf)->p[(pos)] = (byte);\ + if ((buf)->used < (unsigned int )used) (buf)->used = used;\ +} while (0) + +#define BBUF_ADD(buf,bytes,n) BBUF_WRITE((buf),(buf)->used,(bytes),(n)) +#define BBUF_ADD1(buf,byte) BBUF_WRITE1((buf),(buf)->used,(byte)) +#define BBUF_GET_ADD_ADDRESS(buf) ((buf)->p + (buf)->used) +#define BBUF_GET_OFFSET_POS(buf) ((buf)->used) + +/* from < to */ +#define BBUF_MOVE_RIGHT(buf,from,to,n) do {\ + if ((unsigned int )((to)+(n)) > (buf)->alloc) BBUF_EXPAND((buf),(to) + (n));\ + xmemmove((buf)->p + (to), (buf)->p + (from), (n));\ + if ((unsigned int )((to)+(n)) > (buf)->used) (buf)->used = (to) + (n);\ +} while (0) + +/* from > to */ +#define BBUF_MOVE_LEFT(buf,from,to,n) do {\ + xmemmove((buf)->p + (to), (buf)->p + (from), (n));\ +} while (0) + +/* from > to */ +#define BBUF_MOVE_LEFT_REDUCE(buf,from,to) do {\ + xmemmove((buf)->p + (to), (buf)->p + (from), (buf)->used - (from));\ + (buf)->used -= (from - to);\ +} while (0) + +#define BBUF_INSERT(buf,pos,bytes,n) do {\ + if (pos >= (buf)->used) {\ + BBUF_WRITE(buf,pos,bytes,n);\ + }\ + else {\ + BBUF_MOVE_RIGHT((buf),(pos),(pos) + (n),((buf)->used - (pos)));\ + xmemcpy((buf)->p + (pos), (bytes), (n));\ + }\ +} while (0) + +#define BBUF_GET_BYTE(buf, pos) (buf)->p[(pos)] + + +#define ANCHOR_BEGIN_BUF (1<<0) +#define ANCHOR_BEGIN_LINE (1<<1) +#define ANCHOR_BEGIN_POSITION (1<<2) +#define ANCHOR_END_BUF (1<<3) +#define ANCHOR_SEMI_END_BUF (1<<4) +#define ANCHOR_END_LINE (1<<5) + +#define ANCHOR_WORD_BOUND (1<<6) +#define ANCHOR_NOT_WORD_BOUND (1<<7) +#define ANCHOR_WORD_BEGIN (1<<8) +#define ANCHOR_WORD_END (1<<9) +#define ANCHOR_PREC_READ (1<<10) +#define ANCHOR_PREC_READ_NOT (1<<11) +#define ANCHOR_LOOK_BEHIND (1<<12) +#define ANCHOR_LOOK_BEHIND_NOT (1<<13) + +#define ANCHOR_ANYCHAR_STAR (1<<14) /* ".*" optimize info */ +#define ANCHOR_ANYCHAR_STAR_PL (1<<15) /* ".*" optimize info (posix-line) */ + +/* operation code */ +enum OpCode { + OP_FINISH = 0, /* matching process terminator (no more alternative) */ + OP_END = 1, /* pattern code terminator (success end) */ + + OP_EXACT1 = 2, /* single byte, N = 1 */ + OP_EXACT2, /* single byte, N = 2 */ + OP_EXACT3, /* single byte, N = 3 */ + OP_EXACT4, /* single byte, N = 4 */ + OP_EXACT5, /* single byte, N = 5 */ + OP_EXACTN, /* single byte */ + OP_EXACTMB2N1, /* mb-length = 2 N = 1 */ + OP_EXACTMB2N2, /* mb-length = 2 N = 2 */ + OP_EXACTMB2N3, /* mb-length = 2 N = 3 */ + OP_EXACTMB2N, /* mb-length = 2 */ + OP_EXACTMB3N, /* mb-length = 3 */ + OP_EXACTMBN, /* other length */ + + OP_EXACT1_IC, /* single byte, N = 1, ignore case */ + OP_EXACTN_IC, /* single byte, ignore case */ + + OP_CCLASS, + OP_CCLASS_MB, + OP_CCLASS_MIX, + OP_CCLASS_NOT, + OP_CCLASS_MB_NOT, + OP_CCLASS_MIX_NOT, + + OP_ANYCHAR, /* "." */ + OP_ANYCHAR_ML, /* "." multi-line */ + OP_ANYCHAR_STAR, /* ".*" */ + OP_ANYCHAR_ML_STAR, /* ".*" multi-line */ + OP_ANYCHAR_STAR_PEEK_NEXT, + OP_ANYCHAR_ML_STAR_PEEK_NEXT, + + OP_WORD, + OP_NOT_WORD, + OP_WORD_SB, + OP_WORD_MB, + OP_WORD_BOUND, + OP_NOT_WORD_BOUND, + OP_WORD_BEGIN, + OP_WORD_END, + + OP_BEGIN_BUF, + OP_END_BUF, + OP_BEGIN_LINE, + OP_END_LINE, + OP_SEMI_END_BUF, + OP_BEGIN_POSITION, + + OP_BACKREF1, + OP_BACKREF2, + OP_BACKREF3, + OP_BACKREFN, + OP_BACKREFN_IC, + OP_BACKREF_MULTI, + OP_BACKREF_MULTI_IC, + + OP_MEMORY_START, + OP_MEMORY_START_PUSH, /* push back-tracker to stack */ + OP_MEMORY_END_PUSH, /* push back-tracker to stack */ + OP_MEMORY_END_PUSH_REC, /* push back-tracker to stack */ + OP_MEMORY_END, + OP_MEMORY_END_REC, /* push marker to stack */ + + OP_SET_OPTION_PUSH, /* set option and push recover option */ + OP_SET_OPTION, /* set option */ + + OP_FAIL, /* pop stack and move */ + OP_JUMP, + OP_PUSH, + OP_POP, + OP_PUSH_OR_JUMP_EXACT1, /* if match exact then push, else jump. */ + OP_PUSH_IF_PEEK_NEXT, /* if match exact then push, else none. */ + OP_REPEAT, /* {n,m} */ + OP_REPEAT_NG, /* {n,m}? (non greedy) */ + OP_REPEAT_INC, + OP_REPEAT_INC_NG, /* non greedy */ + OP_NULL_CHECK_START, /* null loop checker start */ + OP_NULL_CHECK_END, /* null loop checker end */ + OP_NULL_CHECK_END_MEMST, /* null loop checker end (with capture status) */ + OP_NULL_CHECK_END_MEMST_PUSH, /* with capture status and push check-end */ + + OP_PUSH_POS, /* (?=...) start */ + OP_POP_POS, /* (?=...) end */ + OP_PUSH_POS_NOT, /* (?!...) start */ + OP_FAIL_POS, /* (?!...) end */ + OP_PUSH_STOP_BT, /* (?>...) start */ + OP_POP_STOP_BT, /* (?>...) end */ + OP_LOOK_BEHIND, /* (?<=...) start (no needs end opcode) */ + OP_PUSH_LOOK_BEHIND_NOT, /* (? */ + OP_RETURN +}; + +/* arguments type */ +#define ARG_SPECIAL -1 +#define ARG_NON 0 +#define ARG_RELADDR 1 +#define ARG_ABSADDR 2 +#define ARG_LENGTH 3 +#define ARG_MEMNUM 4 +#define ARG_OPTION 5 + +typedef short int RelAddrType; +typedef short int AbsAddrType; +typedef short int LengthType; +typedef short int MemNumType; +typedef int RepeatNumType; + +#define SIZE_OPCODE 1 +#define SIZE_RELADDR sizeof(RelAddrType) +#define SIZE_ABSADDR sizeof(AbsAddrType) +#define SIZE_LENGTH sizeof(LengthType) +#define SIZE_MEMNUM sizeof(MemNumType) +#define SIZE_REPEATNUM sizeof(RepeatNumType) +#define SIZE_OPTION sizeof(OnigOptionType) +#define SIZE_CODE_POINT sizeof(OnigCodePoint) + +#ifdef PLATFORM_UNALIGNED_WORD_ACCESS +#define GET_RELADDR_INC(addr,p) do{\ + addr = *((RelAddrType* )(p));\ + (p) += SIZE_RELADDR;\ +} while(0) + +#define GET_ABSADDR_INC(addr,p) do{\ + addr = *((AbsAddrType* )(p));\ + (p) += SIZE_ABSADDR;\ +} while(0) + +#define GET_LENGTH_INC(len,p) do{\ + len = *((LengthType* )(p));\ + (p) += SIZE_LENGTH;\ +} while(0) + +#define GET_MEMNUM_INC(num,p) do{\ + num = *((MemNumType* )(p));\ + (p) += SIZE_MEMNUM;\ +} while(0) + +#define GET_REPEATNUM_INC(num,p) do{\ + num = *((RepeatNumType* )(p));\ + (p) += SIZE_REPEATNUM;\ +} while(0) + +#define GET_OPTION_INC(option,p) do{\ + option = *((OnigOptionType* )(p));\ + (p) += SIZE_OPTION;\ +} while(0) +#else + +#define GET_RELADDR_INC(addr,p) GET_SHORT_INC(addr,p) +#define GET_ABSADDR_INC(addr,p) GET_SHORT_INC(addr,p) +#define GET_LENGTH_INC(len,p) GET_SHORT_INC(len,p) +#define GET_MEMNUM_INC(num,p) GET_SHORT_INC(num,p) +#define GET_REPEATNUM_INC(num,p) GET_INT_INC(num,p) +#define GET_OPTION_INC(option,p) GET_UINT_INC(option,p) + +#define SERIALIZE_RELADDR(addr,p) SERIALIZE_SHORT(addr,p) +#define SERIALIZE_ABSADDR(addr,p) SERIALIZE_SHORT(addr,p) +#define SERIALIZE_LENGTH(len,p) SERIALIZE_SHORT(len,p) +#define SERIALIZE_MEMNUM(num,p) SERIALIZE_SHORT(num,p) +#define SERIALIZE_REPEATNUM(num,p) SERIALIZE_INT(num,p) +#define SERIALIZE_OPTION(option,p) SERIALIZE_UINT(option,p) + +#define SERIALIZE_BUFSIZE SIZEOF_INT + +#endif /* PLATFORM_UNALIGNED_WORD_ACCESS */ + +/* code point's address must be aligned address. */ +#define GET_CODE_POINT(code,p) code = *((OnigCodePoint* )(p)) +#define GET_BYTE_INC(byte,p) do{\ + byte = *(p);\ + (p)++;\ +} while(0) + + +/* op-code + arg size */ +#define SIZE_OP_ANYCHAR_STAR SIZE_OPCODE +#define SIZE_OP_ANYCHAR_STAR_PEEK_NEXT (SIZE_OPCODE + 1) +#define SIZE_OP_JUMP (SIZE_OPCODE + SIZE_RELADDR) +#define SIZE_OP_PUSH (SIZE_OPCODE + SIZE_RELADDR) +#define SIZE_OP_POP SIZE_OPCODE +#define SIZE_OP_PUSH_OR_JUMP_EXACT1 (SIZE_OPCODE + SIZE_RELADDR + 1) +#define SIZE_OP_PUSH_IF_PEEK_NEXT (SIZE_OPCODE + SIZE_RELADDR + 1) +#define SIZE_OP_REPEAT_INC (SIZE_OPCODE + SIZE_MEMNUM) +#define SIZE_OP_REPEAT_INC_NG (SIZE_OPCODE + SIZE_MEMNUM) +#define SIZE_OP_PUSH_POS SIZE_OPCODE +#define SIZE_OP_PUSH_POS_NOT (SIZE_OPCODE + SIZE_RELADDR) +#define SIZE_OP_POP_POS SIZE_OPCODE +#define SIZE_OP_FAIL_POS SIZE_OPCODE +#define SIZE_OP_SET_OPTION (SIZE_OPCODE + SIZE_OPTION) +#define SIZE_OP_SET_OPTION_PUSH (SIZE_OPCODE + SIZE_OPTION) +#define SIZE_OP_FAIL SIZE_OPCODE +#define SIZE_OP_MEMORY_START (SIZE_OPCODE + SIZE_MEMNUM) +#define SIZE_OP_MEMORY_START_PUSH (SIZE_OPCODE + SIZE_MEMNUM) +#define SIZE_OP_MEMORY_END_PUSH (SIZE_OPCODE + SIZE_MEMNUM) +#define SIZE_OP_MEMORY_END_PUSH_REC (SIZE_OPCODE + SIZE_MEMNUM) +#define SIZE_OP_MEMORY_END (SIZE_OPCODE + SIZE_MEMNUM) +#define SIZE_OP_MEMORY_END_REC (SIZE_OPCODE + SIZE_MEMNUM) +#define SIZE_OP_PUSH_STOP_BT SIZE_OPCODE +#define SIZE_OP_POP_STOP_BT SIZE_OPCODE +#define SIZE_OP_NULL_CHECK_START (SIZE_OPCODE + SIZE_MEMNUM) +#define SIZE_OP_NULL_CHECK_END (SIZE_OPCODE + SIZE_MEMNUM) +#define SIZE_OP_LOOK_BEHIND (SIZE_OPCODE + SIZE_LENGTH) +#define SIZE_OP_PUSH_LOOK_BEHIND_NOT (SIZE_OPCODE + SIZE_RELADDR + SIZE_LENGTH) +#define SIZE_OP_FAIL_LOOK_BEHIND_NOT SIZE_OPCODE +#define SIZE_OP_CALL (SIZE_OPCODE + SIZE_ABSADDR) +#define SIZE_OP_RETURN SIZE_OPCODE + + +typedef struct { + UChar esc; + UChar anychar; + UChar anytime; + UChar zero_or_one_time; + UChar one_or_more_time; + UChar anychar_anytime; +} OnigMetaCharTableType; + +extern OnigMetaCharTableType OnigMetaCharTable; + +#define MC_ESC OnigMetaCharTable.esc +#define MC_ANYCHAR OnigMetaCharTable.anychar +#define MC_ANYTIME OnigMetaCharTable.anytime +#define MC_ZERO_OR_ONE_TIME OnigMetaCharTable.zero_or_one_time +#define MC_ONE_OR_MORE_TIME OnigMetaCharTable.one_or_more_time +#define MC_ANYCHAR_ANYTIME OnigMetaCharTable.anychar_anytime + + +#ifdef ONIG_DEBUG + +typedef struct { + short int opcode; + char* name; + short int arg_type; +} OnigOpInfoType; + +extern OnigOpInfoType OnigOpInfo[]; + +extern void onig_print_compiled_byte_code P_((FILE* f, UChar* bp, UChar** nextp)); + +#ifdef ONIG_DEBUG_STATISTICS +extern void onig_statistics_init P_((void)); +extern void onig_print_statistics P_((FILE* f)); +#endif +#endif + +extern char* onig_error_code_to_format P_((int code)); +extern void onig_snprintf_with_pattern PV_((char buf[], int bufsize, OnigEncoding enc, char* pat, char* pat_end, char *fmt, ...)); +extern UChar* onig_strdup P_((UChar* s, UChar* end)); +extern int onig_bbuf_init P_((BBuf* buf, int size)); +extern int onig_alloc_init P_((regex_t** reg, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax)); +extern int onig_compile P_((regex_t* reg, UChar* pattern, UChar* pattern_end, OnigErrorInfo* einfo)); +extern void onig_chain_reduce P_((regex_t* reg)); +extern int onig_is_in_code_range P_((UChar* p, OnigCodePoint code)); + +#endif /* REGINT_H */ diff --git a/regparse.c b/regparse.c new file mode 100644 index 0000000000..2260df4155 --- /dev/null +++ b/regparse.c @@ -0,0 +1,4815 @@ +/********************************************************************** + + regparse.c - Oniguruma (regular expression library) + + Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) + +**********************************************************************/ +#include "regparse.h" + +#define WARN_BUFSIZE 256 + +#define SYN_POSIX_COMMON_OP \ + ( ONIG_SYN_OP_DOT_ANYCHAR | ONIG_SYN_OP_POSIX_BRACKET | \ + ONIG_SYN_OP_DECIMAL_BACKREF | \ + ONIG_SYN_OP_BRACKET_CC | ONIG_SYN_OP_ASTERISK_ZERO_INF | \ + ONIG_SYN_OP_LINE_ANCHOR | \ + ONIG_SYN_OP_ESC_CONTROL_CHARS ) + +#define SYN_GNU_REGEX_OP \ + ( ONIG_SYN_OP_DOT_ANYCHAR | ONIG_SYN_OP_BRACKET_CC | \ + ONIG_SYN_OP_POSIX_BRACKET | ONIG_SYN_OP_DECIMAL_BACKREF | \ + ONIG_SYN_OP_BRACE_INTERVAL | ONIG_SYN_OP_LPAREN_SUBEXP | \ + ONIG_SYN_OP_VBAR_ALT | \ + ONIG_SYN_OP_ASTERISK_ZERO_INF | ONIG_SYN_OP_PLUS_ONE_INF | \ + ONIG_SYN_OP_QMARK_ZERO_ONE | \ + ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR | ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR | \ + ONIG_SYN_OP_ESC_W_WORD | \ + ONIG_SYN_OP_ESC_B_WORD_BOUND | ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END | \ + ONIG_SYN_OP_ESC_S_WHITE_SPACE | ONIG_SYN_OP_ESC_D_DIGIT | \ + ONIG_SYN_OP_LINE_ANCHOR ) + +#define SYN_GNU_REGEX_BV \ + ( ONIG_SYN_CONTEXT_INDEP_ANCHORS | ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS | \ + ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS | ONIG_SYN_ALLOW_INVALID_INTERVAL | \ + ONIG_SYN_BACKSLASH_ESCAPE_IN_CC | ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC ) + +#ifdef USE_VARIABLE_SYNTAX +OnigSyntaxType OnigSyntaxPosixBasic = { + ( SYN_POSIX_COMMON_OP | ONIG_SYN_OP_ESC_LPAREN_SUBEXP | + ONIG_SYN_OP_ESC_BRACE_INTERVAL ) + , 0 + , 0 + , ( ONIG_OPTION_SINGLELINE | ONIG_OPTION_MULTILINE ) +}; + +OnigSyntaxType OnigSyntaxPosixExtended = { + ( SYN_POSIX_COMMON_OP | ONIG_SYN_OP_LPAREN_SUBEXP | + ONIG_SYN_OP_BRACE_INTERVAL | + ONIG_SYN_OP_PLUS_ONE_INF | ONIG_SYN_OP_QMARK_ZERO_ONE | ONIG_SYN_OP_VBAR_ALT ) + , 0 + , ( ONIG_SYN_CONTEXT_INDEP_ANCHORS | + ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS | ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS | + ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP | + ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC ) + , ( ONIG_OPTION_SINGLELINE | ONIG_OPTION_MULTILINE ) +}; + +OnigSyntaxType OnigSyntaxEmacs = { + ( ONIG_SYN_OP_DOT_ANYCHAR | ONIG_SYN_OP_BRACKET_CC | + ONIG_SYN_OP_ESC_BRACE_INTERVAL | + ONIG_SYN_OP_ESC_LPAREN_SUBEXP | ONIG_SYN_OP_ESC_VBAR_ALT | + ONIG_SYN_OP_ASTERISK_ZERO_INF | ONIG_SYN_OP_PLUS_ONE_INF | + ONIG_SYN_OP_QMARK_ZERO_ONE | ONIG_SYN_OP_DECIMAL_BACKREF | + ONIG_SYN_OP_LINE_ANCHOR | ONIG_SYN_OP_ESC_CONTROL_CHARS ) + , ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR + , ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC + , ONIG_OPTION_NONE +}; + +OnigSyntaxType OnigSyntaxGrep = { + ( ONIG_SYN_OP_DOT_ANYCHAR | ONIG_SYN_OP_BRACKET_CC | ONIG_SYN_OP_POSIX_BRACKET | + ONIG_SYN_OP_BRACE_INTERVAL | ONIG_SYN_OP_ESC_LPAREN_SUBEXP | + ONIG_SYN_OP_ESC_VBAR_ALT | + ONIG_SYN_OP_ASTERISK_ZERO_INF | ONIG_SYN_OP_ESC_PLUS_ONE_INF | + ONIG_SYN_OP_ESC_QMARK_ZERO_ONE | ONIG_SYN_OP_LINE_ANCHOR | + ONIG_SYN_OP_ESC_W_WORD | ONIG_SYN_OP_ESC_B_WORD_BOUND | + ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END | ONIG_SYN_OP_DECIMAL_BACKREF ) + , 0 + , ( ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC | ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC ) + , ONIG_OPTION_NONE +}; + +OnigSyntaxType OnigSyntaxGnuRegex = { + SYN_GNU_REGEX_OP + , 0 + , SYN_GNU_REGEX_BV + , ONIG_OPTION_NONE +}; + +OnigSyntaxType OnigSyntaxJava = { + (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY | + ONIG_SYN_OP_ESC_CONTROL_CHARS | ONIG_SYN_OP_ESC_C_CONTROL | + ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 ) + & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END ) + , ( ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE | ONIG_SYN_OP2_QMARK_GROUP_EFFECT | + ONIG_SYN_OP2_OPTION_PERL | ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT | + ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL | ONIG_SYN_OP2_CCLASS_SET_OP | + ONIG_SYN_OP2_ESC_V_VTAB | ONIG_SYN_OP2_ESC_U_HEX4 | + ONIG_SYN_OP2_ESC_P_CHAR_PROPERTY ) + , ( SYN_GNU_REGEX_BV | ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND ) + , ONIG_OPTION_SINGLELINE +}; + +OnigSyntaxType OnigSyntaxPerl = { + (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY | + ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 | + ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS | + ONIG_SYN_OP_ESC_C_CONTROL ) + & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END ) + , ( ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE | + ONIG_SYN_OP2_QMARK_GROUP_EFFECT | ONIG_SYN_OP2_OPTION_PERL | + ONIG_SYN_OP2_ESC_P_CHAR_PROPERTY ) + , SYN_GNU_REGEX_BV + , ONIG_OPTION_SINGLELINE +}; +#endif /* USE_VARIABLE_SYNTAX */ + +OnigSyntaxType OnigSyntaxRuby = { + (( SYN_GNU_REGEX_OP | ONIG_SYN_OP_QMARK_NON_GREEDY | + ONIG_SYN_OP_ESC_OCTAL3 | ONIG_SYN_OP_ESC_X_HEX2 | + ONIG_SYN_OP_ESC_X_BRACE_HEX8 | ONIG_SYN_OP_ESC_CONTROL_CHARS | + ONIG_SYN_OP_ESC_C_CONTROL ) + & ~ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END ) + , ( ONIG_SYN_OP2_QMARK_GROUP_EFFECT | + ONIG_SYN_OP2_OPTION_RUBY | + ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP | ONIG_SYN_OP2_ESC_K_NAMED_BACKREF | + ONIG_SYN_OP2_ESC_G_SUBEXP_CALL | + ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT | + ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL | + ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB ) + , ( SYN_GNU_REGEX_BV | + ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV | + ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND | + ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP | + ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME | + ONIG_SYN_WARN_CC_OP_NOT_ESCAPED | + ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT ) + , ONIG_OPTION_NONE +}; + +OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY; + +#ifdef USE_VARIABLE_SYNTAX +extern int +onig_set_default_syntax(OnigSyntaxType* syntax) +{ + if (IS_NULL(syntax)) + syntax = ONIG_SYNTAX_RUBY; + + OnigDefaultSyntax = syntax; + return 0; +} + +extern void +onig_copy_syntax(OnigSyntaxType* to, OnigSyntaxType* from) +{ + *to = *from; +} + +extern void +onig_set_syntax_op(OnigSyntaxType* syntax, unsigned int op) +{ + syntax->op = op; +} + +extern void +onig_set_syntax_op2(OnigSyntaxType* syntax, unsigned int op2) +{ + syntax->op2 = op2; +} + +extern void +onig_set_syntax_behavior(OnigSyntaxType* syntax, unsigned int behavior) +{ + syntax->behavior = behavior; +} + +extern void +onig_set_syntax_options(OnigSyntaxType* syntax, OnigOptionType options) +{ + syntax->options = options; +} +#endif + +OnigMetaCharTableType OnigMetaCharTable = { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )0 /* anychar '.' */ + , (OnigCodePoint )0 /* anytime '*' */ + , (OnigCodePoint )0 /* zero or one time '?' */ + , (OnigCodePoint )0 /* one or more time '+' */ + , (OnigCodePoint )0 /* anychar anytime */ +}; + +#ifdef USE_VARIABLE_META_CHARS +extern int onig_set_meta_char(unsigned int what, unsigned int c) +{ + switch (what) { + case ONIG_META_CHAR_ESCAPE: + OnigMetaCharTable.esc = c; + break; + case ONIG_META_CHAR_ANYCHAR: + OnigMetaCharTable.anychar = c; + break; + case ONIG_META_CHAR_ANYTIME: + OnigMetaCharTable.anytime = c; + break; + case ONIG_META_CHAR_ZERO_OR_ONE_TIME: + OnigMetaCharTable.zero_or_one_time = c; + break; + case ONIG_META_CHAR_ONE_OR_MORE_TIME: + OnigMetaCharTable.one_or_more_time = c; + break; + case ONIG_META_CHAR_ANYCHAR_ANYTIME: + OnigMetaCharTable.anychar_anytime = c; + break; + default: + return ONIGERR_INVALID_ARGUMENT; + break; + } + return 0; +} +#endif /* USE_VARIABLE_META_CHARS */ + + +extern void onig_null_warn(char* s) { } + +#ifdef DEFAULT_WARN_FUNCTION +static OnigWarnFunc onig_warn = (OnigWarnFunc )DEFAULT_WARN_FUNCTION; +#else +static OnigWarnFunc onig_warn = onig_null_warn; +#endif + +#ifdef DEFAULT_VERB_WARN_FUNCTION +static OnigWarnFunc onig_verb_warn = (OnigWarnFunc )DEFAULT_VERB_WARN_FUNCTION; +#else +static OnigWarnFunc onig_verb_warn = onig_null_warn; +#endif + +extern void onig_set_warn_func(OnigWarnFunc f) +{ + onig_warn = f; +} + +extern void onig_set_verb_warn_func(OnigWarnFunc f) +{ + onig_verb_warn = f; +} + +static void +bbuf_free(BBuf* bbuf) +{ + if (IS_NOT_NULL(bbuf)) { + if (IS_NOT_NULL(bbuf->p)) xfree(bbuf->p); + xfree(bbuf); + } +} + +static int +bbuf_clone(BBuf** rto, BBuf* from) +{ + int r; + BBuf *to; + + *rto = to = (BBuf* )xmalloc(sizeof(BBuf)); + CHECK_NULL_RETURN_VAL(to, ONIGERR_MEMORY); + r = BBUF_INIT(to, from->alloc); + if (r != 0) return r; + to->used = from->used; + xmemcpy(to->p, from->p, from->used); + return 0; +} + +#define ONOFF(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f)) + +#define SET_ALL_MULTI_BYTE_RANGE(pbuf) \ + add_code_range_to_buf(pbuf, (OnigCodePoint )0x80, ~((OnigCodePoint )0)) + +#define ADD_ALL_MULTI_BYTE_RANGE(code, mbuf) do {\ + if (! ONIGENC_IS_SINGLEBYTE(code)) {\ + r = SET_ALL_MULTI_BYTE_RANGE(&(mbuf));\ + if (r) return r;\ + }\ +} while (0) + + +#define BITSET_IS_EMPTY(bs,empty) do {\ + int i;\ + empty = 1;\ + for (i = 0; i < BITSET_SIZE; i++) {\ + if ((bs)[i] != 0) {\ + empty = 0; break;\ + }\ + }\ +} while (0) + +static void +bitset_set_range(BitSetRef bs, int from, int to) +{ + int i; + for (i = from; i <= to && i < SINGLE_BYTE_SIZE; i++) { + BITSET_SET_BIT(bs, i); + } +} + +#if 0 +static void +bitset_set_all(BitSetRef bs) +{ + int i; + for (i = 0; i < BITSET_SIZE; i++) { + bs[i] = ~((Bits )0); + } +} +#endif + +static void +bitset_invert(BitSetRef bs) +{ + int i; + for (i = 0; i < BITSET_SIZE; i++) { + bs[i] = ~(bs[i]); + } +} + +static void +bitset_invert_to(BitSetRef from, BitSetRef to) +{ + int i; + for (i = 0; i < BITSET_SIZE; i++) { + to[i] = ~(from[i]); + } +} + +static void +bitset_and(BitSetRef dest, BitSetRef bs) +{ + int i; + for (i = 0; i < BITSET_SIZE; i++) { + dest[i] &= bs[i]; + } +} + +static void +bitset_or(BitSetRef dest, BitSetRef bs) +{ + int i; + for (i = 0; i < BITSET_SIZE; i++) { + dest[i] |= bs[i]; + } +} + +static void +bitset_copy(BitSetRef dest, BitSetRef bs) +{ + int i; + for (i = 0; i < BITSET_SIZE; i++) { + dest[i] = bs[i]; + } +} + +extern int +onig_strncmp(UChar* s1, UChar* s2, int n) +{ + int x; + + while (n-- > 0) { + x = *s2++ - *s1++; + if (x) return x; + } + return 0; +} + +static void +k_strcpy(UChar* dest, UChar* src, UChar* end) +{ + int len = end - src; + if (len > 0) { + xmemcpy(dest, src, len); + dest[len] = (UChar )0; + } +} + +extern UChar* +onig_strdup(UChar* s, UChar* end) +{ + int len = end - s; + + if (len > 0) { + UChar* r = (UChar* )xmalloc(len + 1); + CHECK_NULL_RETURN(r); + xmemcpy(r, s, len); + r[len] = (UChar )0; + return r; + } + else return NULL; +} + +/* scan pattern methods */ +#define PEND_VALUE -1 + +#define PFETCH(c) do { (c) = *p++; } while (0) +#define PUNFETCH p-- +#define PINC p++ +#define PPEEK (p < end ? *p : PEND_VALUE) +#define PEND (p < end ? 0 : 1) + + +static UChar* +k_strcat_capa(UChar* dest, UChar* dest_end, UChar* src, UChar* src_end, + int capa) +{ + UChar* r; + + if (dest) + r = (UChar* )xrealloc(dest, capa + 1); + else + r = (UChar* )xmalloc(capa + 1); + + CHECK_NULL_RETURN(r); + k_strcpy(r + (dest_end - dest), src, src_end); + return r; +} + +/* dest on static area */ +static UChar* +strcat_capa_from_static(UChar* dest, UChar* dest_end, + UChar* src, UChar* src_end, int capa) +{ + UChar* r; + + r = (UChar* )xmalloc(capa + 1); + CHECK_NULL_RETURN(r); + k_strcpy(r, dest, dest_end); + k_strcpy(r + (dest_end - dest), src, src_end); + return r; +} + +#ifdef USE_NAMED_GROUP + +#define INIT_NAME_BACKREFS_ALLOC_NUM 8 + +typedef struct { + UChar* name; + int name_len; /* byte length */ + int back_num; /* number of backrefs */ + int back_alloc; + int back_ref1; + int* back_refs; +} NameEntry; + +#ifdef USE_ST_HASH_TABLE + +#include + +typedef st_table NameTable; +typedef st_data_t HashDataType; /* 1.6 st.h doesn't define st_data_t type */ + +#define NAMEBUF_SIZE 24 +#define NAMEBUF_SIZE_1 25 + +#ifdef ONIG_DEBUG +static int +i_print_name_entry(UChar* key, NameEntry* e, void* arg) +{ + int i; + FILE* fp = (FILE* )arg; + + fprintf(fp, "%s: ", e->name); + if (e->back_num == 0) + fputs("-", fp); + else if (e->back_num == 1) + fprintf(fp, "%d", e->back_ref1); + else { + for (i = 0; i < e->back_num; i++) { + if (i > 0) fprintf(fp, ", "); + fprintf(fp, "%d", e->back_refs[i]); + } + } + fputs("\n", fp); + return ST_CONTINUE; +} + +extern int +onig_print_names(FILE* fp, regex_t* reg) +{ + NameTable* t = (NameTable* )reg->name_table; + + if (IS_NOT_NULL(t)) { + fprintf(fp, "name table\n"); + st_foreach(t, i_print_name_entry, (HashDataType )fp); + fputs("\n", fp); + } + return 0; +} +#endif + +static int +i_free_name_entry(UChar* key, NameEntry* e, void* arg) +{ + xfree(e->name); /* == key */ + if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs); + return ST_DELETE; +} + +static int +names_clear(regex_t* reg) +{ + NameTable* t = (NameTable* )reg->name_table; + + if (IS_NOT_NULL(t)) { + st_foreach(t, i_free_name_entry, 0); + } + return 0; +} + +extern int +onig_names_free(regex_t* reg) +{ + int r; + NameTable* t; + + r = names_clear(reg); + if (r) return r; + + t = (NameTable* )reg->name_table; + if (IS_NOT_NULL(t)) st_free_table(t); + reg->name_table = (void* )NULL; + return 0; +} + +static NameEntry* +name_find(regex_t* reg, UChar* name, UChar* name_end) +{ + int len; + UChar namebuf[NAMEBUF_SIZE_1]; + UChar *key; + NameEntry* e; + NameTable* t = (NameTable* )reg->name_table; + + e = (NameEntry* )NULL; + if (IS_NOT_NULL(t)) { + if (*name_end == '\0') { + key = name; + } + else { + /* dirty, but st.c API claims NULL terminated key. */ + len = name_end - name; + if (len <= NAMEBUF_SIZE) { + xmemcpy(namebuf, name, len); + namebuf[len] = '\0'; + key = namebuf; + } + else { + key = onig_strdup(name, name_end); + if (IS_NULL(key)) return (NameEntry* )NULL; + } + } + + st_lookup(t, (HashDataType )key, (HashDataType * )&e); + if (key != name && key != namebuf) xfree(key); + } + return e; +} + +typedef struct { + int (*func)(UChar*,UChar*,int,int*,regex_t*,void*); + regex_t* reg; + void* arg; + int ret; +} INamesArg; + +static int +i_names(UChar* key, NameEntry* e, INamesArg* arg) +{ + int r = (*(arg->func))(e->name, e->name + strlen(e->name), e->back_num, + (e->back_num > 1 ? e->back_refs : &(e->back_ref1)), + arg->reg, arg->arg); + if (r != 0) { + arg->ret = r; + return ST_STOP; + } + return ST_CONTINUE; +} + +extern int +onig_foreach_name(regex_t* reg, + int (*func)(UChar*,UChar*,int,int*,regex_t*,void*), + void* arg) +{ + INamesArg narg; + NameTable* t = (NameTable* )reg->name_table; + + narg.ret = 0; + if (IS_NOT_NULL(t)) { + narg.func = func; + narg.reg = reg; + narg.arg = arg; + st_foreach(t, i_names, (HashDataType )&narg); + } + return narg.ret; +} + +extern int +onig_number_of_names(regex_t* reg) +{ + NameTable* t = (NameTable* )reg->name_table; + + if (IS_NOT_NULL(t)) + return t->num_entries; + else + return 0; +} + +#else /* USE_ST_HASH_TABLE */ + +#define INIT_NAMES_ALLOC_NUM 8 + +typedef struct { + NameEntry* e; + int num; + int alloc; +} NameTable; + + +#ifdef ONIG_DEBUG +extern int +onig_print_names(FILE* fp, regex_t* reg) +{ + int i, j; + NameEntry* e; + NameTable* t = (NameTable* )reg->name_table; + + if (IS_NOT_NULL(t) && t->num > 0) { + fprintf(fp, "name table\n"); + for (i = 0; i < t->num; i++) { + e = &(t->e[i]); + fprintf(fp, "%s: ", e->name); + if (e->back_num == 0) { + fputs("-", fp); + } + else if (e->back_num == 1) { + fprintf(fp, "%d", e->back_ref1); + } + else { + for (j = 0; j < e->back_num; j++) { + if (j > 0) fprintf(fp, ", "); + fprintf(fp, "%d", e->back_refs[j]); + } + } + fputs("\n", fp); + } + fputs("\n", fp); + } + return 0; +} +#endif + +static int +names_clear(regex_t* reg) +{ + int i; + NameEntry* e; + NameTable* t = (NameTable* )reg->name_table; + + if (IS_NOT_NULL(t)) { + for (i = 0; i < t->num; i++) { + e = &(t->e[i]); + if (IS_NOT_NULL(e->name)) { + xfree(e->name); + e->name = NULL; + e->name_len = 0; + e->back_num = 0; + e->back_alloc = 0; + if (IS_NOT_NULL(e->back_refs)) xfree(e->back_refs); + e->back_refs = (int* )NULL; + } + } + if (IS_NOT_NULL(t->e)) { + xfree(t->e); + t->e = NULL; + } + t->num = 0; + } + return 0; +} + +extern int +onig_names_free(regex_t* reg) +{ + int r; + NameTable* t; + + r = names_clear(reg); + if (r) return r; + + t = (NameTable* )reg->name_table; + if (IS_NOT_NULL(t)) xfree(t); + reg->name_table = NULL; + return 0; +} + +static NameEntry* +name_find(regex_t* reg, UChar* name, UChar* name_end) +{ + int i, len; + NameEntry* e; + NameTable* t = (NameTable* )reg->name_table; + + if (IS_NOT_NULL(t)) { + len = name_end - name; + for (i = 0; i < t->num; i++) { + e = &(t->e[i]); + if (len == e->name_len && onig_strncmp(name, e->name, len) == 0) + return e; + } + } + return (NameEntry* )NULL; +} + +extern int +onig_foreach_name(regex_t* reg, + int (*func)(UChar*,UChar*,int,int*,regex_t*,void*), + void* arg) +{ + int i, r; + NameEntry* e; + NameTable* t = (NameTable* )reg->name_table; + + if (IS_NOT_NULL(t)) { + for (i = 0; i < t->num; i++) { + e = &(t->e[i]); + r = (*func)(e->name, e->name + e->name_len, e->back_num, + (e->back_num > 1 ? e->back_refs : &(e->back_ref1)), + reg, arg); + if (r != 0) return r; + } + } + return 0; +} + +extern int +onig_number_of_names(regex_t* reg) +{ + NameTable* t = (NameTable* )reg->name_table; + + if (IS_NOT_NULL(t)) + return t->num; + else + return 0; +} + +#endif /* else USE_ST_HASH_TABLE */ + +static int +name_add(regex_t* reg, UChar* name, UChar* name_end, int backref, ScanEnv* env) +{ + int alloc; + NameEntry* e; + NameTable* t = (NameTable* )reg->name_table; + + if (name_end - name <= 0) + return ONIGERR_EMPTY_GROUP_NAME; + + e = name_find(reg, name, name_end); + if (IS_NULL(e)) { +#ifdef USE_ST_HASH_TABLE + if (IS_NULL(t)) { + reg->name_table = t = st_init_strtable(); + } + e = (NameEntry* )xmalloc(sizeof(NameEntry)); + CHECK_NULL_RETURN_VAL(e, ONIGERR_MEMORY); + + e->name = onig_strdup(name, name_end); + if (IS_NULL(e->name)) return ONIGERR_MEMORY; + st_insert(t, (HashDataType )e->name, (HashDataType )e); + + e->name_len = name_end - name; + e->back_num = 0; + e->back_alloc = 0; + e->back_refs = (int* )NULL; + +#else + + if (IS_NULL(t)) { + alloc = INIT_NAMES_ALLOC_NUM; + t = (NameTable* )xmalloc(sizeof(NameTable)); + CHECK_NULL_RETURN_VAL(t, ONIGERR_MEMORY); + t->e = NULL; + t->alloc = 0; + t->num = 0; + + t->e = (NameEntry* )xmalloc(sizeof(NameEntry) * alloc); + if (IS_NULL(t->e)) { + xfree(t); + return ONIGERR_MEMORY; + } + t->alloc = alloc; + reg->name_table = t; + goto clear; + } + else if (t->num == t->alloc) { + int i; + + alloc = t->alloc * 2; + t->e = (NameEntry* )xrealloc(t->e, sizeof(NameEntry) * alloc); + CHECK_NULL_RETURN_VAL(t->e, ONIGERR_MEMORY); + t->alloc = alloc; + + clear: + for (i = t->num; i < t->alloc; i++) { + t->e[i].name = NULL; + t->e[i].name_len = 0; + t->e[i].back_num = 0; + t->e[i].back_alloc = 0; + t->e[i].back_refs = (int* )NULL; + } + } + e = &(t->e[t->num]); + t->num++; + e->name = onig_strdup(name, name_end); + e->name_len = name_end - name; +#endif + } + + if (e->back_num >= 1 && + ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME)) { + onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINED_NAME, + name, name_end); + return ONIGERR_MULTIPLEX_DEFINED_NAME; + } + + e->back_num++; + if (e->back_num == 1) { + e->back_ref1 = backref; + } + else { + if (e->back_num == 2) { + alloc = INIT_NAME_BACKREFS_ALLOC_NUM; + e->back_refs = (int* )xmalloc(sizeof(int) * alloc); + CHECK_NULL_RETURN_VAL(e->back_refs, ONIGERR_MEMORY); + e->back_alloc = alloc; + e->back_refs[0] = e->back_ref1; + e->back_refs[1] = backref; + } + else { + if (e->back_num > e->back_alloc) { + alloc = e->back_alloc * 2; + e->back_refs = (int* )xrealloc(e->back_refs, sizeof(int) * alloc); + CHECK_NULL_RETURN_VAL(e->back_refs, ONIGERR_MEMORY); + e->back_alloc = alloc; + } + e->back_refs[e->back_num - 1] = backref; + } + } + + return 0; +} + +extern int +onig_name_to_group_numbers(regex_t* reg, UChar* name, UChar* name_end, + int** nums) +{ + NameEntry* e; + + e = name_find(reg, name, name_end); + if (IS_NULL(e)) return ONIGERR_UNDEFINED_NAME_REFERENCE; + + switch (e->back_num) { + case 0: + break; + case 1: + *nums = &(e->back_ref1); + break; + default: + *nums = e->back_refs; + break; + } + return e->back_num; +} + +extern int +onig_name_to_backref_number(regex_t* reg, UChar* name, UChar* name_end, + OnigRegion *region) +{ + int i, n, *nums; + + n = onig_name_to_group_numbers(reg, name, name_end, &nums); + if (n < 0) + return n; + else if (n == 0) + return ONIGERR_PARSER_BUG; + else if (n == 1) + return nums[0]; + else { + if (IS_NOT_NULL(region)) { + for (i = n - 1; i >= 0; i--) { + if (region->beg[nums[i]] != ONIG_REGION_NOTPOS) + return nums[i]; + } + } + return nums[n - 1]; + } +} + +#else /* USE_NAMED_GROUP */ + +extern int +onig_name_to_group_numbers(regex_t* reg, UChar* name, UChar* name_end, + int** nums) +{ + return ONIG_NO_SUPPORT_CONFIG; +} + +extern int +onig_name_to_backref_number(regex_t* reg, UChar* name, UChar* name_end, + OnigRegion* region) +{ + return ONIG_NO_SUPPORT_CONFIG; +} + +extern int +onig_foreach_name(regex_t* reg, + int (*func)(UChar*,UChar*,int,int*,regex_t*,void*), + void* arg) +{ + return ONIG_NO_SUPPORT_CONFIG; +} + +extern int +onig_number_of_names(regex_t* reg) +{ + return 0; +} +#endif /* else USE_NAMED_GROUP */ + + +#define INIT_SCANENV_MEMNODES_ALLOC_SIZE 16 + +static void +scan_env_clear(ScanEnv* env) +{ + int i; + + BIT_STATUS_CLEAR(env->capture_history); + BIT_STATUS_CLEAR(env->bt_mem_start); + BIT_STATUS_CLEAR(env->bt_mem_end); + BIT_STATUS_CLEAR(env->backrefed_mem); + env->error = (UChar* )NULL; + env->error_end = (UChar* )NULL; + env->num_call = 0; + env->num_mem = 0; +#ifdef USE_NAMED_GROUP + env->num_named = 0; +#endif + env->mem_alloc = 0; + env->mem_nodes_dynamic = (Node** )NULL; + + for (i = 0; i < SCANENV_MEMNODES_SIZE; i++) + env->mem_nodes_static[i] = NULL_NODE; +} + +static int +scan_env_add_mem_entry(ScanEnv* env) +{ + int i, need, alloc; + Node** p; + + need = env->num_mem + 1; + if (need >= SCANENV_MEMNODES_SIZE) { + if (env->mem_alloc <= need) { + if (IS_NULL(env->mem_nodes_dynamic)) { + alloc = INIT_SCANENV_MEMNODES_ALLOC_SIZE; + p = (Node** )xmalloc(sizeof(Node*) * alloc); + xmemcpy(p, env->mem_nodes_static, + sizeof(Node*) * SCANENV_MEMNODES_SIZE); + } + else { + alloc = env->mem_alloc * 2; + p = (Node** )xrealloc(env->mem_nodes_dynamic, sizeof(Node*) * alloc); + } + CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY); + + for (i = env->num_mem + 1; i < alloc; i++) + p[i] = NULL_NODE; + + env->mem_nodes_dynamic = p; + env->mem_alloc = alloc; + } + } + + env->num_mem++; + return env->num_mem; +} + +static int +scan_env_set_mem_node(ScanEnv* env, int num, Node* node) +{ + if (env->num_mem >= num) + SCANENV_MEM_NODES(env)[num] = node; + else + return ONIGERR_PARSER_BUG; + return 0; +} + + +#ifdef USE_RECYCLE_NODE +typedef struct _FreeNode { + struct _FreeNode* next; +} FreeNode; + +static FreeNode* FreeNodeList = (FreeNode* )NULL; +#endif + +extern void +onig_node_free(Node* node) +{ + if (IS_NULL(node)) return ; + + switch (NTYPE(node)) { + case N_STRING: + if (IS_NOT_NULL(NSTRING(node).s) && NSTRING(node).s != NSTRING(node).buf) { + xfree(NSTRING(node).s); + } + break; + + case N_LIST: + case N_ALT: + onig_node_free(NCONS(node).left); + onig_node_free(NCONS(node).right); + break; + + case N_CCLASS: + if (NCCLASS(node).mbuf) + bbuf_free(NCCLASS(node).mbuf); + break; + + case N_QUALIFIER: + if (NQUALIFIER(node).target) + onig_node_free(NQUALIFIER(node).target); + break; + + case N_EFFECT: + if (NEFFECT(node).target) + onig_node_free(NEFFECT(node).target); + break; + + case N_BACKREF: + if (IS_NOT_NULL(NBACKREF(node).back_dynamic)) + xfree(NBACKREF(node).back_dynamic); + break; + + case N_ANCHOR: + if (NANCHOR(node).target) + onig_node_free(NANCHOR(node).target); + break; + } + +#ifdef USE_RECYCLE_NODE + { + FreeNode* n; + + n = (FreeNode* )node; + n->next = FreeNodeList; + FreeNodeList = n; + } +#else + xfree(node); +#endif +} + +#ifdef USE_RECYCLE_NODE +extern int +onig_free_node_list() +{ + FreeNode* n; + + THREAD_ATOMIC_START; + while (FreeNodeList) { + n = FreeNodeList; + FreeNodeList = FreeNodeList->next; + xfree(n); + } + THREAD_ATOMIC_END; + return 0; +} +#endif + +static Node* +node_new() +{ + Node* node; + +#ifdef USE_RECYCLE_NODE + if (IS_NOT_NULL(FreeNodeList)) { + node = (Node* )FreeNodeList; + FreeNodeList = FreeNodeList->next; + return node; + } +#endif + + node = (Node* )xmalloc(sizeof(Node)); + return node; +} + + +static void +initialize_cclass(CClassNode* cc) +{ + BITSET_CLEAR(cc->bs); + cc->not = 0; + cc->mbuf = NULL; +} + +static Node* +node_new_cclass() +{ + Node* node = node_new(); + CHECK_NULL_RETURN(node); + node->type = N_CCLASS; + + initialize_cclass(&(NCCLASS(node))); + return node; +} + +static Node* +node_new_ctype(int type) +{ + Node* node = node_new(); + CHECK_NULL_RETURN(node); + node->type = N_CTYPE; + NCTYPE(node).type = type; + return node; +} + +static Node* +node_new_anychar() +{ + Node* node = node_new(); + CHECK_NULL_RETURN(node); + node->type = N_ANYCHAR; + return node; +} + +static Node* +node_new_list(Node* left, Node* right) +{ + Node* node = node_new(); + CHECK_NULL_RETURN(node); + node->type = N_LIST; + NCONS(node).left = left; + NCONS(node).right = right; + return node; +} + +static Node* +node_new_alt(Node* left, Node* right) +{ + Node* node = node_new(); + CHECK_NULL_RETURN(node); + node->type = N_ALT; + NCONS(node).left = left; + NCONS(node).right = right; + return node; +} + +extern Node* +onig_node_new_anchor(int type) +{ + Node* node = node_new(); + CHECK_NULL_RETURN(node); + node->type = N_ANCHOR; + NANCHOR(node).type = type; + NANCHOR(node).target = NULL; + NANCHOR(node).char_len = -1; + return node; +} + +static Node* +node_new_backref(int back_num, int* backrefs, int by_name, ScanEnv* env) +{ + int i; + Node* node = node_new(); + + CHECK_NULL_RETURN(node); + node->type = N_BACKREF; + NBACKREF(node).state = 0; + NBACKREF(node).back_num = back_num; + NBACKREF(node).back_dynamic = (int* )NULL; + if (by_name != 0) + NBACKREF(node).state |= NST_NAME_REF; + + for (i = 0; i < back_num; i++) { + if (backrefs[i] <= env->num_mem && + IS_NULL(SCANENV_MEM_NODES(env)[backrefs[i]])) { + NBACKREF(node).state |= NST_RECURSION; /* /...(\1).../ */ + break; + } + } + + if (back_num <= NODE_BACKREFS_SIZE) { + for (i = 0; i < back_num; i++) + NBACKREF(node).back_static[i] = backrefs[i]; + } + else { + int* p = (int* )xmalloc(sizeof(int) * back_num); + if (IS_NULL(p)) { + onig_node_free(node); + return NULL; + } + NBACKREF(node).back_dynamic = p; + for (i = 0; i < back_num; i++) + p[i] = backrefs[i]; + } + return node; +} + +#ifdef USE_SUBEXP_CALL +static Node* +node_new_call(UChar* name, UChar* name_end) +{ + Node* node = node_new(); + CHECK_NULL_RETURN(node); + + node->type = N_CALL; + NCALL(node).state = 0; + NCALL(node).ref_num = CALLNODE_REFNUM_UNDEF; + NCALL(node).target = NULL_NODE; + NCALL(node).name = name; + NCALL(node).name_end = name_end; + return node; +} +#endif + +static Node* +node_new_qualifier(int lower, int upper, int by_number) +{ + Node* node = node_new(); + CHECK_NULL_RETURN(node); + node->type = N_QUALIFIER; + NQUALIFIER(node).target = NULL; + NQUALIFIER(node).lower = lower; + NQUALIFIER(node).upper = upper; + NQUALIFIER(node).greedy = 1; + NQUALIFIER(node).by_number = by_number; + NQUALIFIER(node).target_empty_info = NQ_TARGET_ISNOT_EMPTY; + NQUALIFIER(node).head_exact = NULL_NODE; + NQUALIFIER(node).next_head_exact = NULL_NODE; + NQUALIFIER(node).is_refered = 0; + return node; +} + +static Node* +node_new_effect(int type) +{ + Node* node = node_new(); + CHECK_NULL_RETURN(node); + node->type = N_EFFECT; + NEFFECT(node).type = type; + NEFFECT(node).state = 0; + NEFFECT(node).regnum = 0; + NEFFECT(node).option = 0; + NEFFECT(node).target = NULL; + NEFFECT(node).call_addr = -1; + NEFFECT(node).opt_count = 0; + return node; +} + +extern Node* +onig_node_new_effect(int type) +{ + return node_new_effect(type); +} + +static Node* +node_new_effect_memory(OnigOptionType option, int is_named) +{ + Node* node = node_new_effect(EFFECT_MEMORY); + CHECK_NULL_RETURN(node); + if (is_named != 0) + SET_EFFECT_STATUS(node, NST_NAMED_GROUP); + +#ifdef USE_SUBEXP_CALL + NEFFECT(node).option = option; +#endif + return node; +} + +static Node* +node_new_option(OnigOptionType option) +{ + Node* node = node_new_effect(EFFECT_OPTION); + CHECK_NULL_RETURN(node); + NEFFECT(node).option = option; + return node; +} + +extern int +onig_node_str_cat(Node* node, UChar* s, UChar* end) +{ + int addlen = end - s; + + if (addlen > 0) { + int len = NSTRING(node).end - NSTRING(node).s; + + if (NSTRING(node).capa > 0 || (len + addlen > NODE_STR_BUF_SIZE - 1)) { + UChar* p; + int capa = len + addlen + NODE_STR_MARGIN; + + if (capa <= NSTRING(node).capa) { + k_strcpy(NSTRING(node).s + len, s, end); + } + else { + if (NSTRING(node).s == NSTRING(node).buf) + p = strcat_capa_from_static(NSTRING(node).s, NSTRING(node).end, + s, end, capa); + else + p = k_strcat_capa(NSTRING(node).s, NSTRING(node).end, s, end, capa); + + CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY); + NSTRING(node).s = p; + NSTRING(node).capa = capa; + } + } + else { + k_strcpy(NSTRING(node).s + len, s, end); + } + NSTRING(node).end = NSTRING(node).s + len + addlen; + } + + return 0; +} + +static int +node_str_cat_char(Node* node, UChar c) +{ + UChar s[1]; + + s[0] = c; + return onig_node_str_cat(node, s, s + 1); +} + +extern void +onig_node_conv_to_str_node(Node* node, int flag) +{ + node->type = N_STRING; + + NSTRING(node).flag = flag; + NSTRING(node).capa = 0; + NSTRING(node).s = NSTRING(node).buf; + NSTRING(node).end = NSTRING(node).buf; +} + +static Node* +node_new_str(UChar* s, UChar* end) +{ + Node* node = node_new(); + CHECK_NULL_RETURN(node); + + node->type = N_STRING; + NSTRING(node).capa = 0; + NSTRING(node).flag = 0; + NSTRING(node).s = NSTRING(node).buf; + NSTRING(node).end = NSTRING(node).buf; + if (onig_node_str_cat(node, s, end)) { + onig_node_free(node); + return NULL; + } + return node; +} + +static Node* +node_new_str_raw(UChar* s, UChar* end) +{ + Node* node = node_new_str(s, end); + NSTRING_SET_RAW(node); + return node; +} + +static Node* +node_new_empty() +{ + return node_new_str(NULL, NULL); +} + +static Node* +node_new_str_char(UChar c) +{ + UChar p[1]; + + p[0] = c; + return node_new_str(p, p + 1); +} + +static Node* +node_new_str_raw_char(UChar c) +{ + UChar p[1]; + + p[0] = c; + return node_new_str_raw(p, p + 1); +} + +static Node* +str_node_split_last_char(StrNode* sn, OnigEncoding enc) +{ + UChar *p; + Node* n = NULL_NODE; + + if (sn->end > sn->s) { + p = onigenc_get_prev_char_head(enc, sn->s, sn->end); + if (p && p > sn->s) { /* can be splitted. */ + n = node_new_str(p, sn->end); + if ((sn->flag & NSTR_RAW) != 0) + NSTRING_SET_RAW(n); + sn->end = p; + } + } + return n; +} + +static int +str_node_can_be_split(StrNode* sn, OnigEncoding enc) +{ + if (sn->end > sn->s) { + return ((enc_len(enc, *(sn->s)) < sn->end - sn->s) ? 1 : 0); + } + return 0; +} + +extern int +onig_scan_unsigned_number(UChar** src, UChar* end, OnigEncoding enc) +{ + unsigned int num, val; + int c; + UChar* p = *src; + + num = 0; + while (!PEND) { + PFETCH(c); + if (ONIGENC_IS_CODE_DIGIT(enc, c)) { + val = (unsigned int )DIGITVAL(c); + if ((INT_MAX_LIMIT - val) / 10UL < num) + return -1; /* overflow */ + + num = num * 10 + val; + } + else { + PUNFETCH; + break; + } + } + *src = p; + return num; +} + +static int +scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen, + OnigEncoding enc) +{ + int c; + unsigned int num, val; + UChar* p = *src; + + num = 0; + while (!PEND && maxlen-- != 0) { + PFETCH(c); + if (ONIGENC_IS_CODE_XDIGIT(enc, c)) { + val = (unsigned int )XDIGITVAL(enc,c); + if ((INT_MAX_LIMIT - val) / 16UL < num) + return -1; /* overflow */ + + num = (num << 4) + XDIGITVAL(enc,c); + } + else { + PUNFETCH; + break; + } + } + *src = p; + return num; +} + +static int +scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen, + OnigEncoding enc) +{ + int c; + unsigned int num, val; + UChar* p = *src; + + num = 0; + while (!PEND && maxlen-- != 0) { + PFETCH(c); + if (ONIGENC_IS_CODE_DIGIT(enc, c) && c < '8') { + val = ODIGITVAL(c); + if ((INT_MAX_LIMIT - val) / 8UL < num) + return -1; /* overflow */ + + num = (num << 3) + val; + } + else { + PUNFETCH; + break; + } + } + *src = p; + return num; +} + + +#define BBUF_WRITE_CODE_POINT(bbuf,pos,code) \ + BBUF_WRITE(bbuf, pos, &(code), SIZE_CODE_POINT) + +/* data format: + [n][from-1][to-1][from-2][to-2] ... [from-n][to-n] + (all data size is OnigCodePoint) + */ +static int +new_code_range(BBuf** pbuf) +{ +#define INIT_MULTI_BYTE_RANGE_SIZE (SIZE_CODE_POINT * 5) + int r; + OnigCodePoint n; + BBuf* bbuf; + + bbuf = *pbuf = (BBuf* )xmalloc(sizeof(BBuf)); + CHECK_NULL_RETURN_VAL(*pbuf, ONIGERR_MEMORY); + r = BBUF_INIT(*pbuf, INIT_MULTI_BYTE_RANGE_SIZE); + if (r) return r; + + n = 0; + BBUF_WRITE_CODE_POINT(bbuf, 0, n); + return 0; +} + +static int +add_code_range_to_buf(BBuf** pbuf, OnigCodePoint from, OnigCodePoint to) +{ + int r, inc_n, pos; + int low, high, bound, x; + OnigCodePoint n, *data; + BBuf* bbuf; + + if (from > to) { + n = from; from = to; to = n; + } + + if (IS_NULL(*pbuf)) { + r = new_code_range(pbuf); + if (r) return r; + bbuf = *pbuf; + n = 0; + } + else { + bbuf = *pbuf; + GET_CODE_POINT(n, bbuf->p); + } + data = (OnigCodePoint* )(bbuf->p); + data++; + + for (low = 0, bound = n; low < bound; ) { + x = (low + bound) >> 1; + if (from > data[x*2 + 1]) + low = x + 1; + else + bound = x; + } + + for (high = low, bound = n; high < bound; ) { + x = (high + bound) >> 1; + if (to >= data[x*2] - 1) + high = x + 1; + else + bound = x; + } + + inc_n = low + 1 - high; + if (n + inc_n > ONIG_MAX_MULTI_BYTE_RANGES_NUM) + return ONIGERR_TOO_MANY_MULTI_BYTE_RANGES; + + if (inc_n != 1) { + if (from > data[low*2]) + from = data[low*2]; + if (to < data[(high - 1)*2 + 1]) + to = data[(high - 1)*2 + 1]; + } + + if (inc_n != 0 && (OnigCodePoint )high < n) { + int from_pos = SIZE_CODE_POINT * (1 + high * 2); + int to_pos = SIZE_CODE_POINT * (1 + (low + 1) * 2); + int size = (n - high) * 2 * SIZE_CODE_POINT; + + if (inc_n > 0) { + BBUF_MOVE_RIGHT(bbuf, from_pos, to_pos, size); + } + else { + BBUF_MOVE_LEFT_REDUCE(bbuf, from_pos, to_pos); + } + } + + pos = SIZE_CODE_POINT * (1 + low * 2); + BBUF_ENSURE_SIZE(bbuf, pos + SIZE_CODE_POINT * 2); + BBUF_WRITE_CODE_POINT(bbuf, pos, from); + BBUF_WRITE_CODE_POINT(bbuf, pos + SIZE_CODE_POINT, to); + n += inc_n; + BBUF_WRITE_CODE_POINT(bbuf, 0, n); + + return 0; +} + +static int +add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to) +{ + if (from > to) { + if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) + return 0; + else + return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; + } + + return add_code_range_to_buf(pbuf, from, to); +} + +static int +not_code_range_buf(BBuf* bbuf, BBuf** pbuf) +{ + int r, i, n; + OnigCodePoint pre, from, to, *data; + + *pbuf = (BBuf* )NULL; + if (IS_NULL(bbuf)) { + set_all: + return SET_ALL_MULTI_BYTE_RANGE(pbuf); + } + + data = (OnigCodePoint* )(bbuf->p); + GET_CODE_POINT(n, data); + data++; + if (n <= 0) goto set_all; + + r = 0; + pre = 0x80; + for (i = 0; i < n; i++) { + from = data[i*2]; + to = data[i*2+1]; + if (pre <= from - 1) { + r = add_code_range_to_buf(pbuf, pre, from - 1); + if (r != 0) return r; + } + if (to == ~((OnigCodePoint )0)) break; + pre = to + 1; + } + if (to < ~((OnigCodePoint )0)) { + r = add_code_range_to_buf(pbuf, to + 1, ~((OnigCodePoint )0)); + } + return r; +} + +#define SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2) do {\ + BBuf *tbuf; \ + int tnot; \ + tnot = not1; not1 = not2; not2 = tnot; \ + tbuf = bbuf1; bbuf1 = bbuf2; bbuf2 = tbuf; \ +} while (0) + +static int +or_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf) +{ + int r; + OnigCodePoint i, n1, *data1; + OnigCodePoint from, to; + + *pbuf = (BBuf* )NULL; + if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) { + if (not1 != 0 || not2 != 0) + return SET_ALL_MULTI_BYTE_RANGE(pbuf); + return 0; + } + + r = 0; + if (IS_NULL(bbuf2)) + SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2); + + if (IS_NULL(bbuf1)) { + if (not1 != 0) { + return SET_ALL_MULTI_BYTE_RANGE(pbuf); + } + else { + if (not2 == 0) { + return bbuf_clone(pbuf, bbuf2); + } + else { + return not_code_range_buf(bbuf2, pbuf); + } + } + } + + if (not1 != 0) + SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2); + + data1 = (OnigCodePoint* )(bbuf1->p); + GET_CODE_POINT(n1, data1); + data1++; + + if (not2 == 0 && not1 == 0) { /* 1 OR 2 */ + r = bbuf_clone(pbuf, bbuf2); + } + else if (not1 == 0) { /* 1 OR (not 2) */ + r = not_code_range_buf(bbuf2, pbuf); + } + if (r != 0) return r; + + for (i = 0; i < n1; i++) { + from = data1[i*2]; + to = data1[i*2+1]; + r = add_code_range_to_buf(pbuf, from, to); + if (r != 0) return r; + } + return 0; +} + +static int +and_code_range1(BBuf** pbuf, OnigCodePoint from1, OnigCodePoint to1, + OnigCodePoint* data, int n) +{ + int i, r; + OnigCodePoint from2, to2; + + for (i = 0; i < n; i++) { + from2 = data[i*2]; + to2 = data[i*2+1]; + if (from2 < from1) { + if (to2 < from1) continue; + else { + from1 = to2 + 1; + } + } + else if (from2 <= to1) { + if (to2 < to1) { + if (from1 <= from2 - 1) { + r = add_code_range_to_buf(pbuf, from1, from2-1); + if (r != 0) return r; + } + from1 = to2 + 1; + } + else { + to1 = from2 - 1; + } + } + else { + from1 = from2; + } + if (from1 > to1) break; + } + if (from1 <= to1) { + r = add_code_range_to_buf(pbuf, from1, to1); + if (r != 0) return r; + } + return 0; +} + +static int +and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf) +{ + int r; + OnigCodePoint i, j, n1, n2, *data1, *data2; + OnigCodePoint from, to, from1, to1, from2, to2; + + *pbuf = (BBuf* )NULL; + if (IS_NULL(bbuf1)) { + if (not1 != 0 && IS_NOT_NULL(bbuf2)) /* not1 != 0 -> not2 == 0 */ + return bbuf_clone(pbuf, bbuf2); + return 0; + } + else if (IS_NULL(bbuf2)) { + if (not2 != 0) + return bbuf_clone(pbuf, bbuf1); + return 0; + } + + if (not1 != 0) + SWAP_BBUF_NOT(bbuf1, not1, bbuf2, not2); + + data1 = (OnigCodePoint* )(bbuf1->p); + data2 = (OnigCodePoint* )(bbuf2->p); + GET_CODE_POINT(n1, data1); + GET_CODE_POINT(n2, data2); + data1++; + data2++; + + if (not2 == 0 && not1 == 0) { /* 1 AND 2 */ + for (i = 0; i < n1; i++) { + from1 = data1[i*2]; + to1 = data1[i*2+1]; + for (j = 0; j < n2; j++) { + from2 = data2[j*2]; + to2 = data2[j*2+1]; + if (from2 > to1) break; + if (to2 < from1) continue; + from = MAX(from1, from2); + to = MIN(to1, to2); + r = add_code_range_to_buf(pbuf, from, to); + if (r != 0) return r; + } + } + } + else if (not1 == 0) { /* 1 AND (not 2) */ + for (i = 0; i < n1; i++) { + from1 = data1[i*2]; + to1 = data1[i*2+1]; + r = and_code_range1(pbuf, from1, to1, data2, n2); + if (r != 0) return r; + } + } + + return 0; +} + +static int +and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc) +{ + int r, not1, not2; + BBuf *buf1, *buf2, *pbuf; + BitSetRef bsr1, bsr2; + BitSet bs1, bs2; + + not1 = dest->not; + bsr1 = dest->bs; + buf1 = dest->mbuf; + not2 = cc->not; + bsr2 = cc->bs; + buf2 = cc->mbuf; + + if (not1 != 0) { + bitset_invert_to(bsr1, bs1); + bsr1 = bs1; + } + if (not2 != 0) { + bitset_invert_to(bsr2, bs2); + bsr2 = bs2; + } + bitset_and(bsr1, bsr2); + if (bsr1 != dest->bs) { + bitset_copy(dest->bs, bsr1); + bsr1 = dest->bs; + } + if (not1 != 0) { + bitset_invert(dest->bs); + } + + if (! ONIGENC_IS_SINGLEBYTE(enc)) { + if (not1 != 0 && not2 != 0) { + r = or_code_range_buf(buf1, 0, buf2, 0, &pbuf); + } + else { + r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf); + if (r == 0 && not1 != 0) { + BBuf *tbuf; + r = not_code_range_buf(pbuf, &tbuf); + if (r != 0) { + bbuf_free(pbuf); + return r; + } + bbuf_free(pbuf); + pbuf = tbuf; + } + } + if (r != 0) return r; + + dest->mbuf = pbuf; + bbuf_free(buf1); + return r; + } + return 0; +} + +static int +or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc) +{ + int r, not1, not2; + BBuf *buf1, *buf2, *pbuf; + BitSetRef bsr1, bsr2; + BitSet bs1, bs2; + + not1 = dest->not; + bsr1 = dest->bs; + buf1 = dest->mbuf; + not2 = cc->not; + bsr2 = cc->bs; + buf2 = cc->mbuf; + + if (not1 != 0) { + bitset_invert_to(bsr1, bs1); + bsr1 = bs1; + } + if (not2 != 0) { + bitset_invert_to(bsr2, bs2); + bsr2 = bs2; + } + bitset_or(bsr1, bsr2); + if (bsr1 != dest->bs) { + bitset_copy(dest->bs, bsr1); + bsr1 = dest->bs; + } + if (not1 != 0) { + bitset_invert(dest->bs); + } + + if (! ONIGENC_IS_SINGLEBYTE(enc)) { + if (not1 != 0 && not2 != 0) { + r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf); + } + else { + r = or_code_range_buf(buf1, not1, buf2, not2, &pbuf); + if (r == 0 && not1 != 0) { + BBuf *tbuf; + r = not_code_range_buf(pbuf, &tbuf); + if (r != 0) { + bbuf_free(pbuf); + return r; + } + bbuf_free(pbuf); + pbuf = tbuf; + } + } + if (r != 0) return r; + + dest->mbuf = pbuf; + bbuf_free(buf1); + return r; + } + else + return 0; +} + +static int +conv_backslash_value(int c, ScanEnv* env) +{ + if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_CONTROL_CHARS)) { + switch (c) { + case 'n': return '\n'; + case 't': return '\t'; + case 'r': return '\r'; + case 'f': return '\f'; + case 'a': return '\007'; + case 'b': return '\010'; + case 'e': return '\033'; + case 'v': + if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_V_VTAB)) + return '\v'; + break; + + default: + break; + } + } + return c; +} + +static int +is_invalid_qualifier_target(Node* node) +{ + switch (NTYPE(node)) { + case N_ANCHOR: + return 1; + break; + + case N_EFFECT: + if (NEFFECT(node).type == EFFECT_OPTION) + return is_invalid_qualifier_target(NEFFECT(node).target); + break; + + case N_LIST: /* ex. (?:\G\A)* */ + do { + if (! is_invalid_qualifier_target(NCONS(node).left)) return 0; + } while (IS_NOT_NULL(node = NCONS(node).right)); + return 0; + break; + + case N_ALT: /* ex. (?:abc|\A)* */ + do { + if (is_invalid_qualifier_target(NCONS(node).left)) return 1; + } while (IS_NOT_NULL(node = NCONS(node).right)); + break; + + default: + break; + } + return 0; +} + +/* ?:0, *:1, +:2, ??:3, *?:4, +?:5 */ +static int +popular_qualifier_num(QualifierNode* qf) +{ + if (qf->greedy) { + if (qf->lower == 0) { + if (qf->upper == 1) return 0; + else if (IS_REPEAT_INFINITE(qf->upper)) return 1; + } + else if (qf->lower == 1) { + if (IS_REPEAT_INFINITE(qf->upper)) return 2; + } + } + else { + if (qf->lower == 0) { + if (qf->upper == 1) return 3; + else if (IS_REPEAT_INFINITE(qf->upper)) return 4; + } + else if (qf->lower == 1) { + if (IS_REPEAT_INFINITE(qf->upper)) return 5; + } + } + return -1; +} + +extern void +onig_reduce_nested_qualifier(Node* pnode, Node* cnode) +{ +#define NQ_ASIS 0 /* as is */ +#define NQ_DEL 1 /* delete parent */ +#define NQ_A 2 /* to '*' */ +#define NQ_AQ 3 /* to '*?' */ +#define NQ_QQ 4 /* to '??' */ +#define NQ_P_QQ 5 /* to '+)??' */ +#define NQ_PQ_Q 6 /* to '+?)?' */ + + static char reduces[][6] = { + {NQ_DEL, NQ_A, NQ_A, NQ_QQ, NQ_AQ, NQ_ASIS}, /* '?' */ + {NQ_DEL, NQ_DEL, NQ_DEL, NQ_P_QQ, NQ_P_QQ, NQ_DEL}, /* '*' */ + {NQ_A, NQ_A, NQ_DEL, NQ_ASIS, NQ_P_QQ, NQ_DEL}, /* '+' */ + {NQ_DEL, NQ_AQ, NQ_AQ, NQ_DEL, NQ_AQ, NQ_AQ}, /* '??' */ + {NQ_DEL, NQ_DEL, NQ_DEL, NQ_DEL, NQ_DEL, NQ_DEL}, /* '*?' */ + {NQ_ASIS, NQ_PQ_Q, NQ_DEL, NQ_AQ, NQ_AQ, NQ_DEL} /* '+?' */ + }; + + int pnum, cnum; + QualifierNode *p, *c; + + p = &(NQUALIFIER(pnode)); + c = &(NQUALIFIER(cnode)); + pnum = popular_qualifier_num(p); + cnum = popular_qualifier_num(c); + + switch(reduces[cnum][pnum]) { + case NQ_DEL: + *p = *c; + break; + case NQ_A: + p->target = c->target; + p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 1; + break; + case NQ_AQ: + p->target = c->target; + p->lower = 0; p->upper = REPEAT_INFINITE; p->greedy = 0; + break; + case NQ_QQ: + p->target = c->target; + p->lower = 0; p->upper = 1; p->greedy = 0; + break; + case NQ_P_QQ: + p->target = cnode; + p->lower = 0; p->upper = 1; p->greedy = 0; + c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 1; + return ; + break; + case NQ_PQ_Q: + p->target = cnode; + p->lower = 0; p->upper = 1; p->greedy = 1; + c->lower = 1; c->upper = REPEAT_INFINITE; c->greedy = 0; + return ; + break; + case NQ_ASIS: + p->target = cnode; + return ; + break; + } + + c->target = NULL_NODE; + onig_node_free(cnode); +} + + +enum TokenSyms { + TK_EOT = 0, /* end of token */ + TK_BYTE = 1, + TK_RAW_BYTE = 2, + TK_CODE_POINT, + TK_ANYCHAR, + TK_CHAR_TYPE, + TK_BACKREF, + TK_CALL, + TK_ANCHOR, + TK_OP_REPEAT, + TK_INTERVAL, + TK_ANYCHAR_ANYTIME, /* SQL '%' == .* */ + TK_ALT, + TK_SUBEXP_OPEN, + TK_SUBEXP_CLOSE, + TK_CC_OPEN, + TK_QUOTE_OPEN, + TK_CHAR_PROPERTY, /* \p{...}, \P{...} */ + /* in cc */ + TK_CC_CLOSE, + TK_CC_RANGE, + TK_POSIX_BRACKET_OPEN, + TK_CC_AND, /* && */ + TK_CC_CC_OPEN /* [ */ +}; + +typedef struct { + enum TokenSyms type; + int escaped; + int base; /* is number: 8, 16 (used in [....]) */ + UChar* backp; + union { + int c; + OnigCodePoint code; + int anchor; + int subtype; + struct { + int lower; + int upper; + int greedy; + int possessive; + } repeat; + struct { + int num; + int ref1; + int* refs; + int by_name; + } backref; + struct { + UChar* name; + UChar* name_end; + } call; + struct { + int not; + } prop; + } u; +} OnigToken; + + +static int +fetch_range_qualifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env) +{ + int low, up, syn_allow, non_low = 0; + int c; + UChar* p = *src; + + syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL); + + if (PEND) { + if (syn_allow) + return 1; /* "....{" : OK! */ + else + return ONIGERR_END_PATTERN_AT_LEFT_BRACE; /* "....{" syntax error */ + } + + if (! syn_allow) { + c = PPEEK; + if (c == ')' || c == '(' || c == '|') { + return ONIGERR_END_PATTERN_AT_LEFT_BRACE; + } + } + + low = onig_scan_unsigned_number(&p, end, env->enc); + if (low < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; + if (low > ONIG_MAX_REPEAT_NUM) + return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; + + if (p == *src) { /* can't read low */ + if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV)) { + /* allow {,n} as {0,n} */ + low = 0; + non_low = 1; + } + else + goto invalid; + } + + if (PEND) goto invalid; + PFETCH(c); + if (c == ',') { + UChar* prev = p; + up = onig_scan_unsigned_number(&p, end, env->enc); + if (up < 0) return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; + if (up > ONIG_MAX_REPEAT_NUM) + return ONIGERR_TOO_BIG_NUMBER_FOR_REPEAT_RANGE; + + if (p == prev) { + if (non_low != 0) + goto invalid; + up = REPEAT_INFINITE; /* {n,} : {n,infinite} */ + } + } + else { + if (non_low != 0) + goto invalid; + + PUNFETCH; + up = low; /* {n} : exact n times */ + } + + if (PEND) goto invalid; + PFETCH(c); + if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) { + if (c != MC_ESC) goto invalid; + PFETCH(c); + } + if (c != '}') goto invalid; + + if (!IS_REPEAT_INFINITE(up) && low > up) { + return ONIGERR_UPPER_SMALLER_THAN_LOWER_IN_REPEAT_RANGE; + } + + tok->type = TK_INTERVAL; + tok->u.repeat.lower = low; + tok->u.repeat.upper = up; + *src = p; + return 0; + + invalid: + if (syn_allow) + return 1; /* OK */ + else + return ONIGERR_INVALID_REPEAT_RANGE_PATTERN; +} + +/* \M-, \C-, \c, or \... */ +static int +fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env) +{ + int c; + UChar* p = *src; + + if (PEND) return ONIGERR_END_PATTERN_AT_BACKSLASH; + + PFETCH(c); + switch (c) { + case 'M': + if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META)) { + if (PEND) return ONIGERR_END_PATTERN_AT_META; + PFETCH(c); + if (c != '-') return ONIGERR_META_CODE_SYNTAX; + if (PEND) return ONIGERR_END_PATTERN_AT_META; + PFETCH(c); + if (c == MC_ESC) { + c = fetch_escaped_value(&p, end, env); + if (c < 0) return c; + } + c = ((c & 0xff) | 0x80); + } + else + goto backslash; + break; + + case 'C': + if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL)) { + if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL; + PFETCH(c); + if (c != '-') return ONIGERR_CONTROL_CODE_SYNTAX; + goto control; + } + else + goto backslash; + + case 'c': + if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_C_CONTROL)) { + control: + if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL; + PFETCH(c); + if (c == MC_ESC) { + c = fetch_escaped_value(&p, end, env); + if (c < 0) return c; + } + else if (c == '?') + c = 0177; + else + c &= 0x9f; + break; + } + /* fall through */ + + default: + { + backslash: + c = conv_backslash_value(c, env); + } + break; + } + + *src = p; + return c; +} + +static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env); + +#ifdef USE_NAMED_GROUP +/* + def: 0 -> define name (don't allow number name) + 1 -> reference name (allow number name) +*/ +static int +fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref) +{ + int r, len, is_num; + int c = 0; + UChar *name_end; + UChar *p = *src; + + name_end = end; + r = 0; + is_num = 0; + if (PEND) { + return ONIGERR_EMPTY_GROUP_NAME; + } + else { + PFETCH(c); + if (c == '>') + return ONIGERR_EMPTY_GROUP_NAME; + + if (ONIGENC_IS_CODE_DIGIT(env->enc, c)) { + if (ref == 1) + is_num = 1; + else { + r = ONIGERR_INVALID_GROUP_NAME; + } + } + len = enc_len(env->enc, c); + while (!PEND && len-- > 1) + PFETCH(c); + } + + while (!PEND) { + name_end = p; + PFETCH(c); + if (c == '>' || c == ')') break; + + len = enc_len(env->enc, c); + if (is_num == 1) { + if (! ONIGENC_IS_CODE_DIGIT(env->enc, c)) { + if (!ONIGENC_IS_CODE_ALPHA(env->enc, c) && c != '_') + r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; + else + r = ONIGERR_INVALID_GROUP_NAME; + } + } + else { + if (len == 1) { + if (!ONIGENC_IS_CODE_ALPHA(env->enc, c) && + !ONIGENC_IS_CODE_DIGIT(env->enc, c) && + c != '_') { + r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; + } + } + } + + while (!PEND && len-- > 1) + PFETCH(c); + } + if (c != '>') { + r = ONIGERR_INVALID_GROUP_NAME; + name_end = end; + } + else { + c = **src; + if (ONIGENC_IS_CODE_UPPER(env->enc, c)) + r = ONIGERR_INVALID_GROUP_NAME; + } + + if (r == 0) { + *rname_end = name_end; + *src = p; + return 0; + } + else { + onig_scan_env_set_error_string(env, r, *src, name_end); + return r; + } +} +#else +static int +fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref) +{ + int r, len; + int c = 0; + UChar *name_end; + UChar *p = *src; + + r = 0; + while (!PEND) { + name_end = p; + PFETCH(c); + if (enc_len(env->enc, c) > 1) + r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; + + if (c == '>' || c == ')') break; + if (! ONIGENC_IS_CODE_DIGIT(env->enc, c)) + r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME; + } + if (c != '>') { + r = ONIGERR_INVALID_GROUP_NAME; + name_end = end; + } + + if (r == 0) { + *rname_end = name_end; + *src = p; + return 0; + } + else { + err: + onig_scan_env_set_error_string(env, r, *src, name_end); + return r; + } +} +#endif + +static void +CC_ESC_WARN(ScanEnv* env, UChar *c) +{ + if (onig_warn == onig_null_warn) return ; + + if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED) && + IS_SYNTAX_BV(env->syntax, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) { + char buf[WARN_BUFSIZE]; + onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, + env->pattern, env->pattern_end, + "character class has '%s' without escape", c); + (*onig_warn)(buf); + } +} + +static void +CCEND_ESC_WARN(ScanEnv* env, UChar* c) +{ + if (onig_warn == onig_null_warn) return ; + + if (IS_SYNTAX_BV((env)->syntax, ONIG_SYN_WARN_CC_OP_NOT_ESCAPED)) { + char buf[WARN_BUFSIZE]; + onig_snprintf_with_pattern(buf, WARN_BUFSIZE, (env)->enc, + (env)->pattern, (env)->pattern_end, + "regular expression has '%s' without escape", c); + (*onig_warn)(buf); + } +} + +static UChar* +find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to, + UChar **next, OnigEncoding enc) +{ + int i; + OnigCodePoint x; + UChar *q; + UChar *p = from; + + while (p < to) { + x = ONIGENC_MBC_TO_CODE(enc, p, to); + q = p + enc_len(enc, *p); + if (x == s[0]) { + for (i = 1; i < n && q < to; i++) { + x = ONIGENC_MBC_TO_CODE(enc, q, to); + if (x != s[i]) break; + q += enc_len(enc, *q); + } + if (i >= n) { + if (IS_NOT_NULL(next)) + *next = q; + return p; + } + } + p = q; + } + return NULL_UCHARP; +} + +static int +str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to, + OnigCodePoint bad, OnigEncoding enc) +{ + int i, in_esc; + OnigCodePoint x; + UChar *q; + UChar *p = from; + + in_esc = 0; + while (p < to) { + if (in_esc) { + in_esc = 0; + p += enc_len(enc, *p); + } + else { + x = ONIGENC_MBC_TO_CODE(enc, p, to); + q = p + enc_len(enc, *p); + if (x == s[0]) { + for (i = 1; i < n && q < to; i++) { + x = ONIGENC_MBC_TO_CODE(enc, q, to); + if (x != s[i]) break; + q += enc_len(enc, *q); + } + if (i >= n) return 1; + p += enc_len(enc, *p); + } + else { + x = ONIGENC_MBC_TO_CODE(enc, p, to); + if (x == bad) return 0; + else if (x == MC_ESC) in_esc = 1; + p = q; + } + } + } + return 0; +} + +static int +fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) +{ + int c, num; + OnigSyntaxType* syn = env->syntax; + UChar* prev; + UChar* p = *src; + + if (PEND) { + tok->type = TK_EOT; + return tok->type; + } + + PFETCH(c); + tok->type = TK_BYTE; + tok->base = 0; + tok->u.c = c; + if (c == ']') { + tok->type = TK_CC_CLOSE; + } + else if (c == '-') { + tok->type = TK_CC_RANGE; + } + else if (c == MC_ESC) { + if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC)) + goto end; + + if (PEND) return ONIGERR_END_PATTERN_AT_BACKSLASH; + + PFETCH(c); + tok->escaped = 1; + tok->u.c = c; + switch (c) { + case 'w': + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_WORD; + break; + case 'W': + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_NOT_WORD; + break; + case 'd': + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_DIGIT; + break; + case 'D': + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_NOT_DIGIT; + break; + case 's': + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_WHITE_SPACE; + break; + case 'S': + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_NOT_WHITE_SPACE; + break; + + case 'p': + case 'P': + if (PPEEK == '{' && + IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_CHAR_PROPERTY)) { + PINC; + tok->type = TK_CHAR_PROPERTY; + tok->u.prop.not = (c == 'P' ? 1 : 0); + } + break; + + case 'x': + if (PEND) break; + + prev = p; + if (PPEEK == '{' && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { + PINC; + num = scan_unsigned_hexadecimal_number(&p, end, 8, env->enc); + if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; + if (!PEND && ONIGENC_IS_CODE_XDIGIT(env->enc, *p) && p - prev >= 9) + return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; + + if (p > prev + 1 && !PEND && PPEEK == '}') { + PINC; + tok->type = TK_CODE_POINT; + tok->base = 16; + tok->u.code = (OnigCodePoint )num; + } + else { + /* can't read nothing or invalid format */ + p = prev; + } + } + else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { + num = scan_unsigned_hexadecimal_number(&p, end, 2, env->enc); + if (num < 0) return ONIGERR_TOO_BIG_NUMBER; + if (p == prev) { /* can't read nothing. */ + num = 0; /* but, it's not error */ + } + tok->type = TK_RAW_BYTE; + tok->base = 16; + tok->u.c = num; + } + break; + + case 'u': + if (PEND) break; + + prev = p; + if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { + num = scan_unsigned_hexadecimal_number(&p, end, 4, env->enc); + if (num < 0) return ONIGERR_TOO_BIG_NUMBER; + if (p == prev) { /* can't read nothing. */ + num = 0; /* but, it's not error */ + } + tok->type = TK_RAW_BYTE; + tok->base = 16; + tok->u.c = num; + } + break; + + case '0': + case '1': case '2': case '3': case '4': case '5': case '6': case '7': + if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { + PUNFETCH; + prev = p; + num = scan_unsigned_octal_number(&p, end, 3, env->enc); + if (num < 0) return ONIGERR_TOO_BIG_NUMBER; + if (p == prev) { /* can't read nothing. */ + num = 0; /* but, it's not error */ + } + tok->type = TK_RAW_BYTE; + tok->base = 8; + tok->u.c = num; + } + break; + + default: + PUNFETCH; + num = fetch_escaped_value(&p, end, env); + if (num < 0) return num; + if (tok->u.c != num) { + tok->u.c = num; + tok->type = TK_RAW_BYTE; + } + break; + } + } + else if (c == '[') { + if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && PPEEK == ':') { + OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' }; + tok->backp = p; /* point at '[' is readed */ + PINC; + if (str_exist_check_with_esc(send, 2, p, end, (OnigCodePoint )']', + env->enc)) { + tok->type = TK_POSIX_BRACKET_OPEN; + } + else { + PUNFETCH; + goto cc_in_cc; + } + } + else { + cc_in_cc: + if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP)) { + tok->type = TK_CC_CC_OPEN; + } + else { + CC_ESC_WARN(env, "["); + } + } + } + else if (c == '&') { + if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) && + !PEND && PPEEK == '&') { + PINC; + tok->type = TK_CC_AND; + } + } + + end: + *src = p; + return tok->type; +} + +static int +fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env) +{ + int r, c, num; + OnigSyntaxType* syn = env->syntax; + UChar* prev; + UChar* p = *src; + + start: + if (PEND) { + tok->type = TK_EOT; + return tok->type; + } + + tok->type = TK_BYTE; + tok->base = 0; + PFETCH(c); + if (c == MC_ESC) { + if (PEND) return ONIGERR_END_PATTERN_AT_BACKSLASH; + + PFETCH(c); + tok->u.c = c; + tok->escaped = 1; + switch (c) { + case '*': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_ASTERISK_ZERO_INF)) break; + tok->type = TK_OP_REPEAT; + tok->u.repeat.lower = 0; + tok->u.repeat.upper = REPEAT_INFINITE; + goto greedy_check; + break; + + case '+': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_PLUS_ONE_INF)) break; + tok->type = TK_OP_REPEAT; + tok->u.repeat.lower = 1; + tok->u.repeat.upper = REPEAT_INFINITE; + goto greedy_check; + break; + + case '?': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_QMARK_ZERO_ONE)) break; + tok->type = TK_OP_REPEAT; + tok->u.repeat.lower = 0; + tok->u.repeat.upper = 1; + greedy_check: + if (!PEND && PPEEK == '?' && + IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) { + PFETCH(c); + tok->u.repeat.greedy = 0; + tok->u.repeat.possessive = 0; + } + else if (!PEND && PPEEK == '+' && + ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) && + tok->type != TK_INTERVAL) || + (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) && + tok->type == TK_INTERVAL))) { + PFETCH(c); + tok->u.repeat.greedy = 1; + tok->u.repeat.possessive = 1; + } + else { + tok->u.repeat.greedy = 1; + tok->u.repeat.possessive = 0; + } + break; + + case '{': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break; + tok->backp = p; + r = fetch_range_qualifier(&p, end, tok, env); + if (r < 0) return r; /* error */ + if (r > 0) { + /* normal char */ + } + else + goto greedy_check; + break; + + case '|': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_VBAR_ALT)) break; + tok->type = TK_ALT; + break; + + case '(': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break; + tok->type = TK_SUBEXP_OPEN; + break; + + case ')': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LPAREN_SUBEXP)) break; + tok->type = TK_SUBEXP_CLOSE; + break; + + case 'w': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break; + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_WORD; + break; + + case 'W': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break; + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_NOT_WORD; + break; + + case 'b': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break; + tok->type = TK_ANCHOR; + tok->u.anchor = ANCHOR_WORD_BOUND; + break; + + case 'B': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_B_WORD_BOUND)) break; + tok->type = TK_ANCHOR; + tok->u.anchor = ANCHOR_NOT_WORD_BOUND; + break; + +#ifdef USE_WORD_BEGIN_END + case '<': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break; + tok->type = TK_ANCHOR; + tok->u.anchor = ANCHOR_WORD_BEGIN; + break; + + case '>': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END)) break; + tok->type = TK_ANCHOR; + tok->u.anchor = ANCHOR_WORD_END; + break; +#endif + + case 's': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break; + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_WHITE_SPACE; + break; + + case 'S': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break; + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_NOT_WHITE_SPACE; + break; + + case 'd': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break; + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_DIGIT; + break; + + case 'D': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break; + tok->type = TK_CHAR_TYPE; + tok->u.subtype = CTYPE_NOT_DIGIT; + break; + + case 'A': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; + begin_buf: + tok->type = TK_ANCHOR; + tok->u.subtype = ANCHOR_BEGIN_BUF; + break; + + case 'Z': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; + tok->type = TK_ANCHOR; + tok->u.subtype = ANCHOR_SEMI_END_BUF; + break; + + case 'z': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break; + end_buf: + tok->type = TK_ANCHOR; + tok->u.subtype = ANCHOR_END_BUF; + break; + + case 'G': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR)) break; + tok->type = TK_ANCHOR; + tok->u.subtype = ANCHOR_BEGIN_POSITION; + break; + + case '`': + if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break; + goto begin_buf; + break; + + case '\'': + if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR)) break; + goto end_buf; + break; + + case 'x': + if (PEND) break; + + prev = p; + if (PPEEK == '{' && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) { + PINC; + num = scan_unsigned_hexadecimal_number(&p, end, 8, env->enc); + if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; + if (!PEND && ONIGENC_IS_CODE_XDIGIT(env->enc, *p) && p - prev >= 9) + return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE; + + if (p > prev + 1 && !PEND && PPEEK == '}') { + PINC; + tok->type = TK_CODE_POINT; + tok->u.code = (OnigCodePoint )num; + } + else { + /* can't read nothing or invalid format */ + p = prev; + } + } + else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) { + num = scan_unsigned_hexadecimal_number(&p, end, 2, env->enc); + if (num < 0) return ONIGERR_TOO_BIG_NUMBER; + if (p == prev) { /* can't read nothing. */ + num = 0; /* but, it's not error */ + } + tok->type = TK_RAW_BYTE; + tok->base = 16; + tok->u.c = num; + } + break; + + case 'u': + if (PEND) break; + + prev = p; + if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) { + num = scan_unsigned_hexadecimal_number(&p, end, 4, env->enc); + if (num < 0) return ONIGERR_TOO_BIG_NUMBER; + if (p == prev) { /* can't read nothing. */ + num = 0; /* but, it's not error */ + } + tok->type = TK_RAW_BYTE; + tok->base = 16; + tok->u.c = num; + } + break; + + case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + PUNFETCH; + prev = p; + num = onig_scan_unsigned_number(&p, end, env->enc); + if (num < 0) return ONIGERR_TOO_BIG_NUMBER; + if (num > ONIG_MAX_BACKREF_NUM) return ONIGERR_TOO_BIG_BACKREF_NUMBER; + + if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) && + (num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */ + if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { + if (num > env->num_mem || IS_NULL(SCANENV_MEM_NODES(env)[num])) + return ONIGERR_INVALID_BACKREF; + } + + tok->type = TK_BACKREF; + tok->u.backref.num = 1; + tok->u.backref.ref1 = num; + tok->u.backref.by_name = 0; + break; + } + else if (c == '8' || c == '9') { + /* normal char */ + p = prev; PINC; + break; + } + + p = prev; + /* fall through */ + case '0': + if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) { + prev = p; + num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), env->enc); + if (num < 0) return ONIGERR_TOO_BIG_NUMBER; + if (p == prev) { /* can't read nothing. */ + num = 0; /* but, it's not error */ + } + tok->type = TK_RAW_BYTE; + tok->base = 8; + tok->u.c = num; + } + else if (c != '0') { + PINC; + } + break; + +#ifdef USE_NAMED_GROUP + case 'k': + if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_K_NAMED_BACKREF)) { + PFETCH(c); + if (c == '<') { + UChar* name_end; + int* backs; + + prev = p; + r = fetch_name(&p, end, &name_end, env, 1); + if (r < 0) return r; + num = onig_name_to_group_numbers(env->reg, prev, name_end, &backs); + if (num <= 0) { + onig_scan_env_set_error_string(env, + ONIGERR_UNDEFINED_NAME_REFERENCE, prev, name_end); + return ONIGERR_UNDEFINED_NAME_REFERENCE; + } + if (IS_SYNTAX_BV(syn, ONIG_SYN_STRICT_CHECK_BACKREF)) { + int i; + for (i = 0; i < num; i++) { + if (backs[i] > env->num_mem || + IS_NULL(SCANENV_MEM_NODES(env)[backs[i]])) + return ONIGERR_INVALID_BACKREF; + } + } + + tok->type = TK_BACKREF; + tok->u.backref.by_name = 1; + if (num == 1) { + tok->u.backref.num = 1; + tok->u.backref.ref1 = backs[0]; + } + else { + tok->u.backref.num = num; + tok->u.backref.refs = backs; + } + } + else + PUNFETCH; + } + break; +#endif + +#ifdef USE_SUBEXP_CALL + case 'g': + if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_G_SUBEXP_CALL)) { + PFETCH(c); + if (c == '<') { + UChar* name_end; + + prev = p; + r = fetch_name(&p, end, &name_end, env, 1); + if (r < 0) return r; + + tok->type = TK_CALL; + tok->u.call.name = prev; + tok->u.call.name_end = name_end; + } + else + PUNFETCH; + } + break; +#endif + + case 'Q': + if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE)) { + tok->type = TK_QUOTE_OPEN; + } + break; + + case 'p': + case 'P': + if (PPEEK == '{' && + IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_CHAR_PROPERTY)) { + PINC; + tok->type = TK_CHAR_PROPERTY; + tok->u.prop.not = (c == 'P' ? 1 : 0); + } + break; + + default: + PUNFETCH; + num = fetch_escaped_value(&p, end, env); + if (num < 0) return num; + /* set_raw: */ + if (tok->u.c != num) { + tok->type = TK_RAW_BYTE; + tok->u.c = num; + } + break; + } + } + else { + tok->u.c = c; + tok->escaped = 0; + +#ifdef USE_VARIABLE_META_CHARS + if ((c != ONIG_INEFFECTIVE_META_CHAR) && + IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) { + if (c == MC_ANYCHAR) + goto any_char; + else if (c == MC_ANYTIME) + goto anytime; + else if (c == MC_ZERO_OR_ONE_TIME) + goto zero_or_one_time; + else if (c == MC_ONE_OR_MORE_TIME) + goto one_or_more_time; + else if (c == MC_ANYCHAR_ANYTIME) { + tok->type = TK_ANYCHAR_ANYTIME; + goto out; + } + } +#endif + + switch (c) { + case '.': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_DOT_ANYCHAR)) break; + any_char: + tok->type = TK_ANYCHAR; + break; + + case '*': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ASTERISK_ZERO_INF)) break; + anytime: + tok->type = TK_OP_REPEAT; + tok->u.repeat.lower = 0; + tok->u.repeat.upper = REPEAT_INFINITE; + goto greedy_check; + break; + + case '+': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_PLUS_ONE_INF)) break; + one_or_more_time: + tok->type = TK_OP_REPEAT; + tok->u.repeat.lower = 1; + tok->u.repeat.upper = REPEAT_INFINITE; + goto greedy_check; + break; + + case '?': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_ZERO_ONE)) break; + zero_or_one_time: + tok->type = TK_OP_REPEAT; + tok->u.repeat.lower = 0; + tok->u.repeat.upper = 1; + goto greedy_check; + break; + + case '{': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break; + tok->backp = p; + r = fetch_range_qualifier(&p, end, tok, env); + if (r < 0) return r; /* error */ + if (r > 0) { + /* normal char */ + } + else + goto greedy_check; + break; + + case '|': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_VBAR_ALT)) break; + tok->type = TK_ALT; + break; + + case '(': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break; + tok->type = TK_SUBEXP_OPEN; + break; + + case ')': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LPAREN_SUBEXP)) break; + tok->type = TK_SUBEXP_CLOSE; + break; + + case '^': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break; + tok->type = TK_ANCHOR; + tok->u.subtype = (IS_SINGLELINE(env->option) + ? ANCHOR_BEGIN_BUF : ANCHOR_BEGIN_LINE); + break; + + case '$': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_LINE_ANCHOR)) break; + tok->type = TK_ANCHOR; + tok->u.subtype = (IS_SINGLELINE(env->option) + ? ANCHOR_END_BUF : ANCHOR_END_LINE); + break; + + case '[': + if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACKET_CC)) break; + tok->type = TK_CC_OPEN; + break; + + case ']': + if (*src > env->pattern) /* /].../ is allowed. */ + CCEND_ESC_WARN(env, "]"); + break; + + case '#': + if (IS_EXTEND(env->option)) { + while (!PEND) { + PFETCH(c); + if (ONIG_IS_NEWLINE(c)) + break; + } + goto start; + break; + } + break; + + case ' ': case '\t': case '\n': case '\r': case '\f': + if (IS_EXTEND(env->option)) + goto start; + break; + + default: + break; + } + } + + out: + *src = p; + return tok->type; +} + +static int +add_ctype_to_cc_by_list(CClassNode* cc, int ctype, int not, + OnigEncoding enc) +{ + int i, r, nsb, nmb; + OnigCodePointRange *sbr, *mbr; + OnigCodePoint j; + + r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &nsb, &nmb, &sbr, &mbr); + if (r != 0) return r; + + if (not == 0) { + for (i = 0; i < nsb; i++) { + for (j = sbr[i].from; j <= sbr[i].to; j++) { + BITSET_SET_BIT(cc->bs, j); + } + } + for (i = 0; i < nmb; i++) { + r = add_code_range_to_buf(&(cc->mbuf), mbr[i].from, mbr[i].to); + if (r != 0) return r; + } + } + else { + OnigCodePoint prev = 0; + for (i = 0; i < nsb; i++) { + for (j = prev; j < sbr[i].from; j++) { + BITSET_SET_BIT(cc->bs, j); + } + prev = sbr[i].to + 1; + } + if (prev < 0x7f) { + for (j = prev; j < 0x7f; j++) { + BITSET_SET_BIT(cc->bs, j); + } + } + + prev = 0x80; + for (i = 0; i < nmb; i++) { + if (prev < mbr[i].from) { + r = add_code_range_to_buf(&(cc->mbuf), prev, mbr[i].from - 1); + if (r != 0) return r; + } + prev = mbr[i].to + 1; + } + if (prev < 0x7fffffff) { + r = add_code_range_to_buf(&(cc->mbuf), prev, 0x7fffffff); + if (r != 0) return r; + } + } + + return r; +} + +static int +add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env) +{ + int c, r; + OnigEncoding enc = env->enc; + + if (ONIGENC_CTYPE_SUPPORT_LEVEL(enc) != ONIGENC_CTYPE_SUPPORT_LEVEL_SB) { + r = add_ctype_to_cc_by_list(cc, ctype, not, env->enc); + return r; + } + + r = 0; + switch (ctype) { + case ONIGENC_CTYPE_ALPHA: + case ONIGENC_CTYPE_BLANK: + case ONIGENC_CTYPE_CNTRL: + case ONIGENC_CTYPE_DIGIT: + case ONIGENC_CTYPE_LOWER: + case ONIGENC_CTYPE_PUNCT: + case ONIGENC_CTYPE_SPACE: + case ONIGENC_CTYPE_UPPER: + case ONIGENC_CTYPE_XDIGIT: + case ONIGENC_CTYPE_ASCII: + case ONIGENC_CTYPE_ALNUM: + if (not != 0) { + for (c = 0; c < SINGLE_BYTE_SIZE; c++) { + if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) + BITSET_SET_BIT(cc->bs, c); + } + ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); + } + else { + for (c = 0; c < SINGLE_BYTE_SIZE; c++) { + if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) + BITSET_SET_BIT(cc->bs, c); + } + } + break; + + case ONIGENC_CTYPE_GRAPH: + case ONIGENC_CTYPE_PRINT: + if (not != 0) { + for (c = 0; c < SINGLE_BYTE_SIZE; c++) { + if (! ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) + BITSET_SET_BIT(cc->bs, c); + } + } + else { + for (c = 0; c < SINGLE_BYTE_SIZE; c++) { + if (ONIGENC_IS_CODE_CTYPE(enc, (OnigCodePoint )c, ctype)) + BITSET_SET_BIT(cc->bs, c); + } + ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); + } + break; + + case ONIGENC_CTYPE_WORD: + if (not == 0) { + for (c = 0; c < SINGLE_BYTE_SIZE; c++) { + if (ONIGENC_IS_CODE_SB_WORD(enc, c)) BITSET_SET_BIT(cc->bs, c); + } + ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf); + } + else { + for (c = 0; c < SINGLE_BYTE_SIZE; c++) { + if (! ONIGENC_IS_CODE_SB_WORD(enc, c) && ! ONIGENC_IS_MBC_HEAD(enc, c)) + BITSET_SET_BIT(cc->bs, c); + } + } + break; + + default: + return ONIGERR_PARSER_BUG; + break; + } + + return r; +} + +static int +parse_ctype_to_enc_ctype(int pctype, int* not) +{ + int ctype; + + switch (pctype) { + case CTYPE_WORD: + ctype = ONIGENC_CTYPE_WORD; + *not = 0; + break; + case CTYPE_NOT_WORD: + ctype = ONIGENC_CTYPE_WORD; + *not = 1; + break; + case CTYPE_WHITE_SPACE: + ctype = ONIGENC_CTYPE_SPACE; + *not = 0; + break; + case CTYPE_NOT_WHITE_SPACE: + ctype = ONIGENC_CTYPE_SPACE; + *not = 1; + break; + case CTYPE_DIGIT: + ctype = ONIGENC_CTYPE_DIGIT; + *not = 0; + break; + case CTYPE_NOT_DIGIT: + ctype = ONIGENC_CTYPE_DIGIT; + *not = 1; + break; + default: + return ONIGERR_PARSER_BUG; + break; + } + return ctype; +} + +typedef struct { + UChar *name; + int ctype; + short int len; +} PosixBracketEntryType; + +static int +parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env) +{ +#define POSIX_BRACKET_CHECK_LIMIT_LENGTH 20 +#define POSIX_BRACKET_NAME_MAX_LEN 6 + + static PosixBracketEntryType PBS[] = { + { "alnum", ONIGENC_CTYPE_ALNUM, 5 }, + { "alpha", ONIGENC_CTYPE_ALPHA, 5 }, + { "blank", ONIGENC_CTYPE_BLANK, 5 }, + { "cntrl", ONIGENC_CTYPE_CNTRL, 5 }, + { "digit", ONIGENC_CTYPE_DIGIT, 5 }, + { "graph", ONIGENC_CTYPE_GRAPH, 5 }, + { "lower", ONIGENC_CTYPE_LOWER, 5 }, + { "print", ONIGENC_CTYPE_PRINT, 5 }, + { "punct", ONIGENC_CTYPE_PUNCT, 5 }, + { "space", ONIGENC_CTYPE_SPACE, 5 }, + { "upper", ONIGENC_CTYPE_UPPER, 5 }, + { "xdigit", ONIGENC_CTYPE_XDIGIT, 6 }, + { "ascii", ONIGENC_CTYPE_ASCII, 5 }, /* I don't know origin. Perl? */ + { (UChar* )NULL, -1, 0 } + }; + + PosixBracketEntryType *pb; + int not, i, c, r; + UChar *p = *src; + + if (PPEEK == '^') { + PINC; + not = 1; + } + else + not = 0; + + if (end - p < POSIX_BRACKET_NAME_MAX_LEN + 1) + goto not_posix_bracket; + + for (pb = PBS; IS_NOT_NULL(pb->name); pb++) { + if (onig_strncmp(p, pb->name, pb->len) == 0) { + p += pb->len; + if (end - p < 2 || *p != ':' || *(p+1) != ']') + return ONIGERR_INVALID_POSIX_BRACKET_TYPE; + + r = add_ctype_to_cc(cc, pb->ctype, not, env); + if (r != 0) return r; + + PINC; PINC; + *src = p; + return 0; + } + } + + not_posix_bracket: + c = 0; + i = 0; + while (!PEND && ((c = PPEEK) != ':') && c != ']') { + PINC; + if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break; + } + if (c == ':' && !PEND) { + PINC; + if (!PEND) { + PFETCH(c); + if (c == ']') + return ONIGERR_INVALID_POSIX_BRACKET_TYPE; + } + } + + return 1; /* 1: is not POSIX bracket, but no error. */ +} + +static int +property_name_to_ctype(UChar* p, UChar* end) +{ + static PosixBracketEntryType PBS[] = { + { "Alnum", ONIGENC_CTYPE_ALNUM, 5 }, + { "Alpha", ONIGENC_CTYPE_ALPHA, 5 }, + { "Blank", ONIGENC_CTYPE_BLANK, 5 }, + { "Cntrl", ONIGENC_CTYPE_CNTRL, 5 }, + { "Digit", ONIGENC_CTYPE_DIGIT, 5 }, + { "Graph", ONIGENC_CTYPE_GRAPH, 5 }, + { "Lower", ONIGENC_CTYPE_LOWER, 5 }, + { "Print", ONIGENC_CTYPE_PRINT, 5 }, + { "Punct", ONIGENC_CTYPE_PUNCT, 5 }, + { "Space", ONIGENC_CTYPE_SPACE, 5 }, + { "Upper", ONIGENC_CTYPE_UPPER, 5 }, + { "XDigit", ONIGENC_CTYPE_XDIGIT, 6 }, + { "ASCII", ONIGENC_CTYPE_ASCII, 5 }, + { (UChar* )NULL, -1, 0 } + }; + + PosixBracketEntryType *pb; + int len; + + len = end - p; + for (pb = PBS; IS_NOT_NULL(pb->name); pb++) { + if (len == pb->len && onig_strncmp(p, pb->name, pb->len) == 0) + return pb->ctype; + } + + return ONIGERR_INVALID_CHAR_PROPERTY_NAME; +} + +static int +fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env) +{ + int ctype; + UChar *prev, *p = *src; + int c = 0; + + while (!PEND) { + prev = p; + PFETCH(c); + if (c == '}') { + ctype = property_name_to_ctype(*src, prev); + if (ctype < 0) return ctype; + + *src = p; + return ctype; + } + else if (c == '(' || c == ')' || c == '{' || c == '|') + break; + } + + return ONIGERR_INVALID_CHAR_PROPERTY_NAME; +} + +static int +parse_char_property(Node** np, OnigToken* tok, UChar** src, UChar* end, + ScanEnv* env) +{ + int r, ctype; + CClassNode* cc; + + ctype = fetch_char_property_to_ctype(src, end, env); + if (ctype < 0) return ctype; + + *np = node_new_cclass(); + CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + cc = &(NCCLASS(*np)); + r = add_ctype_to_cc(cc, ctype, 0, env); + if (r != 0) return r; + if (tok->u.prop.not != 0) CCLASS_SET_NOT(cc); + + return 0; +} + + +enum CCSTATE { + CCS_VALUE, + CCS_RANGE, + CCS_COMPLETE, + CCS_START +}; + +enum CCVALTYPE { + CCV_SB, + CCV_CODE_POINT, + CCV_CLASS +}; + +static int +next_state_class(CClassNode* cc, OnigCodePoint* vs, enum CCVALTYPE* type, + enum CCSTATE* state, ScanEnv* env) +{ + int r; + + if (*state == CCS_RANGE) + return ONIGERR_CHAR_CLASS_VALUE_AT_END_OF_RANGE; + + if (*state == CCS_VALUE && *type != CCV_CLASS) { + if (*type == CCV_SB) + BITSET_SET_BIT(cc->bs, (int )(*vs)); + else if (*type == CCV_CODE_POINT) { + r = add_code_range(&(cc->mbuf), env, *vs, *vs); + if (r < 0) return r; + } + } + + *state = CCS_VALUE; + *type = CCV_CLASS; + return 0; +} + +static int +next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v, + int* vs_israw, int v_israw, + enum CCVALTYPE intype, enum CCVALTYPE* type, + enum CCSTATE* state, ScanEnv* env) +{ + int r; + + switch (*state) { + case CCS_VALUE: + if (*type == CCV_SB) + BITSET_SET_BIT(cc->bs, (int )(*vs)); + else if (*type == CCV_CODE_POINT) { + r = add_code_range(&(cc->mbuf), env, *vs, *vs); + if (r < 0) return r; + } + break; + + case CCS_RANGE: + if (intype == *type) { + if (intype == CCV_SB) { + if (*vs > v) { + if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC)) + goto ccs_range_end; + else + return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS; + } + bitset_set_range(cc->bs, (int )*vs, (int )v); + } + else { + r = add_code_range(&(cc->mbuf), env, *vs, v); + if (r < 0) return r; + } + } + else { + if (intype == CCV_CODE_POINT && *type == CCV_SB && + ONIGENC_IS_CONTINUOUS_SB_MB(env->enc)) { + bitset_set_range(cc->bs, (int )*vs, 0x7f); + r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )0x80, v); + if (r < 0) return r; + } + else + return ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE; + } + ccs_range_end: + *state = CCS_COMPLETE; + break; + + case CCS_COMPLETE: + case CCS_START: + *state = CCS_VALUE; + break; + + default: + break; + } + + *vs_israw = v_israw; + *vs = v; + *type = intype; + return 0; +} + +static int +char_exist_check(UChar c, UChar* from, UChar* to, int ignore_escaped, + OnigEncoding enc) +{ + int in_esc; + UChar* p = from; + + in_esc = 0; + while (p < to) { + if (ignore_escaped && in_esc) { + in_esc = 0; + } + else { + if (*p == c) return 1; + if (*p == MC_ESC) in_esc = 1; + } + p += enc_len(enc, *p); + } + return 0; +} + +static int +parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end, + ScanEnv* env) +{ + int r, neg, len, fetched, and_start; + OnigCodePoint v, vs; + UChar *p; + Node* node; + CClassNode *cc, *prev_cc; + CClassNode work_cc; + + enum CCSTATE state; + enum CCVALTYPE val_type, in_type; + int val_israw, in_israw; + + prev_cc = (CClassNode* )NULL; + *np = NULL_NODE; + r = fetch_token_in_cc(tok, src, end, env); + if (r == TK_BYTE && tok->u.c == '^' && tok->escaped == 0) { + neg = 1; + r = fetch_token_in_cc(tok, src, end, env); + } + else { + neg = 0; + } + + if (r < 0) return r; + if (r == TK_CC_CLOSE) { + if (! char_exist_check(']', *src, env->pattern_end, 1, env->enc)) + return ONIGERR_EMPTY_CHAR_CLASS; + + CC_ESC_WARN(env, "]"); + r = tok->type = TK_BYTE; /* allow []...] */ + } + + *np = node = node_new_cclass(); + CHECK_NULL_RETURN_VAL(node, ONIGERR_MEMORY); + cc = &(NCCLASS(node)); + + and_start = 0; + state = CCS_START; + p = *src; + while (r != TK_CC_CLOSE) { + fetched = 0; + switch (r) { + case TK_BYTE: + len = enc_len(env->enc, tok->u.c); + if (len > 1) { + PUNFETCH; + v = ONIGENC_MBC_TO_CODE(env->enc, p, end); + p += len; + in_type = CCV_CODE_POINT; + } + else { + sb_char: + v = (OnigCodePoint )tok->u.c; + in_type = CCV_SB; + } + in_israw = 0; + goto val_entry2; + break; + + case TK_RAW_BYTE: + len = enc_len(env->enc, tok->u.c); + if (len > 1 && tok->base != 0) { /* tok->base != 0 : octal or hexadec. */ + UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; + UChar* bufp = buf; + UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN; + int i, base = tok->base; + + if (len > ONIGENC_CODE_TO_MBC_MAXLEN) { + bufp = (UChar* )xmalloc(len); + if (IS_NULL(bufp)) { + r = ONIGERR_MEMORY; + goto err; + } + bufe = bufp + len; + } + bufp[0] = tok->u.c; + for (i = 1; i < len; i++) { + r = fetch_token_in_cc(tok, &p, end, env); + if (r < 0) goto raw_byte_err; + if (r != TK_RAW_BYTE || tok->base != base) break; + bufp[i] = tok->u.c; + } + if (i < len) { + r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING; + raw_byte_err: + if (bufp != buf) xfree(bufp); + goto err; + } + v = ONIGENC_MBC_TO_CODE(env->enc, bufp, bufe); + if (bufp != buf) xfree(bufp); + in_type = CCV_CODE_POINT; + } + else { + v = (OnigCodePoint )tok->u.c; + in_type = CCV_SB; + } + in_israw = 1; + goto val_entry2; + break; + + case TK_CODE_POINT: + v = tok->u.code; + in_israw = 1; + val_entry: + len = ONIGENC_CODE_TO_MBCLEN(env->enc, v); + if (len < 0) { + r = len; + goto err; + } + in_type = (len == 1 ? CCV_SB : CCV_CODE_POINT); + val_entry2: + r = next_state_val(cc, &vs, v, &val_israw, in_israw, in_type, &val_type, + &state, env); + if (r != 0) goto err; + break; + + case TK_POSIX_BRACKET_OPEN: + r = parse_posix_bracket(cc, &p, end, env); + if (r < 0) goto err; + if (r == 1) { /* is not POSIX bracket */ + CC_ESC_WARN(env, "["); + p = tok->backp; + v = (OnigCodePoint )tok->u.c; + in_israw = 0; + goto val_entry; + } + goto next_class; + break; + + case TK_CHAR_TYPE: + { + int ctype, not; + ctype = parse_ctype_to_enc_ctype(tok->u.subtype, ¬); + r = add_ctype_to_cc(cc, ctype, not, env); + if (r != 0) return r; + } + + next_class: + r = next_state_class(cc, &vs, &val_type, &state, env); + if (r != 0) goto err; + break; + + case TK_CHAR_PROPERTY: + { + int ctype; + + ctype = fetch_char_property_to_ctype(&p, end, env); + if (ctype < 0) return ctype; + r = add_ctype_to_cc(cc, ctype, tok->u.prop.not, env); + if (r != 0) return r; + goto next_class; + } + break; + + case TK_CC_RANGE: + if (state == CCS_VALUE) { + r = fetch_token_in_cc(tok, &p, end, env); + if (r < 0) goto err; + fetched = 1; + if (r == TK_CC_CLOSE) { /* allow [x-] */ + range_end_val: + v = (OnigCodePoint )'-'; + in_israw = 0; + goto val_entry; + } + else if (r == TK_CC_AND) { + CC_ESC_WARN(env, "-"); + goto range_end_val; + } + state = CCS_RANGE; + } + else if (state == CCS_START) { + /* [-xa] is allowed */ + v = (OnigCodePoint )tok->u.c; + in_israw = 0; + + r = fetch_token_in_cc(tok, &p, end, env); + if (r < 0) goto err; + fetched = 1; + /* [--x] or [a&&-x] is warned. */ + if (r == TK_CC_RANGE || and_start != 0) + CC_ESC_WARN(env, "-"); + + goto val_entry; + } + else if (state == CCS_RANGE) { + CC_ESC_WARN(env, "-"); + goto sb_char; /* [!--x] is allowed */ + } + else { /* CCS_COMPLETE */ + r = fetch_token_in_cc(tok, &p, end, env); + if (r < 0) goto err; + fetched = 1; + if (r == TK_CC_CLOSE) goto range_end_val; /* allow [a-b-] */ + else if (r == TK_CC_AND) { + CC_ESC_WARN(env, "-"); + goto range_end_val; + } + + if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC)) { + CC_ESC_WARN(env, "-"); + goto sb_char; /* [0-9-a] is allowed as [0-9\-a] */ + } + r = ONIGERR_UNMATCHED_RANGE_SPECIFIER_IN_CHAR_CLASS; + goto err; + } + break; + + case TK_CC_CC_OPEN: /* [ */ + { + Node *anode; + CClassNode* acc; + + r = parse_char_class(&anode, tok, &p, end, env); + if (r != 0) goto cc_open_err; + acc = &(NCCLASS(anode)); + r = or_cclass(cc, acc, env->enc); + + onig_node_free(anode); + cc_open_err: + if (r != 0) goto err; + } + break; + + case TK_CC_AND: /* && */ + { + if (state == CCS_VALUE) { + r = next_state_val(cc, &vs, 0, &val_israw, 0, CCV_SB, + &val_type, &state, env); + if (r != 0) goto err; + } + /* initialize local variables */ + and_start = 1; + state = CCS_START; + + if (IS_NOT_NULL(prev_cc)) { + r = and_cclass(prev_cc, cc, env->enc); + if (r != 0) goto err; + bbuf_free(cc->mbuf); + } + else { + prev_cc = cc; + cc = &work_cc; + } + initialize_cclass(cc); + } + break; + + case TK_EOT: + r = ONIGERR_PREMATURE_END_OF_CHAR_CLASS; + goto err; + break; + default: + r = ONIGERR_PARSER_BUG; + goto err; + break; + } + + if (fetched) + r = tok->type; + else { + r = fetch_token_in_cc(tok, &p, end, env); + if (r < 0) goto err; + } + } + + if (state == CCS_VALUE) { + r = next_state_val(cc, &vs, 0, &val_israw, 0, CCV_SB, + &val_type, &state, env); + if (r != 0) goto err; + } + + if (IS_NOT_NULL(prev_cc)) { + r = and_cclass(prev_cc, cc, env->enc); + if (r != 0) goto err; + bbuf_free(cc->mbuf); + cc = prev_cc; + } + + cc->not = neg; + if (cc->not != 0 && + IS_SYNTAX_BV(env->syntax, ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC)) { + int is_empty; + + is_empty = (IS_NULL(cc->mbuf) ? 1 : 0); + if (is_empty != 0) + BITSET_IS_EMPTY(cc->bs, is_empty); + if (is_empty == 0) + BITSET_SET_BIT(cc->bs, ONIG_NEWLINE); + } + *src = p; + return 0; + + err: + if (cc != &(NCCLASS(*np))) + bbuf_free(cc->mbuf); + onig_node_free(*np); + return r; +} + +static int parse_subexp(Node** top, OnigToken* tok, int term, + UChar** src, UChar* end, ScanEnv* env); + +static int +parse_effect(Node** np, OnigToken* tok, int term, UChar** src, UChar* end, + ScanEnv* env) +{ + Node *target; + OnigOptionType option; + int r, c, num; + int list_capture; + UChar* p = *src; + + *np = NULL; + if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; + + option = env->option; + if (PPEEK == '?' && + IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) { + PINC; + if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; + + PFETCH(c); + switch (c) { + case '#': /* (?#...) comment */ + while (1) { + if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; + PFETCH(c); + if (c == ')') break; + } + *src = p; + return 3; /* 3: comment */ + break; + + case ':': /* (?:...) grouping only */ + group: + r = fetch_token(tok, &p, end, env); + if (r < 0) return r; + r = parse_subexp(np, tok, term, &p, end, env); + if (r < 0) return r; + *src = p; + return 1; /* group */ + break; + + case '=': + *np = onig_node_new_anchor(ANCHOR_PREC_READ); + break; + case '!': /* preceding read */ + *np = onig_node_new_anchor(ANCHOR_PREC_READ_NOT); + break; + case '>': /* (?>...) stop backtrack */ + *np = node_new_effect(EFFECT_STOP_BACKTRACK); + break; + + case '<': /* look behind (?<=...), (?syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { + UChar *name; + UChar *name_end; + + PUNFETCH; + list_capture = 0; + + named_group: + name = p; + r = fetch_name(&p, end, &name_end, env, 0); + if (r < 0) return r; + + num = scan_env_add_mem_entry(env); + if (num < 0) return num; + if (list_capture != 0 && num >= BIT_STATUS_BITS_NUM) + return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY; + + r = name_add(env->reg, name, name_end, num, env); + if (r != 0) return r; + *np = node_new_effect_memory(env->option, 1); + CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + NEFFECT(*np).regnum = num; + if (list_capture != 0) + BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num); + env->num_named++; + } +#endif + else + return ONIGERR_UNDEFINED_GROUP_OPTION; + break; + + case '@': + if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY)) { +#ifdef USE_NAMED_GROUP + if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP)) { + PFETCH(c); + if (c == '<') { + list_capture = 1; + goto named_group; /* (?@...) */ + } + PUNFETCH; + } +#endif + *np = node_new_effect_memory(env->option, 0); + CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + num = scan_env_add_mem_entry(env); + if (num < 0) { + onig_node_free(*np); + return num; + } + else if (num >= BIT_STATUS_BITS_NUM) { + onig_node_free(*np); + return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY; + } + NEFFECT(*np).regnum = num; + BIT_STATUS_ON_AT_SIMPLE(env->capture_history, num); + } + else { + return ONIGERR_UNDEFINED_GROUP_OPTION; + } + break; + +#ifdef USE_POSIXLINE_OPTION + case 'p': +#endif + case '-': case 'i': case 'm': case 's': case 'x': + { + int neg = 0; + + while (1) { + switch (c) { + case ':': + case ')': + break; + + case '-': neg = 1; break; + case 'x': ONOFF(option, ONIG_OPTION_EXTEND, neg); break; + case 'i': ONOFF(option, ONIG_OPTION_IGNORECASE, neg); break; + case 's': + if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) { + ONOFF(option, ONIG_OPTION_MULTILINE, neg); + } + else + return ONIGERR_UNDEFINED_GROUP_OPTION; + break; + + case 'm': + if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_PERL)) { + ONOFF(option, ONIG_OPTION_SINGLELINE, (neg == 0 ? 1 : 0)); + } + else if (IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_OPTION_RUBY)) { + ONOFF(option, ONIG_OPTION_MULTILINE, neg); + } + else + return ONIGERR_UNDEFINED_GROUP_OPTION; + break; +#ifdef USE_POSIXLINE_OPTION + case 'p': + ONOFF(option, ONIG_OPTION_MULTILINE|ONIG_OPTION_SINGLELINE, neg); + break; +#endif + default: + return ONIGERR_UNDEFINED_GROUP_OPTION; + } + + if (c == ')') { + *np = node_new_option(option); + CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + *src = p; + return 2; /* option only */ + } + else if (c == ':') { + OnigOptionType prev = env->option; + + env->option = option; + r = fetch_token(tok, &p, end, env); + if (r < 0) return r; + r = parse_subexp(&target, tok, term, &p, end, env); + env->option = prev; + if (r < 0) return r; + *np = node_new_option(option); + CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + NEFFECT(*np).target = target; + *src = p; + return 0; + } + + if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; + PFETCH(c); + } + } + break; + + default: + return ONIGERR_UNDEFINED_GROUP_OPTION; + } + } + else { +#ifdef USE_NAMED_GROUP + if (ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_DONT_CAPTURE_GROUP)) + goto group; +#endif + *np = node_new_effect_memory(env->option, 0); + CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + num = scan_env_add_mem_entry(env); + if (num < 0) return num; + NEFFECT(*np).regnum = num; + } + + CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + r = fetch_token(tok, &p, end, env); + if (r < 0) return r; + r = parse_subexp(&target, tok, term, &p, end, env); + if (r < 0) return r; + + if (NTYPE(*np) == N_ANCHOR) + NANCHOR(*np).target = target; + else { + NEFFECT(*np).target = target; + if (NEFFECT(*np).type == EFFECT_MEMORY) { + /* Don't move this to previous of parse_subexp() */ + r = scan_env_set_mem_node(env, NEFFECT(*np).regnum, *np); + if (r != 0) return r; + } + } + + *src = p; + return 0; +} + +static int +set_qualifier(Node* qnode, Node* target, int group, ScanEnv* env) +{ + QualifierNode* qn; + + qn = &(NQUALIFIER(qnode)); + if (qn->lower == 1 && qn->upper == 1) { + return 1; + } + + switch (NTYPE(target)) { + case N_STRING: + if (! group) { + StrNode* sn = &(NSTRING(target)); + if (str_node_can_be_split(sn, env->enc)) { + Node* n = str_node_split_last_char(sn, env->enc); + if (IS_NOT_NULL(n)) { + qn->target = n; + return 2; + } + } + } + break; + + case N_QUALIFIER: + { /* check redundant double repeat. */ + /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */ + QualifierNode* qnt = &(NQUALIFIER(target)); + +#ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR + if (qn->by_number == 0 && qnt->by_number == 0 && + IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) { + if (IS_REPEAT_INFINITE(qn->upper)) { + if (qn->lower == 0) { /* '*' */ + redundant: + { + char buf[WARN_BUFSIZE]; + if (onig_verb_warn != onig_null_warn) { + onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc, + env->pattern, env->pattern_end, + "redundant nested repeat operator"); + (*onig_verb_warn)(buf); + } + goto warn_exit; + } + } + else if (qn->lower == 1) { /* '+' */ + /* (?:a?)+? only allowed. */ + if (qn->greedy || !(qnt->upper == 1 && qnt->greedy)) + goto redundant; + } + } + else if (qn->upper == 1 && qn->lower == 0) { + if (qn->greedy) { /* '?' */ + if (!(qnt->lower == 1 && qnt->greedy == 0)) /* not '+?' */ + goto redundant; + } + else { /* '??' */ + /* '(?:a+)?? only allowd. (?:a*)?? can be replaced to (?:a+)?? */ + if (!(qnt->greedy && qnt->lower == 1 && + IS_REPEAT_INFINITE(qnt->upper))) + goto redundant; + } + } + } + + warn_exit: +#endif + if (popular_qualifier_num(qnt) >= 0 && popular_qualifier_num(qn) >= 0) { + onig_reduce_nested_qualifier(qnode, target); + goto q_exit; + } + } + break; + + default: + break; + } + + qn->target = target; + q_exit: + return 0; +} + +#ifdef USE_FOLD_MATCH +static int +make_alt_node_from_fold_info(OnigEncFoldMatchInfo* info, Node** node) +{ + int i; + UChar *s, *end; + Node *root, **ptail, *snode; + + ptail = &root; + for (i = 0; i < info->target_num; i++) { + s = info->target_str[i]; + end = s + info->target_byte_len[i]; + /* ex. + U+00DF match "ss" and "SS, but not match "Ss". + So, string nodes must be raw. + */ + snode = node_new_str_raw(s, end); + CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); + + *ptail = node_new_alt(snode, NULL_NODE); + CHECK_NULL_RETURN_VAL(*ptail, ONIGERR_MEMORY); + ptail = &(NCONS(*ptail).right); + } + *ptail = NULL_NODE; + *node = root; + return 0; +} + +static int +make_fold_alt_node_from_cc(OnigEncoding enc, CClassNode* cc, Node** root) +{ + int i, j, flen, len, ncode, n; + UChar *s, *end, buf[ONIGENC_CODE_TO_MBC_MAXLEN]; + OnigCodePoint* codes; + Node **ptail, *snode; + OnigEncFoldMatchInfo* info; + + *root = NULL_NODE; + ptail = root; + + ncode = ONIGENC_GET_ALL_FOLD_MATCH_CODE(enc, &codes); + n = 0; + for (i = 0; i < ncode; i++) { + if (onig_is_code_in_cc(enc, codes[i], cc)) { + len = ONIGENC_CODE_TO_MBC(enc, codes[i], buf); + flen = ONIGENC_GET_FOLD_MATCH_INFO(enc, buf, buf + len, &info); + if (flen > 0) { /* fold */ + for (j = 0; j < info->target_num; j++) { + s = info->target_str[j]; + end = s + info->target_byte_len[j]; + if (onig_strncmp(s, buf, enc_len(enc, *s)) == 0) + continue; /* ignore single char. */ + + snode = node_new_str_raw(s, end); + CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); + + *ptail = node_new_alt(snode, NULL_NODE); + CHECK_NULL_RETURN_VAL(*ptail, ONIGERR_MEMORY); + ptail = &(NCONS(*ptail).right); + n++; + } + } + } + } + + return n; +} +#endif + +static int +parse_exp(Node** np, OnigToken* tok, int term, + UChar** src, UChar* end, ScanEnv* env) +{ + int r, len, group = 0; + Node* qn; + Node** targetp; + + start: + *np = NULL; + if (tok->type == term) + goto end_of_token; + + switch (tok->type) { + case TK_ALT: + case TK_EOT: + end_of_token: + *np = node_new_empty(); + return tok->type; + break; + + case TK_SUBEXP_OPEN: + r = parse_effect(np, tok, TK_SUBEXP_CLOSE, src, end, env); + if (r < 0) return r; + if (r == 1) group = 1; + else if (r == 2) { /* option only */ + Node* target; + OnigOptionType prev = env->option; + + env->option = NEFFECT(*np).option; + r = fetch_token(tok, src, end, env); + if (r < 0) return r; + r = parse_subexp(&target, tok, term, src, end, env); + env->option = prev; + if (r < 0) return r; + NEFFECT(*np).target = target; + return tok->type; + } + else if (r == 3) { /* comment */ + r = fetch_token(tok, src, end, env); + if (r < 0) return r; + goto start; + } + break; + + case TK_SUBEXP_CLOSE: + if (! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP)) + return ONIGERR_UNMATCHED_CLOSE_PARENTHESIS; + + if (tok->escaped) goto tk_raw_byte; + else goto tk_byte; + break; + + case TK_BYTE: + tk_byte: + { + *np = node_new_str_char((UChar )tok->u.c); + CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + + while (1) { + len = enc_len(env->enc, tok->u.c); + if (len > 1) { + r = onig_node_str_cat(*np, *src, *src + len - 1); + if (r < 0) return r; + *src += (len - 1); + } + + r = fetch_token(tok, src, end, env); + if (r < 0) return r; + if (r != TK_BYTE) break; + + r = node_str_cat_char(*np, (UChar )tok->u.c); + if (r < 0) return r; + } + + fold_entry: +#ifdef USE_FOLD_MATCH + if (IS_IGNORECASE(env->option) && ONIGENC_IS_FOLD_MATCH(env->enc)) { + int flen, ret; + Node *root, **ptail, *work, *snode, *anode; + UChar *p, *pprev; + OnigEncFoldMatchInfo* fold_info; + StrNode* sn = &(NSTRING(*np)); + + ptail = &root; + pprev = sn->s; + for (p = sn->s; p < sn->end; ) { + flen = ONIGENC_GET_FOLD_MATCH_INFO(env->enc, p, sn->end, &fold_info); + if (flen > 0) { /* fold */ + ret = make_alt_node_from_fold_info(fold_info, &anode); + if (ret != 0) return ret; + work = node_new_list(anode, NULL); + CHECK_NULL_RETURN_VAL(work, ONIGERR_MEMORY); + + if (pprev < p) { + snode = node_new_str(pprev, p); + CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); + *ptail = node_new_list(snode, work); + CHECK_NULL_RETURN_VAL(*ptail, ONIGERR_MEMORY); + } + else { + *ptail = work; + } + ptail = &(NCONS(work).right); + p += flen; + pprev = p; + } + else + p += enc_len(env->enc, *p); + } + *ptail = NULL_NODE; + if (IS_NOT_NULL(root)) { + if (pprev < sn->end) { + snode = node_new_str(pprev, sn->end); + CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); + *ptail = node_new_list(snode, NULL_NODE); + CHECK_NULL_RETURN_VAL(*ptail, ONIGERR_MEMORY); + } + onig_node_free(*np); + *np = root; + } + } +#endif + targetp = np; + goto repeat; + } + break; + + case TK_RAW_BYTE: + tk_raw_byte: + { + int expect_len; + + *np = node_new_str_raw_char((UChar )tok->u.c); + CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + expect_len = enc_len(env->enc, tok->u.c); + len = 1; + while (1) { + r = fetch_token(tok, src, end, env); + if (r < 0) return r; + if (r != TK_RAW_BYTE) { +#ifndef NUMBERED_CHAR_IS_NOT_CASE_AMBIG + if (len >= expect_len) { + NSTRING_CLEAR_RAW(*np); + } +#endif + goto fold_entry; + } + + r = node_str_cat_char(*np, (UChar )tok->u.c); + if (r < 0) return r; + len++; + } + } + break; + + case TK_CODE_POINT: + { + UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; + int num = ONIGENC_CODE_TO_MBC(env->enc, tok->u.code, buf); + if (num < 0) return num; +#ifdef NUMBERED_CHAR_IS_NOT_CASE_AMBIG + *np = node_new_str_raw(buf, buf + num); +#else + *np = node_new_str(buf, buf + num); +#endif + CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + } + break; + + case TK_QUOTE_OPEN: + { + OnigCodePoint end_op[] = { (OnigCodePoint )MC_ESC, (OnigCodePoint )'E' }; + UChar *qstart, *qend, *nextp; + + qstart = *src; + qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc); + if (IS_NULL(qend)) { + nextp = qend = end; + } + *np = node_new_str(qstart, qend); + CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + *src = nextp; + } + break; + + case TK_CHAR_TYPE: + { + switch (tok->u.subtype) { + case CTYPE_WORD: + case CTYPE_NOT_WORD: + *np = node_new_ctype(tok->u.subtype); + CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + break; + + case CTYPE_WHITE_SPACE: + case CTYPE_NOT_WHITE_SPACE: + case CTYPE_DIGIT: + case CTYPE_NOT_DIGIT: + { + CClassNode* cc; + int ctype, not; + + ctype = parse_ctype_to_enc_ctype(tok->u.subtype, ¬); + + *np = node_new_cclass(); + CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + cc = &(NCCLASS(*np)); + add_ctype_to_cc(cc, ctype, 0, env); + if (not != 0) CCLASS_SET_NOT(cc); + } + break; + + default: + return ONIGERR_PARSER_BUG; + break; + } + } + break; + + case TK_CHAR_PROPERTY: + r = parse_char_property(np, tok, src, end, env); + if (r != 0) return r; + break; + + case TK_CC_OPEN: + r = parse_char_class(np, tok, src, end, env); + if (r != 0) return r; + +#ifdef USE_FOLD_MATCH + if (IS_IGNORECASE(env->option) && ONIGENC_IS_FOLD_MATCH(env->enc)) { + int res; + Node *alt_root, *work; + CClassNode* cc = &(NCCLASS(*np)); + + res = make_fold_alt_node_from_cc(env->enc, cc, &alt_root); + if (res < 0) return res; + if (res > 0) { + work = node_new_alt(*np, alt_root); + if (IS_NULL(work)) { + onig_node_free(alt_root); + return ONIGERR_MEMORY; + } + *np = work; + } + } +#endif + break; + + case TK_ANYCHAR: + *np = node_new_anychar(); + CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + break; + + case TK_ANYCHAR_ANYTIME: + *np = node_new_anychar(); + CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + qn = node_new_qualifier(0, REPEAT_INFINITE, 0); + CHECK_NULL_RETURN_VAL(qn, ONIGERR_MEMORY); + NQUALIFIER(qn).target = *np; + *np = qn; + break; + + case TK_BACKREF: + len = tok->u.backref.num; + *np = node_new_backref(len, + (len > 1 ? tok->u.backref.refs : &(tok->u.backref.ref1)), + tok->u.backref.by_name, env); + CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + break; + +#ifdef USE_SUBEXP_CALL + case TK_CALL: + *np = node_new_call(tok->u.call.name, tok->u.call.name_end); + CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY); + env->num_call++; + break; +#endif + + case TK_ANCHOR: + *np = onig_node_new_anchor(tok->u.anchor); + break; + + case TK_OP_REPEAT: + case TK_INTERVAL: + if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS)) { + if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS)) + return ONIGERR_TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED; + else + *np = node_new_empty(); + } + else { + *src = tok->backp; + goto tk_byte; + } + break; + + default: + return ONIGERR_PARSER_BUG; + break; + } + + { + targetp = np; + + re_entry: + r = fetch_token(tok, src, end, env); + if (r < 0) return r; + + repeat: + if (r == TK_OP_REPEAT || r == TK_INTERVAL) { + if (is_invalid_qualifier_target(*targetp)) + return ONIGERR_TARGET_OF_REPEAT_OPERATOR_INVALID; + + qn = node_new_qualifier(tok->u.repeat.lower, tok->u.repeat.upper, + (r == TK_INTERVAL ? 1 : 0)); + CHECK_NULL_RETURN_VAL(qn, ONIGERR_MEMORY); + NQUALIFIER(qn).greedy = tok->u.repeat.greedy; + r = set_qualifier(qn, *targetp, group, env); + if (r < 0) return r; + + if (tok->u.repeat.possessive != 0) { + Node* en; + en = node_new_effect(EFFECT_STOP_BACKTRACK); + CHECK_NULL_RETURN_VAL(en, ONIGERR_MEMORY); + NEFFECT(en).target = qn; + qn = en; + } + + if (r == 0) { + *targetp = qn; + } + else if (r == 2) { /* split case: /abc+/ */ + Node *tmp; + + *targetp = node_new_list(*targetp, NULL); + CHECK_NULL_RETURN_VAL(*targetp, ONIGERR_MEMORY); + tmp = NCONS(*targetp).right = node_new_list(qn, NULL); + CHECK_NULL_RETURN_VAL(tmp, ONIGERR_MEMORY); + targetp = &(NCONS(tmp).left); + } + goto re_entry; + } + } + + return r; +} + +static int +parse_branch(Node** top, OnigToken* tok, int term, + UChar** src, UChar* end, ScanEnv* env) +{ + int r; + Node *node, **headp; + + *top = NULL; + r = parse_exp(&node, tok, term, src, end, env); + if (r < 0) return r; + + if (r == TK_EOT || r == term || r == TK_ALT) { + *top = node; + } + else { + *top = node_new_list(node, NULL); + headp = &(NCONS(*top).right); + while (r != TK_EOT && r != term && r != TK_ALT) { + r = parse_exp(&node, tok, term, src, end, env); + if (r < 0) return r; + + if (NTYPE(node) == N_LIST) { + *headp = node; + while (IS_NOT_NULL(NCONS(node).right)) node = NCONS(node).right; + headp = &(NCONS(node).right); + } + else { + *headp = node_new_list(node, NULL); + headp = &(NCONS(*headp).right); + } + } + } + + return r; +} + +/* term_tok: TK_EOT or TK_SUBEXP_CLOSE */ +static int +parse_subexp(Node** top, OnigToken* tok, int term, + UChar** src, UChar* end, ScanEnv* env) +{ + int r; + Node *node, **headp; + + *top = NULL; + r = parse_branch(&node, tok, term, src, end, env); + if (r < 0) { + onig_node_free(node); + return r; + } + + if (r == term) { + *top = node; + } + else if (r == TK_ALT) { + *top = node_new_alt(node, NULL); + headp = &(NCONS(*top).right); + while (r == TK_ALT) { + r = fetch_token(tok, src, end, env); + if (r < 0) return r; + r = parse_branch(&node, tok, term, src, end, env); + if (r < 0) return r; + + *headp = node_new_alt(node, NULL); + headp = &(NCONS(*headp).right); + } + + if (tok->type != term) + goto err; + } + else { + err: + if (term == TK_SUBEXP_CLOSE) + return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS; + else + return ONIGERR_PARSER_BUG; + } + + return r; +} + +static int +parse_regexp(Node** top, UChar** src, UChar* end, ScanEnv* env) +{ + int r; + OnigToken tok; + + r = fetch_token(&tok, src, end, env); + if (r < 0) return r; + r = parse_subexp(top, &tok, TK_EOT, src, end, env); + if (r < 0) return r; + return 0; +} + +extern int +onig_parse_make_tree(Node** root, UChar* pattern, UChar* end, regex_t* reg, + ScanEnv* env) +{ + int r; + UChar* p; + +#ifdef USE_NAMED_GROUP + names_clear(reg); +#endif + + scan_env_clear(env); + env->option = reg->options; + env->enc = reg->enc; + env->syntax = reg->syntax; + env->pattern = pattern; + env->pattern_end = end; + env->reg = reg; + + *root = NULL; + p = pattern; + r = parse_regexp(root, &p, end, env); + reg->num_mem = env->num_mem; + return r; +} + +extern void +onig_scan_env_set_error_string(ScanEnv* env, int ecode, + UChar* arg, UChar* arg_end) +{ + env->error = arg; + env->error_end = arg_end; +} diff --git a/regparse.h b/regparse.h new file mode 100644 index 0000000000..b2726becbd --- /dev/null +++ b/regparse.h @@ -0,0 +1,277 @@ +/********************************************************************** + + regparse.h - Oniguruma (regular expression library) + + Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) + +**********************************************************************/ +#ifndef REGPARSE_H +#define REGPARSE_H + +#include "regint.h" + +/* node type */ +#define N_STRING (1<< 0) +#define N_CCLASS (1<< 1) +#define N_CTYPE (1<< 2) +#define N_ANYCHAR (1<< 3) +#define N_BACKREF (1<< 4) +#define N_QUALIFIER (1<< 5) +#define N_EFFECT (1<< 6) +#define N_ANCHOR (1<< 7) +#define N_LIST (1<< 8) +#define N_ALT (1<< 9) +#define N_CALL (1<<10) + +#define IS_NODE_TYPE_SIMPLE(type) \ + (((type) & (N_STRING | N_CCLASS | N_CTYPE | N_ANYCHAR | N_BACKREF)) != 0) + +#define NTYPE(node) ((node)->type) +#define NCONS(node) ((node)->u.cons) +#define NSTRING(node) ((node)->u.str) +#define NCCLASS(node) ((node)->u.cclass) +#define NCTYPE(node) ((node)->u.ctype) +#define NQUALIFIER(node) ((node)->u.qualifier) +#define NANCHOR(node) ((node)->u.anchor) +#define NBACKREF(node) ((node)->u.backref) +#define NEFFECT(node) ((node)->u.effect) +#define NCALL(node) ((node)->u.call) + +#define CTYPE_WORD (1<<0) +#define CTYPE_NOT_WORD (1<<1) +#define CTYPE_WHITE_SPACE (1<<2) +#define CTYPE_NOT_WHITE_SPACE (1<<3) +#define CTYPE_DIGIT (1<<4) +#define CTYPE_NOT_DIGIT (1<<5) + + +#define ANCHOR_ANYCHAR_STAR_MASK (ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_PL) +#define ANCHOR_END_BUF_MASK (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF) + +#define EFFECT_MEMORY (1<<0) +#define EFFECT_OPTION (1<<1) +#define EFFECT_STOP_BACKTRACK (1<<2) + +#define REPEAT_INFINITE -1 +#define IS_REPEAT_INFINITE(n) ((n) == REPEAT_INFINITE) + +#define NODE_STR_MARGIN 16 +#define NODE_STR_BUF_SIZE 24 /* sizeof(CClassNode) - sizeof(int)*4 */ +#define NODE_BACKREFS_SIZE 7 + +#define NSTR_RAW (1<<0) /* by backslashed number */ +#define NSTR_CASE_AMBIG (1<<1) + +#define NSTRING_LEN(node) ((node)->u.str.end - (node)->u.str.s) +#define NSTRING_SET_RAW(node) (node)->u.str.flag |= NSTR_RAW +#define NSTRING_CLEAR_RAW(node) (node)->u.str.flag &= ~NSTR_RAW +#define NSTRING_SET_CASE_AMBIG(node) (node)->u.str.flag |= NSTR_CASE_AMBIG +#define NSTRING_IS_RAW(node) (((node)->u.str.flag & NSTR_RAW) != 0) +#define NSTRING_IS_CASE_AMBIG(node) \ + (((node)->u.str.flag & NSTR_CASE_AMBIG) != 0) + +#define BACKREFS_P(br) \ + (IS_NOT_NULL((br)->back_dynamic) ? (br)->back_dynamic : (br)->back_static); + +#define CCLASS_SET_NOT(cc) (cc)->not = 1 + +#define NQ_TARGET_ISNOT_EMPTY 0 +#define NQ_TARGET_IS_EMPTY 1 +#define NQ_TARGET_IS_EMPTY_MEM 2 +#define NQ_TARGET_IS_EMPTY_REC 3 + + +typedef struct { + UChar* s; + UChar* end; + unsigned int flag; + int capa; /* (allocated size - 1) or 0: use buf[] */ + UChar buf[NODE_STR_BUF_SIZE]; +} StrNode; + +typedef struct { + int not; + BitSet bs; + BBuf* mbuf; /* multi-byte info or NULL */ +} CClassNode; + +typedef struct { + struct _Node* target; + int lower; + int upper; + int greedy; + int by_number; /* {n,m} */ + int target_empty_info; + struct _Node* head_exact; + struct _Node* next_head_exact; + int is_refered; /* include called node. don't eliminate even if {0} */ +} QualifierNode; + +/* status bits */ +#define NST_MIN_FIXED (1<<0) +#define NST_MAX_FIXED (1<<1) +#define NST_CLEN_FIXED (1<<2) +#define NST_MARK1 (1<<3) +#define NST_MARK2 (1<<4) +#define NST_MEM_BACKREFED (1<<5) +#define NST_SIMPLE_REPEAT (1<<6) /* for stop backtrack optimization */ + +#define NST_RECURSION (1<<7) +#define NST_CALLED (1<<8) +#define NST_ADDR_FIXED (1<<9) +#define NST_NAMED_GROUP (1<<10) +#define NST_NAME_REF (1<<11) + +#define SET_EFFECT_STATUS(node,f) (node)->u.effect.state |= (f) +#define CLEAR_EFFECT_STATUS(node,f) (node)->u.effect.state &= ~(f) + +#define IS_EFFECT_CALLED(en) (((en)->state & NST_CALLED) != 0) +#define IS_EFFECT_ADDR_FIXED(en) (((en)->state & NST_ADDR_FIXED) != 0) +#define IS_EFFECT_RECURSION(en) (((en)->state & NST_RECURSION) != 0) +#define IS_EFFECT_MARK1(en) (((en)->state & NST_MARK1) != 0) +#define IS_EFFECT_MARK2(en) (((en)->state & NST_MARK2) != 0) +#define IS_EFFECT_MIN_FIXED(en) (((en)->state & NST_MIN_FIXED) != 0) +#define IS_EFFECT_MAX_FIXED(en) (((en)->state & NST_MAX_FIXED) != 0) +#define IS_EFFECT_CLEN_FIXED(en) (((en)->state & NST_CLEN_FIXED) != 0) +#define IS_EFFECT_SIMPLE_REPEAT(en) (((en)->state & NST_SIMPLE_REPEAT) != 0) +#define IS_EFFECT_NAMED_GROUP(en) (((en)->state & NST_NAMED_GROUP) != 0) + +#define SET_CALL_RECURSION(node) (node)->u.call.state |= NST_RECURSION +#define IS_CALL_RECURSION(cn) (((cn)->state & NST_RECURSION) != 0) +#define IS_CALL_NAME_REF(cn) (((cn)->state & NST_NAME_REF) != 0) +#define IS_BACKREF_NAME_REF(bn) (((bn)->state & NST_NAME_REF) != 0) + +typedef struct { + int state; + int type; + int regnum; + OnigOptionType option; + struct _Node* target; + AbsAddrType call_addr; + /* for multiple call reference */ + OnigDistance min_len; /* min length (byte) */ + OnigDistance max_len; /* max length (byte) */ + int char_len; /* character length */ + int opt_count; /* referenced count in optimize_node_left() */ +} EffectNode; + +#define CALLNODE_REFNUM_UNDEF -1 + +#ifdef USE_SUBEXP_CALL + +typedef struct { + int offset; + struct _Node* target; +} UnsetAddr; + +typedef struct { + int num; + int alloc; + UnsetAddr* us; +} UnsetAddrList; + +typedef struct { + int state; + int ref_num; + UChar* name; + UChar* name_end; + struct _Node* target; /* EffectNode : EFFECT_MEMORY */ + UnsetAddrList* unset_addr_list; +} CallNode; + +#endif + +typedef struct { + int state; + int back_num; + int back_static[NODE_BACKREFS_SIZE]; + int* back_dynamic; +} BackrefNode; + +typedef struct { + int type; + struct _Node* target; + int char_len; +} AnchorNode; + +typedef struct _Node { + int type; + union { + StrNode str; + CClassNode cclass; + QualifierNode qualifier; + EffectNode effect; +#ifdef USE_SUBEXP_CALL + CallNode call; +#endif + BackrefNode backref; + AnchorNode anchor; + struct { + struct _Node* left; + struct _Node* right; + } cons; + struct { + int type; + } ctype; + } u; +} Node; + +#define NULL_NODE ((Node* )0) + +#define SCANENV_MEMNODES_SIZE 8 +#define SCANENV_MEM_NODES(senv) \ + (IS_NOT_NULL((senv)->mem_nodes_dynamic) ? \ + (senv)->mem_nodes_dynamic : (senv)->mem_nodes_static) + +typedef struct { + OnigOptionType option; + OnigEncoding enc; + OnigSyntaxType* syntax; + BitStatusType capture_history; + BitStatusType bt_mem_start; + BitStatusType bt_mem_end; + BitStatusType backrefed_mem; + UChar* pattern; + UChar* pattern_end; + UChar* error; + UChar* error_end; + regex_t* reg; /* for reg->names only */ + int num_call; +#ifdef USE_SUBEXP_CALL + UnsetAddrList* unset_addr_list; +#endif + int num_mem; +#ifdef USE_NAMED_GROUP + int num_named; +#endif + int mem_alloc; + Node* mem_nodes_static[SCANENV_MEMNODES_SIZE]; + Node** mem_nodes_dynamic; +} ScanEnv; + + +#define IS_SYNTAX_OP(syn, opm) (((syn)->op & (opm)) != 0) +#define IS_SYNTAX_OP2(syn, opm) (((syn)->op2 & (opm)) != 0) +#define IS_SYNTAX_BV(syn, bvm) (((syn)->behavior & (bvm)) != 0) + +extern int onig_is_code_in_cc P_((OnigEncoding enc, OnigCodePoint code, CClassNode* cc)); +extern int onig_strncmp P_((UChar* s1, UChar* s2, int n)); +extern void onig_scan_env_set_error_string P_((ScanEnv* env, int ecode, UChar* arg, UChar* arg_end)); +extern int onig_scan_unsigned_number P_((UChar** src, UChar* end, OnigEncoding enc)); +extern void onig_reduce_nested_qualifier P_((Node* pnode, Node* cnode)); +extern void onig_node_conv_to_str_node P_((Node* node, int raw)); +extern int onig_node_str_cat P_((Node* node, UChar* s, UChar* end)); +extern void onig_node_free P_((Node* node)); +extern Node* onig_node_new_effect P_((int type)); +extern Node* onig_node_new_anchor P_((int type)); +extern int onig_free_node_list(); +extern int onig_names_free P_((regex_t* reg)); +extern int onig_parse_make_tree P_((Node** root, UChar* pattern, UChar* end, regex_t* reg, ScanEnv* env)); + +#ifdef ONIG_DEBUG +#ifdef USE_NAMED_GROUP +extern int onig_print_names(FILE*, regex_t*); +#endif +#endif + +#endif /* REGPARSE_H */ diff --git a/sjis.c b/sjis.c new file mode 100644 index 0000000000..8485910e69 --- /dev/null +++ b/sjis.c @@ -0,0 +1,174 @@ +/********************************************************************** + + sjis.c - Oniguruma (regular expression library) + + Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) + +**********************************************************************/ +#include "regenc.h" + +static const char SJIS_CAN_BE_TRAIL_TABLE[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 +}; + +#define SJIS_ISMB_FIRST(byte) (OnigEncodingSJIS.len_table[byte] > 1) +#define SJIS_ISMB_TRAIL(byte) SJIS_CAN_BE_TRAIL_TABLE[(byte)] + +static OnigCodePoint +sjis_mbc_to_code(UChar* p, UChar* end) +{ + int c, i, len; + OnigCodePoint n; + + c = *p++; + len = enc_len(ONIG_ENCODING_SJIS, c); + n = c; + if (len == 1) return n; + + for (i = 1; i < len; i++) { + if (p >= end) break; + c = *p++; + n <<= 8; n += c; + } + return n; +} + +static int +sjis_code_to_mbc(OnigCodePoint code, UChar *buf) +{ + UChar *p = buf; + + if ((code & 0xff00) != 0) *p++ = (UChar )(((code >> 8) & 0xff)); + *p++ = (UChar )(code & 0xff); + +#if 0 + if (enc_len(ONIG_ENCODING_SJIS, buf[0]) != (p - buf)) + return REGERR_INVALID_WIDE_CHAR_VALUE; +#endif + return p - buf; +} + +static int +sjis_mbc_to_lower(UChar* p, UChar* lower) +{ + int len; + + if (ONIGENC_IS_MBC_ASCII(p)) { + *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + return 1; + } + else { + len = enc_len(ONIG_ENCODING_SJIS, *p); + if (lower != p) { + /* memcpy(lower, p, len); */ + int i; + for (i = 0; i < len; i++) { + *lower++ = *p++; + } + } + return len; /* return byte length of converted char to lower */ + } +} + +static int +sjis_code_is_ctype(OnigCodePoint code, unsigned int ctype) +{ + if ((ctype & ONIGENC_CTYPE_WORD) != 0) { + if (code < 128) + return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); + else { + int first = onigenc_mb2_code_to_mbc_first(code); + return (enc_len(ONIG_ENCODING_SJIS, first) > 1 ? TRUE : FALSE); + } + + ctype &= ~ONIGENC_CTYPE_WORD; + if (ctype == 0) return FALSE; + } + + if (code < 128) + return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); + else + return FALSE; +} + +static UChar* +sjis_left_adjust_char_head(UChar* start, UChar* s) +{ + UChar *p; + int len; + + if (s <= start) return s; + p = s; + + if (SJIS_ISMB_TRAIL(*p)) { + while (p > start) { + if (! SJIS_ISMB_FIRST(*--p)) { + p++; + break; + } + } + } + len = enc_len(ONIG_ENCODING_SJIS, *p); + if (p + len > s) return p; + p += len; + return p + ((s - p) & ~1); +} + +static int +sjis_is_allowed_reverse_match(UChar* s, UChar* end) +{ + UChar c = *s; + return (SJIS_ISMB_TRAIL(c) ? FALSE : TRUE); +} + +OnigEncodingType OnigEncodingSJIS = { + { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1 + }, + "Shift_JIS", /* name */ + 2, /* max byte length */ + FALSE, /* is_fold_match */ + ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ + FALSE, /* is continuous sb mb codepoint */ + sjis_mbc_to_code, + onigenc_mb2_code_to_mbclen, + sjis_code_to_mbc, + sjis_mbc_to_lower, + onigenc_mbn_mbc_is_case_ambig, + sjis_code_is_ctype, + onigenc_nothing_get_ctype_code_range, + sjis_left_adjust_char_head, + sjis_is_allowed_reverse_match, + onigenc_nothing_get_all_fold_match_code, + onigenc_nothing_get_fold_match_info +}; diff --git a/utf8.c b/utf8.c new file mode 100644 index 0000000000..604cfac2ef --- /dev/null +++ b/utf8.c @@ -0,0 +1,566 @@ +/********************************************************************** + + utf8.c - Oniguruma (regular expression library) + + Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) + +**********************************************************************/ +#include "regenc.h" + +#define utf8_islead(c) ((UChar )((c) & 0xc0) != 0x80) + +#define ENC_IS_ISO_8859_1_CTYPE(code,ctype) \ + ((EncUnicode_ISO_8859_1_CtypeTable[code] & ctype) != 0) + +static unsigned short EncUnicode_ISO_8859_1_CtypeTable[256] = { + 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, + 0x1004, 0x1106, 0x1104, 0x1104, 0x1104, 0x1104, 0x1004, 0x1004, + 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, + 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, + 0x1142, 0x10d0, 0x10d0, 0x10d0, 0x1050, 0x10d0, 0x10d0, 0x10d0, + 0x10d0, 0x10d0, 0x10d0, 0x1050, 0x10d0, 0x10d0, 0x10d0, 0x10d0, + 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, + 0x1c58, 0x1c58, 0x10d0, 0x10d0, 0x1050, 0x1050, 0x1050, 0x10d0, + 0x10d0, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1a51, + 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, + 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, + 0x1a51, 0x1a51, 0x1a51, 0x10d0, 0x10d0, 0x10d0, 0x1050, 0x18d0, + 0x1050, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1871, + 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, + 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, + 0x1871, 0x1871, 0x1871, 0x10d0, 0x1050, 0x10d0, 0x1050, 0x1004, + 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, + 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, + 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, + 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, + 0x1142, 0x10d0, 0x1050, 0x1050, 0x1050, 0x1050, 0x1050, 0x1050, + 0x1050, 0x1050, 0x1871, 0x10d0, 0x1050, 0x10d0, 0x1050, 0x1050, + 0x1050, 0x1050, 0x1850, 0x1850, 0x1050, 0x1871, 0x1050, 0x10d0, + 0x1050, 0x1850, 0x1871, 0x10d0, 0x1850, 0x1850, 0x1850, 0x10d0, + 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, + 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, + 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1050, + 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1871, + 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, + 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, + 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1050, + 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871 +}; + +static OnigCodePoint +utf8_mbc_to_code(UChar* p, UChar* end) +{ + int c, len; + OnigCodePoint n; + + c = *p++; + len = enc_len(ONIG_ENCODING_UTF8, c); + if (len > 1) { + len--; + n = c & ((1 << (6 - len)) - 1); + while (len--) { + c = *p++; + n = (n << 6) | (c & ((1 << 6) - 1)); + } + return n; + } + else + return (OnigCodePoint )c; +} + +static int +utf8_code_to_mbclen(OnigCodePoint code) +{ + if ((code & 0xffffff80) == 0) return 1; + else if ((code & 0xfffff800) == 0) { + if (code <= 0xff && code >= 0xfe) + return 1; + return 2; + } + else if ((code & 0xffff0000) == 0) return 3; + else if ((code & 0xffe00000) == 0) return 4; + else if ((code & 0xfc000000) == 0) return 5; + else if ((code & 0x80000000) == 0) return 6; + else + return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; +} + +#if 0 +static int +utf8_code_to_mbc_first(OnigCodePoint code) +{ + if ((code & 0xffffff80) == 0) + return code; + else { + if ((code & 0xfffff800) == 0) + return ((code>>6)& 0x1f) | 0xc0; + else if ((code & 0xffff0000) == 0) + return ((code>>12) & 0x0f) | 0xe0; + else if ((code & 0xffe00000) == 0) + return ((code>>18) & 0x07) | 0xf0; + else if ((code & 0xfc000000) == 0) + return ((code>>24) & 0x03) | 0xf8; + else if ((code & 0x80000000) == 0) + return ((code>>30) & 0x01) | 0xfc; + else { + return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; + } + } +} +#endif + +static int +utf8_code_to_mbc(OnigCodePoint code, UChar *buf) +{ +#define UTF8_TRAILS(code, shift) (UChar )((((code) >> (shift)) & 0x3f) | 0x80) +#define UTF8_TRAIL0(code) (UChar )(((code) & 0x3f) | 0x80) + + if ((code & 0xffffff80) == 0) { + *buf = (UChar )code; + return 1; + } + else { + UChar *p = buf; + + if ((code & 0xfffff800) == 0) { + *p++ = (UChar )(((code>>6)& 0x1f) | 0xc0); + } + else if ((code & 0xffff0000) == 0) { + *p++ = (UChar )(((code>>12) & 0x0f) | 0xe0); + *p++ = UTF8_TRAILS(code, 6); + } + else if ((code & 0xffe00000) == 0) { + *p++ = (UChar )(((code>>18) & 0x07) | 0xf0); + *p++ = UTF8_TRAILS(code, 12); + *p++ = UTF8_TRAILS(code, 6); + } + else if ((code & 0xfc000000) == 0) { + *p++ = (UChar )(((code>>24) & 0x03) | 0xf8); + *p++ = UTF8_TRAILS(code, 18); + *p++ = UTF8_TRAILS(code, 12); + *p++ = UTF8_TRAILS(code, 6); + } + else if ((code & 0x80000000) == 0) { + *p++ = (UChar )(((code>>30) & 0x01) | 0xfc); + *p++ = UTF8_TRAILS(code, 24); + *p++ = UTF8_TRAILS(code, 18); + *p++ = UTF8_TRAILS(code, 12); + *p++ = UTF8_TRAILS(code, 6); + } + else { + return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; + } + + *p++ = UTF8_TRAIL0(code); + return p - buf; + } +} + +static int +utf8_mbc_to_lower(UChar* p, UChar* lower) +{ + int len; + + /* !!! U+0080 - U+00ff is treated by fold match. !!! */ + if (ONIGENC_IS_MBC_ASCII(p)) { + *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + return 1; /* return byte length of converted char to lower */ + } + else { + len = enc_len(ONIG_ENCODING_UTF8, *p); + if (lower != p) { + /* memcpy(lower, p, len); */ + int i; + for (i = 0; i < len; i++) { + *lower++ = *p++; + } + } + return len; /* return byte length of converted char to lower */ + } +} + +static int +utf8_mbc_is_case_ambig(UChar* p) +{ + /* !!! U+0080 - U+00ff ( 0x80[0xc2,0x80] - 0xff[0xc3,0xbf] ) + is treated by fold match. !!! */ + + if (ONIGENC_IS_MBC_ASCII(p)) + return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p); + + return FALSE; +} + +static int +utf8_code_is_ctype(OnigCodePoint code, unsigned int ctype) +{ + if (code < 256) { + return ENC_IS_ISO_8859_1_CTYPE(code, ctype); + } + + if ((ctype & ONIGENC_CTYPE_WORD) != 0) { + return TRUE; + } + + return FALSE; +} + +static int +utf8_get_ctype_code_range(int ctype, int* nsb, int* nmb, + OnigCodePointRange* sbr[], OnigCodePointRange* mbr[]) +{ +#define CR_SET(sbl,mbl) do { \ + *nsb = sizeof(sbl) / sizeof(OnigCodePointRange); \ + *nmb = sizeof(mbl) / sizeof(OnigCodePointRange); \ + *sbr = sbl; \ + *mbr = mbl; \ +} while (0) + +#define CR_SB_SET(sbl) do { \ + *nsb = sizeof(sbl) / sizeof(OnigCodePointRange); \ + *nmb = 0; \ + *sbr = sbl; \ +} while (0) + + static OnigCodePointRange SBAlpha[] = { + { 0x41, 0x5a }, + { 0x61, 0x7a } + }; + + static OnigCodePointRange MBAlpha[] = { + { 0xaa, 0xaa }, + { 0xb5, 0xb5 }, + { 0xba, 0xba }, + { 0xc0, 0xd6 }, + { 0xd8, 0xf6 }, + { 0xf8, 0x220 } + }; + + static OnigCodePointRange SBBlank[] = { + { 0x09, 0x09 }, + { 0x20, 0x20 } + }; + + static OnigCodePointRange MBBlank[] = { + { 0xa0, 0xa0 } + }; + + static OnigCodePointRange SBCntrl[] = { + { 0x00, 0x1f }, + { 0x7f, 0x7f } + }; + + static OnigCodePointRange MBCntrl[] = { + { 0x80, 0x9f } + }; + + static OnigCodePointRange SBDigit[] = { + { 0x30, 0x39 } + }; + + static OnigCodePointRange SBGraph[] = { + { 0x21, 0x7e } + }; + + static OnigCodePointRange MBGraph[] = { + { 0xa1, 0x220 } + }; + + static OnigCodePointRange SBLower[] = { + { 0x61, 0x7a } + }; + + static OnigCodePointRange MBLower[] = { + { 0xaa, 0xaa }, + { 0xb5, 0xb5 }, + { 0xba, 0xba }, + { 0xdf, 0xf6 }, + { 0xf8, 0xff } + }; + + static OnigCodePointRange SBPrint[] = { + { 0x20, 0x7e } + }; + + static OnigCodePointRange MBPrint[] = { + { 0xa0, 0x220 } + }; + + static OnigCodePointRange SBPunct[] = { + { 0x21, 0x23 }, + { 0x25, 0x2a }, + { 0x2c, 0x2f }, + { 0x3a, 0x3b }, + { 0x3f, 0x40 }, + { 0x5b, 0x5d }, + { 0x5f, 0x5f }, + { 0x7b, 0x7b }, + { 0x7d, 0x7d } + }; + + static OnigCodePointRange MBPunct[] = { + { 0xa1, 0xa1 }, + { 0xab, 0xab }, + { 0xad, 0xad }, + { 0xb7, 0xb7 }, + { 0xbb, 0xbb }, + { 0xbf, 0xbf } + }; + + static OnigCodePointRange SBSpace[] = { + { 0x09, 0x0d }, + { 0x20, 0x20 } + }; + + static OnigCodePointRange MBSpace[] = { + { 0xa0, 0xa0 } + }; + + static OnigCodePointRange SBUpper[] = { + { 0x41, 0x5a } + }; + + static OnigCodePointRange MBUpper[] = { + { 0xc0, 0xd6 }, + { 0xd8, 0xde } + }; + + static OnigCodePointRange SBXDigit[] = { + { 0x30, 0x39 }, + { 0x41, 0x46 }, + { 0x61, 0x66 } + }; + + static OnigCodePointRange SBWord[] = { + { 0x30, 0x39 }, + { 0x41, 0x5a }, + { 0x5f, 0x5f }, + { 0x61, 0x7a } + }; + + static OnigCodePointRange MBWord[] = { + { 0xaa, 0xaa }, + { 0xb2, 0xb3 }, + { 0xb5, 0xb5 }, + { 0xb9, 0xba }, + { 0xbc, 0xbe }, + { 0xc0, 0xd6 }, + { 0xd8, 0xf6 }, +#if 0 + { 0xf8, 0x220 } +#else + { 0xf8, 0x7fffffff } /* all multibyte code as word */ +#endif + }; + + static OnigCodePointRange SBAscii[] = { + { 0x00, 0x7f } + }; + + static OnigCodePointRange SBAlnum[] = { + { 0x30, 0x39 }, + { 0x41, 0x5a }, + { 0x61, 0x7a } + }; + + static OnigCodePointRange MBAlnum[] = { + { 0xaa, 0xaa }, + { 0xb5, 0xb5 }, + { 0xba, 0xba }, + { 0xc0, 0xd6 }, + { 0xd8, 0xf6 }, + { 0xf8, 0x220 } + }; + + switch (ctype) { + case ONIGENC_CTYPE_ALPHA: + CR_SET(SBAlpha, MBAlpha); + break; + case ONIGENC_CTYPE_BLANK: + CR_SET(SBBlank, MBBlank); + break; + case ONIGENC_CTYPE_CNTRL: + CR_SET(SBCntrl, MBCntrl); + break; + case ONIGENC_CTYPE_DIGIT: + CR_SB_SET(SBDigit); + break; + case ONIGENC_CTYPE_GRAPH: + CR_SET(SBGraph, MBGraph); + break; + case ONIGENC_CTYPE_LOWER: + CR_SET(SBLower, MBLower); + break; + case ONIGENC_CTYPE_PRINT: + CR_SET(SBPrint, MBPrint); + break; + case ONIGENC_CTYPE_PUNCT: + CR_SET(SBPunct, MBPunct); + break; + case ONIGENC_CTYPE_SPACE: + CR_SET(SBSpace, MBSpace); + break; + case ONIGENC_CTYPE_UPPER: + CR_SET(SBUpper, MBUpper); + break; + case ONIGENC_CTYPE_XDIGIT: + CR_SB_SET(SBXDigit); + break; + case ONIGENC_CTYPE_WORD: + CR_SET(SBWord, MBWord); + break; + case ONIGENC_CTYPE_ASCII: + CR_SB_SET(SBAscii); + break; + case ONIGENC_CTYPE_ALNUM: + CR_SET(SBAlnum, MBAlnum); + break; + + default: + return ONIGERR_TYPE_BUG; + break; + } + + return 0; +} + +static int +utf8_get_all_fold_match_code(OnigCodePoint** codes) +{ + static OnigCodePoint list[] = { + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, + }; + + *codes = list; + return sizeof(list) / sizeof(OnigCodePoint); +} + +static int +utf8_get_fold_match_info(UChar* p, UChar* end, OnigEncFoldMatchInfo** info) +{ + + static OnigEncFoldMatchInfo xc[] = { + { 2, { 2, 2 }, { "\303\200", "\303\240" } }, /* CodePoint 0xc0 */ + { 2, { 2, 2 }, { "\303\201", "\303\241" } }, + { 2, { 2, 2 }, { "\303\202", "\303\242" } }, + { 2, { 2, 2 }, { "\303\203", "\303\243" } }, + { 2, { 2, 2 }, { "\303\204", "\303\244" } }, + { 2, { 2, 2 }, { "\303\205", "\303\245" } }, + { 2, { 2, 2 }, { "\303\206", "\303\246" } }, + { 2, { 2, 2 }, { "\303\207", "\303\247" } }, + { 2, { 2, 2 }, { "\303\210", "\303\250" } }, + { 2, { 2, 2 }, { "\303\211", "\303\251" } }, + { 2, { 2, 2 }, { "\303\212", "\303\252" } }, + { 2, { 2, 2 }, { "\303\213", "\303\253" } }, + { 2, { 2, 2 }, { "\303\214", "\303\254" } }, + { 2, { 2, 2 }, { "\303\215", "\303\255" } }, + { 2, { 2, 2 }, { "\303\216", "\303\256" } }, + { 2, { 2, 2 }, { "\303\217", "\303\257" } }, + { 2, { 2, 2 }, { "\303\220", "\303\260" } }, /* CodePoint 0xd0 */ + { 2, { 2, 2 }, { "\303\221", "\303\261" } }, + { 2, { 2, 2 }, { "\303\222", "\303\262" } }, + { 2, { 2, 2 }, { "\303\223", "\303\263" } }, + { 2, { 2, 2 }, { "\303\224", "\303\264" } }, + { 2, { 2, 2 }, { "\303\225", "\303\265" } }, + { 2, { 2, 2 }, { "\303\226", "\303\266" } }, + { 0, { 0 }, { "" } }, + { 2, { 2, 2 }, { "\303\230", "\303\270" } }, + { 2, { 2, 2 }, { "\303\231", "\303\271" } }, + { 2, { 2, 2 }, { "\303\232", "\303\272" } }, + { 2, { 2, 2 }, { "\303\233", "\303\273" } }, + { 2, { 2, 2 }, { "\303\234", "\303\274" } }, + { 2, { 2, 2 }, { "\303\235", "\303\275" } }, + { 2, { 2, 2 }, { "\303\236", "\303\276" } }, + { 3, { 2, 2, 2 }, { "\303\237", "ss", "SS" }} /* ess-tsett(U+00DF) */ + }; + + if (p + 1 >= end) return -1; + if (*p < 0x80) { + if ((*p == 'S' && *(p+1) == 'S') || + (*p == 's' && *(p+1) == 's')) { + *info = &(xc[0xdf - 0xc0]); + return 2; + } + } + else if (*p == 195) { /* 195 == '\303' */ + int c = *(p+1); + if (c >= 128) { + if (c <= 159) { /* upper */ + if (c == 151) return -1; /* 0xd7 */ + *info = &(xc[c - 128]); + return 2; + } + else { /* lower */ + if (c == 183) return -1; /* 0xf7 */ + *info = &(xc[c - 160]); + return 2; + } + } + } + + return -1; /* is not a fold string. */ +} + + +static UChar* +utf8_left_adjust_char_head(UChar* start, UChar* s) +{ + UChar *p; + + if (s <= start) return s; + p = s; + + while (!utf8_islead(*p) && p > start) p--; + return p; +} + +static int +utf8_is_allowed_reverse_match(UChar* s, UChar* end) +{ + return TRUE; +} + +OnigEncodingType OnigEncodingUTF8 = { + { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1 + }, + "UTF-8", /* name */ + 6, /* max byte length */ + TRUE, /* is_fold_match */ + ONIGENC_CTYPE_SUPPORT_LEVEL_FULL, /* ctype_support_level */ + TRUE, /* is continuous sb mb codepoint */ + utf8_mbc_to_code, + utf8_code_to_mbclen, + utf8_code_to_mbc, + utf8_mbc_to_lower, + utf8_mbc_is_case_ambig, + utf8_code_is_ctype, + utf8_get_ctype_code_range, + utf8_left_adjust_char_head, + utf8_is_allowed_reverse_match, + utf8_get_all_fold_match_code, + utf8_get_fold_match_info +}; -- cgit v1.2.3