diff options
author | ksaito <ksaito@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2004-11-04 14:31:26 +0000 |
---|---|---|
committer | ksaito <ksaito@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2004-11-04 14:31:26 +0000 |
commit | 5e853c811ce1d6d6edc187e580a14133667e1058 (patch) | |
tree | 4ecf2cb00a79a481ee5aeda802d5bb73415ca8f5 | |
parent | 67ae0fb9aced8cf56de10a1fd400a236bd753b60 (diff) | |
download | ruby-5e853c811ce1d6d6edc187e580a14133667e1058.tar.gz |
This commit was generated by cvs2svn to compensate for changes in r7203,
which included commits to RCS files with non-trunk default branches.
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@7204 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r-- | ascii.c | 79 | ||||
-rw-r--r-- | euc_jp.c | 161 | ||||
-rw-r--r-- | oniggnu.h | 45 | ||||
-rw-r--r-- | regenc.c | 614 | ||||
-rw-r--r-- | regenc.h | 85 | ||||
-rw-r--r-- | regerror.c | 71 | ||||
-rw-r--r-- | reggnu.c | 38 | ||||
-rw-r--r-- | sjis.c | 158 | ||||
-rw-r--r-- | utf8.c | 450 |
9 files changed, 1203 insertions, 498 deletions
@@ -1,14 +1,36 @@ /********************************************************************** - ascii.c - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" static int -ascii_code_is_ctype(OnigCodePoint code, unsigned int ctype) +ascii_is_code_ctype(OnigCodePoint code, unsigned int ctype) { if (code < 128) return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); @@ -17,38 +39,29 @@ ascii_code_is_ctype(OnigCodePoint code, unsigned int ctype) } OnigEncodingType OnigEncodingASCII = { - { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 - }, + onigenc_single_byte_mbc_enc_len, "US-ASCII", /* name */ 1, /* max byte length */ - FALSE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ - TRUE, /* is continuous sb mb codepoint */ + 1, /* min byte length */ + ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE, + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + onigenc_is_mbc_newline_0x0a, onigenc_single_byte_mbc_to_code, onigenc_single_byte_code_to_mbclen, onigenc_single_byte_code_to_mbc, - onigenc_ascii_mbc_to_lower, - onigenc_ascii_mbc_is_case_ambig, - ascii_code_is_ctype, - onigenc_nothing_get_ctype_code_range, + onigenc_ascii_mbc_to_normalize, + onigenc_ascii_is_mbc_ambiguous, + onigenc_ascii_get_all_pair_ambig_codes, + onigenc_nothing_get_all_comp_ambig_codes, + ascii_is_code_ctype, + onigenc_not_support_get_ctype_code_range, onigenc_single_byte_left_adjust_char_head, - onigenc_single_byte_is_allowed_reverse_match, - onigenc_nothing_get_all_fold_match_code, - onigenc_nothing_get_fold_match_info + onigenc_always_true_is_allowed_reverse_match }; @@ -1,23 +1,69 @@ /********************************************************************** - euc_jp.c - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" #define eucjp_islead(c) ((UChar )((c) - 0xa1) > 0xfe - 0xa1) +static int EncLen_EUCJP[] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 +}; + +static int +eucjp_mbc_enc_len(UChar* p) +{ + return EncLen_EUCJP[*p]; +} + static OnigCodePoint eucjp_mbc_to_code(UChar* p, UChar* end) { int c, i, len; OnigCodePoint n; - c = *p++; - len = enc_len(ONIG_ENCODING_EUC_JP, c); - n = c; + len = enc_len(ONIG_ENCODING_EUC_JP, p); + n = (OnigCodePoint )*p++; if (len == 1) return n; for (i = 1; i < len; i++) { @@ -31,11 +77,13 @@ eucjp_mbc_to_code(UChar* p, UChar* end) static int eucjp_code_to_mbclen(OnigCodePoint code) { - if ((code & 0xff0000) != 0) return 3; + if (ONIGENC_IS_CODE_ASCII(code)) return 1; + else if ((code & 0xff0000) != 0) return 3; else if ((code & 0xff00) != 0) return 2; - else return 1; + else return 0; } +#if 0 static int eucjp_code_to_mbc_first(OnigCodePoint code) { @@ -43,27 +91,16 @@ eucjp_code_to_mbc_first(OnigCodePoint code) if ((code & 0xff0000) != 0) { first = (code >> 16) & 0xff; - /* - if (enc_len(ONIG_ENCODING_EUC_JP, first) != 3) - return ONIGENCERR_INVALID_WIDE_CHAR_VALUE; - */ } else if ((code & 0xff00) != 0) { first = (code >> 8) & 0xff; - /* - if (enc_len(ONIG_ENCODING_EUC_JP, first) != 2) - return ONIGENCERR_INVALID_WIDE_CHAR_VALUE; - */ } else { - /* - if (enc_len(ONIG_ENCODING_EUC_JP, code) != 1) - return ONIGENCERR_INVALID_WIDE_CHAR_VALUE; - */ return (int )code; } return first; } +#endif static int eucjp_code_to_mbc(OnigCodePoint code, UChar *buf) @@ -75,44 +112,57 @@ eucjp_code_to_mbc(OnigCodePoint code, UChar *buf) *p++ = (UChar )(code & 0xff); #if 1 - if (enc_len(ONIG_ENCODING_EUC_JP, buf[0]) != (p - buf)) + if (enc_len(ONIG_ENCODING_EUC_JP, buf) != (p - buf)) return ONIGENCERR_INVALID_WIDE_CHAR_VALUE; #endif return p - buf; } static int -eucjp_mbc_to_lower(UChar* p, UChar* lower) +eucjp_mbc_to_normalize(OnigAmbigType flag, UChar** pp, UChar* end, + UChar* lower) { int len; + UChar* p = *pp; if (ONIGENC_IS_MBC_ASCII(p)) { - *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) { + *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + } + else { + *lower = *p; + } + + (*pp)++; return 1; } else { - len = enc_len(ONIG_ENCODING_EUC_JP, *p); + len = enc_len(ONIG_ENCODING_EUC_JP, p); if (lower != p) { - /* memcpy(lower, p, len); */ int i; for (i = 0; i < len; i++) { *lower++ = *p++; } } + (*pp) += len; return len; /* return byte length of converted char to lower */ } } static int -eucjp_code_is_ctype(OnigCodePoint code, unsigned int ctype) +eucjp_is_mbc_ambiguous(OnigAmbigType flag, UChar** pp, UChar* end) +{ + return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_EUC_JP, flag, pp, end); +} + +static int +eucjp_is_code_ctype(OnigCodePoint code, unsigned int ctype) { if ((ctype & ONIGENC_CTYPE_WORD) != 0) { if (code < 128) return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); - else { - int first = eucjp_code_to_mbc_first(code); - return (enc_len(ONIG_ENCODING_EUC_JP, first) > 1 ? TRUE : FALSE); - } + else + return (eucjp_code_to_mbclen(code) > 1 ? TRUE : FALSE); ctype &= ~ONIGENC_CTYPE_WORD; if (ctype == 0) return FALSE; @@ -137,7 +187,7 @@ eucjp_left_adjust_char_head(UChar* start, UChar* s) p = s; while (!eucjp_islead(*p) && p > start) p--; - len = enc_len(ONIG_ENCODING_EUC_JP, *p); + len = enc_len(ONIG_ENCODING_EUC_JP, p); if (p + len > s) return p; p += len; return p + ((s - p) & ~1); @@ -154,38 +204,29 @@ eucjp_is_allowed_reverse_match(UChar* s, UChar* end) } OnigEncodingType OnigEncodingEUC_JP = { + eucjp_mbc_enc_len, + "EUC-JP", /* name */ + 3, /* max enc length */ + 1, /* min enc length */ + ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE, { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ }, - "EUC-JP", /* name */ - 3, /* max byte length */ - FALSE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ - FALSE, /* is continuous sb mb codepoint */ + onigenc_is_mbc_newline_0x0a, eucjp_mbc_to_code, eucjp_code_to_mbclen, eucjp_code_to_mbc, - eucjp_mbc_to_lower, - onigenc_mbn_mbc_is_case_ambig, - eucjp_code_is_ctype, - onigenc_nothing_get_ctype_code_range, + eucjp_mbc_to_normalize, + eucjp_is_mbc_ambiguous, + onigenc_ascii_get_all_pair_ambig_codes, + onigenc_nothing_get_all_comp_ambig_codes, + eucjp_is_code_ctype, + onigenc_not_support_get_ctype_code_range, eucjp_left_adjust_char_head, - eucjp_is_allowed_reverse_match, - onigenc_nothing_get_all_fold_match_code, - onigenc_nothing_get_fold_match_info + eucjp_is_allowed_reverse_match }; @@ -1,12 +1,33 @@ +#ifndef ONIGGNU_H +#define ONIGGNU_H /********************************************************************** - oniggnu.h - Oniguruma (regular expression library) - - Copyright (C) 2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ -#ifndef ONIGGNU_H -#define ONIGGNU_H +/*- + * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ #include "oniguruma.h" @@ -27,6 +48,18 @@ #define RE_OPTION_POSIXLINE (RE_OPTION_MULTILINE|RE_OPTION_SINGLELINE) #ifdef RUBY_PLATFORM + +#ifndef ONIG_RUBY_M17N + +ONIG_EXTERN OnigEncoding OnigEncDefaultCharEncoding; + +#undef ismbchar +#define ismbchar(c) (mbclen((c)) != 1) +#define mbclen(c) \ + ONIGENC_MBC_ENC_LEN(OnigEncDefaultCharEncoding, (UChar* )(&c)) + +#endif /* ifndef ONIG_RUBY_M17N */ + #define re_mbcinit ruby_re_mbcinit #define re_compile_pattern ruby_re_compile_pattern #define re_recompile_pattern ruby_re_recompile_pattern @@ -1,10 +1,32 @@ /********************************************************************** - regenc.c - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" OnigEncoding OnigEncDefaultCharEncoding = ONIG_ENCODING_INIT_DEFAULT; @@ -33,7 +55,7 @@ onigenc_get_right_adjust_char_head(OnigEncoding enc, UChar* start, UChar* s) { UChar* p = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s); if (p < s) { - p += enc_len(enc, *p); + p += enc_len(enc, p); } return p; } @@ -46,7 +68,7 @@ onigenc_get_right_adjust_char_head_with_prev(OnigEncoding enc, if (p < s) { if (prev) *prev = p; - p += enc_len(enc, *p); + p += enc_len(enc, p); } else { if (prev) *prev = (UChar* )NULL; /* Sorry */ @@ -75,11 +97,114 @@ onigenc_step_back(OnigEncoding enc, UChar* start, UChar* s, int n) return s; } +extern UChar* +onigenc_step(OnigEncoding enc, UChar* p, UChar* end, int n) +{ + while (n-- > 0) { + p += ONIGENC_MBC_ENC_LEN(enc, p); + } + return (p <= end ? p : (UChar* )NULL); +} + +extern int +onigenc_strlen(OnigEncoding enc, UChar* p, UChar* end) +{ + int n = 0; + + while (p < end) { + p += ONIGENC_MBC_ENC_LEN(enc, p); + n++; + } + return n; +} + +extern int +onigenc_strlen_null(OnigEncoding enc, UChar* p) +{ + int n = 0; + + while (1) { + if (*p == '\0') { + UChar* q; + int len = ONIGENC_MBC_MINLEN(enc); + + if (len == 1) return n; + q = p + 1; + while (len > 1) { + if (*q != '\0') break; + q++; + len--; + } + if (len == 1) return n; + } + p += ONIGENC_MBC_ENC_LEN(enc, p); + n++; + } +} + +extern int +onigenc_str_bytelen_null(OnigEncoding enc, UChar* p) +{ + UChar* start = p; + + while (1) { + if (*p == '\0') { + UChar* q; + int len = ONIGENC_MBC_MINLEN(enc); + + if (len == 1) return (int )(p - start); + q = p + 1; + while (len > 1) { + if (*q != '\0') break; + q++; + len--; + } + if (len == 1) return (int )(p - start); + } + p += ONIGENC_MBC_ENC_LEN(enc, p); + } +} #ifndef ONIG_RUBY_M17N #ifndef NOT_RUBY + #define USE_APPLICATION_TO_LOWER_CASE_TABLE + +unsigned short OnigEnc_Unicode_ISO_8859_1_CtypeTable[256] = { + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2284, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, + 0x38b0, 0x38b0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x31a0, + 0x21a0, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x2008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, 0x0008, + 0x0284, 0x01a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, 0x00a0, + 0x00a0, 0x00a0, 0x10e2, 0x01a0, 0x00a0, 0x01a0, 0x00a0, 0x00a0, + 0x00a0, 0x00a0, 0x10a0, 0x10a0, 0x00a0, 0x10e2, 0x00a0, 0x01a0, + 0x00a0, 0x10a0, 0x10e2, 0x01a0, 0x10a0, 0x10a0, 0x10a0, 0x01a0, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x00a0, + 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x14a2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x00a0, + 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2, 0x10e2 +}; #endif UChar* OnigEncAsciiToLowerCaseTable = (UChar* )0; @@ -121,23 +246,61 @@ static UChar BuiltInAsciiToLowerCaseTable[] = { }; #endif /* not USE_APPLICATION_TO_LOWER_CASE_TABLE */ +#ifdef USE_UPPER_CASE_TABLE +UChar OnigEncAsciiToUpperCaseTable[256] = { + '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', + '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', + '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', + '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037', + '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047', + '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057', + '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067', + '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077', + '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107', + '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117', + '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127', + '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137', + '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107', + '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117', + '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127', + '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177', + '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207', + '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217', + '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227', + '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237', + '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247', + '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257', + '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267', + '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277', + '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307', + '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317', + '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327', + '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337', + '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347', + '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', + '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367', + '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377', +}; +#endif + unsigned short OnigEncAsciiCtypeTable[256] = { - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1106, 0x1104, 0x1104, 0x1104, 0x1104, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1142, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, - 0x1c58, 0x1c58, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x18d0, - 0x10d0, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x10d0, 0x10d0, 0x10d0, 0x10d0, 0x1004, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x220c, 0x2209, 0x2208, 0x2208, 0x2208, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, 0x2008, + 0x2284, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, 0x38b0, + 0x38b0, 0x38b0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x21a0, + 0x21a0, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x3ca2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, 0x34a2, + 0x34a2, 0x34a2, 0x34a2, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x31a0, + 0x21a0, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x38e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, + 0x30e2, 0x30e2, 0x30e2, 0x21a0, 0x21a0, 0x21a0, 0x21a0, 0x2008, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, @@ -156,6 +319,78 @@ unsigned short OnigEncAsciiCtypeTable[256] = { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }; +UChar OnigEncISO_8859_1_ToLowerCaseTable[256] = { + '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', + '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', + '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', + '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037', + '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047', + '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057', + '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067', + '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077', + '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147', + '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', + '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', + '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137', + '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147', + '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', + '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', + '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177', + '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207', + '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217', + '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227', + '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237', + '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247', + '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257', + '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267', + '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277', + '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347', + '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', + '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\327', + '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\337', + '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347', + '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', + '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367', + '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377' +}; + +#ifdef USE_UPPER_CASE_TABLE +UChar OnigEncISO_8859_1_ToUpperCaseTable[256] = { + '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', + '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', + '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', + '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037', + '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047', + '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057', + '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067', + '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077', + '\100', '\101', '\102', '\103', '\104', '\105', '\106', '\107', + '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117', + '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127', + '\130', '\131', '\132', '\133', '\134', '\135', '\136', '\137', + '\140', '\101', '\102', '\103', '\104', '\105', '\106', '\107', + '\110', '\111', '\112', '\113', '\114', '\115', '\116', '\117', + '\120', '\121', '\122', '\123', '\124', '\125', '\126', '\127', + '\130', '\131', '\132', '\173', '\174', '\175', '\176', '\177', + '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207', + '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217', + '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227', + '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237', + '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247', + '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257', + '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267', + '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277', + '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307', + '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317', + '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327', + '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337', + '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307', + '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317', + '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\367', + '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\377', +}; +#endif + extern void onigenc_set_default_caseconv_table(UChar* table) { @@ -178,38 +413,230 @@ onigenc_get_left_adjust_char_head(OnigEncoding enc, UChar* start, UChar* s) return ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, start, s); } +OnigPairAmbigCodes OnigAsciiPairAmbigCodes[] = { + { 0x41, 0x61 }, + { 0x42, 0x62 }, + { 0x43, 0x63 }, + { 0x44, 0x64 }, + { 0x45, 0x65 }, + { 0x46, 0x66 }, + { 0x47, 0x67 }, + { 0x48, 0x68 }, + { 0x49, 0x69 }, + { 0x4a, 0x6a }, + { 0x4b, 0x6b }, + { 0x4c, 0x6c }, + { 0x4d, 0x6d }, + { 0x4e, 0x6e }, + { 0x4f, 0x6f }, + { 0x50, 0x70 }, + { 0x51, 0x71 }, + { 0x52, 0x72 }, + { 0x53, 0x73 }, + { 0x54, 0x74 }, + { 0x55, 0x75 }, + { 0x56, 0x76 }, + { 0x57, 0x77 }, + { 0x58, 0x78 }, + { 0x59, 0x79 }, + { 0x5a, 0x7a }, + + { 0x61, 0x41 }, + { 0x62, 0x42 }, + { 0x63, 0x43 }, + { 0x64, 0x44 }, + { 0x65, 0x45 }, + { 0x66, 0x46 }, + { 0x67, 0x47 }, + { 0x68, 0x48 }, + { 0x69, 0x49 }, + { 0x6a, 0x4a }, + { 0x6b, 0x4b }, + { 0x6c, 0x4c }, + { 0x6d, 0x4d }, + { 0x6e, 0x4e }, + { 0x6f, 0x4f }, + { 0x70, 0x50 }, + { 0x71, 0x51 }, + { 0x72, 0x52 }, + { 0x73, 0x53 }, + { 0x74, 0x54 }, + { 0x75, 0x55 }, + { 0x76, 0x56 }, + { 0x77, 0x57 }, + { 0x78, 0x58 }, + { 0x79, 0x59 }, + { 0x7a, 0x5a } +}; + +extern int +onigenc_ascii_get_all_pair_ambig_codes(OnigAmbigType flag, + OnigPairAmbigCodes** ccs) +{ + if (flag == ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) { + *ccs = OnigAsciiPairAmbigCodes; + return (sizeof(OnigAsciiPairAmbigCodes) / sizeof(OnigPairAmbigCodes)); + } + else { + return 0; + } +} + extern int -onigenc_nothing_get_all_fold_match_code(OnigCodePoint** codes) +onigenc_nothing_get_all_comp_ambig_codes(OnigAmbigType flag, + OnigCompAmbigCodes** ccs) { return 0; } extern int -onigenc_nothing_get_fold_match_info(UChar* p, UChar* end, - OnigEncFoldMatchInfo** info) +onigenc_iso_8859_1_get_all_pair_ambig_codes(OnigAmbigType flag, + OnigPairAmbigCodes** ccs) +{ + static OnigPairAmbigCodes cc[] = { + { 0xc0, 0xe0 }, + { 0xc1, 0xe1 }, + { 0xc2, 0xe2 }, + { 0xc3, 0xe3 }, + { 0xc4, 0xe4 }, + { 0xc5, 0xe5 }, + { 0xc6, 0xe6 }, + { 0xc7, 0xe7 }, + { 0xc8, 0xe8 }, + { 0xc9, 0xe9 }, + { 0xca, 0xea }, + { 0xcb, 0xeb }, + { 0xcc, 0xec }, + { 0xcd, 0xed }, + { 0xce, 0xee }, + { 0xcf, 0xef }, + + { 0xd0, 0xf0 }, + { 0xd1, 0xf1 }, + { 0xd2, 0xf2 }, + { 0xd3, 0xf3 }, + { 0xd4, 0xf4 }, + { 0xd5, 0xf5 }, + { 0xd6, 0xf6 }, + { 0xd8, 0xf8 }, + { 0xd9, 0xf9 }, + { 0xda, 0xfa }, + { 0xdb, 0xfb }, + { 0xdc, 0xfc }, + { 0xdd, 0xfd }, + { 0xde, 0xfe }, + + { 0xe0, 0xc0 }, + { 0xe1, 0xc1 }, + { 0xe2, 0xc2 }, + { 0xe3, 0xc3 }, + { 0xe4, 0xc4 }, + { 0xe5, 0xc5 }, + { 0xe6, 0xc6 }, + { 0xe7, 0xc7 }, + { 0xe8, 0xc8 }, + { 0xe9, 0xc9 }, + { 0xea, 0xca }, + { 0xeb, 0xcb }, + { 0xec, 0xcc }, + { 0xed, 0xcd }, + { 0xee, 0xce }, + { 0xef, 0xcf }, + + { 0xf0, 0xd0 }, + { 0xf1, 0xd1 }, + { 0xf2, 0xd2 }, + { 0xf3, 0xd3 }, + { 0xf4, 0xd4 }, + { 0xf5, 0xd5 }, + { 0xf6, 0xd6 }, + { 0xf8, 0xd8 }, + { 0xf9, 0xd9 }, + { 0xfa, 0xda }, + { 0xfb, 0xdb }, + { 0xfc, 0xdc }, + { 0xfd, 0xdd }, + { 0xfe, 0xde } + }; + + if (flag == ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) { + *ccs = OnigAsciiPairAmbigCodes; + return (sizeof(OnigAsciiPairAmbigCodes) / sizeof(OnigPairAmbigCodes)); + } + else if (flag == ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) { + *ccs = cc; + return sizeof(cc) / sizeof(OnigPairAmbigCodes); + } + else + return 0; +} + +extern int +onigenc_ess_tsett_get_all_comp_ambig_codes(OnigAmbigType flag, + OnigCompAmbigCodes** ccs) { - return -1; + static OnigCompAmbigCodes folds[] = { + { 2, 0xdf, {{ 2, { 0x53, 0x53 } }, { 2, { 0x73, 0x73} } } } + }; + + if (flag == ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) { + *ccs = folds; + return sizeof(folds) / sizeof(OnigCompAmbigCodes); + } + else + return 0; } extern int -onigenc_nothing_get_ctype_code_range(int ctype, int* nsb, int* nmb, +onigenc_not_support_get_ctype_code_range(int ctype, int* nsb, int* nmb, OnigCodePointRange* sbr[], OnigCodePointRange* mbr[]) { - return -1; + return ONIG_NO_SUPPORT_CONFIG; +} + +extern int +onigenc_is_mbc_newline_0x0a(UChar* p, UChar* end) +{ + if (p < end) { + if (*p == 0x0a) return 1; + } + return 0; } /* for single byte encodings */ extern int -onigenc_ascii_mbc_to_lower(UChar* p, UChar* lower) +onigenc_ascii_mbc_to_normalize(OnigAmbigType flag, UChar** p, UChar*end, + UChar* lower) { - *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) { + *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(**p); + } + else { + *lower = **p; + } + + (*p)++; return 1; /* return byte length of converted char to lower */ } extern int -onigenc_ascii_mbc_is_case_ambig(UChar* p) +onigenc_ascii_is_mbc_ambiguous(OnigAmbigType flag, UChar** pp, UChar* end) +{ + UChar* p = *pp; + + (*pp)++; + if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) { + return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p); + } + else { + return FALSE; + } +} + +extern int +onigenc_single_byte_mbc_enc_len(UChar* p) { - return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p); + return 1; } extern OnigCodePoint @@ -244,20 +671,25 @@ onigenc_single_byte_left_adjust_char_head(UChar* start, UChar* s) } extern int -onigenc_single_byte_is_allowed_reverse_match(UChar* s, UChar* end) +onigenc_always_true_is_allowed_reverse_match(UChar* s, UChar* end) { return TRUE; } +extern int +onigenc_always_false_is_allowed_reverse_match(UChar* s, UChar* end) +{ + return FALSE; +} + extern OnigCodePoint onigenc_mbn_mbc_to_code(OnigEncoding enc, UChar* p, UChar* end) { int c, i, len; OnigCodePoint n; - c = *p++; - len = enc_len(enc, c); - n = c; + len = enc_len(enc, p); + n = (OnigCodePoint )(*p++); if (len == 1) return n; for (i = 1; i < len; i++) { @@ -269,33 +701,52 @@ onigenc_mbn_mbc_to_code(OnigEncoding enc, UChar* p, UChar* end) } extern int -onigenc_mbn_mbc_to_lower(OnigEncoding enc, UChar* p, UChar* lower) +onigenc_mbn_mbc_to_normalize(OnigEncoding enc, OnigAmbigType flag, + UChar** pp, UChar* end, UChar* lower) { int len; + UChar *p = *pp; if (ONIGENC_IS_MBC_ASCII(p)) { - *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) { + *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + } + else { + *lower = *p; + } + (*pp)++; return 1; } else { - len = enc_len(enc, *p); + len = enc_len(enc, p); if (lower != p) { - /* memcpy(lower, p, len); */ int i; for (i = 0; i < len; i++) { *lower++ = *p++; } } + (*pp) += len; return len; /* return byte length of converted to lower char */ } } extern int -onigenc_mbn_mbc_is_case_ambig(UChar* p) +onigenc_mbn_is_mbc_ambiguous(OnigEncoding enc, OnigAmbigType flag, + UChar** pp, UChar* end) { - if (ONIGENC_IS_MBC_ASCII(p)) - return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p); + UChar* p = *pp; + if (ONIGENC_IS_MBC_ASCII(p)) { + (*pp)++; + if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) { + return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p); + } + else { + return FALSE; + } + } + + (*pp) += enc_len(enc, p); return FALSE; } @@ -360,7 +811,7 @@ onigenc_mb2_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf) *p++ = (UChar )(code & 0xff); #if 1 - if (enc_len(enc, buf[0]) != (p - buf)) + if (enc_len(enc, buf) != (p - buf)) return ONIGENCERR_INVALID_WIDE_CHAR_VALUE; #endif return p - buf; @@ -383,23 +834,21 @@ onigenc_mb4_code_to_mbc(OnigEncoding enc, OnigCodePoint code, UChar *buf) *p++ = (UChar )(code & 0xff); #if 1 - if (enc_len(enc, buf[0]) != (p - buf)) + if (enc_len(enc, buf) != (p - buf)) return ONIGENCERR_INVALID_WIDE_CHAR_VALUE; #endif return p - buf; } extern int -onigenc_mb2_code_is_ctype(OnigEncoding enc, OnigCodePoint code, +onigenc_mb2_is_code_ctype(OnigEncoding enc, OnigCodePoint code, unsigned int ctype) { if ((ctype & ONIGENC_CTYPE_WORD) != 0) { if (code < 128) return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); - else { - int first = onigenc_mb2_code_to_mbc_first(code); - return (enc_len(enc, first) > 1 ? TRUE : FALSE); - } + else + return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE); ctype &= ~ONIGENC_CTYPE_WORD; if (ctype == 0) return FALSE; @@ -412,16 +861,14 @@ onigenc_mb2_code_is_ctype(OnigEncoding enc, OnigCodePoint code, } extern int -onigenc_mb4_code_is_ctype(OnigEncoding enc, OnigCodePoint code, +onigenc_mb4_is_code_ctype(OnigEncoding enc, OnigCodePoint code, unsigned int ctype) { if ((ctype & ONIGENC_CTYPE_WORD) != 0) { if (code < 128) return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); - else { - int first = onigenc_mb4_code_to_mbc_first(code); - return (enc_len(enc, first) > 1 ? TRUE : FALSE); - } + else + return (ONIGENC_CODE_TO_MBCLEN(enc, code) > 1 ? TRUE : FALSE); ctype &= ~ONIGENC_CTYPE_WORD; if (ctype == 0) return FALSE; @@ -433,6 +880,7 @@ onigenc_mb4_code_is_ctype(OnigEncoding enc, OnigCodePoint code, return FALSE; } +#if 0 extern int onigenc_get_all_fold_match_code_ss_0xdf(OnigCodePoint** codes) { @@ -440,33 +888,25 @@ onigenc_get_all_fold_match_code_ss_0xdf(OnigCodePoint** codes) *codes = list; return 1; } +#endif extern int -onigenc_get_fold_match_info_ss_0xdf(UChar* p, UChar* end, - OnigEncFoldMatchInfo** info) +onigenc_with_ascii_strncmp(OnigEncoding enc, UChar* p, UChar* end, + UChar* sascii /* ascii */, int n) { - /* German alphabet ess-tsett(U+00DF) */ - static OnigEncFoldMatchInfo ss = { - 3, - { 1, 2, 2 }, - { "\337", "ss", "SS" } /* 0337: 0xdf */ - }; + int x, c; - if (p >= end) return -1; + while (n-- > 0) { + if (p >= end) return (int )(*sascii); - if (*p == 0xdf) { - *info = &ss; - return 1; - } - else if (p + 1 < end) { - if ((*p == 'S' && *(p+1) == 'S') || - (*p == 's' && *(p+1) == 's')) { - *info = &ss; - return 2; - } - } + c = (int )ONIGENC_MBC_TO_CODE(enc, p, end); + x = *sascii - c; + if (x) return x; - return -1; /* is not a fold string. */ + sascii++; + p += enc_len(enc, p); + } + return 0; } #else /* ONIG_RUBY_M17N */ @@ -475,6 +915,10 @@ extern int onigenc_is_code_ctype(OnigEncoding enc, OnigCodePoint code, int ctype) { switch (ctype) { + case ONIGENC_CTYPE_NEWLINE: + if (code == 0x0a) return 1; + break; + case ONIGENC_CTYPE_ALPHA: return m17n_isalpha(enc, code); break; @@ -548,12 +992,22 @@ onigenc_mbc_to_lower(OnigEncoding enc, UChar* p, UChar* buf) } extern int -onigenc_mbc_is_case_ambig(OnigEncoding enc, UChar* p) +onigenc_is_mbc_ambiguous(OnigEncoding enc, OnigAmbigType flag, + UChar** pp, UChar* end) { - unsigned int c = m17n_codepoint(enc, p, p + enc_len(enc, *p)); + int len; + unsigned int c; + UChar* p = *pp; + + len = enc_len(enc, *p); + (*pp) += len; + c = m17n_codepoint(enc, p, p + len); + + if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) { + if (m17n_isupper(enc, c) || m17n_islower(enc, c)) + return TRUE; + } - if (m17n_isupper(enc, c) || m17n_islower(enc, c)) - return TRUE; return FALSE; } @@ -1,12 +1,33 @@ +#ifndef REGENC_H +#define REGENC_H /********************************************************************** - regenc.h - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ -#ifndef REGENC_H -#define REGENC_H +/*- + * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ #ifndef RUBY_PLATFORM #include "config.h" @@ -31,8 +52,6 @@ #define ONIGENCERR_INVALID_WIDE_CHAR_VALUE -400 #define ONIGENCERR_TOO_BIG_WIDE_CHAR_VALUE -401 -#define ONIG_NEWLINE '\n' -#define ONIG_IS_NEWLINE(c) ((c) == ONIG_NEWLINE) #define ONIG_IS_NULL(p) (((void*)(p)) == (void*)0) #define ONIG_IS_NOT_NULL(p) (((void*)(p)) != (void*)0) #define ONIG_CHECK_NULL_RETURN(p) if (ONIG_IS_NULL(p)) return NULL @@ -48,44 +67,72 @@ #define ONIG_ENCODING_INIT_DEFAULT ONIG_ENCODING_ASCII /* for encoding system implementation (internal) */ -ONIG_EXTERN int onigenc_nothing_get_all_fold_match_code P_((OnigCodePoint** codes)); -ONIG_EXTERN int onigenc_nothing_get_fold_match_info P_((UChar* p, UChar* end, OnigEncFoldMatchInfo** info)); -ONIG_EXTERN int onigenc_nothing_get_ctype_code_range P_((int ctype, int* nsb, int* nmb, OnigCodePointRange* sbr[], OnigCodePointRange* mbr[])); +ONIG_EXTERN int onigenc_ascii_get_all_pair_ambig_codes P_((OnigAmbigType flag, OnigPairAmbigCodes** acs)); +ONIG_EXTERN int onigenc_nothing_get_all_comp_ambig_codes P_((OnigAmbigType flag, OnigCompAmbigCodes** acs)); +ONIG_EXTERN int onigenc_iso_8859_1_get_all_pair_ambig_codes P_((OnigAmbigType flag, OnigPairAmbigCodes** acs)); +ONIG_EXTERN int onigenc_ess_tsett_get_all_comp_ambig_codes P_((OnigAmbigType flag, OnigCompAmbigCodes** acs)); +ONIG_EXTERN int onigenc_not_support_get_ctype_code_range P_((int ctype, int* nsb, int* nmb, OnigCodePointRange* sbr[], OnigCodePointRange* mbr[])); +ONIG_EXTERN int onigenc_is_mbc_newline_0x0a P_((UChar* p, UChar* end)); /* methods for single byte encoding */ -ONIG_EXTERN int onigenc_ascii_mbc_to_lower P_((UChar* p, UChar* lower)); -ONIG_EXTERN int onigenc_ascii_mbc_is_case_ambig P_((UChar* p)); +ONIG_EXTERN int onigenc_ascii_mbc_to_normalize P_((OnigAmbigType flag, UChar** p, UChar* end, UChar* lower)); +ONIG_EXTERN int onigenc_ascii_is_mbc_ambiguous P_((OnigAmbigType flag, UChar** p, UChar* end)); +ONIG_EXTERN int onigenc_single_byte_mbc_enc_len P_((UChar* p)); ONIG_EXTERN OnigCodePoint onigenc_single_byte_mbc_to_code P_((UChar* p, UChar* end)); ONIG_EXTERN int onigenc_single_byte_code_to_mbclen P_((OnigCodePoint code)); ONIG_EXTERN int onigenc_single_byte_code_to_mbc_first P_((OnigCodePoint code)); ONIG_EXTERN int onigenc_single_byte_code_to_mbc P_((OnigCodePoint code, UChar *buf)); ONIG_EXTERN UChar* onigenc_single_byte_left_adjust_char_head P_((UChar* start, UChar* s)); -ONIG_EXTERN int onigenc_single_byte_is_allowed_reverse_match P_((UChar* s, UChar* end)); +ONIG_EXTERN int onigenc_always_true_is_allowed_reverse_match P_((UChar* s, UChar* end)); +ONIG_EXTERN int onigenc_always_false_is_allowed_reverse_match P_((UChar* s, UChar* end)); /* methods for multi byte encoding */ ONIG_EXTERN OnigCodePoint onigenc_mbn_mbc_to_code P_((OnigEncoding enc, UChar* p, UChar* end)); -ONIG_EXTERN int onigenc_mbn_mbc_to_lower P_((OnigEncoding enc, UChar* p, UChar* lower)); -ONIG_EXTERN int onigenc_mbn_mbc_is_case_ambig P_((UChar* p)); +ONIG_EXTERN int onigenc_mbn_mbc_to_normalize P_((OnigEncoding enc, OnigAmbigType flag, UChar** p, UChar* end, UChar* lower)); +ONIG_EXTERN int onigenc_mbn_is_mbc_ambiguous P_((OnigEncoding enc, OnigAmbigType flag, UChar** p, UChar* end)); ONIG_EXTERN int onigenc_mb2_code_to_mbclen P_((OnigCodePoint code)); ONIG_EXTERN int onigenc_mb2_code_to_mbc_first P_((OnigCodePoint code)); ONIG_EXTERN int onigenc_mb2_code_to_mbc P_((OnigEncoding enc, OnigCodePoint code, UChar *buf)); -ONIG_EXTERN int onigenc_mb2_code_is_ctype P_((OnigEncoding enc, OnigCodePoint code, unsigned int ctype)); +ONIG_EXTERN int onigenc_mb2_is_code_ctype P_((OnigEncoding enc, OnigCodePoint code, unsigned int ctype)); ONIG_EXTERN int onigenc_mb4_code_to_mbclen P_((OnigCodePoint code)); ONIG_EXTERN int onigenc_mb4_code_to_mbc_first P_((OnigCodePoint code)); ONIG_EXTERN int onigenc_mb4_code_to_mbc P_((OnigEncoding enc, OnigCodePoint code, UChar *buf)); -ONIG_EXTERN int onigenc_mb4_code_is_ctype P_((OnigEncoding enc, OnigCodePoint code, unsigned int ctype)); +ONIG_EXTERN int onigenc_mb4_is_code_ctype P_((OnigEncoding enc, OnigCodePoint code, unsigned int ctype)); ONIG_EXTERN int onigenc_get_all_fold_match_code_ss_0xdf P_((OnigCodePoint** codes)); -ONIG_EXTERN int onigenc_get_fold_match_info_ss_0xdf P_((UChar* p, UChar* end, OnigEncFoldMatchInfo** info)); + +/* in enc/unicode.c */ +ONIG_EXTERN int onigenc_unicode_is_code_ctype P_((OnigCodePoint code, unsigned int ctype)); +ONIG_EXTERN int onigenc_unicode_get_ctype_code_range P_((int ctype, int* nsb, int* nmb, OnigCodePointRange* sbr[], OnigCodePointRange* mbr[])); + + +#define ONIGENC_ISO_8859_1_TO_LOWER_CASE(c) \ + OnigEncISO_8859_1_ToLowerCaseTable[c] +#define ONIGENC_ISO_8859_1_TO_UPPER_CASE(c) \ + OnigEncISO_8859_1_ToUpperCaseTable[c] +#define ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code,ctype) \ + ((OnigEnc_Unicode_ISO_8859_1_CtypeTable[code] & ctype) != 0) + +ONIG_EXTERN UChar OnigEncISO_8859_1_ToLowerCaseTable[]; +ONIG_EXTERN UChar OnigEncISO_8859_1_ToUpperCaseTable[]; +ONIG_EXTERN unsigned short OnigEnc_Unicode_ISO_8859_1_CtypeTable[]; +ONIG_EXTERN OnigPairAmbigCodes OnigAsciiPairAmbigCodes[]; #endif /* is not ONIG_RUBY_M17N */ +ONIG_EXTERN int +onigenc_with_ascii_strncmp P_((OnigEncoding enc, UChar* p, UChar* end, UChar* sascii /* ascii */, int n)); +ONIG_EXTERN UChar* +onigenc_step P_((OnigEncoding enc, UChar* p, UChar* end, int n)); + ONIG_EXTERN OnigEncoding OnigEncDefaultCharEncoding; ONIG_EXTERN UChar* OnigEncAsciiToLowerCaseTable; +ONIG_EXTERN UChar OnigEncAsciiToUpperCaseTable[]; ONIG_EXTERN unsigned short OnigEncAsciiCtypeTable[]; #define ONIGENC_ASCII_CODE_TO_LOWER_CASE(c) OnigEncAsciiToLowerCaseTable[c] +#define ONIGENC_ASCII_CODE_TO_UPPER_CASE(c) OnigEncAsciiToUpperCaseTable[c] #define ONIGENC_IS_ASCII_CODE_CTYPE(code,ctype) \ ((OnigEncAsciiCtypeTable[code] & ctype) != 0) #define ONIGENC_IS_ASCII_CODE_CASE_AMBIG(code) \ diff --git a/regerror.c b/regerror.c index c7a2a7b7ea..50ce8fd9fe 100644 --- a/regerror.c +++ b/regerror.c @@ -1,10 +1,32 @@ /********************************************************************** - regerror.c - Oniguruma (regular expression library) - - Copyright (C) 2002-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regint.h" #include <stdio.h> /* for vsnprintf() */ @@ -146,6 +168,8 @@ onig_error_code_to_format(int code) p = "group number is too big for capture history"; break; case ONIGERR_INVALID_CHAR_PROPERTY_NAME: p = "invalid character property name {%n}"; break; + case ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION: + p = "not supported encoding combination"; break; case ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT: p = "over thread pass limit count"; break; @@ -219,7 +243,7 @@ onig_error_code_to_str(s, code, va_alist) default: q = onig_error_code_to_format(code); - len = strlen(q); + len = onigenc_str_bytelen_null(ONIG_ENCODING_ASCII, q); xmemcpy(s, q, len); s[len] = '\0'; break; @@ -246,7 +270,8 @@ onig_snprintf_with_pattern(buf, bufsize, enc, pat, pat_end, fmt, va_alist) #endif { int n, need, len; - UChar *p, *s; + UChar *p, *s, *bp; + char bs[6]; va_list args; va_init_list(args, fmt); @@ -257,29 +282,41 @@ onig_snprintf_with_pattern(buf, bufsize, enc, pat, pat_end, fmt, va_alist) if (n + need < bufsize) { strcat(buf, ": /"); - s = buf + strlen(buf); + s = buf + onigenc_str_bytelen_null(ONIG_ENCODING_ASCII, buf); p = pat; while (p < (UChar* )pat_end) { - if (*p == MC_ESC) { + if (*p == MC_ESC(enc)) { *s++ = *p++; - len = enc_len(enc, *p); + len = enc_len(enc, p); while (len-- > 0) *s++ = *p++; } else if (*p == '/') { - *s++ = MC_ESC; + *s++ = (unsigned char )MC_ESC(enc); *s++ = *p++; } - else if (ONIGENC_IS_MBC_HEAD(enc, *p)) { - len = enc_len(enc, *p); - while (len-- > 0) *s++ = *p++; + else if (ONIGENC_IS_MBC_HEAD(enc, p)) { + len = enc_len(enc, p); + if (ONIGENC_MBC_MINLEN(enc) == 1) { + while (len-- > 0) *s++ = *p++; + } + else { /* for UTF16 */ + int blen; + + while (len-- > 0) { + sprintf(bs, "\\%03o", *p++ & 0377); + blen = onigenc_str_bytelen_null(ONIG_ENCODING_ASCII, bs); + bp = bs; + while (blen-- > 0) *s++ = *bp++; + } + } } else if (!ONIGENC_IS_CODE_PRINT(enc, *p) && !ONIGENC_IS_CODE_SPACE(enc, *p)) { - char b[5]; - sprintf(b, "\\%03o", *p & 0377); - len = strlen(b); - while (len-- > 0) *s++ = *p++; + sprintf(bs, "\\%03o", *p++ & 0377); + len = onigenc_str_bytelen_null(ONIG_ENCODING_ASCII, bs); + bp = bs; + while (len-- > 0) *s++ = *bp++; } else { *s++ = *p++; @@ -1,10 +1,32 @@ /********************************************************************** - reggnu.c - Oniguruma (regular expression library) - - Copyright (C) 2002-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regint.h" #ifndef ONIGGNU_H /* name changes from oniggnu.h to regex.h in ruby. */ @@ -17,10 +39,6 @@ #endif #endif -#ifndef NULL -#define NULL ((void* )0) -#endif - extern void re_free_registers(OnigRegion* r) { @@ -111,7 +129,9 @@ re_free_pattern(regex_t* reg) extern int re_alloc_pattern(regex_t** reg) { - return onig_alloc_init(reg, ONIG_OPTION_DEFAULT, OnigEncDefaultCharEncoding, + return onig_alloc_init(reg, ONIG_OPTION_DEFAULT, + ONIGENC_AMBIGUOUS_MATCH_DEFAULT, + OnigEncDefaultCharEncoding, OnigDefaultSyntax); } @@ -1,12 +1,53 @@ /********************************************************************** - sjis.c - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" +static int EncLen_SJIS[] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1 +}; + static const char SJIS_CAN_BE_TRAIL_TABLE[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -26,17 +67,39 @@ static const char SJIS_CAN_BE_TRAIL_TABLE[256] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0 }; -#define SJIS_ISMB_FIRST(byte) (OnigEncodingSJIS.len_table[byte] > 1) +#define SJIS_ISMB_FIRST(byte) (EncLen_SJIS[byte] > 1) #define SJIS_ISMB_TRAIL(byte) SJIS_CAN_BE_TRAIL_TABLE[(byte)] +static int +sjis_mbc_enc_len(UChar* p) +{ + return EncLen_SJIS[*p]; +} + +extern int +sjis_code_to_mbclen(OnigCodePoint code) +{ + if (code < 256) { + if (EncLen_SJIS[(int )code] == 1) + return 1; + else + return 0; + } + else if (code <= 0xffff) { + return 2; + } + else + return 0; +} + static OnigCodePoint sjis_mbc_to_code(UChar* p, UChar* end) { int c, i, len; OnigCodePoint n; + len = enc_len(ONIG_ENCODING_SJIS, p); c = *p++; - len = enc_len(ONIG_ENCODING_SJIS, c); n = c; if (len == 1) return n; @@ -57,43 +120,57 @@ sjis_code_to_mbc(OnigCodePoint code, UChar *buf) *p++ = (UChar )(code & 0xff); #if 0 - if (enc_len(ONIG_ENCODING_SJIS, buf[0]) != (p - buf)) + if (enc_len(ONIG_ENCODING_SJIS, buf) != (p - buf)) return REGERR_INVALID_WIDE_CHAR_VALUE; #endif return p - buf; } static int -sjis_mbc_to_lower(UChar* p, UChar* lower) +sjis_mbc_to_normalize(OnigAmbigType flag, UChar** pp, UChar* end, UChar* lower) { - int len; + UChar* p = *pp; if (ONIGENC_IS_MBC_ASCII(p)) { - *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) { + *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + } + else { + *lower = *p; + } + + (*pp)++; return 1; } else { - len = enc_len(ONIG_ENCODING_SJIS, *p); + int len = enc_len(ONIG_ENCODING_SJIS, p); + if (lower != p) { - /* memcpy(lower, p, len); */ int i; for (i = 0; i < len; i++) { *lower++ = *p++; } } + (*pp) += len; return len; /* return byte length of converted char to lower */ } } static int -sjis_code_is_ctype(OnigCodePoint code, unsigned int ctype) +sjis_is_mbc_ambiguous(OnigAmbigType flag, UChar** pp, UChar* end) +{ + return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_SJIS, flag, pp, end); + +} + +static int +sjis_is_code_ctype(OnigCodePoint code, unsigned int ctype) { if ((ctype & ONIGENC_CTYPE_WORD) != 0) { if (code < 128) return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype); else { - int first = onigenc_mb2_code_to_mbc_first(code); - return (enc_len(ONIG_ENCODING_SJIS, first) > 1 ? TRUE : FALSE); + return (sjis_code_to_mbclen(code) > 1 ? TRUE : FALSE); } ctype &= ~ONIGENC_CTYPE_WORD; @@ -123,7 +200,7 @@ sjis_left_adjust_char_head(UChar* start, UChar* s) } } } - len = enc_len(ONIG_ENCODING_SJIS, *p); + len = enc_len(ONIG_ENCODING_SJIS, p); if (p + len > s) return p; p += len; return p + ((s - p) & ~1); @@ -137,38 +214,29 @@ sjis_is_allowed_reverse_match(UChar* s, UChar* end) } OnigEncodingType OnigEncodingSJIS = { - { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1 - }, + sjis_mbc_enc_len, "Shift_JIS", /* name */ 2, /* max byte length */ - FALSE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */ - FALSE, /* is continuous sb mb codepoint */ + 1, /* min byte length */ + ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE, + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + onigenc_is_mbc_newline_0x0a, sjis_mbc_to_code, - onigenc_mb2_code_to_mbclen, + sjis_code_to_mbclen, sjis_code_to_mbc, - sjis_mbc_to_lower, - onigenc_mbn_mbc_is_case_ambig, - sjis_code_is_ctype, - onigenc_nothing_get_ctype_code_range, + sjis_mbc_to_normalize, + sjis_is_mbc_ambiguous, + onigenc_ascii_get_all_pair_ambig_codes, + onigenc_nothing_get_all_comp_ambig_codes, + sjis_is_code_ctype, + onigenc_not_support_get_ctype_code_range, sjis_left_adjust_char_head, - sjis_is_allowed_reverse_match, - onigenc_nothing_get_all_fold_match_code, - onigenc_nothing_get_fold_match_info + sjis_is_allowed_reverse_match }; @@ -1,60 +1,78 @@ /********************************************************************** - utf8.c - Oniguruma (regular expression library) - - Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regenc.h" +#define USE_INVALID_CODE_SCHEME + +#ifdef USE_INVALID_CODE_SCHEME +/* virtual codepoint values for invalid encoding byte 0xfe and 0xff */ +#define INVALID_CODE_FE 0xfffffffe +#define INVALID_CODE_FF 0xffffffff +#define VALID_CODE_LIMIT 0x7fffffff +#endif + #define utf8_islead(c) ((UChar )((c) & 0xc0) != 0x80) -#define ENC_IS_ISO_8859_1_CTYPE(code,ctype) \ - ((EncUnicode_ISO_8859_1_CtypeTable[code] & ctype) != 0) - -static unsigned short EncUnicode_ISO_8859_1_CtypeTable[256] = { - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1106, 0x1104, 0x1104, 0x1104, 0x1104, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1142, 0x10d0, 0x10d0, 0x10d0, 0x1050, 0x10d0, 0x10d0, 0x10d0, - 0x10d0, 0x10d0, 0x10d0, 0x1050, 0x10d0, 0x10d0, 0x10d0, 0x10d0, - 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, 0x1c58, - 0x1c58, 0x1c58, 0x10d0, 0x10d0, 0x1050, 0x1050, 0x1050, 0x10d0, - 0x10d0, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1e51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x10d0, 0x10d0, 0x10d0, 0x1050, 0x18d0, - 0x1050, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1c71, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x10d0, 0x1050, 0x10d0, 0x1050, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, 0x1004, - 0x1142, 0x10d0, 0x1050, 0x1050, 0x1050, 0x1050, 0x1050, 0x1050, - 0x1050, 0x1050, 0x1871, 0x10d0, 0x1050, 0x10d0, 0x1050, 0x1050, - 0x1050, 0x1050, 0x1850, 0x1850, 0x1050, 0x1871, 0x1050, 0x10d0, - 0x1050, 0x1850, 0x1871, 0x10d0, 0x1850, 0x1850, 0x1850, 0x10d0, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1050, - 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1a51, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1050, - 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871, 0x1871 +static int EncLen_UTF8[] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1 }; +static int +utf8_mbc_enc_len(UChar* p) +{ + return EncLen_UTF8[*p]; +} + static OnigCodePoint utf8_mbc_to_code(UChar* p, UChar* end) { int c, len; OnigCodePoint n; + len = enc_len(ONIG_ENCODING_UTF8, p); c = *p++; - len = enc_len(ONIG_ENCODING_UTF8, c); if (len > 1) { len--; n = c & ((1 << (6 - len)) - 1); @@ -64,8 +82,14 @@ utf8_mbc_to_code(UChar* p, UChar* end) } return n; } - else + else { +#ifdef USE_INVALID_CODE_SCHEME + if (c > 0xfd) { + return ((c == 0xfe) ? INVALID_CODE_FE : INVALID_CODE_FF); + } +#endif return (OnigCodePoint )c; + } } static int @@ -81,6 +105,10 @@ utf8_code_to_mbclen(OnigCodePoint code) else if ((code & 0xffe00000) == 0) return 4; else if ((code & 0xfc000000) == 0) return 5; else if ((code & 0x80000000) == 0) return 6; +#ifdef USE_INVALID_CODE_SCHEME + else if (code == INVALID_CODE_FE) return 1; + else if (code == INVALID_CODE_FF) return 1; +#endif else return ONIGENCERR_TOO_BIG_WIDE_CHAR_VALUE; } @@ -147,6 +175,16 @@ utf8_code_to_mbc(OnigCodePoint code, UChar *buf) *p++ = UTF8_TRAILS(code, 12); *p++ = UTF8_TRAILS(code, 6); } +#ifdef USE_INVALID_CODE_SCHEME + else if (code == INVALID_CODE_FE) { + *p = 0xfe; + return 1; + } + else if (code == INVALID_CODE_FF) { + *p = 0xff; + return 1; + } +#endif else { return ONIGENCERR_TOO_BIG_WIDE_CHAR_VALUE; } @@ -157,49 +195,129 @@ utf8_code_to_mbc(OnigCodePoint code, UChar *buf) } static int -utf8_mbc_to_lower(UChar* p, UChar* lower) +utf8_mbc_to_normalize(OnigAmbigType flag, UChar** pp, UChar* end, UChar* lower) { - int len; + UChar* p = *pp; - /* !!! U+0080 - U+00ff is treated by fold match. !!! */ if (ONIGENC_IS_MBC_ASCII(p)) { - *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + if (end > p + 1 && + (flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0 && + ((*p == 's' && *(p+1) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+1) == 'S')))) { + *lower++ = '\303'; + *lower = '\237'; + (*pp) += 2; + return 2; + } + + if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) { + *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p); + } + else { + *lower = *p; + } + (*pp)++; return 1; /* return byte length of converted char to lower */ } else { - len = enc_len(ONIG_ENCODING_UTF8, *p); + int len; + + if (*p == 195) { /* 195 == '\303' */ + int c = *(p + 1); + if (c >= 128) { + if (c <= '\236' && /* upper */ + (flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0) { + if (c != '\227') { + *lower++ = *p; + *lower = (UChar )(c + 32); + (*pp) += 2; + return 2; + } + } +#if 0 + else if (c == '\237' && + (flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + *lower++ = '\303'; + *lower = '\237'; + (*pp) += 2; + return 2; + } +#endif + } + } + + len = enc_len(ONIG_ENCODING_UTF8, p); if (lower != p) { - /* memcpy(lower, p, len); */ int i; for (i = 0; i < len; i++) { *lower++ = *p++; } } + (*pp) += len; return len; /* return byte length of converted char to lower */ } } static int -utf8_mbc_is_case_ambig(UChar* p) +utf8_is_mbc_ambiguous(OnigAmbigType flag, UChar** pp, UChar* end) { - /* !!! U+0080 - U+00ff ( 0x80[0xc2,0x80] - 0xff[0xc3,0xbf] ) - is treated by fold match. !!! */ + UChar* p = *pp; - if (ONIGENC_IS_MBC_ASCII(p)) - return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p); + if (ONIGENC_IS_MBC_ASCII(p)) { + if (end > p + 1 && + (flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0 && + ((*p == 's' && *(p+1) == 's') || + ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0 && + (*p == 'S' && *(p+1) == 'S')))) { + (*pp) += 2; + return TRUE; + } + + (*pp)++; + if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) { + return ONIGENC_IS_ASCII_CODE_CASE_AMBIG(*p); + } + } + else { + (*pp) += enc_len(ONIG_ENCODING_UTF8, p); + + if (*p == 195) { /* 195 == '\303' */ + int c = *(p + 1); + if (c >= 128) { + if ((flag & ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE) != 0) { + if (c <= '\236') { /* upper */ + if (c == '\227') return FALSE; + return TRUE; + } + else if (c >= '\240' && c <= '\276') { /* lower */ + if (c == '\267') return FALSE; + return TRUE; + } + } + else if (c == '\237' && + (flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + return TRUE; + } + } + } + } return FALSE; } static int -utf8_code_is_ctype(OnigCodePoint code, unsigned int ctype) +utf8_is_code_ctype(OnigCodePoint code, unsigned int ctype) { if (code < 256) { - return ENC_IS_ISO_8859_1_CTYPE(code, ctype); + return ONIGENC_IS_UNICODE_ISO_8859_1_CTYPE(code, ctype); } if ((ctype & ONIGENC_CTYPE_WORD) != 0) { - return TRUE; +#ifdef USE_INVALID_CODE_SCHEME + if (code <= VALID_CODE_LIMIT) +#endif + return TRUE; } return FALSE; @@ -223,22 +341,17 @@ utf8_get_ctype_code_range(int ctype, int* nsb, int* nmb, } while (0) static OnigCodePointRange SBAlpha[] = { - { 0x41, 0x5a }, - { 0x61, 0x7a } + { 0x41, 0x5a }, { 0x61, 0x7a } }; static OnigCodePointRange MBAlpha[] = { - { 0xaa, 0xaa }, - { 0xb5, 0xb5 }, - { 0xba, 0xba }, - { 0xc0, 0xd6 }, - { 0xd8, 0xf6 }, - { 0xf8, 0x220 } + { 0xaa, 0xaa }, { 0xb5, 0xb5 }, + { 0xba, 0xba }, { 0xc0, 0xd6 }, + { 0xd8, 0xf6 }, { 0xf8, 0x220 } }; static OnigCodePointRange SBBlank[] = { - { 0x09, 0x09 }, - { 0x20, 0x20 } + { 0x09, 0x09 }, { 0x20, 0x20 } }; static OnigCodePointRange MBBlank[] = { @@ -246,8 +359,7 @@ utf8_get_ctype_code_range(int ctype, int* nsb, int* nmb, }; static OnigCodePointRange SBCntrl[] = { - { 0x00, 0x1f }, - { 0x7f, 0x7f } + { 0x00, 0x1f }, { 0x7f, 0x7f } }; static OnigCodePointRange MBCntrl[] = { @@ -271,10 +383,8 @@ utf8_get_ctype_code_range(int ctype, int* nsb, int* nmb, }; static OnigCodePointRange MBLower[] = { - { 0xaa, 0xaa }, - { 0xb5, 0xb5 }, - { 0xba, 0xba }, - { 0xdf, 0xf6 }, + { 0xaa, 0xaa }, { 0xb5, 0xb5 }, + { 0xba, 0xba }, { 0xdf, 0xf6 }, { 0xf8, 0xff } }; @@ -287,29 +397,21 @@ utf8_get_ctype_code_range(int ctype, int* nsb, int* nmb, }; static OnigCodePointRange SBPunct[] = { - { 0x21, 0x23 }, - { 0x25, 0x2a }, - { 0x2c, 0x2f }, - { 0x3a, 0x3b }, - { 0x3f, 0x40 }, - { 0x5b, 0x5d }, - { 0x5f, 0x5f }, - { 0x7b, 0x7b }, + { 0x21, 0x23 }, { 0x25, 0x2a }, + { 0x2c, 0x2f }, { 0x3a, 0x3b }, + { 0x3f, 0x40 }, { 0x5b, 0x5d }, + { 0x5f, 0x5f }, { 0x7b, 0x7b }, { 0x7d, 0x7d } }; static OnigCodePointRange MBPunct[] = { - { 0xa1, 0xa1 }, - { 0xab, 0xab }, - { 0xad, 0xad }, - { 0xb7, 0xb7 }, - { 0xbb, 0xbb }, - { 0xbf, 0xbf } + { 0xa1, 0xa1 }, { 0xab, 0xab }, + { 0xad, 0xad }, { 0xb7, 0xb7 }, + { 0xbb, 0xbb }, { 0xbf, 0xbf } }; static OnigCodePointRange SBSpace[] = { - { 0x09, 0x0d }, - { 0x20, 0x20 } + { 0x09, 0x0d }, { 0x20, 0x20 } }; static OnigCodePointRange MBSpace[] = { @@ -321,30 +423,23 @@ utf8_get_ctype_code_range(int ctype, int* nsb, int* nmb, }; static OnigCodePointRange MBUpper[] = { - { 0xc0, 0xd6 }, - { 0xd8, 0xde } + { 0xc0, 0xd6 }, { 0xd8, 0xde } }; static OnigCodePointRange SBXDigit[] = { - { 0x30, 0x39 }, - { 0x41, 0x46 }, + { 0x30, 0x39 }, { 0x41, 0x46 }, { 0x61, 0x66 } }; static OnigCodePointRange SBWord[] = { - { 0x30, 0x39 }, - { 0x41, 0x5a }, - { 0x5f, 0x5f }, - { 0x61, 0x7a } + { 0x30, 0x39 }, { 0x41, 0x5a }, + { 0x5f, 0x5f }, { 0x61, 0x7a } }; static OnigCodePointRange MBWord[] = { - { 0xaa, 0xaa }, - { 0xb2, 0xb3 }, - { 0xb5, 0xb5 }, - { 0xb9, 0xba }, - { 0xbc, 0xbe }, - { 0xc0, 0xd6 }, + { 0xaa, 0xaa }, { 0xb2, 0xb3 }, + { 0xb5, 0xb5 }, { 0xb9, 0xba }, + { 0xbc, 0xbe }, { 0xc0, 0xd6 }, { 0xd8, 0xf6 }, #if 0 { 0xf8, 0x220 } @@ -358,18 +453,14 @@ utf8_get_ctype_code_range(int ctype, int* nsb, int* nmb, }; static OnigCodePointRange SBAlnum[] = { - { 0x30, 0x39 }, - { 0x41, 0x5a }, + { 0x30, 0x39 }, { 0x41, 0x5a }, { 0x61, 0x7a } }; static OnigCodePointRange MBAlnum[] = { - { 0xaa, 0xaa }, - { 0xb5, 0xb5 }, - { 0xba, 0xba }, - { 0xc0, 0xd6 }, - { 0xd8, 0xf6 }, - { 0xf8, 0x220 } + { 0xaa, 0xaa }, { 0xb5, 0xb5 }, + { 0xba, 0xba }, { 0xc0, 0xd6 }, + { 0xd8, 0xf6 }, { 0xf8, 0x220 } }; switch (ctype) { @@ -424,92 +515,6 @@ utf8_get_ctype_code_range(int ctype, int* nsb, int* nmb, return 0; } -static int -utf8_get_all_fold_match_code(OnigCodePoint** codes) -{ - static OnigCodePoint list[] = { - 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, - 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, - 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, - 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, - - 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, - 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, - 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, - 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, - }; - - *codes = list; - return sizeof(list) / sizeof(OnigCodePoint); -} - -static int -utf8_get_fold_match_info(UChar* p, UChar* end, OnigEncFoldMatchInfo** info) -{ - - static OnigEncFoldMatchInfo xc[] = { - { 2, { 2, 2 }, { "\303\200", "\303\240" } }, /* CodePoint 0xc0 */ - { 2, { 2, 2 }, { "\303\201", "\303\241" } }, - { 2, { 2, 2 }, { "\303\202", "\303\242" } }, - { 2, { 2, 2 }, { "\303\203", "\303\243" } }, - { 2, { 2, 2 }, { "\303\204", "\303\244" } }, - { 2, { 2, 2 }, { "\303\205", "\303\245" } }, - { 2, { 2, 2 }, { "\303\206", "\303\246" } }, - { 2, { 2, 2 }, { "\303\207", "\303\247" } }, - { 2, { 2, 2 }, { "\303\210", "\303\250" } }, - { 2, { 2, 2 }, { "\303\211", "\303\251" } }, - { 2, { 2, 2 }, { "\303\212", "\303\252" } }, - { 2, { 2, 2 }, { "\303\213", "\303\253" } }, - { 2, { 2, 2 }, { "\303\214", "\303\254" } }, - { 2, { 2, 2 }, { "\303\215", "\303\255" } }, - { 2, { 2, 2 }, { "\303\216", "\303\256" } }, - { 2, { 2, 2 }, { "\303\217", "\303\257" } }, - { 2, { 2, 2 }, { "\303\220", "\303\260" } }, /* CodePoint 0xd0 */ - { 2, { 2, 2 }, { "\303\221", "\303\261" } }, - { 2, { 2, 2 }, { "\303\222", "\303\262" } }, - { 2, { 2, 2 }, { "\303\223", "\303\263" } }, - { 2, { 2, 2 }, { "\303\224", "\303\264" } }, - { 2, { 2, 2 }, { "\303\225", "\303\265" } }, - { 2, { 2, 2 }, { "\303\226", "\303\266" } }, - { 0, { 0 }, { "" } }, - { 2, { 2, 2 }, { "\303\230", "\303\270" } }, - { 2, { 2, 2 }, { "\303\231", "\303\271" } }, - { 2, { 2, 2 }, { "\303\232", "\303\272" } }, - { 2, { 2, 2 }, { "\303\233", "\303\273" } }, - { 2, { 2, 2 }, { "\303\234", "\303\274" } }, - { 2, { 2, 2 }, { "\303\235", "\303\275" } }, - { 2, { 2, 2 }, { "\303\236", "\303\276" } }, - { 3, { 2, 2, 2 }, { "\303\237", "ss", "SS" }} /* ess-tsett(U+00DF) */ - }; - - if (p + 1 >= end) return -1; - if (*p < 0x80) { - if ((*p == 'S' && *(p+1) == 'S') || - (*p == 's' && *(p+1) == 's')) { - *info = &(xc[0xdf - 0xc0]); - return 2; - } - } - else if (*p == 195) { /* 195 == '\303' */ - int c = *(p+1); - if (c >= 128) { - if (c <= 159) { /* upper */ - if (c == 151) return -1; /* 0xd7 */ - *info = &(xc[c - 128]); - return 2; - } - else { /* lower */ - if (c == 183) return -1; /* 0xf7 */ - *info = &(xc[c - 160]); - return 2; - } - } - } - - return -1; /* is not a fold string. */ -} - - static UChar* utf8_left_adjust_char_head(UChar* start, UChar* s) { @@ -522,45 +527,32 @@ utf8_left_adjust_char_head(UChar* start, UChar* s) return p; } -static int -utf8_is_allowed_reverse_match(UChar* s, UChar* end) -{ - return TRUE; -} - OnigEncodingType OnigEncodingUTF8 = { - { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1 - }, + utf8_mbc_enc_len, "UTF-8", /* name */ 6, /* max byte length */ - TRUE, /* is_fold_match */ - ONIGENC_CTYPE_SUPPORT_LEVEL_FULL, /* ctype_support_level */ - TRUE, /* is continuous sb mb codepoint */ + 1, /* min byte length */ + (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE | + ONIGENC_AMBIGUOUS_MATCH_COMPOUND), + { + (OnigCodePoint )'\\' /* esc */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */ + , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */ + }, + onigenc_is_mbc_newline_0x0a, utf8_mbc_to_code, utf8_code_to_mbclen, utf8_code_to_mbc, - utf8_mbc_to_lower, - utf8_mbc_is_case_ambig, - utf8_code_is_ctype, + utf8_mbc_to_normalize, + utf8_is_mbc_ambiguous, + onigenc_iso_8859_1_get_all_pair_ambig_codes, + onigenc_ess_tsett_get_all_comp_ambig_codes, + utf8_is_code_ctype, utf8_get_ctype_code_range, utf8_left_adjust_char_head, - utf8_is_allowed_reverse_match, - utf8_get_all_fold_match_code, - utf8_get_fold_match_info + onigenc_always_true_is_allowed_reverse_match }; |