aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog10
-rw-r--r--parse.y2
-rw-r--r--re.c28
-rw-r--r--test/ruby/test_m17n.rb201
4 files changed, 231 insertions, 10 deletions
diff --git a/ChangeLog b/ChangeLog
index 2bd1d06064..322eb95c9a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,13 @@
+Fri Nov 23 15:27:43 2007 Tanaka Akira <akr@fsij.org>
+
+ * re.c (REG_CASESTATE): unused macro removed.
+ (rb_reg_prepare_re): check encoding difference.
+ (rb_reg_initialize): check 8bit byte.
+
+ * parse.y (parser_tokadd_escape): fix has8bit.
+
+ [ruby-dev:32113]
+
Fri Nov 23 13:34:08 2007 Tanaka Akira <akr@fsij.org>
* struct.c (rb_struct_define_without_accessor): new function.
diff --git a/parse.y b/parse.y
index adae072923..75d39042f4 100644
--- a/parse.y
+++ b/parse.y
@@ -5206,7 +5206,7 @@ parser_tokadd_escape(struct parser_params *parser, int term,
hex = tok_hex(&numlen);
if (numlen == 0) goto eof;
tokcopy(numlen + 2);
- if (hex >= 0x80) *has8bit = ENC_CODERANGE_UNKNOWN;
+ if (hex >= 0x80) *has8bit = 1;
}
return 0;
diff --git a/re.c b/re.c
index 4e915f6f78..8c17e98592 100644
--- a/re.c
+++ b/re.c
@@ -132,7 +132,6 @@ rb_memsearch(const void *x0, long m, const void *y0, long n)
}
#define REG_LITERAL FL_USER5
-#define REG_CASESTATE FL_USER0
#define KCODE_FIXED FL_USER4
@@ -711,15 +710,18 @@ static void
rb_reg_prepare_re(VALUE re, VALUE str)
{
int need_recompile = 0;
- int state;
rb_encoding *enc;
rb_reg_check(re);
- state = FL_TEST(re, REG_CASESTATE);
/* ignorecase status */
- if (ENCODING_GET(re) == 0 && !FL_TEST(re, KCODE_FIXED) &&
- (enc = rb_enc_get(str)) != 0 &&
- RREGEXP(re)->ptr->enc != enc) {
+ if (ENCODING_GET(re) != 0 || FL_TEST(re, KCODE_FIXED)) {
+ if (ENCODING_GET(re) != rb_enc_get_index(str) &&
+ rb_enc_str_coderange(str) != ENC_CODERANGE_SINGLE) {
+ rb_raise(rb_eArgError, "character encodings differ");
+ }
+ }
+ else if ((enc = rb_enc_get(str)) != 0 &&
+ RREGEXP(re)->ptr->enc != enc) {
need_recompile = 1;
}
@@ -755,7 +757,6 @@ rb_reg_adjust_startpos(VALUE re, VALUE str, int pos, int reverse)
OnigEncoding enc;
UChar *p, *string;
- rb_reg_check(re);
rb_reg_prepare_re(re, str);
if (reverse) {
@@ -795,7 +796,6 @@ rb_reg_search(VALUE re, VALUE str, int pos, int reverse)
return -1;
}
- rb_reg_check(re);
rb_reg_prepare_re(re, str);
if (reverse) {
@@ -1231,6 +1231,8 @@ rb_reg_initialize(VALUE obj, const char *s, int len, rb_encoding *enc,
int options, onig_errmsg_buffer err)
{
struct RRegexp *re = RREGEXP(obj);
+ int raw8bit;
+ long i;
if (!OBJ_TAINTED(obj) && rb_safe_level() >= 4)
rb_raise(rb_eSecurityError, "Insecure: can't modify regexp");
@@ -1242,8 +1244,16 @@ rb_reg_initialize(VALUE obj, const char *s, int len, rb_encoding *enc,
re->ptr = 0;
re->str = 0;
+ raw8bit = 0;
+ for (i = 0; i < len; i++) {
+ if (s[i] & 0x80) {
+ raw8bit = 1;
+ break;
+ }
+ }
+
rb_enc_associate((VALUE)re, enc);
- if (options & ARG_ENCODING_FIXED) {
+ if (options & ARG_ENCODING_FIXED || raw8bit) {
re->basic.flags |= KCODE_FIXED;
}
re->ptr = make_regexp(s, len, enc, options & ARG_REG_OPTION_MASK, err);
diff --git a/test/ruby/test_m17n.rb b/test/ruby/test_m17n.rb
new file mode 100644
index 0000000000..c50c6b8384
--- /dev/null
+++ b/test/ruby/test_m17n.rb
@@ -0,0 +1,201 @@
+require 'test/unit'
+
+class TestM17N < Test::Unit::TestCase
+ def assert_encoding(encname, actual, message=nil)
+ assert_equal(Encoding.find(encname), actual, message)
+ end
+
+ def a(str) str.force_encoding("ASCII-8BIT") end
+ def e(str) str.force_encoding("EUC-JP") end
+ def s(str) str.force_encoding("Shift_JIS") end
+ def u(str) str.force_encoding("UTF-8") end
+
+ def test_string_ascii_literal
+ assert_encoding("ASCII-8BIT", eval(a(%{""})).encoding)
+ assert_encoding("ASCII-8BIT", eval(a(%{"a"})).encoding)
+ end
+
+ def test_string_euc_literal
+ assert_encoding("ASCII-8BIT", eval(e(%{""})).encoding)
+ assert_encoding("ASCII-8BIT", eval(e(%{"a"})).encoding)
+ assert_encoding("EUC-JP", eval(e(%{"\xa1\xa1"})).encoding)
+ assert_encoding("EUC-JP", eval(e(%{"\\xa1\\xa1"})).encoding)
+ assert_encoding("ASCII-8BIT", eval(e(%{"\\x20"})).encoding)
+ assert_encoding("ASCII-8BIT", eval(e(%{"\\n"})).encoding)
+ assert_encoding("EUC-JP", eval(e(%{"\\x80"})).encoding)
+ end
+
+ def test_regexp_too_short_multibyte_character
+ assert_raise(SyntaxError) { eval('/\xfe/e') }
+ assert_raise(SyntaxError) { eval('/\x8e/e') }
+ assert_raise(SyntaxError) { eval('/\x8f/e') }
+ assert_raise(SyntaxError) { eval('/\x8f\xa1/e') }
+ assert_raise(SyntaxError) { eval('/\xef/s') }
+ assert_raise(SyntaxError) { eval('/\xc0/u') }
+ assert_raise(SyntaxError) { eval('/\xe0\x80/u') }
+ assert_raise(SyntaxError) { eval('/\xf0\x80\x80/u') }
+ assert_raise(SyntaxError) { eval('/\xf8\x80\x80\x80/u') }
+ assert_raise(SyntaxError) { eval('/\xfc\x80\x80\x80\x80/u') }
+
+ # raw 8bit
+ #assert_raise(SyntaxError) { eval("/\xfe/e") }
+ #assert_raise(SyntaxError) { eval("/\xc0/u") }
+
+ # invalid suffix
+ #assert_raise(SyntaxError) { eval('/\xc0\xff/u') }
+ #assert_raise(SyntaxError) { eval('/\xc0\x20/u') }
+ end
+
+ def test_regexp_generic
+ r = /a/
+ assert_encoding("ASCII-8BIT", r.encoding)
+ assert_equal(0, r =~ a("a"))
+ assert_equal(0, r =~ e("a"))
+ assert_equal(0, r =~ s("a"))
+ assert_equal(0, r =~ u("a"))
+
+ # "\xc0\xa1" is a valid sequence for ASCII-8BIT, EUC-JP, Shift_JIS and UTF-8.
+ assert_equal(nil, r =~ a("\xc0\xa1"))
+ assert_equal(nil, r =~ e("\xc0\xa1"))
+ assert_equal(nil, r =~ s("\xc0\xa1"))
+ assert_equal(nil, r =~ u("\xc0\xa1"))
+
+ r = eval(a(%{/\xc0\xa1/}))
+ assert_encoding("ASCII-8BIT", r.encoding)
+ assert_equal(nil, r =~ a("a"))
+ assert_equal(nil, r =~ e("a"))
+ assert_equal(nil, r =~ s("a"))
+ assert_equal(nil, r =~ u("a"))
+ assert_equal(0, r =~ a("\xc0\xa1"))
+ assert_raise(ArgumentError) { r =~ e("\xc0\xa1") }
+ assert_raise(ArgumentError) { r =~ s("\xc0\xa1") }
+ assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
+
+ # xxx: /\xc0\xa1/ should be restricted only for ASCII-8BIT?
+ # r = /\xc0\xa1/
+ # assert_encoding("ASCII-8BIT", r.encoding)
+ # assert_equal(nil, r =~ a("a"))
+ # assert_equal(nil, r =~ e("a"))
+ # assert_equal(nil, r =~ s("a"))
+ # assert_equal(nil, r =~ u("a"))
+ # assert_equal(0, r =~ a("\xc0\xa1"))
+ # assert_equal(0, r =~ e("\xc0\xa1")) # xxx
+ # assert_equal(0, r =~ s("\xc0\xa1")) # xxx
+ # assert_equal(0, r =~ u("\xc0\xa1")) # xxx
+ end
+
+ def test_regexp_ascii
+ r = /a/n
+ assert_encoding("ASCII-8BIT", r.encoding)
+ assert_equal(0, r =~ a("a"))
+ assert_equal(0, r =~ e("a"))
+ assert_equal(0, r =~ s("a"))
+ assert_equal(0, r =~ u("a"))
+ assert_equal(nil, r =~ a("\xc0\xa1"))
+ assert_raise(ArgumentError) { r =~ e("\xc0\xa1") }
+ assert_raise(ArgumentError) { r =~ s("\xc0\xa1") }
+ assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
+
+ r = /\xc0\xa1/n
+ assert_encoding("ASCII-8BIT", r.encoding)
+ assert_equal(nil, r =~ a("a"))
+ assert_equal(nil, r =~ e("a"))
+ assert_equal(nil, r =~ s("a"))
+ assert_equal(nil, r =~ u("a"))
+ assert_equal(0, r =~ a("\xc0\xa1"))
+ assert_raise(ArgumentError) { r =~ e("\xc0\xa1") }
+ assert_raise(ArgumentError) { r =~ s("\xc0\xa1") }
+ assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
+
+ r = eval(%{/\xc0\xa1/n}.force_encoding("ASCII-8BIT"))
+ assert_encoding("ASCII-8BIT", r.encoding)
+ assert_equal(nil, r =~ a("a"))
+ assert_equal(nil, r =~ e("a"))
+ assert_equal(nil, r =~ s("a"))
+ assert_equal(nil, r =~ u("a"))
+ assert_equal(0, r =~ a("\xc0\xa1"))
+ assert_raise(ArgumentError) { r =~ e("\xc0\xa1") }
+ assert_raise(ArgumentError) { r =~ s("\xc0\xa1") }
+ assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
+
+ r = eval(%q{/\xc0\xa1/}.force_encoding("ASCII-8BIT"))
+ assert_encoding("ASCII-8BIT", r.encoding)
+ assert_equal(nil, r =~ a("a"))
+ assert_equal(nil, r =~ e("a"))
+ assert_equal(nil, r =~ s("a"))
+ assert_equal(nil, r =~ u("a"))
+ assert_equal(0, r =~ a("\xc0\xa1"))
+ # assert_raise(ArgumentError) { r =~ e("\xc0\xa1") }
+ # assert_raise(ArgumentError) { r =~ s("\xc0\xa1") }
+ # assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
+
+ end
+
+ def test_regexp_euc
+ r = /a/e
+ assert_encoding("EUC-JP", r.encoding)
+ assert_equal(0, r =~ a("a"))
+ assert_equal(0, r =~ e("a"))
+ assert_equal(0, r =~ s("a"))
+ assert_equal(0, r =~ u("a"))
+ assert_raise(ArgumentError) { r =~ a("\xc0\xa1") }
+ assert_equal(nil, r =~ e("\xc0\xa1"))
+ assert_raise(ArgumentError) { r =~ s("\xc0\xa1") }
+ assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
+
+ r = /\xc0\xa1/e
+ assert_encoding("EUC-JP", r.encoding)
+ assert_equal(nil, r =~ a("a"))
+ assert_equal(nil, r =~ e("a"))
+ assert_equal(nil, r =~ s("a"))
+ assert_equal(nil, r =~ u("a"))
+ assert_raise(ArgumentError) { r =~ a("\xc0\xa1") }
+ assert_equal(0, r =~ e("\xc0\xa1"))
+ assert_raise(ArgumentError) { r =~ s("\xc0\xa1") }
+ assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
+
+ r = eval(%{/\xc0\xa1/}.force_encoding("EUC-JP"))
+ assert_encoding("EUC-JP", r.encoding)
+ assert_equal(nil, r =~ a("a"))
+ assert_equal(nil, r =~ e("a"))
+ assert_equal(nil, r =~ s("a"))
+ assert_equal(nil, r =~ u("a"))
+ assert_raise(ArgumentError) { r =~ a("\xc0\xa1") }
+ assert_equal(0, r =~ e("\xc0\xa1"))
+ assert_raise(ArgumentError) { r =~ s("\xc0\xa1") }
+ assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
+
+ r = eval(%q{/\xc0\xa1/}.force_encoding("EUC-JP"))
+ assert_encoding("EUC-JP", r.encoding)
+ assert_equal(nil, r =~ a("a"))
+ assert_equal(nil, r =~ e("a"))
+ assert_equal(nil, r =~ s("a"))
+ assert_equal(nil, r =~ u("a"))
+ assert_raise(ArgumentError) { r =~ a("\xc0\xa1") }
+ assert_equal(0, r =~ e("\xc0\xa1"))
+ assert_raise(ArgumentError) { r =~ s("\xc0\xa1") }
+ assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
+ end
+
+ def test_begin_end_offset
+ str = e("\244\242\244\244\244\246\244\250\244\252a")
+ assert(/(a)/ =~ str)
+ assert_equal("a", $&)
+ assert_equal(5, $~.begin(0))
+ assert_equal(6, $~.end(0))
+ assert_equal([5,6], $~.offset(0))
+ assert_equal(5, $~.begin(1))
+ assert_equal(6, $~.end(1))
+ assert_equal([5,6], $~.offset(1))
+ end
+
+ def test_begin_end_offset_sjis
+ str = s("\x81@@")
+ assert(/@/ =~ str)
+ assert_equal(s("\x81@"), $`)
+ assert_equal("@", $&)
+ assert_equal("", $')
+ assert_equal([1,2], $~.offset(0))
+ end
+
+end