From 800f04c6a521c007e9837813b3564ae6b491e31c Mon Sep 17 00:00:00 2001 From: naruse Date: Sun, 8 Jan 2012 20:42:45 +0000 Subject: * numeric.c (rb_enc_uint_char): raise RangeError when added codepoint is invalid. [Feature #5855] [Bug #5863] [Bug #5864] * string.c (rb_str_concat): ditto. * string.c (rb_str_concat): set encoding as ASCII-8BIT when the string is US-ASCII and the argument is an integer greater than 127. * regenc.c (onigenc_mb2_code_to_mbclen): rearrange error code. * enc/euc_jp.c (code_to_mbclen): ditto. * enc/shift_jis.c (code_to_mbclen): ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@34236 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 16 +++++++++++++++ enc/euc_jp.c | 7 ++++--- enc/shift_jis.c | 4 ++-- numeric.c | 11 ++++++++++- regenc.c | 5 +++-- string.c | 43 ++++++++++++++++++++++++++++++++--------- test/ruby/enc/test_shift_jis.rb | 2 +- test/ruby/test_m17n.rb | 9 +++++++++ test/ruby/test_regexp.rb | 2 -- 9 files changed, 79 insertions(+), 20 deletions(-) diff --git a/ChangeLog b/ChangeLog index 886dae702b..1a52fcbdc6 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,19 @@ +Mon Jan 9 01:12:35 2012 NARUSE, Yui + + * numeric.c (rb_enc_uint_char): raise RangeError when added codepoint + is invalid. [Feature #5855] [Bug #5863] [Bug #5864] + + * string.c (rb_str_concat): ditto. + + * string.c (rb_str_concat): set encoding as ASCII-8BIT when the string + is US-ASCII and the argument is an integer greater than 127. + + * regenc.c (onigenc_mb2_code_to_mbclen): rearrange error code. + + * enc/euc_jp.c (code_to_mbclen): ditto. + + * enc/shift_jis.c (code_to_mbclen): ditto. + Sun Jan 8 20:31:45 2012 Narihiro Nakamura * gc.c : consider header bytes which are used by malloc. diff --git a/enc/euc_jp.c b/enc/euc_jp.c index 2666e60ae0..7667c5800e 100644 --- a/enc/euc_jp.c +++ b/enc/euc_jp.c @@ -154,9 +154,10 @@ static int code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED) { if (ONIGENC_IS_CODE_ASCII(code)) return 1; - else if (code > 0xffffff) return 0; - else if ((code & 0xff0000) >= 0x800000) return 3; - else if ((code & 0xff00) >= 0x8000) return 2; + else if (code > 0xffffff) + return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; + else if (code & 0x800000) return 3; + else if (code & 0x8000) return 2; else return ONIGERR_INVALID_CODE_POINT_VALUE; } diff --git a/enc/shift_jis.c b/enc/shift_jis.c index d1357b3212..9dcacb584d 100644 --- a/enc/shift_jis.c +++ b/enc/shift_jis.c @@ -135,13 +135,13 @@ code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED) if (EncLen_SJIS[(int )code] == 1) return 1; else - return 0; + return ONIGERR_INVALID_CODE_POINT_VALUE; } else if (code <= 0xffff) { return 2; } else - return ONIGERR_INVALID_CODE_POINT_VALUE; + return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; } static OnigCodePoint diff --git a/numeric.c b/numeric.c index 53d85829bb..3b09fea17b 100644 --- a/numeric.c +++ b/numeric.c @@ -2281,11 +2281,20 @@ rb_enc_uint_chr(unsigned int code, rb_encoding *enc) { int n; VALUE str; - if ((n = rb_enc_codelen(code, enc)) <= 0) { + switch (n = rb_enc_codelen(code, enc)) { + case ONIGERR_INVALID_CODE_POINT_VALUE: + rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc)); + break; + case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE: + case 0: rb_raise(rb_eRangeError, "%u out of char range", code); + break; } str = rb_enc_str_new(0, n, enc); rb_enc_mbcput(code, RSTRING_PTR(str), enc); + if (rb_enc_precise_mbclen(RSTRING_PTR(str), RSTRING_END(str), enc) != n) { + rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc)); + } return str; } diff --git a/regenc.c b/regenc.c index 32d24e76d9..70b56ef727 100644 --- a/regenc.c +++ b/regenc.c @@ -732,8 +732,9 @@ onigenc_mbn_is_mbc_ambiguous(OnigEncoding enc, OnigCaseFoldType flag, extern int onigenc_mb2_code_to_mbclen(OnigCodePoint code, OnigEncoding enc ARG_UNUSED) { - if ((code & 0xff00) != 0) return 2; - else return 1; + if (code <= 0xff) return 1; + if (code <= 0xffff) return 2; + return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE; } extern int diff --git a/string.c b/string.c index 606fef7a42..55691d017c 100644 --- a/string.c +++ b/string.c @@ -2074,10 +2074,11 @@ rb_str_append(VALUE str, VALUE str2) VALUE rb_str_concat(VALUE str1, VALUE str2) { - unsigned int lc; + unsigned int code; + rb_encoding *enc = STR_ENC_GET(str1); if (FIXNUM_P(str2) || RB_TYPE_P(str2, T_BIGNUM)) { - if (rb_num_to_uint(str2, &lc) == 0) { + if (rb_num_to_uint(str2, &code) == 0) { } else if (FIXNUM_P(str2)) { rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2)); @@ -2089,22 +2090,46 @@ rb_str_concat(VALUE str1, VALUE str2) else { return rb_str_append(str1, str2); } - { - rb_encoding *enc = STR_ENC_GET(str1); + + if (enc == rb_usascii_encoding()) { + /* US-ASCII automatically extended to ASCII-8BIT */ + char buf[1] = {(char)code}; + if (code > 0xFF) { + rb_raise(rb_eRangeError, "%u out of char range", code); + } + rb_str_cat(str1, buf, 1); + if (code > 127) { + rb_enc_associate(str1, rb_ascii8bit_encoding()); + ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID); + } + } + else { long pos = RSTRING_LEN(str1); int cr = ENC_CODERANGE(str1); int len; + char *buf; - if ((len = rb_enc_codelen(lc, enc)) <= 0) { - rb_raise(rb_eRangeError, "%u invalid char", lc); + switch (len = rb_enc_codelen(code, enc)) { + case ONIGERR_INVALID_CODE_POINT_VALUE: + rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc)); + break; + case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE: + case 0: + rb_raise(rb_eRangeError, "%u out of char range", code); + break; + } + buf = ALLOCA_N(char, len + 1); + rb_enc_mbcput(code, buf, enc); + if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) { + rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc)); } rb_str_resize(str1, pos+len); - rb_enc_mbcput(lc, RSTRING_PTR(str1)+pos, enc); - if (cr == ENC_CODERANGE_7BIT && lc > 127) + strncpy(RSTRING_PTR(str1) + pos, buf, len); + if (cr == ENC_CODERANGE_7BIT && code > 127) cr = ENC_CODERANGE_VALID; ENC_CODERANGE_SET(str1, cr); - return str1; } + return str1; } /* diff --git a/test/ruby/enc/test_shift_jis.rb b/test/ruby/enc/test_shift_jis.rb index f81cb7801c..54ef67dd44 100644 --- a/test/ruby/enc/test_shift_jis.rb +++ b/test/ruby/enc/test_shift_jis.rb @@ -22,6 +22,6 @@ class TestShiftJIS < Test::Unit::TestCase s = "あいうえお" s << 0x82a9 assert_equal("あいうえおか", s) - assert_raise(ArgumentError) { s << 0x82 } + assert_raise(RangeError) { s << 0x82 } end end diff --git a/test/ruby/test_m17n.rb b/test/ruby/test_m17n.rb index fffcab3def..7c261138ba 100644 --- a/test/ruby/test_m17n.rb +++ b/test/ruby/test_m17n.rb @@ -1161,6 +1161,7 @@ class TestM17N < Test::Unit::TestCase def test_str_concat assert_equal(1, "".concat(0xA2).size) + assert_equal(Encoding::ASCII_8BIT, "".force_encoding("US-ASCII").concat(0xA2).encoding) assert_equal("A\x84\x31\xA4\x39".force_encoding("GB18030"), "A".force_encoding("GB18030") << 0x8431A439) end @@ -1220,6 +1221,14 @@ class TestM17N < Test::Unit::TestCase 2206368128.chr(Encoding::UTF_8) } assert_not_match(/-\d+ out of char range/, e.message) + + assert_raise(RangeError){ 0x80.chr("US-ASCII") } + assert_raise(RangeError){ 0x80.chr("SHIFT_JIS") } + assert_raise(RangeError){ 0xE0.chr("SHIFT_JIS") } + assert_raise(RangeError){ 0x100.chr("SHIFT_JIS") } + assert_raise(RangeError){ 0xA0.chr("EUC-JP") } + assert_raise(RangeError){ 0x100.chr("EUC-JP") } + assert_raise(RangeError){ 0xA1A0.chr("EUC-JP") } end def test_marshal diff --git a/test/ruby/test_regexp.rb b/test/ruby/test_regexp.rb index 1104647730..05ec477cba 100644 --- a/test/ruby/test_regexp.rb +++ b/test/ruby/test_regexp.rb @@ -808,8 +808,6 @@ class TestRegexp < Test::Unit::TestCase #assert_match(/^(\ufb05)\1\1$/i, "\ufb05\ufb06st") # this must be bug... assert_match(/^\ufb05{3}$/i, "\ufb05\ufb06st") assert_match(/^\u03b9\u0308\u0301$/i, "\u0390") - assert_nothing_raised { 0x03ffffff.chr("utf-8").size } - assert_nothing_raised { 0x7fffffff.chr("utf-8").size } end def test_unicode_age -- cgit v1.2.3