From 394d5dfa9ba625c99a1e6a411f81b628bfebd60a Mon Sep 17 00:00:00 2001 From: naruse Date: Fri, 19 Apr 2013 17:50:38 +0000 Subject: * string.c (str_scrub): add ruby method String#scrub which verify and fix invalid byte sequence. * string.c (str_compat_and_valid): check given string is compatible and valid with given encoding. * transcode.c (str_transcode0): If invalid: :replace is specified for String#encode, replace invalid byte sequence even if the destination encoding equals to the source encoding. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@40390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- test/ruby/test_m17n.rb | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'test/ruby') diff --git a/test/ruby/test_m17n.rb b/test/ruby/test_m17n.rb index a8d56a4a56..60834bb9c6 100644 --- a/test/ruby/test_m17n.rb +++ b/test/ruby/test_m17n.rb @@ -1489,4 +1489,38 @@ class TestM17N < Test::Unit::TestCase s.untrust assert_equal(true, s.b.untrusted?) end + + def test_scrub + assert_equal("\uFFFD\uFFFD\uFFFD", u("\x80\x80\x80").scrub) + assert_equal("\uFFFDA", u("\xF4\x80\x80A").scrub) + + # exapmles in Unicode 6.1.0 D93b + assert_equal("\x41\uFFFD\uFFFD\x41\uFFFD\x41", + u("\x41\xC0\xAF\x41\xF4\x80\x80\x41").scrub) + assert_equal("\x41\uFFFD\uFFFD\uFFFD\x41", + u("\x41\xE0\x9F\x80\x41").scrub) + assert_equal("\u0061\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u0063\uFFFD\uFFFD\u0064", + u("\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64").scrub) + assert_equal("abcdefghijklmnopqrstuvwxyz\u0061\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u0063\uFFFD\uFFFD\u0064", + u("abcdefghijklmnopqrstuvwxyz\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64").scrub) + + assert_equal("\u3042\u3013", u("\xE3\x81\x82\xE3\x81").scrub("\u3013")) + assert_raise(Encoding::CompatibilityError){ u("\xE3\x81\x82\xE3\x81").scrub(e("\xA4\xA2")) } + assert_raise(TypeError){ u("\xE3\x81\x82\xE3\x81").scrub(1) } + assert_raise(ArgumentError){ u("\xE3\x81\x82\xE3\x81\x82\xE3\x81").scrub(u("\x81")) } + assert_equal(e("\xA4\xA2\xA2\xAE"), e("\xA4\xA2\xA4").scrub(e("\xA2\xAE"))) + + assert_equal("\u3042", u("\xE3\x81\x82\xE3\x81").scrub{|x|'<'+x.unpack('H*')[0]+'>'}) + assert_raise(Encoding::CompatibilityError){ u("\xE3\x81\x82\xE3\x81").scrub{e("\xA4\xA2")} } + assert_raise(TypeError){ u("\xE3\x81\x82\xE3\x81").scrub{1} } + assert_raise(ArgumentError){ u("\xE3\x81\x82\xE3\x81\x82\xE3\x81").scrub{u("\x81")} } + assert_equal(e("\xA4\xA2\xA2\xAE"), e("\xA4\xA2\xA4").scrub{e("\xA2\xAE")}) + + assert_equal("\uFFFD\u3042".encode("UTF-16BE"), + "\xD8\x00\x30\x42".force_encoding(Encoding::UTF_16BE). + scrub) + assert_equal("\uFFFD\u3042".encode("UTF-16LE"), + "\x00\xD8\x42\x30".force_encoding(Encoding::UTF_16LE). + scrub) + end end -- cgit v1.2.3