diff options
author | akr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2007-12-08 04:31:26 +0000 |
---|---|---|
committer | akr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2007-12-08 04:31:26 +0000 |
commit | 9667f7953e1ca78e5acfa40e9573d1fcdbd3d1ea (patch) | |
tree | 87aac01c31519e4d21627350f0d8c1a435e96923 /test/ruby | |
parent | f1b7e60cb90a7e1a392d4ffccd07dd06eeff5345 (diff) | |
download | ruby-9667f7953e1ca78e5acfa40e9573d1fcdbd3d1ea.tar.gz |
add test for UTF-8 bit pattern.
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@14132 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'test/ruby')
-rw-r--r-- | test/ruby/test_m17n.rb | 59 |
1 files changed, 59 insertions, 0 deletions
diff --git a/test/ruby/test_m17n.rb b/test/ruby/test_m17n.rb index df59c88770..ff009d3a01 100644 --- a/test/ruby/test_m17n.rb +++ b/test/ruby/test_m17n.rb @@ -68,6 +68,65 @@ class TestM17N < Test::Unit::TestCase assert_equal('"\374"', u("\xfc").inspect) end + def test_validate_redundant_utf8 + bits_0x10ffff = "11110100 10001111 10111111 10111111" + [ + "0xxxxxxx", + "110XXXXx 10xxxxxx", + "1110XXXX 10Xxxxxx 10xxxxxx", + "11110XXX 10XXxxxx 10xxxxxx 10xxxxxx", + "111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx", + "1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx", + "11111110 10XXXXXx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx", + "11111111 10XXXXXX 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx", + ].each {|pat0| + [ + pat0.gsub(/x/, '1'), + pat0.gsub(/x/, '0') + ].each {|pat1| + [ + pat1.sub(/X([^X]*)\z/, '1\1').gsub(/X/, "0"), + pat1.gsub(/X/, "1"), + ].each {|pat2| + s = [pat2.gsub(/ /, "")].pack("B*").force_encoding("utf-8") + if pat2 <= bits_0x10ffff + assert(s.valid_encoding?, "#{pat2}") + else + assert(!s.valid_encoding?, "#{pat2}") + end + } + if / / =~ pat0 + pat3 = pat1.gsub(/X/, "0") + s = [pat3.gsub(/ /, "")].pack("B*").force_encoding("utf-8") + assert(!s.valid_encoding?, "#{pat3}") + end + } + } + end + + def test_validate_surrogate + # 1110XXXX 10Xxxxxx 10xxxxxx : 3 bytes UTF-8 + pats = [ + "11101101 10011111 10111111", # just before surrogate high + "11101101 1010xxxx 10xxxxxx", # surrogate high + "11101101 1011xxxx 10xxxxxx", # surrogate low + "11101110 10000000 10000000", # just after surrogate low + ] + pats.values_at(1,2).each {|pat0| + [ + pat0.gsub(/x/, '0'), + pat0.gsub(/x/, '1'), + ].each {|pat1| + s = [pat1.gsub(/ /, "")].pack("B*").force_encoding("utf-8") + assert(!s.valid_encoding?, "#{pat1}") + } + } + pats.values_at(0,3).each {|pat| + s = [pat.gsub(/ /, "")].pack("B*").force_encoding("utf-8") + assert(s.valid_encoding?, "#{pat}") + } + end + def test_regexp_too_short_multibyte_character assert_raise(SyntaxError) { eval('/\xfe/e') } assert_raise(SyntaxError) { eval('/\x8e/e') } |