aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorakr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2007-12-08 04:31:26 +0000
committerakr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2007-12-08 04:31:26 +0000
commit9667f7953e1ca78e5acfa40e9573d1fcdbd3d1ea (patch)
tree87aac01c31519e4d21627350f0d8c1a435e96923
parentf1b7e60cb90a7e1a392d4ffccd07dd06eeff5345 (diff)
downloadruby-9667f7953e1ca78e5acfa40e9573d1fcdbd3d1ea.tar.gz
add test for UTF-8 bit pattern.
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@14132 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r--test/ruby/test_m17n.rb59
1 files changed, 59 insertions, 0 deletions
diff --git a/test/ruby/test_m17n.rb b/test/ruby/test_m17n.rb
index df59c88770..ff009d3a01 100644
--- a/test/ruby/test_m17n.rb
+++ b/test/ruby/test_m17n.rb
@@ -68,6 +68,65 @@ class TestM17N < Test::Unit::TestCase
assert_equal('"\374"', u("\xfc").inspect)
end
+ def test_validate_redundant_utf8
+ bits_0x10ffff = "11110100 10001111 10111111 10111111"
+ [
+ "0xxxxxxx",
+ "110XXXXx 10xxxxxx",
+ "1110XXXX 10Xxxxxx 10xxxxxx",
+ "11110XXX 10XXxxxx 10xxxxxx 10xxxxxx",
+ "111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx",
+ "1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx",
+ "11111110 10XXXXXx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx",
+ "11111111 10XXXXXX 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx",
+ ].each {|pat0|
+ [
+ pat0.gsub(/x/, '1'),
+ pat0.gsub(/x/, '0')
+ ].each {|pat1|
+ [
+ pat1.sub(/X([^X]*)\z/, '1\1').gsub(/X/, "0"),
+ pat1.gsub(/X/, "1"),
+ ].each {|pat2|
+ s = [pat2.gsub(/ /, "")].pack("B*").force_encoding("utf-8")
+ if pat2 <= bits_0x10ffff
+ assert(s.valid_encoding?, "#{pat2}")
+ else
+ assert(!s.valid_encoding?, "#{pat2}")
+ end
+ }
+ if / / =~ pat0
+ pat3 = pat1.gsub(/X/, "0")
+ s = [pat3.gsub(/ /, "")].pack("B*").force_encoding("utf-8")
+ assert(!s.valid_encoding?, "#{pat3}")
+ end
+ }
+ }
+ end
+
+ def test_validate_surrogate
+ # 1110XXXX 10Xxxxxx 10xxxxxx : 3 bytes UTF-8
+ pats = [
+ "11101101 10011111 10111111", # just before surrogate high
+ "11101101 1010xxxx 10xxxxxx", # surrogate high
+ "11101101 1011xxxx 10xxxxxx", # surrogate low
+ "11101110 10000000 10000000", # just after surrogate low
+ ]
+ pats.values_at(1,2).each {|pat0|
+ [
+ pat0.gsub(/x/, '0'),
+ pat0.gsub(/x/, '1'),
+ ].each {|pat1|
+ s = [pat1.gsub(/ /, "")].pack("B*").force_encoding("utf-8")
+ assert(!s.valid_encoding?, "#{pat1}")
+ }
+ }
+ pats.values_at(0,3).each {|pat|
+ s = [pat.gsub(/ /, "")].pack("B*").force_encoding("utf-8")
+ assert(s.valid_encoding?, "#{pat}")
+ }
+ end
+
def test_regexp_too_short_multibyte_character
assert_raise(SyntaxError) { eval('/\xfe/e') }
assert_raise(SyntaxError) { eval('/\x8e/e') }