Web Mar 5 17:43:43 2008 Martin Duerst <duerst@it.aoyama.ac.jp>

* transcode.c (transcode_loop): Adjusted detection of invalid (ill-formed) UTF-8 sequences. Fixing potential security issue, see http://www.unicode.org/versions/Unicode5.1.0/#Notable_Changes. * test/ruby/test_transcode.rb: Added two tests for above fix. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@15692 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
author: duerst <duerst@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> 2008-03-05 08:45:51 +0000
committer: duerst <duerst@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> 2008-03-05 08:45:51 +0000
commit: 08631278ada7a6fd2bafb3ab0f0447b1f6d58790 (patch)
tree: 05eda73587c46a0d7cbedad281da8c6a07fa2b53
parent: 39787ea14db33dd4265d7f6271cd2d59ccaeff37 (diff)
download: ruby-08631278ada7a6fd2bafb3ab0f0447b1f6d58790.tar.gz
3 files changed, 29 insertions, 10 deletions
diff --git a/ChangeLog b/ChangeLog
index a77d84fa43..fbe8b29e5c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+Web Mar  5 17:43:43 2008  Martin Duerst  <duerst@it.aoyama.ac.jp>
+
+	* transcode.c (transcode_loop): Adjusted detection of invalid
+	  (ill-formed) UTF-8 sequences. Fixing potential security issue, see
+	  http://www.unicode.org/versions/Unicode5.1.0/#Notable_Changes.
+
+	* test/ruby/test_transcode.rb: Added two tests for above fix.
+
 Wed Mar  5 14:00:49 2008  Yukihiro Matsumoto  <matz@ruby-lang.org>
 
 	* numeric.c (fix_to_s): avoid rb_scan_args() when no argument
diff --git a/test/ruby/test_transcode.rb b/test/ruby/test_transcode.rb
index 9edf30882b..5a704fd364 100644
--- a/test/ruby/test_transcode.rb
+++ b/test/ruby/test_transcode.rb
@@ -242,6 +242,11 @@ class TestTranscode < Test::Unit::TestCase
   
   def test_invalid_ignore
     # arguments only
-    'abc'.encode('utf-8', invalid: :ignore)
+    assert_nothing_raised { 'abc'.encode('utf-8', invalid: :ignore) }
+    # check handling of UTF-8 ill-formed subsequences
+    assert_equal("\x00\x41\x00\x3E\x00\x42".force_encoding('UTF-16BE'),
+      "\x41\xC2\x3E\x42".encode('UTF-16BE', 'UTF-8', invalid: :ignore))
+    assert_equal("\x00\x41\x00\xF1\x00\x42".force_encoding('UTF-16BE'),
+      "\x41\xC2\xC3\xB1\x42".encode('UTF-16BE', 'UTF-8', invalid: :ignore))
   end
 end
diff --git a/transcode.c b/transcode.c
index ed01374f5b..a4c066a5ca 100644
--- a/transcode.c
+++ b/transcode.c
@@ -177,8 +177,10 @@ transcode_loop(unsigned char **in_pos, unsigned char **out_pos,
 	    if (from_utf8) {
 		if ((next_byte&0xC0) == 0x80)
 		    next_byte -= 0x80;
-		else
+		else {
+		    in_p--; /* may need to add more code later to revert other things */
 		    goto invalid;
+		}
 	    }
 	    next_table = (const BYTE_LOOKUP *)next_info;
 	    goto follow_byte;
@@ -390,13 +392,15 @@ str_transcode(int argc, VALUE *argv, VALUE *self)
 
 /*
  *  call-seq:
- *     str.encode!(encoding)   => str
- *     str.encode!(to_encoding, from_encoding)   => str
+ *     str.encode!(encoding [, options] )   => str
+ *     str.encode!(to_encoding, from_encoding [, options] )   => str
  *
- *  With one argument, transcodes the contents of <i>str</i> from
+ *  The first form transcodes the contents of <i>str</i> from
  *  str.encoding to +encoding+.
- *  With two arguments, transcodes the contents of <i>str</i> from
+ *  The second form transcodes the contents of <i>str</i> from
  *  from_encoding to to_encoding.
+ *  The options Hash gives details for conversion. See String#encode
+ *  for details.
  *  Returns the string even if no changes were made.
  */
 
@@ -414,13 +418,15 @@ rb_str_transcode_bang(int argc, VALUE *argv, VALUE str)
 
 /*
  *  call-seq:
- *     str.encode(encoding)   => str
- *     str.encode(to_encoding, from_encoding)   => str
+ *     str.encode(encoding [, options] )   => str
+ *     str.encode(to_encoding, from_encoding [, options] )   => str
  *
- *  With one argument, returns a copy of <i>str</i> transcoded
+ *  The first form returns a copy of <i>str</i> transcoded
  *  to encoding +encoding+.
- *  With two arguments, returns a copy of <i>str</i> transcoded
+ *  The second form returns a copy of <i>str</i> transcoded
  *  from from_encoding to to_encoding.
+ *  The options Hash gives details for conversion. Details
+ *  to be added.
  */
 
 static VALUE
author	duerst <duerst@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>	2008-03-05 08:45:51 +0000
committer	duerst <duerst@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>	2008-03-05 08:45:51 +0000
commit	08631278ada7a6fd2bafb3ab0f0447b1f6d58790 (patch)
tree	05eda73587c46a0d7cbedad281da8c6a07fa2b53
parent	39787ea14db33dd4265d7f6271cd2d59ccaeff37 (diff)
download	ruby-08631278ada7a6fd2bafb3ab0f0447b1f6d58790.tar.gz