diff options
author | Martin Dürst <duerst@it.aoyama.ac.jp> | 2019-07-14 10:58:50 +0900 |
---|---|---|
committer | Martin Dürst <duerst@it.aoyama.ac.jp> | 2019-07-14 10:58:50 +0900 |
commit | 369ff79394765ce198ac7cee872a8c739d895aaa (patch) | |
tree | d373a8e2a3b835f981a85dbb0e25e93730fba776 /test/ruby | |
parent | ac2866005b96baf986072f86ecd3dfd887f2bda3 (diff) | |
download | ruby-369ff79394765ce198ac7cee872a8c739d895aaa.tar.gz |
add encoding conversion from/to CESU-8
Add encoding conversion (transcoding) from UTF-8 to CESU-8
and back. CESU-8 is an encoding similar to UTF-8, but encodes
codepoints above U+FFFF as two surrogates, these surrogates
again being encoded as if they were UTF-8 codepoints. This
preserves the same binary sorting order as in UTF-16. It is
also somewhat similar (although not exactly identical) to an
encoding used internally by Java.
This completes issue #15995.
enc/trans/cesu_8.trans: Add encoding conversion from/to CESU-8
test/ruby/test_transcode.rb: Add tests for above
Diffstat (limited to 'test/ruby')
-rw-r--r-- | test/ruby/test_transcode.rb | 22 |
1 files changed, 22 insertions, 0 deletions
diff --git a/test/ruby/test_transcode.rb b/test/ruby/test_transcode.rb index 44d238ffd2..f405877dd5 100644 --- a/test/ruby/test_transcode.rb +++ b/test/ruby/test_transcode.rb @@ -2116,6 +2116,28 @@ class TestTranscode < Test::Unit::TestCase check_both_ways("D\u00FCrst", "\xC4\xDC\x99\xA2\xA3", 'IBM037') # Dürst end + def test_CESU_8 + check_both_ways("aijrszAIJRSZ09", "aijrszAIJRSZ09", 'CESU-8') # single bytes + + # check NULL explicitly + # this is different in CESU-8 and in Java modified UTF-8 strings + check_both_ways("\0", "\0", 'CESU-8') + + # U+0080 U+00FC U+00FF U+0100 U+0400 U+0700 U+07FF + two_byte_chars = "\xC2\x80\x20\xC3\xBC\x20\xC3\xBF\x20\xC4\x80\x20\xD0\x80\x20\xDC\x80\x20\xDF\xBF" + check_both_ways(two_byte_chars, two_byte_chars, 'CESU-8') + + # U+0800 U+2200 U+4E00 U+D7FF U+E000 U+FFFF + three_byte_chars = "\xE0\xA0\x80\x20\xE2\x88\x80\x20\xE4\xB8\x80\x20\xED\x9F\xBF\x20\xEE\x80\x80\x20\xEF\xBF\xBF" + check_both_ways(three_byte_chars, three_byte_chars, 'CESU-8') + + # characters outside BMP (double surrogates in CESU-8) + # U+10000 U+20000 U+50000 U+10FFFF + utf8 = "\xF0\x90\x80\x80 \xF0\xA0\x80\x80 \xF1\x90\x80\x80 \xF4\x8F\xBF\xBF" + cesu = "\xED\xA0\x80\xED\xB0\x80 \xED\xA1\x80\xED\xB0\x80 \xED\xA4\x80\xED\xB0\x80 \xED\xAF\xBF\xED\xBF\xBF" + check_both_ways(utf8, cesu, 'CESU-8') + end + def test_nothing_changed a = "James".force_encoding("US-ASCII") b = a.encode("Shift_JIS") |