From 40e5b3933662ef3cd41f57fe8e8003b3f7c04b05 Mon Sep 17 00:00:00 2001 From: jeg2 Date: Tue, 16 Nov 2010 23:55:29 +0000 Subject: * lib/csv.rb: Upgrading output encoding as needed. [ruby-core:33135] git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@29808 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 4 +++ lib/csv.rb | 28 +++++++++++++------ test/csv/test_encodings.rb | 68 ++++++++++++++++++++++++++-------------------- 3 files changed, 62 insertions(+), 38 deletions(-) diff --git a/ChangeLog b/ChangeLog index f99c1245e4..678e842a2f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +Tue Nov 17 08:54:04 2010 James Edward Gray II + + * lib/csv.rb: Upgrading output encoding as needed. [ruby-core:33135] + Tue Nov 16 22:30:39 2010 Yusuke Endoh * vm_insnhelper.c (vm_throw): remove fear of undefined behavior :-) diff --git a/lib/csv.rb b/lib/csv.rb index fed35d2a81..fe7309009b 100644 --- a/lib/csv.rb +++ b/lib/csv.rb @@ -1562,9 +1562,10 @@ class CSV options = DEFAULT_OPTIONS.merge(options) # create the IO object we will read from - @io = if data.is_a? String then StringIO.new(data) else data end + @io = data.is_a?(String) ? StringIO.new(data) : data # honor the IO encoding if we can, otherwise default to ASCII-8BIT - @encoding = raw_encoding || Encoding.default_internal || Encoding.default_external + @encoding = raw_encoding || Encoding.default_internal || + Encoding.default_external # # prepare for building safe regular expressions in the target encoding, # if we can transcode the needed characters @@ -1711,7 +1712,15 @@ class CSV @headers = row if header_row? @lineno += 1 - @io << row.map(&@quote).join(@col_sep) + @row_sep # quote and separate + output = row.map(&@quote).join(@col_sep) + @row_sep # quote and separate + if @io.is_a?(StringIO) and + output.encoding != raw_encoding and + ( compatible_encoding = Encoding.compatible?( @io.string.encoding, + output.encoding ) ) + @io = StringIO.new(@io.string.force_encoding(compatible_encoding)) + @io.seek(0, IO::SEEK_END) + end + @io << output self # for chaining end @@ -2043,11 +2052,13 @@ class CSV @row_sep = @row_sep.to_s.encode(@encoding) # establish quoting rules - @force_quotes = options.delete(:force_quotes) - do_quote = lambda do |field| - @quote_char + - String(field).gsub(@quote_char, @quote_char * 2) + - @quote_char + @force_quotes = options.delete(:force_quotes) + do_quote = lambda do |field| + field = String(field) + encoded_quote = @quote_char.encode(field.encoding) + encoded_quote + + field.gsub(encoded_quote, encoded_quote * 2) + + encoded_quote end quotable_chars = encode_str("\r\n", @col_sep, @quote_char) @quote = if @force_quotes @@ -2297,6 +2308,7 @@ class CSV end private + def raw_encoding if @io.respond_to? :internal_encoding @io.internal_encoding || @io.external_encoding diff --git a/test/csv/test_encodings.rb b/test/csv/test_encodings.rb index 5d29ac1fa9..8d7de2e728 100644 --- a/test/csv/test_encodings.rb +++ b/test/csv/test_encodings.rb @@ -15,43 +15,43 @@ class TestEncodings < Test::Unit::TestCase def setup @temp_csv_path = File.join(File.dirname(__FILE__), "temp.csv") end - + def teardown File.unlink(@temp_csv_path) if File.exist? @temp_csv_path end - + ######################################## ### Hand Test Some Popular Encodings ### ######################################## - + def test_parses_utf8_encoding assert_parses( [ %w[ one two … ], %w[ 1 … 3 ], %w[ … 5 6 ] ], "UTF-8" ) end - + def test_parses_latin1_encoding assert_parses( [ %w[ one two Résumé ], %w[ 1 Résumé 3 ], %w[ Résumé 5 6 ] ], "ISO-8859-1" ) end - + def test_parses_utf16be_encoding assert_parses( [ %w[ one two … ], %w[ 1 … 3 ], %w[ … 5 6 ] ], "UTF-16BE" ) end - + def test_parses_shift_jis_encoding assert_parses( [ %w[ 一 二 三 ], %w[ 四 五 六 ], %w[ 七 八 九 ] ], "Shift_JIS" ) end - + ########################################################### ### Try Simple Reading for All Non-dummy Ruby Encodings ### ########################################################### - + def test_reading_with_most_encodings each_encoding do |encoding| begin @@ -62,7 +62,7 @@ class TestEncodings < Test::Unit::TestCase end end end - + def test_regular_expression_escaping each_encoding do |encoding| begin @@ -73,18 +73,18 @@ class TestEncodings < Test::Unit::TestCase end end end - + ####################################################################### ### Stress Test ASCII Compatible and Non-ASCII Compatible Encodings ### ####################################################################### - + def test_auto_line_ending_detection # arrange data to place a \r at the end of CSV's read ahead point encode_for_tests([["a" * 509]], row_sep: "\r\n") do |data| assert_equal("\r\n".encode(data.encoding), CSV.new(data).row_sep) end end - + def test_csv_chars_are_transcoded encode_for_tests([%w[abc def]]) do |data| %w[col_sep row_sep quote_char].each do |csv_char| @@ -93,7 +93,7 @@ class TestEncodings < Test::Unit::TestCase end end end - + def test_parser_works_with_encoded_headers encode_for_tests([%w[one two three], %w[1 2 3]]) do |data| parsed = CSV.parse(data, headers: true) @@ -105,7 +105,7 @@ class TestEncodings < Test::Unit::TestCase end end end - + def test_built_in_converters_transcode_to_utf_8_then_convert encode_for_tests([%w[one two three], %w[1 2 3]]) do |data| parsed = CSV.parse(data, converters: :integer) @@ -114,7 +114,7 @@ class TestEncodings < Test::Unit::TestCase assert_equal([1, 2, 3], parsed[1]) end end - + def test_built_in_header_converters_transcode_to_utf_8_then_convert encode_for_tests([%w[one two three], %w[1 2 3]]) do |data| parsed = CSV.parse( data, headers: true, @@ -125,7 +125,7 @@ class TestEncodings < Test::Unit::TestCase "Wrong data encoding." ) end end - + def test_open_allows_you_to_set_encodings encode_for_tests([%w[abc def]]) do |data| # read and write in encoding @@ -136,7 +136,7 @@ class TestEncodings < Test::Unit::TestCase "Wrong data encoding." ) end end - + # read and write with transcoding File.open(@temp_csv_path, "wb:UTF-32BE:#{data.encoding.name}") do |f| f << data @@ -149,7 +149,7 @@ class TestEncodings < Test::Unit::TestCase end end end - + def test_foreach_allows_you_to_set_encodings encode_for_tests([%w[abc def]]) do |data| # read and write in encoding @@ -158,7 +158,7 @@ class TestEncodings < Test::Unit::TestCase assert( row.all? { |f| f.encoding == data.encoding }, "Wrong data encoding." ) end - + # read and write with transcoding File.open(@temp_csv_path, "wb:UTF-32BE:#{data.encoding.name}") do |f| f << data @@ -170,7 +170,7 @@ class TestEncodings < Test::Unit::TestCase end end end - + def test_read_allows_you_to_set_encodings encode_for_tests([%w[abc def]]) do |data| # read and write in encoding @@ -178,7 +178,7 @@ class TestEncodings < Test::Unit::TestCase rows = CSV.read(@temp_csv_path, encoding: data.encoding.name) assert( rows.flatten.all? { |f| f.encoding == data.encoding }, "Wrong data encoding." ) - + # read and write with transcoding File.open(@temp_csv_path, "wb:UTF-32BE:#{data.encoding.name}") do |f| f << data @@ -189,11 +189,11 @@ class TestEncodings < Test::Unit::TestCase "Wrong data encoding." ) end end - + ################################# ### Write CSV in any Encoding ### ################################# - + def test_can_write_csv_in_any_encoding each_encoding do |encoding| # test generate_line with encoding hint @@ -204,11 +204,11 @@ class TestEncodings < Test::Unit::TestCase next end assert_equal(encoding, csv.encoding) - + # test generate_line with encoding guessing from fields csv = %w[abc d|ef].map { |f| f.encode(encoding) }.to_csv(col_sep: "|") assert_equal(encoding, csv.encoding) - + # writing to files data = encode_ary([%w[abc d,ef], %w[123 456 ]], encoding) CSV.open(@temp_csv_path, "wb:#{encoding.name}") do |f| @@ -217,9 +217,17 @@ class TestEncodings < Test::Unit::TestCase assert_equal(data, CSV.read(@temp_csv_path, encoding: encoding.name)) end end + + def test_encoding_is_upgraded_during_writing_as_needed + data = ["foo".force_encoding("US-ASCII"), "\u3042"] + assert_equal("US-ASCII", data.first.encoding.name) + assert_equal("UTF-8", data.last.encoding.name) + assert_equal("UTF-8", data.join.encoding.name) + assert_equal("UTF-8", data.to_csv.encoding.name) + end private - + def assert_parses(fields, encoding, options = { }) encoding = Encoding.find(encoding) unless encoding.is_a? Encoding fields = encode_ary(fields, encoding) @@ -229,11 +237,11 @@ class TestEncodings < Test::Unit::TestCase assert_equal(encoding, field.encoding, "Field[#{i + 1}] was transcoded.") end end - + def encode_ary(ary, encoding) ary.map { |row| row.map { |field| field.encode(encoding) } } end - + def ary_to_data(ary, options = { }) encoding = ary.flatten.first.encoding quote_char = (options[:quote_char] || '"').encode(encoding) @@ -245,12 +253,12 @@ class TestEncodings < Test::Unit::TestCase }.join(col_sep) + row_sep }.join.encode(encoding) end - + def encode_for_tests(data, options = { }) yield ary_to_data(encode_ary(data, "UTF-8"), options) yield ary_to_data(encode_ary(data, "UTF-16BE"), options) end - + def each_encoding Encoding.list.each do |encoding| next if encoding.dummy? # skip "dummy" encodings -- cgit v1.2.3