diff options
author | jeg2 <jeg2@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2010-03-23 14:59:25 +0000 |
---|---|---|
committer | jeg2 <jeg2@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2010-03-23 14:59:25 +0000 |
commit | 1bd1128989a339790a658c43ec3d9769b49265da (patch) | |
tree | 6efae89644fcd74cdab3f28209243b8e9c057cf1 | |
parent | 0471422beb4d786ede528612c8486296761a59ee (diff) | |
download | ruby-1bd1128989a339790a658c43ec3d9769b49265da.tar.gz |
* lib/csv.rb: Incorporating the fixes from the recent
FasterCSV releases: 1.5.2 and 1.5.3. [ruby-core:25038]
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@27025 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r-- | ChangeLog | 5 | ||||
-rw-r--r-- | lib/csv.rb | 172 | ||||
-rw-r--r-- | test/csv/test_csv_parsing.rb | 12 | ||||
-rw-r--r-- | test/csv/test_interface.rb | 7 | ||||
-rw-r--r-- | test/csv/test_table.rb | 2 |
5 files changed, 112 insertions, 86 deletions
@@ -1,3 +1,8 @@ +Tue Mar 23 23:58:51 2010 James Edward Gray II <jeg2@ruby-lang.org> + + * lib/csv.rb: Incorporating the fixes from the recent + FasterCSV releases: 1.5.2 and 1.5.3. [ruby-core:25038] + Tue Mar 23 18:35:46 2010 Shugo Maeda <shugo@ruby-lang.org> * object.c (rb_obj_singleton_class): new method diff --git a/lib/csv.rb b/lib/csv.rb index b9066f9cc8..d3a295b3a8 100644 --- a/lib/csv.rb +++ b/lib/csv.rb @@ -826,8 +826,12 @@ class CSV # Returns the table as a complete CSV String. Headers will be listed first, # then all of the field rows. # + # This method assumes you want the Table.headers(), unless you explicitly + # pass <tt>:write_headers => false</tt>. + # def to_csv(options = Hash.new) - @table.inject([headers.to_csv(options)]) do |rows, row| + wh = options.fetch(:write_headers, true) + @table.inject(wh ? [headers.to_csv(options)] : [ ]) do |rows, row| if row.header_row? rows else @@ -1808,66 +1812,95 @@ class CSV # it can take multiple calls to <tt>@io.gets()</tt> to get a full line, # because of \r and/or \n characters embedded in quoted fields # + in_extended_col = false + csv = Array.new + loop do # add another read to the line - (line += @io.gets(@row_sep)) rescue return nil - # copy the line so we can chop it up in parsing - parse = line.dup + unless parse = @io.gets(@row_sep) + return nil + end + parse.sub!(@parsers[:line_end], "") - # - # I believe a blank line should be an <tt>Array.new</tt>, not Ruby 1.8 - # CSV's <tt>[nil]</tt> - # - if parse.empty? - @lineno += 1 - if @skip_blanks - line = "" - next - elsif @unconverted_fields - return add_unconverted_fields(Array.new, Array.new) - elsif @use_headers - return self.class::Row.new(Array.new, Array.new) - else - return Array.new + if csv.empty? + # + # I believe a blank line should be an <tt>Array.new</tt>, not Ruby 1.8 + # CSV's <tt>[nil]</tt> + # + if parse.empty? + @lineno += 1 + if @skip_blanks + next + elsif @unconverted_fields + return add_unconverted_fields(Array.new, Array.new) + elsif @use_headers + return self.class::Row.new(Array.new, Array.new) + else + return Array.new + end end end - # - # shave leading empty fields if needed, because the main parser chokes - # on these - # - csv = if parse.sub!(@parsers[:leading_fields], "") - [nil] * ($&.length / @col_sep.length) - else - Array.new - end - # - # then parse the main fields with a hyper-tuned Regexp from - # Mastering Regular Expressions, Second Edition - # - parse.gsub!(@parsers[:csv_row]) do - csv << if $1.nil? # we found an unquoted field - if $2.empty? # switch empty unquoted fields to +nil+... - nil # for Ruby 1.8 CSV compatibility + parts = parse.split(@col_sep, -1) + csv << nil if parts.empty? + + # This loop is the hot path of csv parsing. Some things may be non-dry + # for a reason. Make sure to benchmark when refactoring. + parts.each do |part| + if in_extended_col + # If we are continuing a previous column + if part[-1] == @quote_char && part.count(@quote_char) % 2 != 0 + # extended column ends + csv.last << part[0..-2] + raise MalformedCSVError if csv.last =~ @parsers[:stray_quote] + csv.last.gsub!(@quote_char * 2, @quote_char) + in_extended_col = false else - # I decided to take a strict approach to CSV parsing... - if $2.count(@parsers[:return_newline]).zero? # verify correctness - $2 - else - # or throw an Exception - raise MalformedCSVError, "Unquoted fields do not allow " + - "\\r or \\n (line #{lineno + 1})." - end + csv.last << part + csv.last << @col_sep + end + elsif part[0] == @quote_char + # If we are staring a new quoted column + if part[-1] != @quote_char || part.count(@quote_char) % 2 != 0 + # start an extended column + csv << part[1..-1] + csv.last << @col_sep + in_extended_col = true + else + # regular quoted column + csv << part[1..-2] + raise MalformedCSVError if csv.last =~ @parsers[:stray_quote] + csv.last.gsub!(@quote_char * 2, @quote_char) + end + elsif part =~ @parsers[:quote_or_nl] + # Unquoted field with bad characters. + if part =~ @parsers[:nl_or_lf] + raise MalformedCSVError, "Unquoted fields do not allow " + + "\\r or \\n (line #{lineno + 1})." + else + raise MalformedCSVError, "Illegal quoting on line #{lineno + 1}." end - else # we found a quoted field... - $1.gsub(@quote_char * 2, @quote_char) # unescape contents + else + # Regular ole unquoted field. + csv << (part.empty? ? nil : part) end - "" # gsub!'s replacement, clear the field end - # if parse is empty?(), we found all the fields on the line... - if parse.empty? + # Replace tacked on @col_sep with @row_sep if we are still in an extended + # column. + csv[-1][-1] = @row_sep if in_extended_col + + if in_extended_col + # if we're at eof?(), a quoted field wasn't closed... + if @io.eof? + raise MalformedCSVError, + "Unclosed quoted field on line #{lineno + 1}." + elsif @field_size_limit and csv.last.size >= @field_size_limit + raise MalformedCSVError, "Field size exceeded on line #{lineno + 1}." + end + # otherwise, we need to loop and pull some more data to complete the row + else @lineno += 1 # save fields unconverted fields, if needed... @@ -1886,15 +1919,6 @@ class CSV # return the results break csv end - # if we're not empty?() but at eof?(), a quoted field wasn't closed... - if @io.eof? - raise MalformedCSVError, "Unclosed quoted field on line #{lineno + 1}." - elsif parse =~ @parsers[:bad_field] - raise MalformedCSVError, "Illegal quoting on line #{lineno + 1}." - elsif @field_size_limit and parse.length >= @field_size_limit - raise MalformedCSVError, "Field size exceeded on line #{lineno + 1}." - end - # otherwise, we need to loop and pull some more data to complete the row end end alias_method :gets, :shift @@ -2046,33 +2070,11 @@ class CSV esc_row_sep = escape_re(@row_sep) esc_quote = escape_re(@quote_char) @parsers = { - # for empty leading fields - leading_fields: encode_re("\\A(?:", esc_col_sep, ")+"), - # The Primary Parser - csv_row: encode_re( - "\\G(?:\\A|", esc_col_sep, ")", # anchor the match - "(?:", esc_quote, # find quoted fields - "((?>[^", esc_quote, "]*)", # "unrolling the loop" - "(?>", esc_quote * 2, # double for escaping - "[^", esc_quote, "]*)*)", - esc_quote, - "|", # ... or ... - "([^", esc_quote, esc_col_sep, "]*))", # unquoted fields - "(?=", esc_col_sep, "|\\z)" # ensure field is ended - ), - # a test for unescaped quotes - bad_field: encode_re( - "\\A", esc_col_sep, "?", # an optional comma - "(?:", esc_quote, # a quoted field - "(?>[^", esc_quote, "]*)", # "unrolling the loop" - "(?>", esc_quote * 2, # double for escaping - "[^", esc_quote, "]*)*", - esc_quote, # the closing quote - "[^", esc_quote, "]", # an extra character - "|", # ... or ... - "[^", esc_quote, esc_col_sep, "]+", # an unquoted field - esc_quote, ")" # an extra quote - ), + # for detecting parse errors + quote_or_nl: encode_re("[", esc_quote, "\r\n]"), + nl_or_lf: encode_re("[\r\n]"), + stray_quote: encode_re( "[^", esc_quote, "]", esc_quote, + "[^", esc_quote, "]" ), # safer than chomp!() line_end: encode_re(esc_row_sep, "\\z"), # illegal unquoted characters diff --git a/test/csv/test_csv_parsing.rb b/test/csv/test_csv_parsing.rb index a9e6cd400c..e3609b7648 100644 --- a/test/csv/test_csv_parsing.rb +++ b/test/csv/test_csv_parsing.rb @@ -115,6 +115,18 @@ class TestCSVParsing < Test::Unit::TestCase assert_equal(Array.new, CSV.parse_line("\n1,2,3\n")) end + def test_non_regex_edge_cases + # An early version of the non-regex parser fails this test + [ [ "foo,\"foo,bar,baz,foo\",\"foo\"", + ["foo", "foo,bar,baz,foo", "foo"] ] ].each do |edge_case| + assert_equal(edge_case.last, CSV.parse_line(edge_case.first)) + end + + assert_raise(CSV::MalformedCSVError) do + CSV.parse_line("1,\"23\"4\"5\", 6") + end + end + def test_malformed_csv assert_raise(CSV::MalformedCSVError) do CSV.parse_line("1,2\r,3", row_sep: "\n") diff --git a/test/csv/test_interface.rb b/test/csv/test_interface.rb index 5aedbbb075..04d0edc5f1 100644 --- a/test/csv/test_interface.rb +++ b/test/csv/test_interface.rb @@ -75,6 +75,11 @@ class TestCSVInterface < Test::Unit::TestCase assert_equal(%w{1 2 3}, row) end + def test_parse_line_with_empty_lines + assert_equal(nil, CSV.parse_line("")) # to signal eof + assert_equal(Array.new, CSV.parse_line("\n1,2,3")) + end + def test_read_and_readlines assert_equal( @expected, CSV.read(@path, col_sep: "\t", row_sep: "\r\n") ) @@ -167,7 +172,7 @@ class TestCSVInterface < Test::Unit::TestCase csv << lines.first.keys lines.each { |line| csv << line } end - CSV.open( @path, "w", headers: true, + CSV.open( @path, "r", headers: true, converters: :all, header_converters: :symbol ) do |csv| csv.each { |line| assert_equal(lines.shift, line.to_hash) } diff --git a/test/csv/test_table.rb b/test/csv/test_table.rb index b7c72b8fcb..d0b421750f 100644 --- a/test/csv/test_table.rb +++ b/test/csv/test_table.rb @@ -253,6 +253,8 @@ class TestCSVTable < Test::Unit::TestCase # with options assert_equal( csv.gsub(",", "|").gsub("\n", "\r\n"), @table.to_csv(col_sep: "|", row_sep: "\r\n") ) + assert_equal( csv.lines.to_a[1..-1].join, + @table.to_csv(:write_headers => false) ) # with headers assert_equal(csv, @header_table.to_csv) |