diff options
-rw-r--r-- | ChangeLog | 5 | ||||
-rw-r--r-- | lib/rexml/source.rb | 39 | ||||
-rw-r--r-- | test/rexml/test_document.rb | 14 |
3 files changed, 44 insertions, 14 deletions
@@ -1,3 +1,8 @@ +Wed Apr 14 22:09:28 2010 NARUSE, Yui <naruse@ruby-lang.org> + + * lib/rexml/source.rb: force_encoding("UTF-8") when the input + is already UTF-8. patched by Kouhei Sutou [ruby-core:23404] + Wed Apr 14 18:23:00 2010 Kenta Murata <mrkn@mrkn.jp> * configure.in (signbit): signbit is a macro in C99. diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 3f6d4ffa26..1206150b16 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -162,6 +162,15 @@ module REXML @line_break = ">" end super( @source.eof? ? str : str+@source.readline( @line_break ) ) + + if !@to_utf and + @buffer.respond_to?(:force_encoding) and + @source.respond_to?(:external_encoding) and + @source.external_encoding != ::Encoding::UTF_8 + @force_utf8 = true + else + @force_utf8 = false + end end def scan(pattern, cons=false) @@ -174,11 +183,7 @@ module REXML if rv.size == 0 until @buffer =~ pattern or @source.nil? begin - # READLINE OPT - #str = @source.read(@block_size) - str = @source.readline(@line_break) - str = decode(str) if @to_utf and str - @buffer << str + @buffer << readline rescue Iconv::IllegalSequence raise rescue @@ -193,12 +198,7 @@ module REXML def read begin - str = @source.readline(@line_break) - str = decode(str) if @to_utf and str - @buffer << str - if not @to_utf and @buffer.respond_to? :force_encoding - @buffer.force_encoding Encoding::UTF_8 - end + @buffer << readline rescue Exception, NameError @source = nil end @@ -213,9 +213,7 @@ module REXML @buffer = $' if cons and rv while !rv and @source begin - str = @source.readline(@line_break) - str = decode(str) if @to_utf and str - @buffer << str + @buffer << readline rv = pattern.match(@buffer) @buffer = $' if cons and rv rescue @@ -254,5 +252,18 @@ module REXML end [pos, lineno, line] end + + private + def readline + str = @source.readline(@line_break) + return nil if str.nil? + + if @to_utf + decode(str) + else + str.force_encoding(::Encoding::UTF_8) if @force_utf8 + str + end + end end end diff --git a/test/rexml/test_document.rb b/test/rexml/test_document.rb index 1ee1a1d414..ab0b1e4e96 100644 --- a/test/rexml/test_document.rb +++ b/test/rexml/test_document.rb @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- + require "rexml/document" require "test/unit" @@ -83,6 +85,18 @@ EOF REXML::Document.entity_expansion_limit = 10000 end + def test_tag_in_cdata_with_not_ascii_only_but_ascii8bit_encoding_source + tag = "<b>...</b>" + message = "こんにちは、世界!" # Hello world! in Japanese + xml = <<EOX +<?xml version="1.0" encoding="UTF-8"?> +<message><![CDATA[#{tag}#{message}]]></message> +EOX + xml.force_encoding(Encoding::ASCII_8BIT) + doc = REXML::Document.new(xml) + assert_equal("#{tag}#{message}", doc.root.children.first.value) + end + def test_xml_declaration_standalone bug2539 = '[ruby-core:27345]' doc = REXML::Document.new('<?xml version="1.0" standalone="no" ?>') |