From f25ff846f6884e202d13ab28e3e10c917b9cdf31 Mon Sep 17 00:00:00 2001 From: naruse Date: Wed, 2 Mar 2011 15:36:48 +0000 Subject: * lib/rexml/encoding.rb (REXML::Encoding#encoding=): store @encoding a String which means the name of the encoding. this partially revert r29646. * lib/rexml/document.rb: follow above. * lib/rexml/output.rb: ditto. * lib/rexml/parsers/baseparser.rb: ditto. * lib/rexml/source.rb: ditto. * lib/rexml/xmldecl.rb: ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@31008 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 16 +++++ lib/rexml/document.rb | 2 +- lib/rexml/encoding.rb | 42 ++++++------ lib/rexml/formatters/default.rb | 2 +- lib/rexml/output.rb | 2 +- lib/rexml/parsers/baseparser.rb | 2 +- lib/rexml/source.rb | 6 +- lib/rexml/xmldecl.rb | 13 +--- test/rexml/test_contrib.rb | 16 ++--- test/rexml/test_core.rb | 138 ++++++++++++++++------------------------ test/rexml/test_encoding.rb | 22 +++---- 11 files changed, 114 insertions(+), 147 deletions(-) diff --git a/ChangeLog b/ChangeLog index a5859815f3..fa167ddf2b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,19 @@ +Thu Mar 3 00:36:29 2011 NARUSE, Yui + + * lib/rexml/encoding.rb (REXML::Encoding#encoding=): store @encoding + a String which means the name of the encoding. + this partially revert r29646. + + * lib/rexml/document.rb: follow above. + + * lib/rexml/output.rb: ditto. + + * lib/rexml/parsers/baseparser.rb: ditto. + + * lib/rexml/source.rb: ditto. + + * lib/rexml/xmldecl.rb: ditto. + Wed Mar 2 23:19:56 2011 Nobuyoshi Nakada * string.c (str_byte_substr): return nil for negative length. diff --git a/lib/rexml/document.rb b/lib/rexml/document.rb index 68a744d9e5..790a1c78db 100644 --- a/lib/rexml/document.rb +++ b/lib/rexml/document.rb @@ -184,7 +184,7 @@ module REXML # that IE's limited abilities can handle. This hack inserts a space # before the /> on empty tags. Defaults to false def write( output=$stdout, indent=-1, transitive=false, ie_hack=false ) - if xml_decl.encoding != ::Encoding::UTF_8 && !output.kind_of?(Output) + if xml_decl.encoding != 'UTF-8' && !output.kind_of?(Output) output = Output.new( output, xml_decl.encoding ) end formatter = if indent > -1 diff --git a/lib/rexml/encoding.rb b/lib/rexml/encoding.rb index 3e7bdfb6aa..d1d5172841 100644 --- a/lib/rexml/encoding.rb +++ b/lib/rexml/encoding.rb @@ -1,8 +1,9 @@ module REXML module Encoding - # ID ---> Encoding object + # ID ---> Encoding name attr_reader :encoding def encoding=(encoding) + encoding = encoding.name if encoding.is_a?(Encoding) if encoding.is_a?(String) original_encoding = encoding encoding = find_encoding(encoding) @@ -11,35 +12,25 @@ module REXML end end return false if defined?(@encoding) and encoding == @encoding - if encoding and encoding != ::Encoding::UTF_8 - @encoding = encoding + if encoding + @encoding = encoding.upcase else - @encoding = ::Encoding::UTF_8 + @encoding = 'UTF-8' end true end def check_encoding(xml) - # We have to recognize UTF-16, LSB UTF-16, and UTF-8 + # We have to recognize UTF-16BE, UTF-16LE, and UTF-8 if xml[0, 2] == "\xfe\xff" xml[0, 2] = "" - ::Encoding::UTF_16BE + return 'UTF-16BE' elsif xml[0, 2] == "\xff\xfe" xml[0, 2] = "" - ::Encoding::UTF_16LE - else - if /\A\s*<\?xml\s+version\s*=\s*(['"]).*?\1 - \s+encoding\s*=\s*(["'])(.*?)\2/mx =~ xml - encoding_name = $3 - if /\Autf-16\z/i =~ encoding_name - ::Encoding::UTF_16BE - else - find_encoding(encoding_name) - end - else - ::Encoding::UTF_8 - end + return 'UTF-16LE' end + xml =~ /^\s*<\?xml\s+version\s*=\s*(['"]).*?\1\s+encoding\s*=\s*(["'])(.*?)\2/m + return $3 ? $3.upcase : 'UTF-8' end def encode(string) @@ -53,14 +44,19 @@ module REXML private def find_encoding(name) case name - when "UTF-16" - name = "UTF-16BE" when /\Ashift-jis\z/i - name = "Shift_JIS" + return "SHIFT_JIS" when /\ACP-(\d+)\z/ name = "CP#{$1}" + when /\AUTF-8\z/i + return name + end + begin + ::Encoding::Converter.search_convpath(name, 'UTF-8') + rescue ::Encoding::ConverterNotFoundError + return nil end - ::Encoding.find(name) + name end end end diff --git a/lib/rexml/formatters/default.rb b/lib/rexml/formatters/default.rb index ec4149047d..574c821f96 100644 --- a/lib/rexml/formatters/default.rb +++ b/lib/rexml/formatters/default.rb @@ -22,7 +22,7 @@ module REXML case node when Document - if node.xml_decl.encoding != ::Encoding::UTF_8 && !output.kind_of?(Output) + if node.xml_decl.encoding != 'UTF-8' && !output.kind_of?(Output) output = Output.new( output, node.xml_decl.encoding ) end write_document( node, output ) diff --git a/lib/rexml/output.rb b/lib/rexml/output.rb index 752f6e1d40..50333ba177 100644 --- a/lib/rexml/output.rb +++ b/lib/rexml/output.rb @@ -10,7 +10,7 @@ module REXML @output = real_IO self.encoding = encd - @to_utf = (@encoding != ::Encoding::UTF_8) + @to_utf = encd != 'UTF-8' end def <<( content ) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index ee8b160ce5..0f1480b07d 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -248,7 +248,7 @@ module REXML @document_status = :after_doctype @source.read if @source.buffer.size<2 md = @source.match(/\s*/um, true) - if @source.encoding == ::Encoding::UTF_8 + if @source.encoding == "UTF-8" @source.buffer.force_encoding(::Encoding::UTF_8) end end diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 227b0c56c4..112393cfd4 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -54,14 +54,12 @@ module REXML def encoding=(enc) return unless super @line_break = encode( '>' ) - if @encoding != ::Encoding::UTF_8 + if @encoding != 'UTF-8' @buffer = decode(@buffer) @to_utf = true else @to_utf = false - if @buffer.respond_to? :force_encoding - @buffer.force_encoding ::Encoding::UTF_8 - end + @buffer.force_encoding ::Encoding::UTF_8 end end diff --git a/lib/rexml/xmldecl.rb b/lib/rexml/xmldecl.rb index 81d3057732..361e4b7106 100644 --- a/lib/rexml/xmldecl.rb +++ b/lib/rexml/xmldecl.rb @@ -109,20 +109,9 @@ module REXML end private - def normalized_encoding_name(_encoding) - if _encoding == ::Encoding::UTF_16BE - "UTF-16" - else - return _encoding.name - end - end - def content(enc) rv = "version='#@version'" - if @writeencoding || enc.to_s !~ /\Autf-8\z/i - encoding_name = normalized_encoding_name(enc) - rv << " encoding='#{encoding_name}'" - end + rv << " encoding='#{enc}'" if @writeencoding || enc !~ /utf-8/i rv << " standalone='#@standalone'" if @standalone rv end diff --git a/test/rexml/test_contrib.rb b/test/rexml/test_contrib.rb index 3bbaa083b3..eb16b8946c 100644 --- a/test/rexml/test_contrib.rb +++ b/test/rexml/test_contrib.rb @@ -241,7 +241,7 @@ DELIMITER end doc = REXML::Document.new(source_iso) - assert_equal('ISO-8859-1', doc.xml_decl.encoding.to_s) + assert_equal('ISO-8859-1', doc.xml_decl.encoding) assert_equal(koln_utf, doc.root.text) doc.write(out="") assert_equal(source_iso, out ) @@ -255,23 +255,21 @@ DELIMITER Technik Technik -Die Technik ist das Rückgrat der meisten Geschäftsprozesse bei Home of the Brave. Deshalb sollen hier alle relevanten technischen Abläufe, Daten und Einrichtungen beschrieben werden, damit jeder im Bedarfsfall die nötigen Informationen, Anweisungen und Verhaltensempfehlungen nachlesen und/oder abrufen kann. +Die Technik ist das R\xFCckgrat der meisten Gesch\xFCftsprozesse bei Home of the Brave. Deshalb sollen hier alle relevanten technischen Abl\xFCufe, Daten und Einrichtungen beschrieben werden, damit jeder im Bedarfsfall die n\xFCtigen Informationen, Anweisungen und Verhaltensempfehlungen nachlesen und/oder abrufen kann. Flash - Nützliches von Flashern für Flasher. + N\xFCtzliches von Flashern f\xFCr Flasher. CVS-FAQ FAQ zur Benutzung von CVS bei HOB EOF tn = XPath.first(doc, "//nebenspalte/text()[2]") - expected_iso = "Nützliches von Flashern für Flasher." - expected_utf = expected_iso.unpack('C*').pack('U*') - if expected_utf.respond_to? :encode - expected_iso.force_encoding("iso-8859-1") - expected_utf.force_encoding(::Encoding::UTF_8) - end + expected_iso = "N\xFCtzliches von Flashern f\xFCr Flasher." + expected_utf = expected_iso.unpack('C*').pack('U*') + expected_iso.force_encoding(::Encoding::ISO_8859_1) + expected_utf.force_encoding(::Encoding::UTF_8) assert_equal(expected_utf, tn.to_s.strip) f = REXML::Formatters::Default.new f.write( tn, Output.new(o = "", "ISO-8859-1") ) diff --git a/test/rexml/test_core.rb b/test/rexml/test_core.rb index 90d83f11a5..d10c1bcc36 100644 --- a/test/rexml/test_core.rb +++ b/test/rexml/test_core.rb @@ -20,7 +20,7 @@ class Tester < Test::Unit::TestCase - @@ -80,7 +80,7 @@ class Tester < Test::Unit::TestCase # This because of a reported bug in attribute handling in 1.0a8 source = 'blah' doc = Document.new source - doc.elements.each do |a| + doc.elements.each do |a| a.attributes['att'] << 'B' assert_equal "AB", a.attributes['att'] a.attributes['att'] = 'C' @@ -155,11 +155,11 @@ class Tester < Test::Unit::TestCase assert_equal 3, doc.root.size assert_equal 1, doc.root.elements.size - text = " This is text + text = " This is text with a lot of whitespace " source = "#{text}#{text}#{text}#{text}" - doc = Document.new( source, { + doc = Document.new( source, { :respect_whitespace => %w{ a c } } ) assert_equal text, doc.elements["//c"].text @@ -207,8 +207,8 @@ class Tester < Test::Unit::TestCase doc.write(test="") assert_equal(correct, test) - multi_line_source = ' ' d = Document.new( multi_line_source ) @@ -217,8 +217,8 @@ class Tester < Test::Unit::TestCase doc.write(test="") assert_equal(correct, test) - odd_space_source = ' ' d = Document.new( odd_space_source ) dt = d.doctype @@ -230,34 +230,12 @@ class Tester < Test::Unit::TestCase doc = Document.new(docin) doc.write(test="") assert_equal(31, doc.doctype.size) - - # Here's a little ditty from Tobias... - src = <<-EOL - --> - - - ' - > - ] - > - EOL end def test_document # Testing cloning source = "" doc = Document.new source - doc2 = Document.new doc # Testing Root assert_equal doc.root.name.to_s, "element" @@ -266,7 +244,7 @@ class Tester < Test::Unit::TestCase source = @xsa_source doc = Document.new source assert_instance_of XMLDecl, doc.xml_decl - assert_instance_of DocType, doc.doctype + assert_instance_of DocType, doc.doctype assert_equal doc.version, "1.0" source = File.new(fixture_path("dash.xml")) @@ -383,7 +361,7 @@ class Tester < Test::Unit::TestCase assert_equal(string, text.to_s) string2 = "#{string}" - doc = Document.new( string2, { + doc = Document.new( string2, { :raw => %w{ a b } } ) f.write(doc,out="") @@ -463,7 +441,7 @@ class Tester < Test::Unit::TestCase assert_equal "Datasets", child.name } each_test(doc, "Project/Datasets/link", 2 ) - each_test(doc.root, "/Project/Description", 1) {|child| + each_test(doc.root, "/Project/Description", 1) {|child| assert_equal "Description", child.name } each_test(doc.root, "./Description",1 ) { |child| @@ -642,11 +620,10 @@ class Tester < Test::Unit::TestCase end def test_line - doc = Document.new File.new(fixture_path("bad.xml")) + Document.new File.new(fixture_path("bad.xml")) assert_fail "There should have been an error" rescue Exception # We should get here - er = $! assert($!.line == 5, "Should have been an error on line 5, "+ "but was reported as being on line #{$!.line}" ) end @@ -664,13 +641,11 @@ class Tester < Test::Unit::TestCase def test_exception source = SourceFactory.create_from "" p = ParseException.new( "dummy message", source ) - s = p.to_s begin raise "dummy" rescue Exception p.continued_exception = $! end - s = p.to_s end def test_bad_content @@ -682,7 +657,7 @@ class Tester < Test::Unit::TestCase assert_equal "content>content", tree_gt.elements[1].text # This isn't begin - tree_lt = Document.new in_lt + Document.new in_lt assert_fail "Should have gotten a parse error" rescue ParseException end @@ -766,7 +741,7 @@ class Tester < Test::Unit::TestCase doc.root.each_element_with_text( nil, 0, 'd', &block ) assert_equal 0, arry.size end - + def test_element_parse_stream s = Source.new( "some text" ) l = Listener.new @@ -810,7 +785,7 @@ EOL assert_equal('eeü'.force_encoding("UTF-8"), a.root.text) end - def test_element_decl + def test_element_decl element_decl = Source.new(" ]>") @@ -824,7 +799,7 @@ EOL - @@ -855,17 +830,15 @@ EOL def test_attlist_write file=File.new(fixture_path("foo.xml")) - doc=Document.new file - root = doc.root - + doc=Document.new file out = '' - doc.write(out) + doc.write(out) end def test_more_namespaces - assert_raise( REXML::UndefinedNamespaceException, + assert_raise( REXML::UndefinedNamespaceException, %Q{Should have gotten an Undefined Namespace error} ) { - doc1 = Document.new("

") + Document.new("

") } doc2 = Document.new("

") es = XPath.match(doc2, '//c') @@ -916,7 +889,7 @@ EOL end def test_oses_with_bad_EOLs - d = Document.new("\n\n\n\n\n\n\n\n") + Document.new("\n\n\n\n\n\n\n\n") end # Contributed (with patch to fix bug) by Kouhei @@ -955,25 +928,25 @@ EOL end def test_accents - docs = [ + docs = [ %Q{ -}, +}, ' -', +', %Q{ -}, +}, %Q{ @@ -1024,7 +997,6 @@ EOL document.write(s) end - def test_write_cdata src = "A" doc = REXML::Document.new( src ) @@ -1045,15 +1017,15 @@ EOL EOL - d = REXML::Document.new( source ) + d = Document.new( source ) assert_equal( 'foo', REXML::XPath.first(d.root, "//x:b/@x:n").value ) assert_equal( nil, REXML::XPath.first(d.root, "//x:b/@x:n", {})) end def test_null_element_name - a = REXML::Document.new + a = REXML::Document.new assert_raise( RuntimeError ) { - a.add_element( nil ) + a.add_element( nil ) } end @@ -1095,22 +1067,22 @@ EOL # Ticket #44 t = REXML::Text.new( "&", false, nil, true ) assert_equal( "&", t.to_s ) - + t = REXML::Text.new("&", false, false) assert_equal( "&amp;", t.to_s ) end def test_to_xpath - doc = REXML::Document.new( %q{ - - - }) + doc = REXML::Document.new( %q{ + + + }) names = %w{ /tag1/tag2[1] /tag1/tag2[2] } - doc.root.elements.each_with_index {|el, i| + doc.root.elements.each_with_index {|el, i| assert_equal( names[i], el.xpath ) - } + } end - + def test_transitive doc = REXML::Document.new( "") s = "" @@ -1181,19 +1153,19 @@ EOL doc = REXML::Document.new doc << REXML::XMLDecl.default doc << REXML::Element.new("a") - + str = "" doc.write(str) - + assert_equal("", str) doc = REXML::Document.new doc << REXML::XMLDecl.new("1.0", "UTF-8") doc << REXML::Element.new("a") - + str = "" doc.write(str) - + assert_equal("", str) end @@ -1212,7 +1184,7 @@ EOL def test_ticket_52 source = "" - d = REXML::Document.new(source) + d = REXML::Document.new(source) d.write(k="") assert_equal( source, k ) @@ -1233,17 +1205,17 @@ EOL def test_ticket_21 src = "" assert_raise( ParseException, "invalid XML should be caught" ) { - d = REXML::Document.new(src) + Document.new(src) } begin - d = REXML::Document.new(src) + Document.new(src) rescue assert_match( /missing attribute quote/, $!.message ) end end def test_ticket_63 - d = REXML::Document.new(File.new(fixture_path("t63-1.xml"))) + Document.new(File.new(fixture_path("t63-1.xml"))) end def test_ticket_75 @@ -1256,7 +1228,7 @@ EOL #- rexml sanity check (bugs in ruby 1.8.4, ruby 1.8.6) xmldoc = Document.new("") xmldoc << XMLDecl.new(XMLDecl::DEFAULT_VERSION, "UTF-8") - content = ['61c3a927223c3e26'].pack("H*") + content = ['61c3a927223c3e26'].pack("H*") content.force_encoding('UTF-8') if content.respond_to?(:force_encoding) #- is some UTF-8 text but just to make sure my editor won't magically convert.. xmldoc.root.add_attribute('attr', content) @@ -1272,14 +1244,14 @@ EOL assert_equal( sanity1, sanity2 ) end - + def test_ticket_88 doc = REXML::Document.new("") - assert_equal("", doc.to_s) + assert_equal("", doc.to_s) doc = REXML::Document.new("") - assert_equal("", doc.to_s) + assert_equal("", doc.to_s) end - + def test_ticket_85 xml = < @@ -1295,8 +1267,6 @@ ENDXML " - zml = "" - # The pretty printer ignores all whitespace, anyway so output1 == output2 f = REXML::Formatters::Pretty.new( 2 ) d = Document.new( xml, :ignore_whitespace_nodes=>:all ) @@ -1311,7 +1281,7 @@ ENDXML # The base case. d = Document.new(yml) f.write( d, output3="" ) - + assert_equal( output3.strip, output2.strip ) d = Document.new(yml) @@ -1355,10 +1325,10 @@ ENDXML end def test_ticket_14 - # Per .2.5 Node Tests of XPath spec - assert_raise( REXML::UndefinedNamespaceException, + # Per .2.5 Node Tests of XPath spec + assert_raise( REXML::UndefinedNamespaceException, %Q{Should have gotten an Undefined Namespace error} ) { - d = Document.new("") + Document.new("") } end @@ -1395,7 +1365,7 @@ ENDXML doc.add_element(bean_element) REXML::Formatters::Pretty.new(3).write( doc, out = "" ) - + assert_equal "\n \n (&#38;(|(memberof=CN=somegroupabcdefgh,OU=OUsucks,DC=hookemhorns,DC=com)(mail=*someco.com))(acct=%u)(!(extraparameter:2.2.222.222222.2.2.222:=2)))\n \n", out end diff --git a/test/rexml/test_encoding.rb b/test/rexml/test_encoding.rb index e1f9296821..e359914b36 100644 --- a/test/rexml/test_encoding.rb +++ b/test/rexml/test_encoding.rb @@ -18,7 +18,7 @@ class EncodingTester < Test::Unit::TestCase def test_encoded_in_encoded_out doc = Document.new( @encoded ) doc.write( out="" ) - out.force_encoding('binary') if out.respond_to? :force_encoding + out.force_encoding(::Encoding::ASCII_8BIT) assert_equal( @encoded, out ) end @@ -26,12 +26,12 @@ class EncodingTester < Test::Unit::TestCase def test_encoded_in_change_out doc = Document.new( @encoded ) doc.xml_decl.encoding = "UTF-8" - assert_equal( ::Encoding::UTF_8, doc.encoding ) + assert_equal("UTF-8", doc.encoding) REXML::Formatters::Default.new.write( doc.root, out="" ) - out.force_encoding('binary') if out.respond_to? :force_encoding + out.force_encoding(::Encoding::ASCII_8BIT) assert_equal( @not_encoded, out ) char = XPath.first( doc, "/a/b/text()" ).to_s - char.force_encoding('binary') if char.respond_to? :force_encoding + char.force_encoding(::Encoding::ASCII_8BIT) assert_equal( "ĉ", char ) end @@ -39,7 +39,7 @@ class EncodingTester < Test::Unit::TestCase def test_encoded_in_different_out doc = Document.new( @encoded ) REXML::Formatters::Default.new.write( doc.root, Output.new( out="", "UTF-8" ) ) - out.force_encoding('binary') if out.respond_to? :force_encoding + out.force_encoding(::Encoding::ASCII_8BIT) assert_equal( @not_encoded, out ) end @@ -47,9 +47,9 @@ class EncodingTester < Test::Unit::TestCase def test_in_change_out doc = Document.new( @not_encoded ) doc.xml_decl.encoding = "ISO-8859-3" - assert_equal( ::Encoding::ISO_8859_3, doc.encoding ) + assert_equal("ISO-8859-3", doc.encoding) doc.write( out="" ) - out.force_encoding('binary') if out.respond_to? :force_encoding + out.force_encoding(::Encoding::ASCII_8BIT) assert_equal( @encoded, out ) end @@ -57,7 +57,7 @@ class EncodingTester < Test::Unit::TestCase def test_in_different_out doc = Document.new( @not_encoded ) doc.write( Output.new( out="", "ISO-8859-3" ) ) - out.force_encoding('binary') if out.respond_to? :force_encoding + out.force_encoding(::Encoding::ASCII_8BIT) assert_equal( @encoded, out ) end @@ -66,10 +66,10 @@ class EncodingTester < Test::Unit::TestCase def test_in_different_access doc = Document.new <<-EOL - ÿ + \xFF EOL expect = "\303\277" - expect.force_encoding('UTF-8') if expect.respond_to? :force_encoding + expect.force_encoding(::Encoding::UTF_8) assert_equal( expect, doc.elements['a'].attributes['a'] ) assert_equal( expect, doc.elements['a'].text ) end @@ -86,7 +86,7 @@ class EncodingTester < Test::Unit::TestCase def test_ticket_110 utf16 = REXML::Document.new(File.new(fixture_path("ticket_110_utf16.xml"))) - assert_equal( ::Encoding::UTF_16BE, utf16.encoding ) + assert_equal(utf16.encoding, "UTF-16") assert( utf16[0].kind_of?(REXML::XMLDecl)) end end -- cgit v1.2.3