diff options
author | ser <ser@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2007-01-20 03:56:02 +0000 |
---|---|---|
committer | ser <ser@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2007-01-20 03:56:02 +0000 |
commit | fa4bfa6af585589e4465831f1489fee83ce26f09 (patch) | |
tree | fabaa77b102a6a2b93bdb79b70c2fe5dfe21763e | |
parent | f700c1354f19ca5ad73f4e119dcbff493a3e6e00 (diff) | |
download | ruby-fa4bfa6af585589e4465831f1489fee83ce26f09.tar.gz |
Merged from REXML main repository:
Fixes ticket:68.
NOTE that this involves an API change! Entity declarations in the doctype
now generate events that carry two, not one, arguments.
Implements ticket:15, using gwrite's suggestion. This allows Element to be
subclassed.
Two unrelated changes, because subversion is retarded and doesn't do
block-level commits:
1) Fixed a typo bug in previous change for ticket:15
2) Fixed namespaces handling in XPath and element.
***** Note that this is an API change!!! *****
Element.namespaces() now returns a hash of namespace mappings which are
relevant for that node.
Fixes a bug in multiple decodings
The changeset 1230:1231 was bad. The default behavior is *not* to use the
native REXML encodings by default, but rather to use ICONV by default. I know
that this will piss some people off, but defaulting to the pure Ruby version
isn't the correct solution, and it breaks other encodings, so I've reverted it.
* Fixes ticket:61 (xpath_parser)
* Fixes ticket:63 (UTF-16; UNILE decoding was bad)
* Cleans up some tests, removing opportunities for test corruption
* Improves parsing error messages a little
* Adds the ability to override the encoding detection in Source construction
* Fixes an edge case in Functions::string, where document nodes weren't
correctly converted
* Fixes Functions::string() for Element and Document nodes
* Fixes some problems in entity handling
Addresses ticket:66
Fixes ticket:71
Addresses ticket:78
NOTE: that this also fixes what is technically another bug in REXML. REXML's
XPath parser used to allow exponential notation in numbers. The XPath spec
is specific about what a number is, and scientific notation is not included.
Therefore, this has been fixed.
Cross-ported a fix for ticket:88 from CVS.
Fixes ticket:80
Documentation cleanup. Ticket:84
Applied Kou's fix for an un-trac'ed bug.
------------------------------------------------------------------------
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@11548 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r-- | lib/rexml/document.rb | 5 | ||||
-rw-r--r-- | lib/rexml/element.rb | 25 | ||||
-rw-r--r-- | lib/rexml/encoding.rb | 26 | ||||
-rw-r--r-- | lib/rexml/encodings/UNILE.rb | 2 | ||||
-rw-r--r-- | lib/rexml/functions.rb | 25 | ||||
-rw-r--r-- | lib/rexml/node.rb | 6 | ||||
-rw-r--r-- | lib/rexml/parsers/baseparser.rb | 4 | ||||
-rw-r--r-- | lib/rexml/parsers/sax2parser.rb | 4 | ||||
-rw-r--r-- | lib/rexml/parsers/treeparser.rb | 3 | ||||
-rw-r--r-- | lib/rexml/sax2listener.rb | 2 | ||||
-rw-r--r-- | lib/rexml/source.rb | 23 | ||||
-rw-r--r-- | lib/rexml/text.rb | 47 | ||||
-rw-r--r-- | lib/rexml/xpath_parser.rb | 53 |
13 files changed, 142 insertions, 83 deletions
diff --git a/lib/rexml/document.rb b/lib/rexml/document.rb index 619a844257..ee3e58dd2b 100644 --- a/lib/rexml/document.rb +++ b/lib/rexml/document.rb @@ -157,8 +157,9 @@ module REXML # document will be written. # indent:: # An integer. If -1, no indenting will be used; otherwise, the - # indentation will be this number of spaces, and children will be - # indented an additional amount. Defaults to -1 + # indentation will be twice this number of spaces, and children will be + # indented an additional amount. For a value of 3, every item will be + # indented 3 more levels, or 6 more spaces (2 * 3). Defaults to -1 # transitive:: # If transitive is true and indent is >= 0, then the output will be # pretty-printed in such a way that the added whitespace does not affect diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index 435076420a..11e2039609 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -94,7 +94,7 @@ module REXML # new_a = d.root.clone # puts new_a # => "<a/>" def clone - Element.new self + self.class.new self end # Evaluates to the root node of the document that this element @@ -200,9 +200,9 @@ module REXML end def namespaces - namespaces = [] + namespaces = {} namespaces = parent.namespaces if parent - namespaces |= attributes.namespaces + namespaces = namespaces.merge( attributes.namespaces ) return namespaces end @@ -494,13 +494,12 @@ module REXML # doc.root.add_element 'c' #-> '<a><b/>Elliott<c/></a>' # doc.root.text = 'Russell' #-> '<a><b/>Russell<c/></a>' # doc.root.text = nil #-> '<a><b/><c/></a>' - def text=( text ) + def text=( text ) if text.kind_of? String text = Text.new( text, whitespace(), nil, raw() ) elsif text and !text.kind_of? Text text = Text.new( text.to_s, whitespace(), nil, raw() ) end - old_text = get_text if text.nil? old_text.remove unless old_text.nil? @@ -557,13 +556,9 @@ module REXML ################################################# def attribute( name, namespace=nil ) - prefix = '' - if namespace - prefix = attributes.prefixes.each { |prefix| - return "#{prefix}:" if namespace( prefix ) == namespace - } || '' - end - attributes.get_attribute( "#{prefix}#{name}" ) + prefix = nil + prefix = namespaces.index(namespace) if namespace + attributes.get_attribute( "#{prefix ? prefix + ':' : ''}#{name}" ) end # Evaluates to +true+ if this element has any attributes set, false @@ -1172,16 +1167,16 @@ module REXML end def namespaces - namespaces = [] + namespaces = {} each_attribute do |attribute| - namespaces << attribute.value if attribute.prefix == 'xmlns' or attribute.name == 'xmlns' + namespaces[attribute.name] = attribute.value if attribute.prefix == 'xmlns' or attribute.name == 'xmlns' end if @element.document and @element.document.doctype expn = @element.expanded_name expn = @element.document.doctype.name if expn.size == 0 @element.document.doctype.attributes_of(expn).each { |attribute| - namespaces << attribute.value if attribute.prefix == 'xmlns' or attribute.name == 'xmlns' + namespaces[attribute.name] = attribute.value if attribute.prefix == 'xmlns' or attribute.name == 'xmlns' } end namespaces diff --git a/lib/rexml/encoding.rb b/lib/rexml/encoding.rb index 3a92080b13..7c23f678bb 100644 --- a/lib/rexml/encoding.rb +++ b/lib/rexml/encoding.rb @@ -24,20 +24,20 @@ module REXML old_verbosity = $VERBOSE begin $VERBOSE = false - enc = enc.nil? ? nil : enc.upcase + enc = enc.nil? ? nil : enc.upcase return false if defined? @encoding and enc == @encoding if enc and enc != UTF_8 - @encoding = enc - raise ArgumentError, "Bad encoding name #@encoding" unless @encoding =~ /^[\w-]+$/ - @encoding.untaint - enc_file = File.join( "rexml", "encodings", "#@encoding.rb" ) - begin - require enc_file - Encoding.apply(self, @encoding) + @encoding = enc + raise ArgumentError, "Bad encoding name #@encoding" unless @encoding =~ /^[\w-]+$/ + @encoding.untaint + begin + require 'rexml/encodings/ICONV.rb' + Encoding.apply(self, "ICONV") rescue LoadError, Exception - begin - require 'rexml/encodings/ICONV.rb' - Encoding.apply(self, "ICONV") + begin + enc_file = File.join( "rexml", "encodings", "#@encoding.rb" ) + require enc_file + Encoding.apply(self, @encoding) rescue LoadError => err puts err.message raise ArgumentError, "No decoder found for encoding #@encoding. Please install iconv." @@ -51,14 +51,14 @@ module REXML ensure $VERBOSE = old_verbosity end - true + true end def check_encoding str # We have to recognize UTF-16, LSB UTF-16, and UTF-8 return UTF_16 if /\A\xfe\xff/n =~ str return UNILE if /\A\xff\xfe/n =~ str - str =~ /^\s*<?xml\s*version\s*=\s*(['"]).*?\2\s*encoding\s*=\s*(["'])(.*?)\2/um + str =~ /^\s*<?xml\s*version=(['"]).*?\2\s*encoding=(["'])(.*?)\2/um return $1.upcase if $1 return UTF_8 end diff --git a/lib/rexml/encodings/UNILE.rb b/lib/rexml/encodings/UNILE.rb index 0560a08361..d054140c40 100644 --- a/lib/rexml/encodings/UNILE.rb +++ b/lib/rexml/encodings/UNILE.rb @@ -18,7 +18,7 @@ module REXML def decode_unile(str) array_enc=str.unpack('C*') array_utf8 = [] - 2.step(array_enc.size-1, 2){|i| + 0.step(array_enc.size-1, 2){|i| array_utf8 << (array_enc.at(i) + array_enc.at(i+1)*0x100) } array_utf8.pack('U*') diff --git a/lib/rexml/functions.rb b/lib/rexml/functions.rb index d741dbdab7..8293e9c5ac 100644 --- a/lib/rexml/functions.rb +++ b/lib/rexml/functions.rb @@ -117,16 +117,30 @@ module REXML elsif defined? object.node_type if object.node_type == :attribute object.value - elsif object.node_type == :element - object.text + elsif object.node_type == :element || object.node_type == :document + string_value(object) else object.to_s end + elsif object.nil? + return "" else object.to_s end end + def Functions::string_value( o ) + rv = "" + o.children.each { |e| + if e.node_type == :text + rv << e.to_s + elsif e.node_type == :element + rv << string_value( e ) + end + } + rv + end + # UNTESTED def Functions::concat( *objects ) objects.join @@ -139,7 +153,7 @@ module REXML # Fixed by Mike Stok def Functions::contains( string, test ) - string(string).include? string(test) + string(string).include?(string(test)) end # Kouhei fixed this @@ -325,8 +339,9 @@ module REXML object.to_f else str = string( object ) - #puts "STRING OF #{object.inspect} = #{str}" - if str =~ /^-?\.?\d/ + # If XPath ever gets scientific notation... + #if str =~ /^\s*-?(\d*\.?\d+|\d+\.)([Ee]\d*)?\s*$/ + if str =~ /^\s*-?(\d*\.?\d+|\d+\.)\s*$/ str.to_f else (0.0 / 0.0) diff --git a/lib/rexml/node.rb b/lib/rexml/node.rb index e5dec72a9d..7226e5be6c 100644 --- a/lib/rexml/node.rb +++ b/lib/rexml/node.rb @@ -55,10 +55,8 @@ module REXML return nil end - # Returns the index that +self+ has in its parent's elements array, so that - # the following equation holds true: - # - # node == node.parent.elements[node.index_in_parent] + # Returns the position that +self+ holds in its parent's array, indexed + # from 1. def index_in_parent parent.index(self)+1 end diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index c57ea58dc7..fecd801d6f 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -146,8 +146,6 @@ module REXML # Returns true if there are no more events def empty? - #STDERR.puts "@source.empty? = #{@source.empty?}" - #STDERR.puts "@stack.empty? = #{@stack.empty?}" return (@source.empty? and @stack.empty?) end @@ -365,8 +363,6 @@ module REXML else md = @source.match( TEXT_PATTERN, true ) if md[0].length == 0 - puts "EMPTY = #{empty?}" - puts "BUFFER = \"#{@source.buffer}\"" @source.match( /(\s+)/, true ) end #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0 diff --git a/lib/rexml/parsers/sax2parser.rb b/lib/rexml/parsers/sax2parser.rb index 61a216cec1..6c7fbe000a 100644 --- a/lib/rexml/parsers/sax2parser.rb +++ b/lib/rexml/parsers/sax2parser.rb @@ -16,6 +16,10 @@ module REXML @tag_stack = [] @entities = {} end + + def source + @parser.source + end def add_listener( listener ) @parser.add_listener( listener ) diff --git a/lib/rexml/parsers/treeparser.rb b/lib/rexml/parsers/treeparser.rb index 500a53f426..a53fa41925 100644 --- a/lib/rexml/parsers/treeparser.rb +++ b/lib/rexml/parsers/treeparser.rb @@ -23,7 +23,8 @@ module REXML case event[0] when :end_document unless tag_stack.empty? - raise ParseException.new("No close tag for #{tag_stack.inspect}") + #raise ParseException.new("No close tag for #{tag_stack.inspect}") + raise ParseException.new("No close tag for #{@build_context.xpath}") end return when :start_element diff --git a/lib/rexml/sax2listener.rb b/lib/rexml/sax2listener.rb index 9a992917e6..8db1389d06 100644 --- a/lib/rexml/sax2listener.rb +++ b/lib/rexml/sax2listener.rb @@ -70,7 +70,7 @@ module REXML # ["open-hatch", "PUBLIC", "\"-//Textuality//TEXT Standard open-hatch boilerplate//EN\"", "\"http://www.textuality.com/boilerplate/OpenHatch.xml\""] # <!ENTITY hatch-pic SYSTEM "../grafix/OpenHatch.gif" NDATA gif> # ["hatch-pic", "SYSTEM", "\"../grafix/OpenHatch.gif\"", "\n\t\t\t\t\t\t\tNDATA gif", "gif"] - def entitydecl content + def entitydecl name, decl end # <!NOTATION ...> def notationdecl content diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 3b6e813baf..2fee99c0e9 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -6,7 +6,7 @@ module REXML # Generates a Source object # @param arg Either a String, or an IO # @return a Source, or nil if a bad argument was given - def SourceFactory::create_from arg#, slurp=true + def SourceFactory::create_from(arg) if arg.kind_of? String Source.new(arg) elsif arg.respond_to? :read and @@ -35,12 +35,19 @@ module REXML # Constructor # @param arg must be a String, and should be a valid XML document - def initialize(arg) + # @param encoding if non-null, sets the encoding of the source to this + # value, overriding all encoding detection + def initialize(arg, encoding=nil) @orig = @buffer = arg - self.encoding = check_encoding( @buffer ) + if encoding + self.encoding = encoding + else + self.encoding = check_encoding( @buffer ) + end @line = 0 end + # Inherited from Encoding # Overridden to support optimized en/decoding def encoding=(enc) @@ -124,7 +131,7 @@ module REXML #attr_reader :block_size # block_size has been deprecated - def initialize(arg, block_size=500) + def initialize(arg, block_size=500, encoding=nil) @er_source = @source = arg @to_utf = false # Determining the encoding is a deceptively difficult issue to resolve. @@ -134,10 +141,12 @@ module REXML # if there is one. If there isn't one, the file MUST be UTF-8, as per # the XML spec. If there is one, we can determine the encoding from # it. + @buffer = "" str = @source.read( 2 ) - if /\A(?:\xfe\xff|\xff\xfe)/n =~ str + if encoding + self.encoding = encoding + elsif /\A(?:\xfe\xff|\xff\xfe)/n =~ str self.encoding = check_encoding( str ) - @line_break = encode( '>' ) else @line_break = '>' end @@ -159,6 +168,8 @@ module REXML str = @source.readline(@line_break) str = decode(str) if @to_utf and str @buffer << str + rescue Iconv::IllegalSequence + raise rescue @source = nil end diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index 55bc9f50f8..3de9170623 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -42,6 +42,7 @@ module REXML # Use this field if you have entities defined for some text, and you don't # want REXML to escape that text in output. # Text.new( "<&", false, nil, false ) #-> "<&" + # Text.new( "<&", false, nil, false ) #-> "&lt;&amp;" # Text.new( "<&", false, nil, true ) #-> Parse exception # Text.new( "<&", false, nil, true ) #-> "<&" # # Assume that the entity "s" is defined to be "sean" @@ -172,17 +173,6 @@ module REXML end @unnormalized = Text::unnormalize( @string, doctype ) end - - def wrap(string, width, addnewline=false) - # Recursivly wrap string at width. - return string if string.length <= width - place = string.rindex(' ', width) # Position in string with last ' ' before cutoff - if addnewline then - return "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width) - else - return string[0,place] + "\n" + wrap(string[place+1..-1], width) - end - end # Sets the contents of this text node. This expects the text to be # unnormalized. It returns self. @@ -198,17 +188,28 @@ module REXML @raw = false end - def indent_text(string, level=1, style="\t", indentfirstline=true) - return string if level < 0 - new_string = '' - string.each { |line| - indent_string = style * level - new_line = (indent_string + line).sub(/[\s]+$/,'') - new_string << new_line - } - new_string.strip! unless indentfirstline - return new_string + def wrap(string, width, addnewline=false) + # Recursivly wrap string at width. + return string if string.length <= width + place = string.rindex(' ', width) # Position in string with last ' ' before cutoff + if addnewline then + return "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width) + else + return string[0,place] + "\n" + wrap(string[place+1..-1], width) + end end + + def indent_text(string, level=1, style="\t", indentfirstline=true) + return string if level < 0 + new_string = '' + string.each { |line| + indent_string = style * level + new_line = (indent_string + line).sub(/[\s]+$/,'') + new_string << new_line + } + new_string.strip! unless indentfirstline + return new_string + end def write( writer, indent=-1, transitive=false, ie_hack=false ) s = to_s() @@ -286,9 +287,10 @@ module REXML def Text::normalize( input, doctype=nil, entity_filter=nil ) copy = input # Doing it like this rather than in a loop improves the speed + #copy = copy.gsub( EREFERENCE, '&' ) + copy = copy.gsub( "&", "&" ) if doctype # Replace all ampersands that aren't part of an entity - copy = copy.gsub( EREFERENCE, '&' ) doctype.entities.each_value do |entity| copy = copy.gsub( entity.value, "&#{entity.name};" ) if entity.value and @@ -296,7 +298,6 @@ module REXML end else # Replace all ampersands that aren't part of an entity - copy = copy.gsub( EREFERENCE, '&' ) DocType::DEFAULT_ENTITIES.each_value do |entity| copy = copy.gsub(entity.value, "&#{entity.name};" ) end diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index a813236e10..453d57a88d 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -160,8 +160,13 @@ module REXML node_types = ELEMENTS return nodeset if path_stack.length == 0 || nodeset.length == 0 while path_stack.length > 0 + #puts "#"*5 #puts "Path stack = #{path_stack.inspect}" #puts "Nodeset is #{nodeset.inspect}" + if nodeset.length == 0 + path_stack.clear + return [] + end case (op = path_stack.shift) when :document nodeset = [ nodeset[0].root_node ] @@ -235,9 +240,11 @@ module REXML name = path_stack.shift for element in nodeset if element.node_type == :element - #puts element.name - attr = element.attribute( name, get_namespace(element, prefix) ) - new_nodeset << attr if attr + #puts "Element name = #{element.name}" + #puts "get_namespace( #{element.inspect}, #{prefix} ) = #{get_namespace(element, prefix)}" + attrib = element.attribute( name, get_namespace(element, prefix) ) + #puts "attrib = #{attrib.inspect}" + new_nodeset << attrib if attrib end end when :any @@ -299,8 +306,10 @@ module REXML #puts "Adding node #{node.inspect}" if result == (index+1) new_nodeset << node if result == (index+1) elsif result.instance_of? Array - #puts "Adding node #{node.inspect}" if result.size > 0 - new_nodeset << node if result.size > 0 + if result.size > 0 and result.inject(false) {|k,s| s or k} + #puts "Adding node #{node.inspect}" if result.size > 0 + new_nodeset << node if result.size > 0 + end else #puts "Adding node #{node.inspect}" if result new_nodeset << node if result @@ -381,9 +390,25 @@ module REXML node_types = ELEMENTS when :namespace - new_set = [] + #puts "In :namespace" + new_nodeset = [] + prefix = path_stack.shift for node in nodeset - new_nodeset << node.namespace if node.node_type == :element or node.node_type == :attribute + if (node.node_type == :element or node.node_type == :attribute) + if @namespaces + namespaces = @namespaces + elsif (node.node_type == :element) + namespaces = node.namespaces + else + namespaces = node.element.namesapces + end + #puts "Namespaces = #{namespaces.inspect}" + #puts "Prefix = #{prefix.inspect}" + #puts "Node.namespace = #{node.namespace}" + if (node.namespace == namespaces[prefix]) + new_nodeset << node + end + end end nodeset = new_nodeset @@ -404,6 +429,18 @@ module REXML #puts "RES => #{res.inspect}" return res + when :and + left = expr( path_stack.shift, nodeset.dup, context ) + #puts "LEFT => #{left.inspect} (#{left.class.name})" + if left == false || left.nil? || !left.inject(false) {|a,b| a | b} + return [] + end + right = expr( path_stack.shift, nodeset.dup, context ) + #puts "RIGHT => #{right.inspect} (#{right.class.name})" + res = equality_relational_compare( left, op, right ) + #puts "RES => #{res.inspect}" + return res + when :div left = Functions::number(expr(path_stack.shift, nodeset, context)).to_f right = Functions::number(expr(path_stack.shift, nodeset, context)).to_f @@ -477,7 +514,7 @@ module REXML # The next two methods are BAD MOJO! # This is my achilles heel. If anybody thinks of a better # way of doing this, be my guest. This really sucks, but - # it took me three days to get it to work at all. + # it is a wonder it works at all. # ######################################################## def descendant_or_self( path_stack, nodeset ) |