From 66aeb2f7080dea92703f10546fb3cbcc946f6fa3 Mon Sep 17 00:00:00 2001 From: ser Date: Sun, 20 Jan 2008 04:31:57 +0000 Subject: r1479@bean: ser | 2008-01-19 14:26:31 -0500 r1483@bean: ser | 2008-01-19 14:47:23 -0500 Sam's fixes: * Don't blow up on empty documents * Add a test case for sorted attributes * Making the output predictable simplifies unit tests, and doesn't cost much given that most xml element have few attributes * Ruby 1.9 revision 14922 is more strict * Complete Ticket #134 * Fix for ticket #121 * Fix for ticket #124 * Fix for ticket #128 * Fix ticket #133 * Ticket #131 (Support Ruby 1.9) * Fix for ticket #127 * Fix for ticket #123 * Add missing data needed by test case r1481@bean (orig r1303): ser | 2008-01-19 17:22:32 -0500 Tagged for release r1482@bean (orig r1304): ser | 2008-01-19 17:27:10 -0500 Version bump git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@15141 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- lib/rexml/attribute.rb | 23 +++++---- lib/rexml/cdata.rb | 2 +- lib/rexml/element.rb | 23 +++++++-- lib/rexml/formatters/default.rb | 2 +- lib/rexml/parsers/baseparser.rb | 38 +++++++++++--- lib/rexml/rexml.rb | 10 ++-- lib/rexml/source.rb | 6 +-- lib/rexml/text.rb | 110 +++++++++++++++++++++++++++++++++------- 8 files changed, 163 insertions(+), 51 deletions(-) (limited to 'lib/rexml') diff --git a/lib/rexml/attribute.rb b/lib/rexml/attribute.rb index 89c1ada36c..17ced44c45 100644 --- a/lib/rexml/attribute.rb +++ b/lib/rexml/attribute.rb @@ -17,6 +17,8 @@ module REXML attr_writer :normalized PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\2/um + NEEDS_A_SECOND_CHECK = /(<|&((#{Entity::NAME});|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));)?)/um + # Constructor. # FIXME: The parser doesn't catch illegal characters in attributes # @@ -110,15 +112,16 @@ module REXML end end - # Returns the attribute value, with entities replaced - def to_s - return @normalized if @normalized - - doctype = nil + def doctype if @element doc = @element.document doctype = doc.doctype if doc end + end + + # Returns the attribute value, with entities replaced + def to_s + return @normalized if @normalized @normalized = Text::normalize( @unnormalized, doctype ) @unnormalized = nil @@ -129,11 +132,6 @@ module REXML # have been expanded to their values def value return @unnormalized if @unnormalized - doctype = nil - if @element - doc = @element.document - doctype = doc.doctype if doc - end @unnormalized = Text::unnormalize( @normalized, doctype ) @normalized = nil @unnormalized @@ -150,6 +148,11 @@ module REXML # Returns this attribute def element=( element ) @element = element + + if @normalized + Text.check( @normalized, NEEDS_A_SECOND_CHECK, doctype ) + end + self end diff --git a/lib/rexml/cdata.rb b/lib/rexml/cdata.rb index efcb71160a..856b9ef8b2 100644 --- a/lib/rexml/cdata.rb +++ b/lib/rexml/cdata.rb @@ -13,7 +13,7 @@ module REXML # CData.new( "Here is some CDATA" ) # CData.new( "Some unprocessed data", respect_whitespace_TF, parent_element ) def initialize( first, whitespace=true, parent=nil ) - super( first, whitespace, parent, true, true, ILLEGAL ) + super( first, whitespace, parent, false, true, ILLEGAL ) end # Make a copy of this object diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index 55094111e6..ecd10de965 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -558,7 +558,19 @@ module REXML prefix = namespaces.index(namespace) if namespace end prefix = nil if prefix == 'xmlns' - attributes.get_attribute( "#{prefix ? prefix + ':' : ''}#{name}" ) + + ret_val = + attributes.get_attribute( "#{prefix ? prefix + ':' : ''}#{name}" ) + + return ret_val unless ret_val.nil? + return nil if prefix.nil? + + # now check that prefix'es namespace is not the same as the + # default namespace + return nil unless ( namespaces[ prefix ] == namespaces[ 'xmlns' ] ) + + attributes.get_attribute( name ) + end # Evaluates to +true+ if this element has any attributes set, false @@ -675,7 +687,7 @@ module REXML # out = '' # doc.write( out ) #-> doc is written to the string 'out' # doc.write( $stdout ) #-> doc written to the console - def write(writer=$stdout, indent=-1, transitive=false, ie_hack=false) + def write(output=$stdout, indent=-1, transitive=false, ie_hack=false) Kernel.warn("#{self.class.name}.write is deprecated. See REXML::Formatters") formatter = if indent > -1 if transitive @@ -1217,14 +1229,17 @@ module REXML # # Method contributed by Henrik Martensson def get_attribute_ns(namespace, name) + result = nil each_attribute() { |attribute| if name == attribute.name && namespace == attribute.namespace() && ( !namespace.empty? || !attribute.fully_expanded_name.index(':') ) - return attribute + # foo will match xmlns:foo, but only if foo isn't also an attribute + result = attribute if !result or !namespace.empty? or + !attribute.fully_expanded_name.index(':') end } - nil + result end end end diff --git a/lib/rexml/formatters/default.rb b/lib/rexml/formatters/default.rb index 77381bdf84..b4d63bc5b5 100644 --- a/lib/rexml/formatters/default.rb +++ b/lib/rexml/formatters/default.rb @@ -63,7 +63,7 @@ module REXML def write_element( node, output ) output << "<#{node.expanded_name}" - node.attributes.each_attribute do |attr| + node.attributes.to_a.sort_by {|attr| attr.name}.each do |attr| output << " " attr.write( output ) end unless node.attributes.empty? diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 854e707fae..85f2c4e46d 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -25,7 +25,20 @@ module REXML # # Nat Price gave me some good ideas for the API. class BaseParser - NCNAME_STR= '[\w:][\-\w\d.]*' + if String.method_defined? :encode + # Oniguruma / POSIX [understands unicode] + LETTER = '[[:alpha:]]' + DIGIT = '[[:digit:]]' + else + # Ruby < 1.9 [doesn't understand unicode] + LETTER = 'a-zA-Z' + DIGIT = '\d' + end + + COMBININGCHAR = '' # TODO + EXTENDER = '' # TODO + + NCNAME_STR= "[#{LETTER}_:][-#{LETTER}#{DIGIT}._:#{COMBININGCHAR}#{EXTENDER}]*" NAME_STR= "(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})" UNAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}" @@ -33,7 +46,7 @@ module REXML NAME = "([\\w:]#{NAMECHAR}*)" NMTOKEN = "(?:#{NAMECHAR})+" NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*" - REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)" + REFERENCE = "&(?:#{NAME};|#\\d+;|#x[0-9a-fA-F]+;)" REFERENCE_RE = /#{REFERENCE}/ DOCTYPE_START = /\A\s* -# Version:: 3.1.7.2 -# Date:: 2007/275 +# Date:: 2008/019 +# Version:: 3.1.7.3 # # This API documentation can be downloaded from the REXML home page, or can # be accessed online[http://www.germane-software.com/software/rexml_doc] @@ -21,9 +21,9 @@ # or can be accessed # online[http://www.germane-software.com/software/rexml/docs/tutorial.html] module REXML - COPYRIGHT = "Copyright © 2001-2007 Sean Russell " - DATE = "2007/275" - VERSION = "3.1.7.2" + COPYRIGHT = "Copyright © 2001-2008 Sean Russell " + DATE = "2008/019" + VERSION = "3.1.7.3" REVISION = "$Revision$".gsub(/\$Revision:|\$/,'').strip Copyright = COPYRIGHT diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 3f14239a35..d4335138a1 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -147,7 +147,7 @@ module REXML # the XML spec. If there is one, we can determine the encoding from # it. @buffer = "" - str = @source.read( 2 ) + str = @source.read( 2 ) || '' if encoding self.encoding = encoding elsif str[0,2] == "\xfe\xff" @@ -161,7 +161,7 @@ module REXML else @line_break = ">" end - super str+@source.readline( @line_break ) + super( @source.eof? ? str : str+@source.readline( @line_break ) ) end def scan(pattern, cons=false) @@ -231,7 +231,7 @@ module REXML end def position - @er_source.stat.pipe? ? 0 : @er_source.pos + @er_source.pos rescue 0 end # @return the current line in the source diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index 8058157605..c23cd17c02 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -18,8 +18,40 @@ module REXML # If +raw+ is true, then REXML leaves the value alone attr_accessor :raw - ILLEGAL = /(<|&(?!(#{Entity::NAME})|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));))/um + NEEDS_A_SECOND_CHECK = /(<|&((#{Entity::NAME});|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));)?)/um NUMERICENTITY = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ + VALID_CHAR = [ + 0x9, 0xA, 0xD, + (0x20..0xD7FF), + (0xE000..0xFFFD), + (0x10000..0x10FFFF) + ] + + if String.method_defined? :encode + VALID_XML_CHARS = Regexp.new('^['+ + VALID_CHAR.map { |item| + case item + when Fixnum + [item].pack('U').force_encoding('utf-8') + when Range + [item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8') + end + }.join + + ']*$') + else + VALID_XML_CHARS = /^( + [\x09\x0A\x0D\x20-\x7E] # ASCII + | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte + | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs + | [\xE1-\xEC\xEE][\x80-\xBF]{2} # straight 3-byte + | \xEF[\x80-\xBE]{2} # + | \xEF\xBF[\x80-\xBD] # excluding U+fffe and U+ffff + | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates + | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 + | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 + | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 + )*$/x; + end # Constructor # +arg+ if a String, the content is set to the String. If a Text, @@ -58,7 +90,7 @@ module REXML # # +pattern+ INTERNAL USE ONLY def initialize(arg, respect_whitespace=false, parent=nil, raw=nil, - entity_filter=nil, illegal=ILLEGAL ) + entity_filter=nil, illegal=NEEDS_A_SECOND_CHECK ) @raw = false @@ -85,10 +117,54 @@ module REXML @string.gsub!( /\r\n?/, "\n" ) - # check for illegal characters - if @raw - if @string =~ illegal - raise "Illegal character '#{$1}' in raw string \"#{@string}\"" + Text.check(@string, NEEDS_A_SECOND_CHECK, doctype) if @raw and @parent + end + + def parent= parent + super(parent) + Text.check(@string, NEEDS_A_SECOND_CHECK, doctype) if @raw and @parent + end + + # check for illegal characters + def Text.check string, pattern, doctype + + # illegal anywhere + if string !~ VALID_XML_CHARS + if String.method_defined? :encode + string.chars.each do |c| + case c.ord + when *VALID_CHAR + else + raise "Illegal character #{c.inspect} in raw string \"#{string}\"" + end + end + else + string.scan(/[\x00-\x7F]|[\x80-\xBF][\xC0-\xF0]*|[\xC0-\xF0]/) do |c| + case c.unpack('U') + when *VALID_CHAR + else + raise "Illegal character #{c.inspect} in raw string \"#{string}\"" + end + end + end + end + + # context sensitive + string.scan(pattern).each do + if $1[-1] != ?; + raise "Illegal character '#{$1}' in raw string \"#{string}\"" + elsif $1[0] == ?& + if $5 and $5[0] == ?# + case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i) + when *VALID_CHAR + else + raise "Illegal character '#{$1}' in raw string \"#{string}\"" + end + elsif $3 and !SUBSTITUTES.include?($1) + if !doctype or !doctype.entities.has_key?($3) + raise "Undeclared entity '#{$1}' in raw string \"#{string}\"" + end + end end end end @@ -120,6 +196,13 @@ module REXML to_s() <=> other.to_s end + def doctype + if @parent + doc = @parent.document + doc.doctype if doc + end + end + REFERENCE = /#{Entity::REFERENCE}/ # Returns the string value of this text node. This string is always # escaped, meaning that it is a valid XML text node string, and all @@ -138,12 +221,6 @@ module REXML return @string if @raw return @normalized if @normalized - doctype = nil - if @parent - doc = @parent.document - doctype = doc.doctype if doc - end - @normalized = Text::normalize( @string, doctype, @entity_filter ) end @@ -165,12 +242,7 @@ module REXML # u = Text.new( "sean russell", false, nil, true ) # u.value #-> "sean russell" def value - @unnormalized if @unnormalized - doctype = nil - if @parent - doc = @parent.document - doctype = doc.doctype if doc - end + return @unnormalized if @unnormalized @unnormalized = Text::unnormalize( @string, doctype ) end @@ -286,7 +358,7 @@ module REXML EREFERENCE = /&(?!#{Entity::NAME};)/ # Escapes all possible entities def Text::normalize( input, doctype=nil, entity_filter=nil ) - copy = input + copy = input.to_s # Doing it like this rather than in a loop improves the speed #copy = copy.gsub( EREFERENCE, '&' ) copy = copy.gsub( "&", "&" ) -- cgit v1.2.3