From 66aeb2f7080dea92703f10546fb3cbcc946f6fa3 Mon Sep 17 00:00:00 2001 From: ser Date: Sun, 20 Jan 2008 04:31:57 +0000 Subject: r1479@bean: ser | 2008-01-19 14:26:31 -0500 r1483@bean: ser | 2008-01-19 14:47:23 -0500 Sam's fixes: * Don't blow up on empty documents * Add a test case for sorted attributes * Making the output predictable simplifies unit tests, and doesn't cost much given that most xml element have few attributes * Ruby 1.9 revision 14922 is more strict * Complete Ticket #134 * Fix for ticket #121 * Fix for ticket #124 * Fix for ticket #128 * Fix ticket #133 * Ticket #131 (Support Ruby 1.9) * Fix for ticket #127 * Fix for ticket #123 * Add missing data needed by test case r1481@bean (orig r1303): ser | 2008-01-19 17:22:32 -0500 Tagged for release r1482@bean (orig r1304): ser | 2008-01-19 17:27:10 -0500 Version bump git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@15141 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- lib/rexml/text.rb | 110 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 91 insertions(+), 19 deletions(-) (limited to 'lib/rexml/text.rb') diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index 8058157605..c23cd17c02 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -18,8 +18,40 @@ module REXML # If +raw+ is true, then REXML leaves the value alone attr_accessor :raw - ILLEGAL = /(<|&(?!(#{Entity::NAME})|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));))/um + NEEDS_A_SECOND_CHECK = /(<|&((#{Entity::NAME});|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));)?)/um NUMERICENTITY = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ + VALID_CHAR = [ + 0x9, 0xA, 0xD, + (0x20..0xD7FF), + (0xE000..0xFFFD), + (0x10000..0x10FFFF) + ] + + if String.method_defined? :encode + VALID_XML_CHARS = Regexp.new('^['+ + VALID_CHAR.map { |item| + case item + when Fixnum + [item].pack('U').force_encoding('utf-8') + when Range + [item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8') + end + }.join + + ']*$') + else + VALID_XML_CHARS = /^( + [\x09\x0A\x0D\x20-\x7E] # ASCII + | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte + | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs + | [\xE1-\xEC\xEE][\x80-\xBF]{2} # straight 3-byte + | \xEF[\x80-\xBE]{2} # + | \xEF\xBF[\x80-\xBD] # excluding U+fffe and U+ffff + | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates + | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 + | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 + | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 + )*$/x; + end # Constructor # +arg+ if a String, the content is set to the String. If a Text, @@ -58,7 +90,7 @@ module REXML # # +pattern+ INTERNAL USE ONLY def initialize(arg, respect_whitespace=false, parent=nil, raw=nil, - entity_filter=nil, illegal=ILLEGAL ) + entity_filter=nil, illegal=NEEDS_A_SECOND_CHECK ) @raw = false @@ -85,10 +117,54 @@ module REXML @string.gsub!( /\r\n?/, "\n" ) - # check for illegal characters - if @raw - if @string =~ illegal - raise "Illegal character '#{$1}' in raw string \"#{@string}\"" + Text.check(@string, NEEDS_A_SECOND_CHECK, doctype) if @raw and @parent + end + + def parent= parent + super(parent) + Text.check(@string, NEEDS_A_SECOND_CHECK, doctype) if @raw and @parent + end + + # check for illegal characters + def Text.check string, pattern, doctype + + # illegal anywhere + if string !~ VALID_XML_CHARS + if String.method_defined? :encode + string.chars.each do |c| + case c.ord + when *VALID_CHAR + else + raise "Illegal character #{c.inspect} in raw string \"#{string}\"" + end + end + else + string.scan(/[\x00-\x7F]|[\x80-\xBF][\xC0-\xF0]*|[\xC0-\xF0]/) do |c| + case c.unpack('U') + when *VALID_CHAR + else + raise "Illegal character #{c.inspect} in raw string \"#{string}\"" + end + end + end + end + + # context sensitive + string.scan(pattern).each do + if $1[-1] != ?; + raise "Illegal character '#{$1}' in raw string \"#{string}\"" + elsif $1[0] == ?& + if $5 and $5[0] == ?# + case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i) + when *VALID_CHAR + else + raise "Illegal character '#{$1}' in raw string \"#{string}\"" + end + elsif $3 and !SUBSTITUTES.include?($1) + if !doctype or !doctype.entities.has_key?($3) + raise "Undeclared entity '#{$1}' in raw string \"#{string}\"" + end + end end end end @@ -120,6 +196,13 @@ module REXML to_s() <=> other.to_s end + def doctype + if @parent + doc = @parent.document + doc.doctype if doc + end + end + REFERENCE = /#{Entity::REFERENCE}/ # Returns the string value of this text node. This string is always # escaped, meaning that it is a valid XML text node string, and all @@ -138,12 +221,6 @@ module REXML return @string if @raw return @normalized if @normalized - doctype = nil - if @parent - doc = @parent.document - doctype = doc.doctype if doc - end - @normalized = Text::normalize( @string, doctype, @entity_filter ) end @@ -165,12 +242,7 @@ module REXML # u = Text.new( "sean russell", false, nil, true ) # u.value #-> "sean russell" def value - @unnormalized if @unnormalized - doctype = nil - if @parent - doc = @parent.document - doctype = doc.doctype if doc - end + return @unnormalized if @unnormalized @unnormalized = Text::unnormalize( @string, doctype ) end @@ -286,7 +358,7 @@ module REXML EREFERENCE = /&(?!#{Entity::NAME};)/ # Escapes all possible entities def Text::normalize( input, doctype=nil, entity_filter=nil ) - copy = input + copy = input.to_s # Doing it like this rather than in a loop improves the speed #copy = copy.gsub( EREFERENCE, '&' ) copy = copy.gsub( "&", "&" ) -- cgit v1.2.3