1 files changed, 279 insertions, 0 deletions
diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb
new file mode 100644
index 0000000000..906f4d41fc
--- /dev/null
+++ b/lib/rexml/text.rb
@@ -0,0 +1,279 @@
+require 'rexml/entity'
+
+module REXML
+	# Represents text nodes in an XML document
+	class Text < Child
+		include Comparable
+		# The order in which the substitutions occur
+		SPECIALS = [ /&(?!#?[\w-]+;)/u, /</u, />/u, /"/u, /'/u, /\r/u ]
+		SUBSTITUTES = ['&amp;', '&lt;', '&gt;', '&quot;', '&apos;', '&#13;']
+		# Characters which are substituted in written strings
+		SLAICEPS = [ '<', '>', '"', "'", '&' ]
+		SETUTITSBUS = [ /&lt;/u, /&gt;/u, /&quot;/u, /&apos;/u, /&amp;/u ]
+
+		# If +raw+ is true, then REXML leaves the value alone
+		attr_accessor :raw
+
+		ILLEGAL = /(<|&(?!(#{Entity::NAME})|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));))/um
+		NUMERICENTITY = /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ 
+
+		# Constructor
+		# +arg+ if a String, the content is set to the String.  If a Text,
+		# the object is shallowly cloned.  
+		#
+		# +respect_whitespace+ (boolean, false) if true, whitespace is
+		# respected
+		#
+		# +parent+ (nil) if this is a Parent object, the parent
+		# will be set to this.  
+		#
+		# +raw+ (nil) This argument can be given three values.
+		# If true, then the value of used to construct this object is expected to 
+		# contain no unescaped XML markup, and REXML will not change the text. If 
+		# this value is false, the string may contain any characters, and REXML will
+		# escape any and all defined entities whose values are contained in the
+		# text.  If this value is nil (the default), then the raw value of the 
+		# parent will be used as the raw value for this node.  If there is no raw
+		# value for the parent, and no value is supplied, the default is false.
+		#   Text.new( "<&", false, nil, false ) #-> "&lt;&amp;"
+		#   Text.new( "<&", false, nil, true )  #-> IllegalArgumentException
+		#   Text.new( "&lt;&amp;", false, nil, true )  #-> "&lt;&amp;"
+		#   # Assume that the entity "s" is defined to be "sean"
+		#   # and that the entity    "r" is defined to be "russell"
+		#   Text.new( "sean russell" )          #-> "&s; &r;"
+		#   Text.new( "sean russell", false, nil, true ) #-> "sean russell"
+		#
+		# +entity_filter+ (nil) This can be an array of entities to match in the
+		# supplied text.  This argument is only useful if +raw+ is set to false.
+		#   Text.new( "sean russell", false, nil, false, ["s"] ) #-> "&s; russell"
+		#   Text.new( "sean russell", false, nil, true, ["s"] ) #-> "sean russell"
+		# In the last example, the +entity_filter+ argument is ignored.
+		#
+		# +pattern+ INTERNAL USE ONLY
+		def initialize(arg, respect_whitespace=false, parent=nil, raw=nil, 
+			entity_filter=nil, illegal=ILLEGAL )
+
+			@raw = false
+
+			if parent
+				super( parent )
+				@raw = parent.raw 
+			else
+				@parent = nil
+			end
+
+			@raw = raw unless raw.nil?
+			@entity_filter = entity_filter
+			@normalized = @unnormalized = nil
+
+			if arg.kind_of? String
+				@string = arg.clone
+				@string.squeeze!(" \n\t") unless respect_whitespace
+			elsif arg.kind_of? Text
+				@string = arg.to_s
+				@raw = arg.raw
+			elsif
+				raise Exception.new( "Illegal argument of type #{arg.type} for Text constructor (#{arg})" )
+			end
+
+			@string.gsub!( /\r\n?/, "\n" )
+
+			# check for illegal characters
+			if @raw
+				if @string =~ illegal
+					raise Exception.new(
+						"Illegal character '#{$1}' in raw string \"#{@string}\""
+					)
+				end
+			end
+		end
+
+		def node_type
+			:text
+		end
+
+		def empty?
+			@string.size==0
+		end
+
+
+		def clone
+			return Text.new(self)
+		end
+
+
+		# Appends text to this text node.  The text is appended in the +raw+ mode
+		# of this text node.
+		def <<( to_append )
+			@string << to_append.gsub( /\r\n?/, "\n" )
+		end
+
+
+		# +other+ a String or a Text
+		# +returns+ the result of (to_s <=> arg.to_s)
+		def <=>( other )
+			to_s() <=> other.to_s
+		end
+
+		REFERENCE = /#{Entity::REFERENCE}/
+		# Returns the string value of this text node.  This string is always
+		# escaped, meaning that it is a valid XML text node string, and all
+		# entities that can be escaped, have been inserted.  This method respects
+		# the entity filter set in the constructor.
+		#   
+		#   # Assume that the entity "s" is defined to be "sean", and that the 
+		#   # entity "r" is defined to be "russell"
+		#   t = Text.new( "< & sean russell", false, nil, false, ['s'] ) 
+		#   t.to_s   #-> "&lt; &amp; &s; russell"
+		#   t = Text.new( "< & &s; russell", false, nil, false ) 
+		#   t.to_s   #-> "&lt; &amp; &s; russell"
+		#   u = Text.new( "sean russell", false, nil, true )
+		#   u.to_s   #-> "sean russell"
+		def to_s
+			return @string if @raw
+			return @normalized if @normalized
+
+			doctype = nil
+			if @parent
+				doc = @parent.document
+				doctype = doc.doctype if doc
+			end
+
+			@normalized = Text::normalize( @string, doctype, @entity_filter )
+		end
+
+		# Returns the string value of this text.  This is the text without
+		# entities, as it might be used programmatically, or printed to the
+		# console.  This ignores the 'raw' attribute setting, and any
+		# entity_filter.
+		#
+		#   # Assume that the entity "s" is defined to be "sean", and that the 
+		#   # entity "r" is defined to be "russell"
+		#   t = Text.new( "< & sean russell", false, nil, false, ['s'] ) 
+		#   t.string   #-> "< & sean russell"
+		#   t = Text.new( "< & &s; russell", false, nil, false )
+		#   t.string   #-> "< & sean russell"
+		#   u = Text.new( "sean russell", false, nil, true )
+		#   u.string   #-> "sean russell"
+		def value
+			@unnormalized if @unnormalized
+			doctype = nil
+			if @parent
+				doc = @parent.document
+				doctype = doc.doctype if doc
+			end
+			@unnormalized = Text::unnormalize( @string, doctype )
+		end
+
+		def write( writer, indent=-1, transitive=false, ie_hack=false ) 
+			writer << to_s()
+		end
+
+		# Writes out text, substituting special characters beforehand.
+		# +out+ A String, IO, or any other object supporting <<( String )
+		# +input+ the text to substitute and the write out
+		#
+		#   z=utf8.unpack("U*")
+		#   ascOut=""
+		#   z.each{|r|
+		#     if r <  0x100
+		#       ascOut.concat(r.chr)
+		#     else
+		#       ascOut.concat(sprintf("&#x%x;", r))
+		#     end
+		#   }
+		#   puts ascOut
+		def write_with_substitution out, input
+			copy = input.clone
+			# Doing it like this rather than in a loop improves the speed
+			copy.gsub!( SPECIALS[0], SUBSTITUTES[0] )
+			copy.gsub!( SPECIALS[1], SUBSTITUTES[1] )
+			copy.gsub!( SPECIALS[2], SUBSTITUTES[2] )
+			copy.gsub!( SPECIALS[3], SUBSTITUTES[3] )
+			copy.gsub!( SPECIALS[4], SUBSTITUTES[4] )
+			copy.gsub!( SPECIALS[5], SUBSTITUTES[5] )
+			out << copy
+		end
+
+		# Reads text, substituting entities
+		def Text::read_with_substitution( input, illegal=nil )
+			copy = input.clone
+
+			if copy =~ illegal
+				raise ParseException.new( "malformed text: Illegal character #$& in \"#{copy}\"" )
+			end if illegal
+			
+			copy.gsub!( /\r\n?/, "\n" )
+			if copy.include? ?&
+				copy.gsub!( SETUTITSBUS[0], SLAICEPS[0] )
+				copy.gsub!( SETUTITSBUS[1], SLAICEPS[1] )
+				copy.gsub!( SETUTITSBUS[2], SLAICEPS[2] )
+				copy.gsub!( SETUTITSBUS[3], SLAICEPS[3] )
+				copy.gsub!( SETUTITSBUS[4], SLAICEPS[4] )
+				copy.gsub!( /&#0*((?:\d+)|(?:x[a-f0-9]+));/ ) {|m|
+					m=$1
+					#m='0' if m==''
+					m = "0#{m}" if m[0] == ?x
+					[Integer(m)].pack('U*')
+				}
+			end
+			copy
+		end
+
+		EREFERENCE = /&(?!#{Entity::NAME};)/
+		# Escapes all possible entities
+		def Text::normalize( input, doctype=nil, entity_filter=nil )
+			copy = input.clone
+			# Doing it like this rather than in a loop improves the speed
+			if doctype
+				copy.gsub!( EREFERENCE, '&amp;' )
+				doctype.entities.each_value do |entity|
+					copy.gsub!( entity.value, 
+						"&#{entity.name};" ) if entity.value and 
+							not( entity_filter and entity_filter.include?(entity) )
+				end
+			else
+				copy.gsub!( EREFERENCE, '&amp;' )
+				DocType::DEFAULT_ENTITIES.each_value do |entity|
+					copy.gsub!(entity.value, "&#{entity.name};" )
+				end
+			end
+			copy
+		end
+
+		# Unescapes all possible entities
+		def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil )
+			rv = string.clone
+			rv.gsub!( /\r\n?/, "\n" )
+			matches = rv.scan REFERENCE
+			return rv if matches.size == 0
+			rv.gsub!( NUMERICENTITY ) {|m|
+				m=$1
+				m = "0#{m}" if m[0] == ?x
+				[Integer(m)].pack('U*')
+			}
+			matches.collect!{|x|x[0]}.compact!
+			if matches.size > 0
+				if doctype
+					matches.each do |entity_reference|
+						unless filter and filter.include?(entity_reference)
+							entity_value = doctype.entity( entity_reference )
+							re = /&#{entity_reference};/
+							rv.gsub!( re, entity_value ) if entity_value
+						end
+					end
+				else
+					matches.each do |entity_reference|
+						unless filter and filter.include?(entity_reference)
+							entity_value = DocType::DEFAULT_ENTITIES[ entity_reference ]
+							re = /&#{entity_reference};/
+							rv.gsub!( re, entity_value.value ) if entity_value
+						end
+					end
+				end
+				rv.gsub!( /&amp;/, '&' )
+			end
+			rv
+		end
+	end
+end