From 06f2b5b1d890253cdc4de78a326369a10d22595b Mon Sep 17 00:00:00 2001 From: ser Date: Sun, 4 Nov 2007 04:52:08 +0000 Subject: Fixes ticket:110 (more UTF-16 problems) Missing include for UndefinedNamespaceException was causing errors in some cases. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@13816 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- lib/rexml/encoding.rb | 9 +++++++-- lib/rexml/parsers/baseparser.rb | 2 ++ lib/rexml/parsers/treeparser.rb | 1 + lib/rexml/source.rb | 12 ++++++++---- 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/lib/rexml/encoding.rb b/lib/rexml/encoding.rb index 6cae6b644d..a01763be99 100644 --- a/lib/rexml/encoding.rb +++ b/lib/rexml/encoding.rb @@ -56,8 +56,13 @@ module REXML def check_encoding str # We have to recognize UTF-16, LSB UTF-16, and UTF-8 - return UTF_16 if /\A\xfe\xff/n =~ str - return UNILE if /\A\xff\xfe/n =~ str + if str[0] == 0xfe && str[1] == 0xff + str[0,2] = "" + return UTF_16 + elsif str[0] == 0xff && str[1] == 0xfe + str[0,2] = "" + return UNILE + end str =~ /^\s*<\?xml\s+version\s*=\s*(['"]).*?\1\s+encoding\s*=\s*(["'])(.*?)\2/um return $3.upcase if $3 return UTF_8 diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 5f7a5ec43b..fc2354a67f 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -1,4 +1,5 @@ require 'rexml/parseexception' +require 'rexml/undefinednamespaceexception' require 'rexml/source' require 'set' @@ -191,6 +192,7 @@ module REXML end return [ :end_document ] if empty? return @stack.shift if @stack.size > 0 + #STDERR.puts @source.encoding @source.read if @source.buffer.size<2 #STDERR.puts "BUFFER = #{@source.buffer.inspect}" if @document_status == nil diff --git a/lib/rexml/parsers/treeparser.rb b/lib/rexml/parsers/treeparser.rb index ff8261cedf..5c3e142ea7 100644 --- a/lib/rexml/parsers/treeparser.rb +++ b/lib/rexml/parsers/treeparser.rb @@ -1,4 +1,5 @@ require 'rexml/validation/validationexception' +require 'rexml/undefinednamespaceexception' module REXML module Parsers diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 659bcdc195..e05460fea1 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -135,6 +135,7 @@ module REXML def initialize(arg, block_size=500, encoding=nil) @er_source = @source = arg @to_utf = false + # Determining the encoding is a deceptively difficult issue to resolve. # First, we check the first two bytes for UTF-16. Then we # assume that the encoding is at least ASCII enough for the '>', and @@ -146,13 +147,16 @@ module REXML str = @source.read( 2 ) if encoding self.encoding = encoding - elsif /\A(?:\xfe\xff|\xff\xfe)/n =~ str - self.encoding = check_encoding( str ) - elsif (0xef == str[0] && 0xbb == str[1]) + elsif 0xfe == str[0] && 0xff == str[1] + @line_break = "\000>" + elsif 0xff == str[0] && 0xfe == str[1] + @line_break = ">\000" + elsif 0xef == str[0] && 0xbb == str[1] str += @source.read(1) str = '' if (0xbf == str[2]) + @line_break = ">" else - @line_break = '>' + @line_break = ">" end super str+@source.readline( @line_break ) end -- cgit v1.2.3