diff options
author | nahi <nahi@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2003-09-24 15:18:44 +0000 |
---|---|---|
committer | nahi <nahi@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2003-09-24 15:18:44 +0000 |
commit | db9445103c082a306ba085f7677da02ea94b8841 (patch) | |
tree | a311d59f031ae5def87f68be71ed1f58abadc031 /lib/xsd/charset.rb | |
parent | 8c2fb77787d1f20b4c19c9c52552856c339b86e9 (diff) | |
download | ruby-db9445103c082a306ba085f7677da02ea94b8841.tar.gz |
* lib/soap/* (29 files): SOAP4R added.
* lib/wsdl/* (42 files): WSDL4R added.
* lib/xsd/* (12 files): XSD4R added.
* test/soap/* (16 files): added.
* test/wsdl/* (2 files): added.
* test/xsd/* (3 files): added.
* sample/soap/* (27 files): added.
* sample/wsdl/* (13 files): added.
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@4591 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'lib/xsd/charset.rb')
-rw-r--r-- | lib/xsd/charset.rb | 175 |
1 files changed, 175 insertions, 0 deletions
diff --git a/lib/xsd/charset.rb b/lib/xsd/charset.rb new file mode 100644 index 0000000000..6dda959155 --- /dev/null +++ b/lib/xsd/charset.rb @@ -0,0 +1,175 @@ +=begin +XSD4R - Charset handling library. +Copyright (C) 2001, 2003 NAKAMURA, Hiroshi. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 2 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +PRATICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 675 Mass +Ave, Cambridge, MA 02139, USA. +=end + + +module XSD + + +module Charset + @encoding = $KCODE + + class XSDError < StandardError; end + class CharsetError < XSDError; end + class UnknownCharsetError < CharsetError; end + class CharsetConversionError < CharsetError; end + +public + + ### + ## Maps + # + EncodingConvertMap = {} + def Charset.init + begin + require 'xsd/iconvcharset' + @encoding = 'UTF8' + EncodingConvertMap[['UTF8', 'EUC' ]] = Proc.new { |str| IconvCharset.safe_iconv("euc-jp", "utf-8", str) } + EncodingConvertMap[['EUC' , 'UTF8']] = Proc.new { |str| IconvCharset.safe_iconv("utf-8", "euc-jp", str) } + EncodingConvertMap[['EUC' , 'SJIS']] = Proc.new { |str| IconvCharset.safe_iconv("shift-jis", "euc-jp", str) } + if /(mswin|bccwin|mingw|cygwin)/ =~ RUBY_PLATFORM + EncodingConvertMap[['UTF8', 'SJIS']] = Proc.new { |str| IconvCharset.safe_iconv("cp932", "utf-8", str) } + EncodingConvertMap[['SJIS', 'UTF8']] = Proc.new { |str| IconvCharset.safe_iconv("utf-8", "cp932", str) } + EncodingConvertMap[['SJIS', 'EUC' ]] = Proc.new { |str| IconvCharset.safe_iconv("euc-jp", "cp932", str) } + else + EncodingConvertMap[['UTF8', 'SJIS']] = Proc.new { |str| IconvCharset.safe_iconv("shift-jis", "utf-8", str) } + EncodingConvertMap[['SJIS', 'UTF8']] = Proc.new { |str| IconvCharset.safe_iconv("utf-8", "shift-jis", str) } + EncodingConvertMap[['SJIS', 'EUC' ]] = Proc.new { |str| IconvCharset.safe_iconv("euc-jp", "shift-jis", str) } + end + rescue LoadError + begin + require 'nkf' + EncodingConvertMap[['EUC' , 'SJIS']] = Proc.new { |str| NKF.nkf('-sXm0', str) } + EncodingConvertMap[['SJIS', 'EUC' ]] = Proc.new { |str| NKF.nkf('-eXm0', str) } + rescue LoadError + end + + begin + require 'uconv' + @encoding = 'UTF8' + EncodingConvertMap[['UTF8', 'EUC' ]] = Uconv.method(:u8toeuc) + EncodingConvertMap[['UTF8', 'SJIS']] = Uconv.method(:u8tosjis) + EncodingConvertMap[['EUC' , 'UTF8']] = Uconv.method(:euctou8) + EncodingConvertMap[['SJIS', 'UTF8']] = Uconv.method(:sjistou8) + rescue LoadError + end + end + end + self.init + + CharsetMap = { + 'NONE' => 'us-ascii', + 'EUC' => 'euc-jp', + 'SJIS' => 'shift_jis', + 'UTF8' => 'utf-8', + } + + + ### + ## handlers + # + def Charset.encoding + @encoding + end + + def Charset.encoding_label + charset_label(@encoding) + end + + def Charset.encoding_to_xml(str, charset) + encoding_conv(str, @encoding, charset_str(charset)) + end + + def Charset.encoding_from_xml(str, charset) + encoding_conv(str, charset_str(charset), @encoding) + end + + def Charset.encoding_conv(str, enc_from, enc_to) + if enc_from == enc_to or enc_from == 'NONE' or enc_to == 'NONE' + str + elsif converter = EncodingConvertMap[[enc_from, enc_to]] + converter.call(str) + else + raise CharsetConversionError.new( + "Converter not found: #{ enc_from } -> #{ enc_to }") + end + end + + def Charset.charset_label(encoding) + CharsetMap[encoding.upcase] + end + + def Charset.charset_str(label) + CharsetMap.index(label.downcase) + end + + # Original regexps: http://www.din.or.jp/~ohzaki/perl.htm + # ascii_euc = '[\x00-\x7F]' + ascii_euc = '[\x9\xa\xd\x20-\x7F]' # XML 1.0 restricted. + twobytes_euc = '(?:[\x8E\xA1-\xFE][\xA1-\xFE])' + threebytes_euc = '(?:\x8F[\xA1-\xFE][\xA1-\xFE])' + character_euc = "(?:#{ ascii_euc }|#{ twobytes_euc }|#{ threebytes_euc })" + EUCRegexp = Regexp.new("\\A#{ character_euc }*\\z", nil, "NONE") + + # onebyte_sjis = '[\x00-\x7F\xA1-\xDF]' + onebyte_sjis = '[\x9\xa\xd\x20-\x7F\xA1-\xDF]' # XML 1.0 restricted. + twobytes_sjis = '(?:[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC])' + character_sjis = "(?:#{ onebyte_sjis }|#{ twobytes_sjis })" + SJISRegexp = Regexp.new("\\A#{ character_sjis }*\\z", nil, "NONE") + + # 0xxxxxxx + #ascii_utf8 = '[\0-\x7F]' + ascii_utf8 = '[\x9\xA\xD\x20-\x7F]' # XML 1.0 restricted. + # 110yyyyy 10xxxxxx + twobytes_utf8 = '(?:[\xC0-\xDF][\x80-\xBF])' + # 1110zzzz 10yyyyyy 10xxxxxx + threebytes_utf8 = '(?:[\xE0-\xEF][\x80-\xBF][\x80-\xBF])' + # 11110uuu 10uuuzzz 10yyyyyy 10xxxxxx + fourbytes_utf8 = '(?:[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])' + character_utf8 = "(?:#{ ascii_utf8 }|#{ twobytes_utf8 }|#{ threebytes_utf8 }|#{ fourbytes_utf8 })" + UTF8Regexp = Regexp.new("\\A#{ character_utf8 }*\\z", nil, "NONE") + + def Charset.is_utf8(str) + UTF8Regexp =~ str + end + + def Charset.is_euc(str) + EUCRegexp =~ str + end + + def Charset.is_sjis(str) + SJISRegexp =~ str + end + + def Charset.is_ces(str, code = $KCODE) + case code + when 'NONE' + true + when 'UTF8' + is_utf8(str) + when 'EUC' + is_euc(str) + when 'SJIS' + is_sjis(str) + else + raise UnknownCharsetError.new("Unknown charset: #{ code }") + end + end +end + + +end |