diff options
author | nahi <nahi@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2003-09-28 09:33:59 +0000 |
---|---|---|
committer | nahi <nahi@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2003-09-28 09:33:59 +0000 |
commit | eb8ee5e401cf49a31d4dc5ba2c8e74f379bb2408 (patch) | |
tree | 74edbf3a969aa75179115b876623f3cdaa24e803 /lib/xsd | |
parent | d57fc5a48922bbdcc6ab8dc0c2e02fe796c70afc (diff) | |
download | ruby-eb8ee5e401cf49a31d4dc5ba2c8e74f379bb2408.tar.gz |
* forgot to add this file in the previous commit.
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@4615 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'lib/xsd')
-rw-r--r-- | lib/xsd/charset.rb | 19 |
1 files changed, 11 insertions, 8 deletions
diff --git a/lib/xsd/charset.rb b/lib/xsd/charset.rb index 6dda959155..88d4f99043 100644 --- a/lib/xsd/charset.rb +++ b/lib/xsd/charset.rb @@ -117,12 +117,13 @@ public CharsetMap.index(label.downcase) end - # Original regexps: http://www.din.or.jp/~ohzaki/perl.htm - # ascii_euc = '[\x00-\x7F]' - ascii_euc = '[\x9\xa\xd\x20-\x7F]' # XML 1.0 restricted. + # us_ascii = '[\x00-\x7F]' + us_ascii = '[\x9\xa\xd\x20-\x7F]' # XML 1.0 restricted. + USASCIIRegexp = Regexp.new("\\A#{ us_ascii }*\\z", nil, "NONE") + twobytes_euc = '(?:[\x8E\xA1-\xFE][\xA1-\xFE])' threebytes_euc = '(?:\x8F[\xA1-\xFE][\xA1-\xFE])' - character_euc = "(?:#{ ascii_euc }|#{ twobytes_euc }|#{ threebytes_euc })" + character_euc = "(?:#{ us_ascii }|#{ twobytes_euc }|#{ threebytes_euc })" EUCRegexp = Regexp.new("\\A#{ character_euc }*\\z", nil, "NONE") # onebyte_sjis = '[\x00-\x7F\xA1-\xDF]' @@ -132,17 +133,19 @@ public SJISRegexp = Regexp.new("\\A#{ character_sjis }*\\z", nil, "NONE") # 0xxxxxxx - #ascii_utf8 = '[\0-\x7F]' - ascii_utf8 = '[\x9\xA\xD\x20-\x7F]' # XML 1.0 restricted. # 110yyyyy 10xxxxxx twobytes_utf8 = '(?:[\xC0-\xDF][\x80-\xBF])' # 1110zzzz 10yyyyyy 10xxxxxx threebytes_utf8 = '(?:[\xE0-\xEF][\x80-\xBF][\x80-\xBF])' # 11110uuu 10uuuzzz 10yyyyyy 10xxxxxx fourbytes_utf8 = '(?:[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])' - character_utf8 = "(?:#{ ascii_utf8 }|#{ twobytes_utf8 }|#{ threebytes_utf8 }|#{ fourbytes_utf8 })" + character_utf8 = "(?:#{ us_ascii }|#{ twobytes_utf8 }|#{ threebytes_utf8 }|#{ fourbytes_utf8 })" UTF8Regexp = Regexp.new("\\A#{ character_utf8 }*\\z", nil, "NONE") + def Charset.is_us_ascii(str) + USASCIIRegexp =~ str + end + def Charset.is_utf8(str) UTF8Regexp =~ str end @@ -158,7 +161,7 @@ public def Charset.is_ces(str, code = $KCODE) case code when 'NONE' - true + is_us_ascii(str) when 'UTF8' is_utf8(str) when 'EUC' |