diff options
author | muraken <muraken@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2010-03-09 09:15:42 +0000 |
---|---|---|
committer | muraken <muraken@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2010-03-09 09:15:42 +0000 |
commit | 62f8df2d3c95816028c006ecbe70bc51704eec4b (patch) | |
tree | 2eb080aa7b5d235cf934014ebbafa17c3f598718 /tool | |
parent | c4636043cc2afe5a5fec48850115e8aa0aa3c2de (diff) | |
download | ruby-62f8df2d3c95816028c006ecbe70bc51704eec4b.tar.gz |
* enc/trans/EMOJI/*.src, enc/trans/emoji*, enc/x-emoji.c, test/ruby/enc/test_emoji.rb, tool/enc-emoji-citrus-gen.rb, tool/enc-emoji4unicode.rb, tool/jisx0208.rb, tool/test/test_jisx0208.rb: new encodings to support emoji charsets, which are used by Japanese mobile phones [ruby-dev:40528]. Thanks Yoji Shidara for a lot of contribution.
* tool/transcode-tblgen.rb: modified for enc-emoji4unicode.rb.
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@26856 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'tool')
-rw-r--r-- | tool/enc-emoji-citrus-gen.rb | 131 | ||||
-rw-r--r-- | tool/enc-emoji4unicode.rb | 133 | ||||
-rw-r--r-- | tool/jisx0208.rb | 84 | ||||
-rw-r--r-- | tool/test/test_jisx0208.rb | 40 | ||||
-rwxr-xr-x | tool/transcode-tblgen.rb | 176 |
5 files changed, 480 insertions, 84 deletions
diff --git a/tool/enc-emoji-citrus-gen.rb b/tool/enc-emoji-citrus-gen.rb new file mode 100644 index 0000000000..5037cbde1e --- /dev/null +++ b/tool/enc-emoji-citrus-gen.rb @@ -0,0 +1,131 @@ +require File.expand_path('../jisx0208', __FILE__) + +ENCODES = [ + { + :name => "SHIFT_JIS-DOCOMO", + :src_zone => [0xF8..0xFC, 0x40..0xFC, 8], + :dst_ilseq => 0xFFFE, + :map => [ + [0xE63E..0xE757, JISX0208::Char.from_sjis(0xF89F)], + ], + }, + { + :name => "ISO-2022-JP-KDDI", + :src_zone => [0x21..0x7E, 0x21..0x7E, 8], + :dst_ilseq => 0xFFFE, + :map => [ + [0xE468..0xE5B4, JISX0208::Char.new(0x7521)], + [0xE5B5..0xE5CC, JISX0208::Char.new(0x7867)], + [0xE5CD..0xE5DF, JISX0208::Char.new(0x7921)], + [0xEA80..0xEAFA, JISX0208::Char.new(0x7934)], + [0xEAFB..0xEB0D, JISX0208::Char.new(0x7854)], + [0xEB0E..0xEB8E, JISX0208::Char.new(0x7A51)], + ], + }, + { + :name => "SHIFT_JIS-KDDI", + :src_zone => [0xF3..0xFC, 0x40..0xFC, 8], + :dst_ilseq => 0xFFFE, + :map => [ + [0xE468..0xE5B4, JISX0208::Char.from_sjis(0xF640)], + [0xE5B5..0xE5CC, JISX0208::Char.from_sjis(0xF7E5)], + [0xE5CD..0xE5DF, JISX0208::Char.from_sjis(0xF340)], + [0xEA80..0xEAFA, JISX0208::Char.from_sjis(0xF353)], + [0xEAFB..0xEB0D, JISX0208::Char.from_sjis(0xF7D2)], + [0xEB0E..0xEB8E, JISX0208::Char.from_sjis(0xF3CF)], + ], + }, + { + :name => "SHIFT_JIS-SOFTBANK", + :src_zone => [0xF3..0xFC, 0x40..0xFC, 8], + :dst_ilseq => 0xFFFE, + :map => [ + [0xE001..0xE05A, JISX0208::Char.from_sjis(0xF941)], + [0xE101..0xE15A, JISX0208::Char.from_sjis(0xF741)], + [0xE201..0xE25A, JISX0208::Char.from_sjis(0xF7A1)], + [0xE301..0xE34D, JISX0208::Char.from_sjis(0xF9A1)], + [0xE401..0xE44C, JISX0208::Char.from_sjis(0xFB41)], + [0xE501..0xE53E, JISX0208::Char.from_sjis(0xFBA1)], + ], + }, +] + +def zone(*args) + bits = args.pop + [*args.map{|range| "0x%02X-0x%02X" % [range.begin, range.end] }, bits].join(' / ') +end + +def header(params) + (<<END_HEADER_TEMPLATE % [params[:name], zone(*params[:src_zone]), params[:dst_ilseq]]) +# DO NOT EDIT THIS FILE DIRECTLY + +TYPE ROWCOL +NAME %s +SRC_ZONE %s +OOB_MODE ILSEQ +DST_ILSEQ 0x%04X +DST_UNIT_BITS 16 +END_HEADER_TEMPLATE +end + +def generate_to_ucs(params, pairs) + pairs.sort_by! {|u, c| c } + name = "EMOJI_#{params[:name]}%UCS" + open("#{name}.src", "w") do |io| + io.print header(params.merge(name: name.tr('%', '/'))) + io.puts + io.puts "BEGIN_MAP" + io.print pairs.inject("") {|acc, uc| acc += "0x%04X = 0x%04X\n" % uc.reverse } + io.puts "END_MAP" + end +end + +def generate_from_ucs(params, pairs) + pairs.sort_by! {|u, c| u } + name = "UCS%EMOJI_#{params[:name]}" + open("#{name}.src", "w") do |io| + io.print header(params.merge(name: name.tr('%', '/'))) + io.puts + io.puts "BEGIN_MAP" + io.print pairs.inject("") {|acc, uc| acc += "0x%04X = 0x%04X\n" % uc } + io.puts "END_MAP" + end +end + +def make_pairs(code_map) + pairs = code_map.inject([]) {|acc, (range, ch)| + acc += range.map{|uni| pair = [uni, Integer(ch)]; ch = ch.succ; next pair } + } +end + +ENCODES.each do |params| + pairs = make_pairs(params[:map], ¶ms[:conv]) + generate_to_ucs(params, pairs) + generate_from_ucs(params, pairs) +end + +# generate KDDI-UNDOC for Shift_JIS-KDDI +kddi_sjis_map = ENCODES.select{|enc| enc[:name] == "SHIFT_JIS-KDDI"}.first[:map] +pairs = kddi_sjis_map.inject([]) {|acc, (range, ch)| + acc += range.map{|uni| pair = [ch.to_sjis - 0x700, Integer(ch)]; ch = ch.succ; next pair } +} +params = { + :name => "SHIFT_JIS-KDDI-UNDOC", + :src_zone => [0xF3..0xFC, 0x40..0xFC, 8], + :dst_ilseq => 0xFFFE, +} +generate_from_ucs(params, pairs) +generate_to_ucs(params, pairs) + +# generate KDDI-UNDOC for ISO-2022-JP-KDDI +kddi_2022_map = ENCODES.select{|enc| enc[:name] == "ISO-2022-JP-KDDI"}.first[:map] +pairs = kddi_2022_map.each_with_index.inject([]) {|acc, ((range, ch), i)| + sjis = kddi_sjis_map[i][1] + acc += range.map{|uni| pair = [sjis.to_sjis - 0x700, Integer(ch)]; ch = ch.succ; sjis = sjis.succ; next pair } +} +params = { + :name => "ISO-2022-JP-KDDI-UNDOC", + :src_zone => [0x21..0x7E, 0x21..0x7E, 8], + :dst_ilseq => 0xFFFE, +} +generate_from_ucs(params, pairs) diff --git a/tool/enc-emoji4unicode.rb b/tool/enc-emoji4unicode.rb new file mode 100644 index 0000000000..1e7d45901f --- /dev/null +++ b/tool/enc-emoji4unicode.rb @@ -0,0 +1,133 @@ +#!/usr/bin/env ruby + +# example: +# ./enc-emoji4unicode.rb emoji4unicode.xml > ../enc/trans/emoji-exchange-tbl.rb + +require 'rexml/document' +require File.expand_path("../transcode-tblgen", __FILE__) + +class EmojiTable + VERBOSE_MODE = false + + def initialize(xml_path) + @doc = REXML::Document.new File.open(xml_path) + @kddi_undoc = make_kddi_undoc_map() + end + + def conversion(from_carrier, to_carrier, &block) + REXML::XPath.each(@doc.root, '//e') do |e| + from = e.attribute(from_carrier.downcase).to_s + to = e.attribute(to_carrier.downcase).to_s + text_fallback = e.attribute('text_fallback').to_s + name = e.attribute('name').to_s + if from =~ /^(?:\*|\+)(.+)$/ # proposed or unified + from = $1 + end + if from.empty? || from !~ /^[0-9A-F]+$/ + # do nothing + else + from_utf8 = [from.hex].pack("U").unpack("H*").first + if to =~ /^(?:>|\*)?([0-9A-F\+]+)$/ + str_to = $1 + if str_to =~ /^\+/ # unicode "proposed" begins at "+" + proposal = true + str_to.sub!(/^\+/, '') + else + proposal = false + end + tos = str_to.split('+') + to_utf8 = tos.map(&:hex).pack("U*").unpack("H*").first + comment = "[%s] U+%X -> %s" % [name, from.hex, tos.map{|c| "U+%X"%c.hex}.join(' ')] + block.call(:from => from_utf8, + :to => to_utf8, + :comment => comment, + :fallback => false, + :proposal => proposal) + elsif to.empty? + if text_fallback.empty? + comment = "[%s] U+%X -> U+3013 (GETA)" % [name, from.hex] + block.call(:from => from_utf8, + :to => "\u{3013}".unpack("H*").first, + :comment => comment, # geta + :fallback => true, + :proposal => false) + else + to_utf8 = text_fallback.unpack("H*").first + comment = %([%s] U+%X -> "%s") % [name, from.hex, text_fallback] + block.call(:from => from_utf8, + :to => to_utf8, + :comment => comment, + :fallback => true, + :proposal => false) + end + else + raise "something wrong: %s -> %s" % [from, to] + end + end + end + end + + def generate(io, from_carrier, to_carrier) + from_encoding = (from_carrier == "Unicode") ? "UTF-8" : "UTF8-"+from_carrier + to_encoding = (to_carrier == "Unicode" ) ? "UTF-8" : "UTF8-"+to_carrier + io.puts "EMOJI_EXCHANGE_TBL['#{from_encoding}']['#{to_encoding}'] = [" + io.puts " # for documented codepoints" if from_carrier == "KDDI" + self.conversion(from_carrier, to_carrier) do |params| + from, to = params[:from], %Q{"#{params[:to]}"} + to = ":undef" if params[:fallback] || params[:proposal] + io.puts %{ ["#{from}", #{to}], # #{params[:comment]}} + end + if from_carrier == "KDDI" + io.puts " # for undocumented codepoints" + self.conversion(from_carrier, to_carrier) do |params| + from, to = params[:from], %Q{"#{params[:to]}"} + to = ":undef" if params[:fallback] || params[:proposal] + unicode = utf8_to_ucs(from) + undoc = ucs_to_utf8(@kddi_undoc[unicode]) + io.puts %{ ["#{undoc}", #{to}], # #{params[:comment]}} + end + end + io.puts "]" + io.puts + end + + private + + def utf8_to_ucs(cp) + return [cp].pack("H*").unpack("U*").first + end + + def ucs_to_utf8(cp) + return [cp].pack("U*").unpack("H*").first + end + + def make_kddi_undoc_map() + pub_to_sjis = citrus_decode_mapsrc( + "mskanji", 2, "UCS/EMOJI_SHIFT_JIS-KDDI").sort_by{|u, s| s} + sjis_to_undoc = citrus_decode_mapsrc( + "mskanji", 2, "EMOJI_SHIFT_JIS-KDDI-UNDOC/UCS").sort_by{|s, u| s} + return pub_to_sjis.zip(sjis_to_undoc).inject({}) {|h, rec| + raise "no match sjis codepoint" if rec[0][1] != rec[1][0] + h[rec[0][0]] = rec[1][1] + next h + } + end +end + +if ARGV.empty? + puts "usage: #$0 [emoji4unicode.xml]" + exit 1 +end +$srcdir = File.expand_path("../../enc/trans", __FILE__) +emoji_table = EmojiTable.new(ARGV[0]) + +companies = %w(DoCoMo KDDI SoftBank Unicode) + +io = STDOUT +io.puts "EMOJI_EXCHANGE_TBL = Hash.new{|h,k| h[k] = {}}" +companies.each do |from_company| + companies.each do |to_company| + next if from_company == to_company + emoji_table.generate(io, from_company, to_company) + end +end diff --git a/tool/jisx0208.rb b/tool/jisx0208.rb new file mode 100644 index 0000000000..921f574816 --- /dev/null +++ b/tool/jisx0208.rb @@ -0,0 +1,84 @@ +module JISX0208 + class Char + class << self + def from_sjis(sjis) + unless 0x8140 <= sjis && sjis <= 0xFCFC + raise ArgumentError, "out of the range of JIS X 0208: 0x#{sjis.to_s(16)}" + end + sjis_hi, sjis_lo = sjis >> 8, sjis & 0xFF + sjis_hi = (sjis_hi - ((sjis_hi <= 0x9F) ? 0x80 : 0xC0)) << 1 + if sjis_lo <= 0x9E + sjis_hi -= 1 + sjis_lo -= (sjis_lo <= 0x7E) ? 0x3F : 0x40 + else + sjis_lo -= 0x9E + end + return self.new(sjis_hi, sjis_lo) + end + end + + def initialize(row, cell=nil) + if cell + @code = row_cell_to_code(row, cell) + else + @code = row.to_int + end + end + + def ==(other) + if self.class === other + return Integer(self) == Integer(other) + end + return super(other) + end + + def to_int + return @code + end + + def hi + Integer(self) >> 8 + end + + def lo + Integer(self) & 0xFF + end + + def row + self.hi - 0x20 + end + + def cell + self.lo - 0x20 + end + + def succ + succ_hi, succ_lo = self.hi, self.lo + 1 + if succ_lo > 0x7E + succ_lo = 0x21 + succ_hi += 1 + end + return self.class.new(succ_hi << 8 | succ_lo) + end + + def to_sjis + h, l = self.hi, self.lo + h = (h + 1) / 2 + ((0x21..0x5E).include?(h) ? 0x70 : 0xB0) + l += self.hi.odd? ? 0x1F + ((l >= 0x60) ? 1 : 0) : 0x7E + return h << 8 | l + end + + def inspect + "#<JISX0208::Char:#{self.object_id.to_s(16)} sjis=#{self.to_sjis.to_s(16)} jis=#{self.to_int.to_s(16)}>" + end + + private + + def row_cell_to_code(row, cell) + unless 0 < row && (1..94).include?(cell) + raise ArgumentError, "out of row-cell range: #{row}-#{cell}" + end + return (row + 0x20) << 8 | (cell + 0x20) + end + end +end diff --git a/tool/test/test_jisx0208.rb b/tool/test/test_jisx0208.rb new file mode 100644 index 0000000000..d323c84745 --- /dev/null +++ b/tool/test/test_jisx0208.rb @@ -0,0 +1,40 @@ +require 'test/unit' + +require '../tool/jisx0208' + +class Test_JISX0208_Char < Test::Unit::TestCase + def test_create_with_row_cell + assert_equal JISX0208::Char.new(0x2121), JISX0208::Char.new(1, 1) + end + + def test_succ + assert_equal JISX0208::Char.new(0x2221), JISX0208::Char.new(0x217e).succ + assert_equal JISX0208::Char.new(2, 1), JISX0208::Char.new(1, 94).succ + assert_equal JISX0208::Char.new(0x7f21), JISX0208::Char.new(0x7e7e).succ + end + + def test_to_shift_jis + assert_equal 0x895C, JISX0208::Char.new(0x313D).to_sjis + assert_equal 0x895C, JISX0208::Char.from_sjis(0x895C).to_sjis + assert_equal 0xF3DE, JISX0208::Char.from_sjis(0xF3DE).to_sjis + assert_equal 0xFC40, JISX0208::Char.from_sjis(0xFC40).to_sjis + end + + def test_from_sjis + assert_raise(ArgumentError) { JISX0208::Char.from_sjis(-1) } + assert_raise(ArgumentError) { JISX0208::Char.from_sjis(0x10000) } + assert_nothing_raised { JISX0208::Char.from_sjis(0x8140) } + assert_nothing_raised { JISX0208::Char.from_sjis(0xFCFC) } + assert_equal JISX0208::Char.new(0x313D), JISX0208::Char.from_sjis(0x895C) + end + + def test_row + assert_equal 1, JISX0208::Char.new(0x2121).row + assert_equal 94, JISX0208::Char.new(0x7E7E).row + end + + def test_cell + assert_equal 1, JISX0208::Char.new(0x2121).cell + assert_equal 94, JISX0208::Char.new(0x7E7E).cell + end +end diff --git a/tool/transcode-tblgen.rb b/tool/transcode-tblgen.rb index fa80ceacf7..227914b635 100755 --- a/tool/transcode-tblgen.rb +++ b/tool/transcode-tblgen.rb @@ -524,16 +524,22 @@ def citrus_euc_cstomb(csid, index) end.to_s(16) end +def citrus_stateless_iso_cstomb(csid, index) + (index | 0x8080 | (csid << 16)).to_s(16) +end + def citrus_cstomb(ces, csid, index) case ces when 'mskanji' citrus_mskanji_cstomb(csid, index) when 'euc' citrus_euc_cstomb(csid, index) + when 'stateless_iso' + citrus_stateless_iso_cstomb(csid, index) end end -SUBDIR = %w/APPLE AST BIG5 CNS CP EBCDIC GB GEORGIAN ISO646 ISO-8859 JIS KAZAKH KOI KS MISC TCVN/ +SUBDIR = %w/APPLE AST BIG5 CNS CP EBCDIC EMOJI GB GEORGIAN ISO646 ISO-8859 JIS KAZAKH KOI KS MISC TCVN/ def citrus_decode_mapsrc(ces, csid, mapsrcs) @@ -766,95 +772,97 @@ def make_signature(filename, src) "src=#{filename.dump}, len=#{src.length}, checksum=#{src.sum}" end -output_filename = nil -verbose_mode = false -force_mode = false - -op = OptionParser.new -op.def_option("--help", "show help message") { puts op; exit 0 } -op.def_option("--verbose", "verbose mode") { verbose_mode = true } -op.def_option("--force", "force table generation") { force_mode = true } -op.def_option("--output=FILE", "specify output file") {|arg| output_filename = arg } -op.parse! - -VERBOSE_MODE = verbose_mode - -OUTPUT_FILENAME = output_filename -OUTPUT_PREFIX = output_filename ? File.basename(output_filename)[/\A[A-Za-z0-9_]*/] : "" -OUTPUT_PREFIX.sub!(/\A_+/, '') -OUTPUT_PREFIX.sub!(/_*\z/, '_') - -TRANSCODE_GENERATED_BYTES_CODE = ArrayCode.new("unsigned char", "#{OUTPUT_PREFIX}byte_array") -TRANSCODE_GENERATED_WORDS_CODE = ArrayCode.new("unsigned int", "#{OUTPUT_PREFIX}word_array") - -arg = ARGV.shift -$srcdir = File.dirname(arg) -$:.unshift $srcdir unless $:.include? $srcdir -src = File.read(arg) -src.force_encoding("ascii-8bit") if src.respond_to? :force_encoding -this_script = File.read(__FILE__) -this_script.force_encoding("ascii-8bit") if this_script.respond_to? :force_encoding - -base_signature = "/* autogenerated. */\n" -base_signature << "/* #{make_signature(File.basename(__FILE__), this_script)} */\n" -base_signature << "/* #{make_signature(File.basename(arg), src)} */\n" - -if !force_mode && output_filename && File.readable?(output_filename) - old_signature = File.open(output_filename) {|f| f.gets("").chomp } - chk_signature = base_signature.dup - old_signature.each_line {|line| - if %r{/\* src="([0-9a-z_.-]+)",} =~ line - name = $1 - next if name == File.basename(arg) || name == File.basename(__FILE__) - path = File.join($srcdir, name) - if File.readable? path - chk_signature << "/* #{make_signature(name, File.read(path))} */\n" +if __FILE__ == $0 + output_filename = nil + verbose_mode = false + force_mode = false + + op = OptionParser.new + op.def_option("--help", "show help message") { puts op; exit 0 } + op.def_option("--verbose", "verbose mode") { verbose_mode = true } + op.def_option("--force", "force table generation") { force_mode = true } + op.def_option("--output=FILE", "specify output file") {|arg| output_filename = arg } + op.parse! + + VERBOSE_MODE = verbose_mode + + OUTPUT_FILENAME = output_filename + OUTPUT_PREFIX = output_filename ? File.basename(output_filename)[/\A[A-Za-z0-9_]*/] : "" + OUTPUT_PREFIX.sub!(/\A_+/, '') + OUTPUT_PREFIX.sub!(/_*\z/, '_') + + TRANSCODE_GENERATED_BYTES_CODE = ArrayCode.new("unsigned char", "#{OUTPUT_PREFIX}byte_array") + TRANSCODE_GENERATED_WORDS_CODE = ArrayCode.new("unsigned int", "#{OUTPUT_PREFIX}word_array") + + arg = ARGV.shift + $srcdir = File.dirname(arg) + $:.unshift $srcdir unless $:.include? $srcdir + src = File.read(arg) + src.force_encoding("ascii-8bit") if src.respond_to? :force_encoding + this_script = File.read(__FILE__) + this_script.force_encoding("ascii-8bit") if this_script.respond_to? :force_encoding + + base_signature = "/* autogenerated. */\n" + base_signature << "/* #{make_signature(File.basename(__FILE__), this_script)} */\n" + base_signature << "/* #{make_signature(File.basename(arg), src)} */\n" + + if !force_mode && output_filename && File.readable?(output_filename) + old_signature = File.open(output_filename) {|f| f.gets("").chomp } + chk_signature = base_signature.dup + old_signature.each_line {|line| + if %r{/\* src="([0-9a-z_.-]+)",} =~ line + name = $1 + next if name == File.basename(arg) || name == File.basename(__FILE__) + path = File.join($srcdir, name) + if File.readable? path + chk_signature << "/* #{make_signature(name, File.read(path))} */\n" + end end + } + if old_signature == chk_signature + now = Time.now + File.utime(now, now, output_filename) + STDERR.puts "already up-to-date: #{output_filename}" if VERBOSE_MODE + exit end - } - if old_signature == chk_signature - now = Time.now - File.utime(now, now, output_filename) - STDERR.puts "already up-to-date: #{output_filename}" if VERBOSE_MODE - exit end -end -if VERBOSE_MODE - if output_filename - STDERR.puts "generating #{output_filename} ..." + if VERBOSE_MODE + if output_filename + STDERR.puts "generating #{output_filename} ..." + end end -end -libs1 = $".dup -erb = ERB.new(src, nil, '%') -erb.filename = arg -erb_result = erb.result(binding) -libs2 = $".dup + libs1 = $".dup + erb = ERB.new(src, nil, '%') + erb.filename = arg + erb_result = erb.result(binding) + libs2 = $".dup -libs = libs2 - libs1 -lib_sigs = '' -libs.each {|lib| - lib = File.basename(lib) - path = File.join($srcdir, lib) - if File.readable? path - lib_sigs << "/* #{make_signature(lib, File.read(path))} */\n" - end -} + libs = libs2 - libs1 + lib_sigs = '' + libs.each {|lib| + lib = File.basename(lib) + path = File.join($srcdir, lib) + if File.readable? path + lib_sigs << "/* #{make_signature(lib, File.read(path))} */\n" + end + } -result = '' -result << base_signature -result << lib_sigs -result << "\n" -result << erb_result -result << "\n" - -if output_filename - new_filename = output_filename + ".new" - FileUtils.mkdir_p(File.dirname(output_filename)) - File.open(new_filename, "wb") {|f| f << result } - File.rename(new_filename, output_filename) - STDERR.puts "done." if VERBOSE_MODE -else - print result + result = '' + result << base_signature + result << lib_sigs + result << "\n" + result << erb_result + result << "\n" + + if output_filename + new_filename = output_filename + ".new" + FileUtils.mkdir_p(File.dirname(output_filename)) + File.open(new_filename, "wb") {|f| f << result } + File.rename(new_filename, output_filename) + STDERR.puts "done." if VERBOSE_MODE + else + print result + end end |