diff options
author | nobu <nobu@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2014-05-30 23:49:54 +0000 |
---|---|---|
committer | nobu <nobu@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2014-05-30 23:49:54 +0000 |
commit | 0148bd15e4928582adebc4afe4e18db30b68a5a6 (patch) | |
tree | fe08705855c410d8bd15f4b69f22c0e987e259c8 /enc | |
parent | 40ec5528612ad426fac80b8b0ea97009fec7458d (diff) | |
download | ruby-0148bd15e4928582adebc4afe4e18db30b68a5a6.tar.gz |
case-folding.rb: conversion script
* enc/unicode/case-folding.rb: script to convert CaseFolding.txt,
tranlated from CaseFolding.py.
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@46266 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'enc')
-rwxr-xr-x | enc/unicode/case-folding.rb | 107 |
1 files changed, 107 insertions, 0 deletions
diff --git a/enc/unicode/case-folding.rb b/enc/unicode/case-folding.rb new file mode 100755 index 0000000000..e39bef20f8 --- /dev/null +++ b/enc/unicode/case-folding.rb @@ -0,0 +1,107 @@ +#!/usr/bin/ruby + +# Usage: +# $ wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt +# $ ruby CaseFolding.rb CaseFolding.txt > ../enc/unicode/casefold.h + + +def hex_seq(v) + v.map {|i| "0x%04x" % i}.join(", ") +end + +def print_table(table, data) + print("static const #{table}[] = {\n") + for k, v in data.sort + if Array === k and k.length > 1 + sk = "{#{hex_seq(k)}}" + else + sk = "0x%04x" % k + end + print(" {#{sk}, {#{v.length}, {#{hex_seq(v)}}}},\n") + end + print("};\n\n") +end + +def print_case_folding_data(filename) + pattern = /([0-9A-F]{4,6}); ([CFT]); ([0-9A-F]{4,6})(?: ([0-9A-F]{4,6}))?(?: ([0-9A-F]{4,6}))?;/ + + fold = {} + unfold = [{}, {}, {}] + turkic = [] + + IO.foreach(filename) do |line| + next unless res = pattern.match(line) + ch_from = res[1].to_i(16) + ch_to = [] + + if res[2] == 'T' + # Turkic case folding + turkic << ch_from + next + end + + # store folding data + (3..6).each do |i| + if res[i] + ch_to << res[i].to_i(16) + end + end + fold[ch_from] = ch_to + + # store unfolding data + i = ch_to.length - 1 + (unfold[i][ch_to] ||= []) << ch_from + end + + # move locale dependent data to (un)fold_locale + fold_locale = {} + unfold_locale = [{}, {}] + for ch_from in turkic + key = fold[ch_from] + i = key.length - 1 + unfold_locale[i][i == 0 ? key[0] : key] = unfold[i].delete(key) + fold_locale[ch_from] = fold.delete(ch_from) + end + + # print the header + print("/* DO NOT EDIT THIS FILE. */\n") + print("/* Generated by tool/CaseFolding.py */\n\n") + + # print folding data + + # CaseFold + print_table("CaseFold_11_Type CaseFold", fold) + + # CaseFold_Locale + print_table("CaseFold_11_Type CaseFold_Locale", fold_locale) + + # print unfolding data + + # CaseUnfold_11 + print_table("CaseUnfold_11_Type CaseUnfold_11", unfold[0]) + + # CaseUnfold_11_Locale + print_table("CaseUnfold_11_Type CaseUnfold_11_Locale", unfold_locale[0]) + + # CaseUnfold_12 + print_table("CaseUnfold_12_Type CaseUnfold_12", unfold[1]) + + # CaseUnfold_12_Locale + print_table("CaseUnfold_12_Type CaseUnfold_12_Locale", unfold_locale[1]) + + # CaseUnfold_13 + print_table("CaseUnfold_13_Type CaseUnfold_13", unfold[2]) + + # table sizes + fold_table_size = fold.size + fold_locale.size + printf("#define FOLD_TABLE_SIZE\t\t%d\n", (fold_table_size * 1.2)) + unfold1_table_size = unfold[0].size + unfold_locale[0].size + printf("#define UNFOLD1_TABLE_SIZE\t%d\n", (unfold1_table_size * 1.2)) + unfold2_table_size = unfold[1].size + unfold_locale[1].size + printf("#define UNFOLD2_TABLE_SIZE\t%d\n", (unfold2_table_size * 1.5)) + unfold3_table_size = unfold[2].size + printf("#define UNFOLD3_TABLE_SIZE\t%d\n", (unfold3_table_size * 1.7)) +end + +filename = ARGV[0] || 'CaseFolding.txt' +print_case_folding_data(filename) |