aboutsummaryrefslogtreecommitdiffstats
path: root/test/ruby/enc/test_regex_casefold.rb
diff options
context:
space:
mode:
authorduerst <duerst@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2016-02-06 05:51:33 +0000
committerduerst <duerst@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2016-02-06 05:51:33 +0000
commitb217dc81f3e08207a381b9c7cdb86d4909c579e6 (patch)
tree5fdf853557a4575444a023bd9aad36109c60d4ee /test/ruby/enc/test_regex_casefold.rb
parent81515b2381dcd325ca57c0272a551bba4f112afd (diff)
downloadruby-b217dc81f3e08207a381b9c7cdb86d4909c579e6.tar.gz
* test/ruby/enc/test_regex_casefold.rb: Tests for three case folding
primitives (mbc_case_fold, get_case_fold_codes_by_str, apply_all_case_fold) in the various encodings. Currently only known good encodings are tested to avoid test failures. For bug hunting, start by adding more encodings with generate_test_casefold encoding (with Kimihito Matsui) git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@53748 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'test/ruby/enc/test_regex_casefold.rb')
-rw-r--r--test/ruby/enc/test_regex_casefold.rb101
1 files changed, 101 insertions, 0 deletions
diff --git a/test/ruby/enc/test_regex_casefold.rb b/test/ruby/enc/test_regex_casefold.rb
new file mode 100644
index 0000000000..825a02ae06
--- /dev/null
+++ b/test/ruby/enc/test_regex_casefold.rb
@@ -0,0 +1,101 @@
+# Copyright Kimihito Matsui (松井 仁人) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
+
+require "test/unit"
+require 'unicode_normalize/normalize' # only for UNICODE_VERSION
+
+class TestCaseFold < Test::Unit::TestCase
+
+ UNICODE_VERSION = UnicodeNormalize::UNICODE_VERSION
+ CaseTest = Struct.new :source, :target, :kind, :line
+
+ def read_tests
+ IO.readlines(File.expand_path("../../../enc/unicode/data/#{UNICODE_VERSION}/CaseFolding.txt", __dir__))
+ .collect.with_index { |linedata, linenumber| [linenumber.to_i+1, linedata.chomp] }
+ .reject { |number, data| data =~ /^(#|$)/ }
+ .collect do |linenumber, linedata|
+ data, name = linedata.split /#\s*/
+ code, kind, result, _ = data.split /;\s*/
+ CaseTest.new code.to_i(16).chr('UTF-8'),
+ result.split(/ /).collect { |hex| hex.to_i(16) }.pack('U*'),
+ kind, linenumber
+ end.select { |test| test.kind=='C' }
+ end
+
+ def to_codepoints(string)
+ string.codepoints.collect { |cp| cp.to_s(16).upcase.rjust(4, '0') }
+ end
+
+ def setup
+ @@tests ||= read_tests
+ rescue Errno::ENOENT => e
+ @@tests ||= []
+ puts e.message
+ end
+
+ def self.generate_test_casefold(encoding)
+ define_method "test_mbc_case_fold_#{encoding}" do
+ @@tests.each do |test|
+ begin
+ source = test.source.encode encoding
+ target = test.target.encode encoding
+ assert_equal 5, "12345#{target}67890" =~ /#{source}/i,
+ "12345#{to_codepoints(target)}67890 and /#{to_codepoints(source)}/ do not match case-insensitive " +
+ "(CaseFolding.txt line #{test[:line]})"
+ rescue Encoding::UndefinedConversionError
+ end
+ end
+ end
+
+ define_method "test_get_case_fold_codes_by_str_#{encoding}" do
+ @@tests.each do |test|
+ begin
+ source = test.source.encode encoding
+ target = test.target.encode encoding
+ assert_equal 5, "12345#{source}67890" =~ /#{target}/i,
+ "12345#{to_codepoints(source)}67890 and /#{to_codepoints(target)}/ do not match case-insensitive " +
+ "(CaseFolding.txt line #{test[:line]}), " +
+ "error may also be triggered by mbc_case_fold"
+ rescue Encoding::UndefinedConversionError
+ end
+ end
+ end
+
+ define_method "test_apply_all_case_fold_#{encoding}" do
+ @@tests.each do |test|
+ begin
+ source = test.source.encode encoding
+ target = test.target.encode encoding
+ reg = '\p{Upper}'
+ regexp = Regexp.compile reg.encode(encoding)
+ regexpi = Regexp.compile reg.encode(encoding), Regexp::IGNORECASE
+ assert_equal 5, "12345#{target}67890" =~ regexpi,
+ "12345#{to_codepoints(target)}67890 and /#{reg}/i do not match " +
+ "(CaseFolding.txt line #{test[:line]})"
+ rescue Encoding::UndefinedConversionError
+ end
+ end
+ end
+
+ end
+
+ # start with good encodings only
+ generate_test_casefold 'US-ASCII'
+ generate_test_casefold 'ISO-8859-1'
+ generate_test_casefold 'ISO-8859-2'
+ generate_test_casefold 'ISO-8859-3'
+ generate_test_casefold 'ISO-8859-4'
+ generate_test_casefold 'ISO-8859-5'
+ generate_test_casefold 'ISO-8859-6'
+ # generate_test_casefold 'ISO-8859-7'
+ generate_test_casefold 'ISO-8859-8'
+ generate_test_casefold 'ISO-8859-9'
+ generate_test_casefold 'ISO-8859-10'
+ generate_test_casefold 'ISO-8859-11'
+ generate_test_casefold 'ISO-8859-13'
+ generate_test_casefold 'ISO-8859-14'
+ generate_test_casefold 'ISO-8859-15'
+ generate_test_casefold 'ISO-8859-16'
+ generate_test_casefold 'Windows-1250'
+ generate_test_casefold 'Windows-1252'
+ #generate_test_casefold 'EUC-JP'
+end