From b217dc81f3e08207a381b9c7cdb86d4909c579e6 Mon Sep 17 00:00:00 2001 From: duerst Date: Sat, 6 Feb 2016 05:51:33 +0000 Subject: * test/ruby/enc/test_regex_casefold.rb: Tests for three case folding primitives (mbc_case_fold, get_case_fold_codes_by_str, apply_all_case_fold) in the various encodings. Currently only known good encodings are tested to avoid test failures. For bug hunting, start by adding more encodings with generate_test_casefold encoding (with Kimihito Matsui) git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@53748 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- test/ruby/enc/test_regex_casefold.rb | 101 +++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 test/ruby/enc/test_regex_casefold.rb (limited to 'test/ruby/enc') diff --git a/test/ruby/enc/test_regex_casefold.rb b/test/ruby/enc/test_regex_casefold.rb new file mode 100644 index 0000000000..825a02ae06 --- /dev/null +++ b/test/ruby/enc/test_regex_casefold.rb @@ -0,0 +1,101 @@ +# Copyright Kimihito Matsui (松井 仁人) and Martin J. Dürst (duerst@it.aoyama.ac.jp) + +require "test/unit" +require 'unicode_normalize/normalize' # only for UNICODE_VERSION + +class TestCaseFold < Test::Unit::TestCase + + UNICODE_VERSION = UnicodeNormalize::UNICODE_VERSION + CaseTest = Struct.new :source, :target, :kind, :line + + def read_tests + IO.readlines(File.expand_path("../../../enc/unicode/data/#{UNICODE_VERSION}/CaseFolding.txt", __dir__)) + .collect.with_index { |linedata, linenumber| [linenumber.to_i+1, linedata.chomp] } + .reject { |number, data| data =~ /^(#|$)/ } + .collect do |linenumber, linedata| + data, name = linedata.split /#\s*/ + code, kind, result, _ = data.split /;\s*/ + CaseTest.new code.to_i(16).chr('UTF-8'), + result.split(/ /).collect { |hex| hex.to_i(16) }.pack('U*'), + kind, linenumber + end.select { |test| test.kind=='C' } + end + + def to_codepoints(string) + string.codepoints.collect { |cp| cp.to_s(16).upcase.rjust(4, '0') } + end + + def setup + @@tests ||= read_tests + rescue Errno::ENOENT => e + @@tests ||= [] + puts e.message + end + + def self.generate_test_casefold(encoding) + define_method "test_mbc_case_fold_#{encoding}" do + @@tests.each do |test| + begin + source = test.source.encode encoding + target = test.target.encode encoding + assert_equal 5, "12345#{target}67890" =~ /#{source}/i, + "12345#{to_codepoints(target)}67890 and /#{to_codepoints(source)}/ do not match case-insensitive " + + "(CaseFolding.txt line #{test[:line]})" + rescue Encoding::UndefinedConversionError + end + end + end + + define_method "test_get_case_fold_codes_by_str_#{encoding}" do + @@tests.each do |test| + begin + source = test.source.encode encoding + target = test.target.encode encoding + assert_equal 5, "12345#{source}67890" =~ /#{target}/i, + "12345#{to_codepoints(source)}67890 and /#{to_codepoints(target)}/ do not match case-insensitive " + + "(CaseFolding.txt line #{test[:line]}), " + + "error may also be triggered by mbc_case_fold" + rescue Encoding::UndefinedConversionError + end + end + end + + define_method "test_apply_all_case_fold_#{encoding}" do + @@tests.each do |test| + begin + source = test.source.encode encoding + target = test.target.encode encoding + reg = '\p{Upper}' + regexp = Regexp.compile reg.encode(encoding) + regexpi = Regexp.compile reg.encode(encoding), Regexp::IGNORECASE + assert_equal 5, "12345#{target}67890" =~ regexpi, + "12345#{to_codepoints(target)}67890 and /#{reg}/i do not match " + + "(CaseFolding.txt line #{test[:line]})" + rescue Encoding::UndefinedConversionError + end + end + end + + end + + # start with good encodings only + generate_test_casefold 'US-ASCII' + generate_test_casefold 'ISO-8859-1' + generate_test_casefold 'ISO-8859-2' + generate_test_casefold 'ISO-8859-3' + generate_test_casefold 'ISO-8859-4' + generate_test_casefold 'ISO-8859-5' + generate_test_casefold 'ISO-8859-6' + # generate_test_casefold 'ISO-8859-7' + generate_test_casefold 'ISO-8859-8' + generate_test_casefold 'ISO-8859-9' + generate_test_casefold 'ISO-8859-10' + generate_test_casefold 'ISO-8859-11' + generate_test_casefold 'ISO-8859-13' + generate_test_casefold 'ISO-8859-14' + generate_test_casefold 'ISO-8859-15' + generate_test_casefold 'ISO-8859-16' + generate_test_casefold 'Windows-1250' + generate_test_casefold 'Windows-1252' + #generate_test_casefold 'EUC-JP' +end -- cgit v1.2.3