From 78f540019a394421e1875cacaf956e8c23b18cc0 Mon Sep 17 00:00:00 2001 From: duerst Date: Tue, 29 Mar 2016 07:53:43 +0000 Subject: * enc/unicode/case-folding.rb, casefold.h: Tweaked handling of 6 special cases in CaseUnfold_11_Table. * enc/unicode.c: Adjustments for above. * test/ruby/enc/test_case_mapping.rb: Tests for the above: Some tests in test_titlecase activated; test_greek added. A test in test_cherokee fixed. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@54383 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 10 ++++++ enc/unicode.c | 15 +++++--- enc/unicode/case-folding.rb | 73 +++++++++++++++++++++++++------------- enc/unicode/casefold.h | 18 ++++------ test/ruby/enc/test_case_mapping.rb | 17 +++++---- 5 files changed, 85 insertions(+), 48 deletions(-) diff --git a/ChangeLog b/ChangeLog index 9a40fc09a2..19e575973d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,13 @@ +Tue Mar 29 16:53:44 2016 Martin Duerst + + * enc/unicode/case-folding.rb, casefold.h: Tweaked handling of 6 + special cases in CaseUnfold_11_Table. + + * enc/unicode.c: Adjustments for above. + + * test/ruby/enc/test_case_mapping.rb: Tests for the above: Some tests in + test_titlecase activated; test_greek added. A test in test_cherokee fixed. + Tue Mar 29 13:31:00 2016 Martin Duerst * enc/unicode.c: Cleaned up some comments. diff --git a/enc/unicode.c b/enc/unicode.c index 87ebb0d8a8..eebf060dd9 100644 --- a/enc/unicode.c +++ b/enc/unicode.c @@ -750,12 +750,17 @@ onigenc_unicode_case_map(OnigCaseFoldType* flagP, } } else if ((folded = onigenc_unicode_unfold1_lookup(code)) != 0) { /* data about character found in CaseUnfold_11_Table */ - if (flags&OnigCaseFoldFlags(folded->n)) { - int count = OnigCodePointCount(folded->n); - const OnigCodePoint *next = folded->code; + if (flags&OnigCaseFoldFlags(folded->n)) { /* needs and data availability match */ MODIFIED; - if (count==1) - code = *next; + if (flags&OnigCaseFoldFlags(folded->n)&ONIGENC_CASE_TITLECASE) + code = folded->code[1]; + else + code = folded->code[0]; + } + else if ((flags&(ONIGENC_CASE_UPCASE)) + && (code==0x03B9||code==0x03BC)) { /* GREEK SMALL LETTERs IOTA/MU */ + MODIFIED; + code = folded->code[1]; } } } diff --git a/enc/unicode/case-folding.rb b/enc/unicode/case-folding.rb index d3738be4e8..2df430185f 100755 --- a/enc/unicode/case-folding.rb +++ b/enc/unicode/case-folding.rb @@ -230,38 +230,61 @@ class CaseMapping def flags(from, type, to) # types: CaseFold_11, CaseUnfold_11, CaseUnfold_12, CaseUnfold_13 flags = "" - flags += '|F' if type=='CaseFold_11' from = Array(from).map {|i| "%04X" % i}.join(" ") to = Array(to).map {|i| "%04X" % i}.join(" ") - to = to.split(/ /).first if type=='CaseUnfold_11' item = @mappings[from] - if item - flags += '|U' if to==item.upper - flags += '|D' if to==item.lower - specials_index = nil - specials = [] - unless item.upper == item.title - if item.code == item.title - flags += '|IT' - else - flags += '|ST' - specials << item.title + specials_index = nil + specials = [] + case type + when 'CaseFold_11' + flags += '|F' + if item + flags += '|U' if to==item.upper + flags += '|D' if to==item.lower + unless item.upper == item.title + if item.code == item.title + flags += '|IT' + else + flags += '|ST' + specials << item.title + end + end + unless item.lower.nil? or item.lower==from or item.lower==to + specials << item.lower + flags += '|SL' + end + unless item.upper.nil? or item.upper==from or item.upper==to + specials << item.upper + flags += '|SU' end end - unless item.lower.nil? or item.lower==from or item.lower==to - specials << item.lower - flags += '|SL' - end - unless item.upper.nil? or item.upper==from or item.upper==to - specials << item.upper - flags += '|SU' - end - if specials.first - flags += "|I(#{@specials_length})" - @specials_length += specials.map { |s| s.split(/ /).length }.reduce(:+) - @specials << specials + when 'CaseUnfold_11' + to = to.split(/ /) + if item + case to.first + when item.upper then flags += '|U' + when item.lower then flags += '|D' + else + unless from=='03B9' or from=='03BC' + warn 'Unpredicted case 0; check data or adjust program (enc/unicode/case_folding.rb).' + end + end + unless item.upper == item.title + if item.code == item.title + warn 'Unpredicted case 1; check data or adjust program (enc/unicode/case_folding.rb).' + elsif item.title==to[1] + flags += '|ST' + else + warn 'Unpredicted case 2; check data or adjust program (enc/unicode/case_folding.rb).' + end + end end end + unless specials.empty? + flags += "|I(#{@specials_length})" + @specials_length += specials.map { |s| s.split(/ /).length }.reduce(:+) + @specials << specials + end flags end diff --git a/enc/unicode/casefold.h b/enc/unicode/casefold.h index c6c5d0d387..27beb5469c 100644 --- a/enc/unicode/casefold.h +++ b/enc/unicode/casefold.h @@ -3298,9 +3298,9 @@ static const CaseUnfold_11_Type CaseUnfold_11_Table[] = { {0x01b9, {1|U, {0x01b8}}}, {0x01bd, {1|U, {0x01bc}}}, {0x01bf, {1|U, {0x01f7}}}, - {0x01c6, {2|U|ST|I(347), {0x01c4, 0x01c5}}}, - {0x01c9, {2|U|ST|I(348), {0x01c7, 0x01c8}}}, - {0x01cc, {2|U|ST|I(349), {0x01ca, 0x01cb}}}, + {0x01c6, {2|U|ST, {0x01c4, 0x01c5}}}, + {0x01c9, {2|U|ST, {0x01c7, 0x01c8}}}, + {0x01cc, {2|U|ST, {0x01ca, 0x01cb}}}, {0x01ce, {1|U, {0x01cd}}}, {0x01d0, {1|U, {0x01cf}}}, {0x01d2, {1|U, {0x01d1}}}, @@ -3319,7 +3319,7 @@ static const CaseUnfold_11_Type CaseUnfold_11_Table[] = { {0x01eb, {1|U, {0x01ea}}}, {0x01ed, {1|U, {0x01ec}}}, {0x01ef, {1|U, {0x01ee}}}, - {0x01f3, {2|U|ST|I(350), {0x01f1, 0x01f2}}}, + {0x01f3, {2|U|ST, {0x01f1, 0x01f2}}}, {0x01f5, {1|U, {0x01f4}}}, {0x01f9, {1|U, {0x01f8}}}, {0x01fb, {1|U, {0x01fa}}}, @@ -3412,10 +3412,10 @@ static const CaseUnfold_11_Type CaseUnfold_11_Table[] = { {0x03b6, {1|U, {0x0396}}}, {0x03b7, {1|U, {0x0397}}}, {0x03b8, {3|U, {0x0398, 0x03d1, 0x03f4}}}, - {0x03b9, {3|SU|I(351), {0x0345, 0x0399, 0x1fbe}}}, + {0x03b9, {3, {0x0345, 0x0399, 0x1fbe}}}, {0x03ba, {2|U, {0x039a, 0x03f0}}}, {0x03bb, {1|U, {0x039b}}}, - {0x03bc, {2|SU|I(352), {0x00b5, 0x039c}}}, + {0x03bc, {2, {0x00b5, 0x039c}}}, {0x03bd, {1|U, {0x039d}}}, {0x03be, {1|U, {0x039e}}}, {0x03bf, {1|U, {0x039f}}}, @@ -6371,10 +6371,4 @@ OnigCodePoint CaseMappingSpecials[] = { L(2)|0x0544, 0x056B, L(2)|0x0544, 0x053B, L(2)|0x054E, 0x0576, L(2)|0x054E, 0x0546, L(2)|0x0544, 0x056D, L(2)|0x0544, 0x053D, - L(1)|0x01C5, - L(1)|0x01C8, - L(1)|0x01CB, - L(1)|0x01F2, - L(1)|0x0399, - L(1)|0x039C, }; diff --git a/test/ruby/enc/test_case_mapping.rb b/test/ruby/enc/test_case_mapping.rb index 5d028d9186..b52d86b57a 100644 --- a/test/ruby/enc/test_case_mapping.rb +++ b/test/ruby/enc/test_case_mapping.rb @@ -74,7 +74,7 @@ class TestCaseMappingPreliminary < Test::Unit::TestCase check_downcase_properties "\uab70\uab71\uab72\uab73\uab74\uab75\uab76\uab77\uab78\uab79", 'ᎠᎡᎢᎣᎤᎥᎦᎧᎨᎩ', :lithuanian check_upcase_properties 'ᎠᎡᎢᎣᎤᎥᎦᎧᎨᎩ', "\uab70\uab71\uab72\uab73\uab74\uab75\uab76\uab77\uab78\uab79", :lithuanian check_capitalize_suffixes "\uab70\uab71\uab72\uab73\uab74\uab75\uab76\uab77\uab78\uab79", 'ᎠᎡᎢᎣᎤᎥᎦᎧᎨᎩ' - assert_equal 'ᎠᎡᎢᎣᎤᎥᎦᎧᎨᎩ', 'ᎠᎡᎢᎣᎤᎥᎦᎧᎨᎩ', :fold + assert_equal 'ᎠᎡᎢᎣᎤᎥᎦᎧᎨᎩ', 'ᎠᎡᎢᎣᎤᎥᎦᎧᎨᎩ'.downcase(:fold) assert_equal 'ᎠᎡᎢᎣᎤᎥᎦᎧᎨᎩ', "\uab70\uab71\uab72\uab73\uab74\uab75\uab76\uab77\uab78\uab79".downcase(:fold) end @@ -82,15 +82,15 @@ class TestCaseMappingPreliminary < Test::Unit::TestCase check_downcase_properties 'dz dž lj nj', 'Dz Dž Lj Nj', :lithuanian check_downcase_properties 'dz dž lj nj', 'DZ DŽ LJ NJ', :lithuanian check_upcase_properties 'DZ DŽ LJ NJ', 'Dz Dž Lj Nj', :lithuanian - # check_upcase_properties 'DZ DŽ LJ NJ', 'dz dž lj nj', :lithuanian + check_upcase_properties 'DZ DŽ LJ NJ', 'dz dž lj nj', :lithuanian check_capitalize_properties 'Dz', 'DZ', :lithuanian check_capitalize_properties 'Dž', 'DŽ', :lithuanian check_capitalize_properties 'Lj', 'LJ', :lithuanian check_capitalize_properties 'Nj', 'NJ', :lithuanian - # check_capitalize_properties 'Dz', 'dz', :lithuanian - # check_capitalize_properties 'Dž', 'dž', :lithuanian - # check_capitalize_properties 'Lj', 'lj', :lithuanian - # check_capitalize_properties 'Nj', 'nj', :lithuanian + check_capitalize_properties 'Dz', 'dz', :lithuanian + check_capitalize_properties 'Dž', 'dž', :lithuanian + check_capitalize_properties 'Lj', 'lj', :lithuanian + check_capitalize_properties 'Nj', 'nj', :lithuanian end def test_ascii_option @@ -116,6 +116,11 @@ class TestCaseMappingPreliminary < Test::Unit::TestCase check_downcase_properties "yuki\u0307hi\u0307ro matsumoto (matz)", 'YUKİHİRO MATSUMOTO (MATZ)', :lithuanian end + def test_greek + check_downcase_properties 'αβγδεζηθικλμνξοπρστυφχψω', 'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ', :lithuanian + check_upcase_properties 'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ', 'αβγδεζηθικλμνξοπρστυφχψω', :lithuanian + end + def no_longer_a_test_buffer_allocations assert_equal 'TURKISH*ı'*10, ('I'*10).downcase(:turkic, :lithuanian) assert_equal 'TURKISH*ı'*100, ('I'*100).downcase(:turkic, :lithuanian) -- cgit v1.2.3