diff options
author | duerst <duerst@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2016-01-17 08:42:16 +0000 |
---|---|---|
committer | duerst <duerst@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2016-01-17 08:42:16 +0000 |
commit | 959bbb6f7202676f2da1ef5e134e6152e8613b54 (patch) | |
tree | 4a37adcb5edd3d2cc60a0e9a87ac107614babd79 | |
parent | 0bc53416909fe4470b9cac34072b0b3c555218a3 (diff) | |
download | ruby-959bbb6f7202676f2da1ef5e134e6152e8613b54.tar.gz |
* enc/unicode.c: Removed artificial expansion for Turkic,
added hand-coded support for Turkic, fixed logic for swapcase.
* string.c: Made use of new case mapping code possible from upcase,
capitalize, and swapcase (with :lithuanian as a guard).
* test/ruby/enc/test_case_mapping.rb: Adjusted for above.
(with Kimihito Matsui)
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@53562 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r-- | ChangeLog | 9 | ||||
-rw-r--r-- | enc/unicode.c | 61 | ||||
-rw-r--r-- | string.c | 42 | ||||
-rw-r--r-- | test/ruby/enc/test_case_mapping.rb | 19 |
4 files changed, 92 insertions, 39 deletions
@@ -1,3 +1,12 @@ +Sun Jan 17 17:41:41 2016 Martin Duerst <duerst@it.aoyama.ac.jp> + + * enc/unicode.c: Removed artificial expansion for Turkic, + added hand-coded support for Turkic, fixed logic for swapcase. + * string.c: Made use of new case mapping code possible from upcase, + capitalize, and swapcase (with :lithuanian as a guard). + * test/ruby/enc/test_case_mapping.rb: Adjusted for above. + (with Kimihito Matsui) + Sun Jan 17 15:30:57 2016 Nobuyoshi Nakada <nobu@ruby-lang.org> * ext/socket/option.c (sockopt_bool): relax boolean size to be one diff --git a/enc/unicode.c b/enc/unicode.c index e877c99925..e61611801c 100644 --- a/enc/unicode.c +++ b/enc/unicode.c @@ -606,9 +606,7 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, /* length in bytes for three characters in UTF-32; e.g. needed for ffi (U+FB03) */ #define CASE_MAPPING_SLACK 12 -/* The following declaration should be moved to an include file rather than - be duplicated here (and in string.c), but we'll wait for this because we - want this to become a primitive anyway. */ +#define MODIFIED (flags |= ONIGENC_CASE_MODIFIED) extern int onigenc_unicode_case_map(OnigCaseFoldType* flagP, const OnigUChar** pp, const OnigUChar* end, @@ -620,29 +618,52 @@ onigenc_unicode_case_map(OnigCaseFoldType* flagP, OnigCaseFoldType flags = *flagP; to_end -= CASE_MAPPING_SLACK; - /* hopelessly preliminary implementation, just dealing with ASCII, - * and just for downcase */ + /* hopelessly preliminary implementation, just dealing with ASCII and Turkic */ while (*pp<end && to<=to_end) { code = ONIGENC_MBC_TO_CODE(enc, *pp, end); *pp += enclen(enc, *pp, end); - /* using :turcic to test buffer expansion */ - if (flags&ONIGENC_CASE_FOLD_TURKISH_AZERI && code==0x0049) { /* I */ - to += ONIGENC_CODE_TO_MBC(enc, 'T', to); - to += ONIGENC_CODE_TO_MBC(enc, 'U', to); - to += ONIGENC_CODE_TO_MBC(enc, 'R', to); - to += ONIGENC_CODE_TO_MBC(enc, 'K', to); - to += ONIGENC_CODE_TO_MBC(enc, 'I', to); - to += ONIGENC_CODE_TO_MBC(enc, 'S', to); - to += ONIGENC_CODE_TO_MBC(enc, 'H', to); - to += ONIGENC_CODE_TO_MBC(enc, '*', to); - code = 0x0131; - flags |= ONIGENC_CASE_MODIFIED; + if (code<='z') { /* ASCII comes first */ + if (code>='a' && code<='z') { + if (flags&ONIGENC_CASE_UPCASE) { + if (flags&ONIGENC_CASE_FOLD_TURKISH_AZERI && code==0x0069) /* i → İ */ + code = 0x0130; + else + code += 'A'-'a'; + MODIFIED; + } + } + else if (code>='A' && code<='Z') { + if (flags&ONIGENC_CASE_DOWNCASE) { + if (flags&ONIGENC_CASE_FOLD_TURKISH_AZERI && code==0x0049) /* I → ı */ + code = 0x0131; + else + code += 'a'-'A'; + MODIFIED; + } + } } - else if (code>='A' && code<='Z') { - code += 'a'-'A'; - flags |= ONIGENC_CASE_MODIFIED; + else if (code>=0x00C0) { /* deal with non-ASCII; nothing relevant below U+00C0 */ + if (code==0x0130) { /* İ → i */ + if (flags&ONIGENC_CASE_UPCASE) { + if (flags&ONIGENC_CASE_FOLD_TURKISH_AZERI) + code = 0x0069; + else { /* make dot above explicit */ + to += ONIGENC_CODE_TO_MBC(enc, 0x0069, to); + code = 0x0307; /* dot above */ + } + MODIFIED; + } + } + /* the following case can be removed once we rely on data, + * because the mapping is always the same */ + else if (code==0x0131 && flags&ONIGENC_CASE_UPCASE) { /* ı → I */ + code = 0x0049; MODIFIED; + } } to += ONIGENC_CODE_TO_MBC(enc, code, to); + /* switch from titlecase to lowercase for capitalize */ + if (flags & ONIGENC_CASE_TITLECASE) + flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_TITLECASE|ONIGENC_CASE_DOWNCASE); } *flagP = flags; return (int)(to-to_start); @@ -5734,7 +5734,11 @@ rb_str_upcase_bang(int argc, VALUE *argv, VALUE str) enc = STR_ENC_GET(str); rb_str_check_dummy_enc(enc); s = RSTRING_PTR(str); send = RSTRING_END(str); - if (single_byte_optimizable(str)) { + if (enc==rb_utf8_encoding() && flags&ONIGENC_CASE_FOLD_LITHUANIAN) { /* lithuanian temporarily used as a guard for debugging */ + str_shared_replace(str, rb_str_casemap(str, &flags, enc)); + modify = ONIGENC_CASE_MODIFIED & flags; + } + else if (single_byte_optimizable(str)) { while (s < send) { unsigned int c = *(unsigned char*)s; @@ -5817,7 +5821,7 @@ rb_str_downcase_bang(int argc, VALUE *argv, VALUE str) enc = STR_ENC_GET(str); rb_str_check_dummy_enc(enc); s = RSTRING_PTR(str); send = RSTRING_END(str); - if (/*enc==rb_utf8_encoding() &&*/ flags&ONIGENC_CASE_FOLD_LITHUANIAN) { /* lithuanian temporarily used as a guard for debugging */ + if (enc==rb_utf8_encoding() && flags&ONIGENC_CASE_FOLD_LITHUANIAN) { /* lithuanian temporarily used as a guard for debugging */ str_shared_replace(str, rb_str_casemap(str, &flags, enc)); modify = ONIGENC_CASE_MODIFIED & flags; } @@ -5906,29 +5910,33 @@ rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str) int modify = 0; unsigned int c; int n; - OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | - ONIGENC_CASE_TITLECASE | ONIGENC_CASE_ONCEONLY; + OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE; flags = check_case_options(argc, argv, flags); str_modify_keep_cr(str); enc = STR_ENC_GET(str); rb_str_check_dummy_enc(enc); if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil; - s = RSTRING_PTR(str); send = RSTRING_END(str); - - c = rb_enc_codepoint_len(s, send, &n, enc); - if (rb_enc_islower(c, enc)) { - rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc); - modify = 1; + if (enc==rb_utf8_encoding() && flags&ONIGENC_CASE_FOLD_LITHUANIAN) { /* lithuanian temporarily used as a guard for debugging */ + str_shared_replace(str, rb_str_casemap(str, &flags, enc)); + modify = ONIGENC_CASE_MODIFIED & flags; } - s += n; - while (s < send) { + else { + s = RSTRING_PTR(str); send = RSTRING_END(str); c = rb_enc_codepoint_len(s, send, &n, enc); - if (rb_enc_isupper(c, enc)) { - rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc); + if (rb_enc_islower(c, enc)) { + rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc); modify = 1; } s += n; + while (s < send) { + c = rb_enc_codepoint_len(s, send, &n, enc); + if (rb_enc_isupper(c, enc)) { + rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc); + modify = 1; + } + s += n; + } } if (modify) return str; @@ -5981,7 +5989,11 @@ rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str) enc = STR_ENC_GET(str); rb_str_check_dummy_enc(enc); s = RSTRING_PTR(str); send = RSTRING_END(str); - while (s < send) { + if (enc==rb_utf8_encoding() && flags&ONIGENC_CASE_FOLD_LITHUANIAN) { /* lithuanian temporarily used as a guard for debugging */ + str_shared_replace(str, rb_str_casemap(str, &flags, enc)); + modify = ONIGENC_CASE_MODIFIED & flags; + } + else while (s < send) { unsigned int c = rb_enc_codepoint_len(s, send, &n, enc); if (rb_enc_isupper(c, enc)) { diff --git a/test/ruby/enc/test_case_mapping.rb b/test/ruby/enc/test_case_mapping.rb index d42b6b1cb6..eb36d7d665 100644 --- a/test/ruby/enc/test_case_mapping.rb +++ b/test/ruby/enc/test_case_mapping.rb @@ -5,14 +5,25 @@ require "test/unit" # preliminary tests, using :lithuanian as a guard # to test new implementation strategy class TestCaseMappingPreliminary < Test::Unit::TestCase - def test_case_mapping_preliminary + def test_ascii assert_equal 'yukihiro matsumoto (matz)', 'Yukihiro MATSUMOTO (MATZ)'.downcase(:lithuanian) - assert_equal 'matsumoto yukTURKISH*ıhTURKISH*ıro (matz)', - 'MATSUMOTO YUKIHIRO (MATZ)'.downcase(:turkic, :lithuanian) + assert_equal 'YUKIHIRO MATSUMOTO (MATZ)', + 'yukihiro matsumoto (matz)'.upcase(:lithuanian) + assert_equal 'Yukihiro matsumoto (matz)', + 'yukihiro MATSUMOTO (MATZ)'.capitalize(:lithuanian) + assert_equal 'yUKIHIRO matsumoto (MAtz)', + 'Yukihiro MATSUMOTO (maTZ)'.swapcase(:lithuanian) end - def test_buffer_allocations + def test_turcic + assert_equal 'yukihiro matsumoto (matz)', + 'Yukihiro MATSUMOTO (MATZ)'.downcase(:turkic, :lithuanian) + assert_equal 'YUKİHİRO MATSUMOTO (MATZ)', + 'Yukihiro Matsumoto (matz)'.upcase(:turkic, :lithuanian) + end + + def no_longer_a_test_buffer_allocations assert_equal 'TURKISH*ı'*10, ('I'*10).downcase(:turkic, :lithuanian) assert_equal 'TURKISH*ı'*100, ('I'*100).downcase(:turkic, :lithuanian) assert_equal 'TURKISH*ı'*1_000, ('I'*1_000).downcase(:turkic, :lithuanian) |