From 959bbb6f7202676f2da1ef5e134e6152e8613b54 Mon Sep 17 00:00:00 2001 From: duerst Date: Sun, 17 Jan 2016 08:42:16 +0000 Subject: * enc/unicode.c: Removed artificial expansion for Turkic, added hand-coded support for Turkic, fixed logic for swapcase. * string.c: Made use of new case mapping code possible from upcase, capitalize, and swapcase (with :lithuanian as a guard). * test/ruby/enc/test_case_mapping.rb: Adjusted for above. (with Kimihito Matsui) git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@53562 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- enc/unicode.c | 61 +++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 20 deletions(-) (limited to 'enc') diff --git a/enc/unicode.c b/enc/unicode.c index e877c99925..e61611801c 100644 --- a/enc/unicode.c +++ b/enc/unicode.c @@ -606,9 +606,7 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, /* length in bytes for three characters in UTF-32; e.g. needed for ffi (U+FB03) */ #define CASE_MAPPING_SLACK 12 -/* The following declaration should be moved to an include file rather than - be duplicated here (and in string.c), but we'll wait for this because we - want this to become a primitive anyway. */ +#define MODIFIED (flags |= ONIGENC_CASE_MODIFIED) extern int onigenc_unicode_case_map(OnigCaseFoldType* flagP, const OnigUChar** pp, const OnigUChar* end, @@ -620,29 +618,52 @@ onigenc_unicode_case_map(OnigCaseFoldType* flagP, OnigCaseFoldType flags = *flagP; to_end -= CASE_MAPPING_SLACK; - /* hopelessly preliminary implementation, just dealing with ASCII, - * and just for downcase */ + /* hopelessly preliminary implementation, just dealing with ASCII and Turkic */ while (*pp='a' && code<='z') { + if (flags&ONIGENC_CASE_UPCASE) { + if (flags&ONIGENC_CASE_FOLD_TURKISH_AZERI && code==0x0069) /* i → İ */ + code = 0x0130; + else + code += 'A'-'a'; + MODIFIED; + } + } + else if (code>='A' && code<='Z') { + if (flags&ONIGENC_CASE_DOWNCASE) { + if (flags&ONIGENC_CASE_FOLD_TURKISH_AZERI && code==0x0049) /* I → ı */ + code = 0x0131; + else + code += 'a'-'A'; + MODIFIED; + } + } } - else if (code>='A' && code<='Z') { - code += 'a'-'A'; - flags |= ONIGENC_CASE_MODIFIED; + else if (code>=0x00C0) { /* deal with non-ASCII; nothing relevant below U+00C0 */ + if (code==0x0130) { /* İ → i */ + if (flags&ONIGENC_CASE_UPCASE) { + if (flags&ONIGENC_CASE_FOLD_TURKISH_AZERI) + code = 0x0069; + else { /* make dot above explicit */ + to += ONIGENC_CODE_TO_MBC(enc, 0x0069, to); + code = 0x0307; /* dot above */ + } + MODIFIED; + } + } + /* the following case can be removed once we rely on data, + * because the mapping is always the same */ + else if (code==0x0131 && flags&ONIGENC_CASE_UPCASE) { /* ı → I */ + code = 0x0049; MODIFIED; + } } to += ONIGENC_CODE_TO_MBC(enc, code, to); + /* switch from titlecase to lowercase for capitalize */ + if (flags & ONIGENC_CASE_TITLECASE) + flags ^= (ONIGENC_CASE_UPCASE|ONIGENC_CASE_TITLECASE|ONIGENC_CASE_DOWNCASE); } *flagP = flags; return (int)(to-to_start); -- cgit v1.2.3