From c12af76763a1bff53ed77bc4d236f441d8679880 Mon Sep 17 00:00:00 2001 From: duerst Date: Sat, 16 Jan 2016 08:24:58 +0000 Subject: * enc/unicode.c: Artificial mapping to test buffer expansion code. * string.c: Fixed buffer expansion logic. * test/ruby/enc/test_case_mapping.rb: Tests for above. (with Kimihito Matsui) git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@53554 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 8 ++++++++ enc/unicode.c | 21 ++++++++++++++++++--- string.c | 12 +++++++++++- test/ruby/enc/test_case_mapping.rb | 14 +++++++++++++- 4 files changed, 50 insertions(+), 5 deletions(-) diff --git a/ChangeLog b/ChangeLog index f3427789c1..5aeb27a079 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +Sat Jan 16 17:24:24 2016 Martin Duerst + + * enc/unicode.c: Artificial mapping to test buffer expansion code. + * string.c: Fixed buffer expansion logic. + * test/ruby/enc/test_case_mapping.rb: Tests for above. + (with Kimihito Matsui) + Sat Jan 16 16:47:14 2016 SHIBATA Hiroshi * ext/openssl/lib/openssl/pkey.rb: Added 2048 bit DH parameter. @@ -24,6 +31,7 @@ Sat Jan 16 10:23:23 2016 Martin Duerst option to avoid accidental problems in daily use. * test/ruby/enc/test_case_mapping.rb: Test for above. * string.c: function 'check_case_options': fixed logical errors + (with Kimihito Matsui) Fri Jan 15 20:20:20 2016 Naohisa Goto diff --git a/enc/unicode.c b/enc/unicode.c index 365283e591..e877c99925 100644 --- a/enc/unicode.c +++ b/enc/unicode.c @@ -610,13 +610,14 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc, be duplicated here (and in string.c), but we'll wait for this because we want this to become a primitive anyway. */ extern int -onigenc_unicode_case_map(OnigCaseFoldType* flags, +onigenc_unicode_case_map(OnigCaseFoldType* flagP, const OnigUChar** pp, const OnigUChar* end, OnigUChar* to, OnigUChar* to_end, const struct OnigEncodingTypeST* enc) { OnigCodePoint code; OnigUChar *to_start = to; + OnigCaseFoldType flags = *flagP; to_end -= CASE_MAPPING_SLACK; /* hopelessly preliminary implementation, just dealing with ASCII, @@ -624,11 +625,25 @@ onigenc_unicode_case_map(OnigCaseFoldType* flags, while (*pp='A' && code<='Z') { + /* using :turcic to test buffer expansion */ + if (flags&ONIGENC_CASE_FOLD_TURKISH_AZERI && code==0x0049) { /* I */ + to += ONIGENC_CODE_TO_MBC(enc, 'T', to); + to += ONIGENC_CODE_TO_MBC(enc, 'U', to); + to += ONIGENC_CODE_TO_MBC(enc, 'R', to); + to += ONIGENC_CODE_TO_MBC(enc, 'K', to); + to += ONIGENC_CODE_TO_MBC(enc, 'I', to); + to += ONIGENC_CODE_TO_MBC(enc, 'S', to); + to += ONIGENC_CODE_TO_MBC(enc, 'H', to); + to += ONIGENC_CODE_TO_MBC(enc, '*', to); + code = 0x0131; + flags |= ONIGENC_CASE_MODIFIED; + } + else if (code>='A' && code<='Z') { code += 'a'-'A'; - *flags |= ONIGENC_CASE_MODIFIED; + flags |= ONIGENC_CASE_MODIFIED; } to += ONIGENC_CODE_TO_MBC(enc, code, to); } + *flagP = flags; return (int)(to-to_start); } diff --git a/string.c b/string.c index c78d30fe63..20c9e34e35 100644 --- a/string.c +++ b/string.c @@ -5673,6 +5673,7 @@ rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc) while (source_current < source_end) { /* increase multiplier using buffer count to converge quickly */ int capa = (int)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH; +/* fprintf(stderr, "Buffer allocation, capa is %d\n", capa); *//* for tuning */ current_buffer->next = (mapping_buffer*)ALLOC_N(char, sizeof(mapping_buffer)+capa); current_buffer = current_buffer->next; current_buffer->next = NULL; @@ -5684,13 +5685,22 @@ rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc) current_buffer->space+current_buffer->capa, enc); } +/* fprintf(stderr, "Buffer count is %d\n", buffer_count); *//* for tuning */ if (buffer_count==1) target = rb_str_new_with_class(source, (const char*)current_buffer->space, target_length); else { char *target_current = RSTRING_PTR(target = rb_str_new_with_class(source, 0, target_length)); - for (current_buffer=pre_buffer.next; current_buffer; current_buffer=current_buffer->next) + mapping_buffer *previous_buffer; + + current_buffer=pre_buffer.next; + while (current_buffer) { memcpy(target_current, current_buffer->space, current_buffer->used); + target_current += current_buffer->used; + previous_buffer = current_buffer; + current_buffer=current_buffer->next; + xfree(previous_buffer); + } } /* TODO: check about string terminator character */ diff --git a/test/ruby/enc/test_case_mapping.rb b/test/ruby/enc/test_case_mapping.rb index 529e86fbaa..d42b6b1cb6 100644 --- a/test/ruby/enc/test_case_mapping.rb +++ b/test/ruby/enc/test_case_mapping.rb @@ -6,6 +6,18 @@ require "test/unit" # to test new implementation strategy class TestCaseMappingPreliminary < Test::Unit::TestCase def test_case_mapping_preliminary - assert_equal "yukihiro matsumoto (matz)", "Yukihiro MATSUMOTO (MATZ)".downcase(:lithuanian) + assert_equal 'yukihiro matsumoto (matz)', + 'Yukihiro MATSUMOTO (MATZ)'.downcase(:lithuanian) + assert_equal 'matsumoto yukTURKISH*ıhTURKISH*ıro (matz)', + 'MATSUMOTO YUKIHIRO (MATZ)'.downcase(:turkic, :lithuanian) + end + + def test_buffer_allocations + assert_equal 'TURKISH*ı'*10, ('I'*10).downcase(:turkic, :lithuanian) + assert_equal 'TURKISH*ı'*100, ('I'*100).downcase(:turkic, :lithuanian) + assert_equal 'TURKISH*ı'*1_000, ('I'*1_000).downcase(:turkic, :lithuanian) + assert_equal 'TURKISH*ı'*10_000, ('I'*10_000).downcase(:turkic, :lithuanian) + assert_equal 'TURKISH*ı'*100_000, ('I'*100_000).downcase(:turkic, :lithuanian) + assert_equal 'TURKISH*ı'*1_000_000, ('I'*1_000_000).downcase(:turkic, :lithuanian) end end -- cgit v1.2.3