aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorduerst <duerst@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2016-01-16 01:24:03 +0000
committerduerst <duerst@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2016-01-16 01:24:03 +0000
commitbe897c2507a9d7710f218ccf377e6ea67d6d47bf (patch)
tree1ced89fb677d813b33a1b30db54d141934c88299
parent2fd11c760ca2f903092c461566bd522636ea45cc (diff)
downloadruby-be897c2507a9d7710f218ccf377e6ea67d6d47bf.tar.gz
* string.c, enc/unicode.c: New code path as a preparation for Unicode-wide
case mapping. The code path is currently guarded by the :lithuanian option to avoid accidental problems in daily use. * test/ruby/enc/test_case_mapping.rb: Test for above. * string.c: function 'check_case_options': fixed logical errors git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@53548 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r--ChangeLog10
-rw-r--r--enc/unicode.c29
-rw-r--r--string.c88
-rw-r--r--test/ruby/enc/test_case_mapping.rb11
4 files changed, 129 insertions, 9 deletions
diff --git a/ChangeLog b/ChangeLog
index 5b448df4e0..e7e74556f9 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+Sat Jan 16 10:23:23 2016 Martin Duerst <duerst@it.aoyama.ac.jp>
+
+ * string.c, enc/unicode.c: New code path as a preparation for Unicode-wide
+ case mapping. The code path is currently guarded by the :lithuanian
+ option to avoid accidental problems in daily use.
+ * test/ruby/enc/test_case_mapping.rb: Test for above.
+ * string.c: function 'check_case_options': fixed logical errors
+
Fri Jan 15 20:20:20 2016 Naohisa Goto <ngotogenome@gmail.com>
* regint.h (PLATFORM_UNALIGNED_WORD_ACCESS): The value of
@@ -8,7 +16,7 @@ Fri Jan 15 20:20:20 2016 Naohisa Goto <ngotogenome@gmail.com>
Fri Jan 15 16:12:10 2016 Nobuyoshi Nakada <nobu@ruby-lang.org>
- * parse.y (string1): reset heredoc indent fore each string leteral
+ * parse.y (string1): reset heredoc indent for each string literal
so that concatenated string would not be dedented.
[ruby-core:72857] [Bug #11990]
diff --git a/enc/unicode.c b/enc/unicode.c
index 9c0b326d0b..2f45f2f88c 100644
--- a/enc/unicode.c
+++ b/enc/unicode.c
@@ -603,3 +603,32 @@ onigenc_unicode_get_case_fold_codes_by_str(OnigEncoding enc,
return n;
}
+
+/* length in bytes for three characters in UTF-32; e.g. needed for ffi (U+FB03) */
+#define CASE_MAPPING_SLACK 12
+/* The following declaration should be moved to an include file rather than
+ be duplicated here (and in string.c), but we'll wait for this because we
+ want this to become a primitive anyway. */
+extern int
+onigenc_unicode_case_map(OnigCaseFoldType* flags,
+ const OnigUChar** pp, const OnigUChar* end,
+ OnigUChar* to, OnigUChar* to_end,
+ const struct OnigEncodingTypeST* enc)
+{
+ OnigCodePoint code;
+ OnigUChar *to_start = to;
+ to_end -= CASE_MAPPING_SLACK;
+
+ /* hopelessly preliminary implementation, just dealing with ASCII,
+ * and just for downcase */
+ while (*pp<end && to<=to_end) {
+ code = ONIGENC_MBC_TO_CODE(enc, *pp, end);
+ *pp += enclen(enc, *pp, end);
+ if (code>='A' && code<='Z') {
+ code += 'a'-'A';
+ *flags |= ONIGENC_CASE_MODIFIED;
+ }
+ to += ONIGENC_CODE_TO_MBC(enc, code, to);
+ }
+ return to-to_start;
+}
diff --git a/string.c b/string.c
index e4b323db3f..895ec37b47 100644
--- a/string.c
+++ b/string.c
@@ -5600,19 +5600,19 @@ check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
if (argc>2)
rb_raise(rb_eArgError, "too many options");
if (argv[0]==sym_turkic) {
- flags &= ONIGENC_CASE_FOLD_TURKISH_AZERI;
+ flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
if (argc==2) {
if (argv[1]==sym_lithuanian)
- flags &= ONIGENC_CASE_FOLD_LITHUANIAN;
+ flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
else
rb_raise(rb_eArgError, "invalid second option");
}
}
else if (argv[0]==sym_lithuanian) {
- flags &= ONIGENC_CASE_FOLD_LITHUANIAN;
+ flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
if (argc==2) {
if (argv[1]==sym_turkic)
- flags &= ONIGENC_CASE_FOLD_TURKISH_AZERI;
+ flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
else
rb_raise(rb_eArgError, "invalid second option");
}
@@ -5620,10 +5620,10 @@ check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
else if (argc>1)
rb_raise(rb_eArgError, "too many options");
else if (argv[0]==sym_ascii)
- flags &= ONIGENC_CASE_ASCII_ONLY;
+ flags |= ONIGENC_CASE_ASCII_ONLY;
else if (argv[0]==sym_fold) {
if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
- flags &= ONIGENC_CASE_FOLD;
+ flags |= ONIGENC_CASE_FOLD;
else
rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
}
@@ -5632,6 +5632,75 @@ check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
return flags;
}
+/* The following declaration should be moved to an include file rather than
+ be duplicated here (and in enc/unicode.c), but we'll wait for this because
+ we want this to become a primitive anyway. */
+extern int
+onigenc_unicode_case_map(OnigCaseFoldType* flag,
+ const OnigUChar** pp,
+ const OnigUChar* end,
+ OnigUChar* to,
+ OnigUChar* to_end,
+ const struct OnigEncodingTypeST* enc);
+
+/* 16 should be long enough to absorb any kind of single character length increase */
+#define CASE_MAPPING_ADDITIONAL_LENGTH 20
+
+struct mapping_buffer;
+typedef struct mapping_buffer {
+ size_t capa;
+ size_t used;
+ struct mapping_buffer *next;
+ OnigUChar space[0];
+} mapping_buffer;
+
+static VALUE
+rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
+{
+ VALUE target;
+
+ OnigUChar *source_current, *source_end;
+ int target_length = 0;
+ mapping_buffer pre_buffer, /* only next pointer used */
+ *current_buffer = &pre_buffer;
+ int buffer_count = 0;
+
+ if (RSTRING_LEN(source) == 0) return rb_str_dup(source);
+
+ source_current = (OnigUChar*)RSTRING_PTR(source);
+ source_end = (OnigUChar*)RSTRING_END(source);
+
+ while (source_current < source_end) {
+ /* increase multiplier using buffer count to converge quickly */
+ int capa = (source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
+ current_buffer->next = (mapping_buffer*)ALLOC_N(char, sizeof(mapping_buffer)+capa);
+ current_buffer = current_buffer->next;
+ current_buffer->next = NULL;
+ current_buffer->capa = capa;
+ target_length += current_buffer->used
+ = onigenc_unicode_case_map(flags,
+ (const OnigUChar**)&source_current, source_end,
+ current_buffer->space,
+ current_buffer->space+current_buffer->capa,
+ enc);
+ }
+
+ if (buffer_count==1)
+ target = rb_str_new_with_class(source, (const char*)current_buffer->space, target_length);
+ else {
+ char *target_current = RSTRING_PTR(target = rb_str_new_with_class(source, 0, target_length));
+ for (current_buffer=pre_buffer.next; current_buffer; current_buffer=current_buffer->next)
+ memcpy(target_current, current_buffer->space, current_buffer->used);
+ }
+
+ /* TODO: check about string terminator character */
+ OBJ_INFECT_RAW(target, source);
+ str_enc_copy(target, source);
+ /*ENC_CODERANGE_SET(mapped, cr);*/
+
+ return target;
+}
+
/*
* call-seq:
* str.upcase! -> str or nil
@@ -5716,7 +5785,6 @@ rb_str_upcase(int argc, VALUE *argv, VALUE str)
return str;
}
-
/*
* call-seq:
* str.downcase! -> str or nil
@@ -5739,7 +5807,11 @@ rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
enc = STR_ENC_GET(str);
rb_str_check_dummy_enc(enc);
s = RSTRING_PTR(str); send = RSTRING_END(str);
- if (single_byte_optimizable(str)) {
+ if (/*enc==rb_utf8_encoding() &&*/ flags&ONIGENC_CASE_FOLD_LITHUANIAN) { /* lithuanian temporarily used as a guard for debugging */
+ str_shared_replace(str, rb_str_casemap(str, &flags, enc));
+ modify = ONIGENC_CASE_MODIFIED & flags;
+ }
+ else if (single_byte_optimizable(str)) {
while (s < send) {
unsigned int c = *(unsigned char*)s;
diff --git a/test/ruby/enc/test_case_mapping.rb b/test/ruby/enc/test_case_mapping.rb
new file mode 100644
index 0000000000..529e86fbaa
--- /dev/null
+++ b/test/ruby/enc/test_case_mapping.rb
@@ -0,0 +1,11 @@
+# Copyright © 2016 Kimihito Matsui (松井 仁人) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
+
+require "test/unit"
+
+# preliminary tests, using :lithuanian as a guard
+# to test new implementation strategy
+class TestCaseMappingPreliminary < Test::Unit::TestCase
+ def test_case_mapping_preliminary
+ assert_equal "yukihiro matsumoto (matz)", "Yukihiro MATSUMOTO (MATZ)".downcase(:lithuanian)
+ end
+end