aboutsummaryrefslogtreecommitdiffstats
path: root/encoding.c
diff options
context:
space:
mode:
authorko1 <ko1@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2015-10-29 09:10:32 +0000
committerko1 <ko1@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2015-10-29 09:10:32 +0000
commitd02b10a3ea42eb17c6a8bf8ba61d557a33b0da48 (patch)
tree6ed8059cdc4d5ef94f8cc4b73510bdf581be3105 /encoding.c
parent5651a91e2c78dcb78b6c109e0d7d1a775d83d596 (diff)
downloadruby-d02b10a3ea42eb17c6a8bf8ba61d557a33b0da48.tar.gz
* encoding.c (rb_enc_check_str): add for performance.
This function only accept T_STRING (and T_REGEXP). This patch improves performance of a tiny_segmenter benchmark (num=2) 2.54sec -> 2.42sec on my machine. https://github.com/chezou/TinySegmenter.jl/blob/master/benchmark/benchmark.rb * encoding.c: add ENC_DEBUG and ENC_ASSERT() macros. * internal.h: add a decl. of rb_enc_check_str(). * string.c (rb_str_plus): use rb_enc_check_str(). * string.c (rb_str_subpat_set): ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@52350 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'encoding.c')
-rw-r--r--encoding.c105
1 files changed, 77 insertions, 28 deletions
diff --git a/encoding.c b/encoding.c
index 57c89ee8ce..d11e6f6e59 100644
--- a/encoding.c
+++ b/encoding.c
@@ -15,6 +15,12 @@
#include <ctype.h>
#include "ruby/util.h"
+#include <assert.h>
+#ifndef ENC_DEBUG
+#define ENC_DEBUG 0
+#endif
+#define ENC_ASSERT(expr) do { if (ENC_DEBUG) {assert(expr);} } while (0)
+
#undef rb_ascii8bit_encindex
#undef rb_utf8_encindex
#undef rb_usascii_encindex
@@ -743,6 +749,19 @@ rb_id_encoding(void)
return id_encoding;
}
+static int
+enc_get_index_str(VALUE str)
+{
+ int i = ENCODING_GET_INLINED(str);
+ if (i == ENCODING_INLINE_MAX) {
+ VALUE iv;
+
+ iv = rb_ivar_get(str, rb_id_encoding());
+ i = NUM2INT(iv);
+ }
+ return i;
+}
+
int
rb_enc_get_index(VALUE obj)
{
@@ -758,13 +777,7 @@ rb_enc_get_index(VALUE obj)
default:
case T_STRING:
case T_REGEXP:
- i = ENCODING_GET_INLINED(obj);
- if (i == ENCODING_INLINE_MAX) {
- VALUE iv;
-
- iv = rb_ivar_get(obj, rb_id_encoding());
- i = NUM2INT(iv);
- }
+ i = enc_get_index_str(obj);
break;
case T_FILE:
tmp = rb_funcallv(obj, rb_intern("internal_encoding"), 0, 0);
@@ -842,6 +855,21 @@ rb_enc_get(VALUE obj)
return rb_enc_from_index(rb_enc_get_index(obj));
}
+static rb_encoding* enc_compatible_str(VALUE str1, VALUE str2);
+
+rb_encoding*
+rb_enc_check_str(VALUE str1, VALUE str2)
+{
+ rb_encoding *enc = enc_compatible_str(str1, str2);
+ ENC_ASSERT(TYPE(str1) == T_STRING);
+ ENC_ASSERT(TYPE(str2) == T_STRING);
+ if (!enc)
+ rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
+ rb_enc_name(rb_enc_get(str1)),
+ rb_enc_name(rb_enc_get(str2)));
+ return enc;
+}
+
rb_encoding*
rb_enc_check(VALUE str1, VALUE str2)
{
@@ -853,40 +881,28 @@ rb_enc_check(VALUE str1, VALUE str2)
return enc;
}
-rb_encoding*
-rb_enc_compatible(VALUE str1, VALUE str2)
+static rb_encoding*
+enc_compatible_latter(VALUE str1, VALUE str2, int idx1, int idx2)
{
- int idx1, idx2;
- rb_encoding *enc1, *enc2;
int isstr1, isstr2;
-
- idx1 = rb_enc_get_index(str1);
- idx2 = rb_enc_get_index(str2);
-
- if (idx1 < 0 || idx2 < 0)
- return 0;
-
- if (idx1 == idx2) {
- return rb_enc_from_index(idx1);
- }
- enc1 = rb_enc_from_index(idx1);
- enc2 = rb_enc_from_index(idx2);
+ rb_encoding *enc1 = rb_enc_from_index(idx1);
+ rb_encoding *enc2 = rb_enc_from_index(idx2);
isstr2 = RB_TYPE_P(str2, T_STRING);
if (isstr2 && RSTRING_LEN(str2) == 0)
- return enc1;
+ return enc1;
isstr1 = RB_TYPE_P(str1, T_STRING);
if (isstr1 && RSTRING_LEN(str1) == 0)
- return (rb_enc_asciicompat(enc1) && rb_enc_str_asciionly_p(str2)) ? enc1 : enc2;
+ return (rb_enc_asciicompat(enc1) && rb_enc_str_asciionly_p(str2)) ? enc1 : enc2;
if (!rb_enc_asciicompat(enc1) || !rb_enc_asciicompat(enc2)) {
return 0;
}
/* objects whose encoding is the same of contents */
if (!isstr2 && idx2 == ENCINDEX_US_ASCII)
- return enc1;
+ return enc1;
if (!isstr1 && idx1 == ENCINDEX_US_ASCII)
- return enc2;
+ return enc2;
if (!isstr1) {
VALUE tmp = str1;
@@ -915,11 +931,44 @@ rb_enc_compatible(VALUE str1, VALUE str2)
}
}
if (cr1 == ENC_CODERANGE_7BIT)
- return enc2;
+ return enc2;
}
return 0;
}
+static rb_encoding*
+enc_compatible_str(VALUE str1, VALUE str2)
+{
+ int idx1 = enc_get_index_str(str1);
+ int idx2 = enc_get_index_str(str2);
+
+ if (idx1 < 0 || idx2 < 0)
+ return 0;
+
+ if (idx1 == idx2) {
+ return rb_enc_from_index(idx1);
+ }
+ else {
+ return enc_compatible_latter(str1, str2, idx1, idx2);
+ }
+}
+
+rb_encoding*
+rb_enc_compatible(VALUE str1, VALUE str2)
+{
+ int idx1 = rb_enc_get_index(str1);
+ int idx2 = rb_enc_get_index(str2);
+
+ if (idx1 < 0 || idx2 < 0)
+ return 0;
+
+ if (idx1 == idx2) {
+ return rb_enc_from_index(idx1);
+ }
+
+ return enc_compatible_latter(str1, str2, idx1, idx2);
+}
+
void
rb_enc_copy(VALUE obj1, VALUE obj2)
{