aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog14
-rw-r--r--encoding.c6
-rw-r--r--include/ruby/encoding.h3
-rw-r--r--string.c93
4 files changed, 91 insertions, 25 deletions
diff --git a/ChangeLog b/ChangeLog
index f6dde0ce69..922ee1ea84 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,17 @@
+Wed May 20 06:25:29 2009 Yukihiro Matsumoto <matz@ruby-lang.org>
+
+ * encoding.c (rb_enc_fast_mbclen): faster mbclen for strings known
+ to be valid.
+
+ * string.c (enc_strlen): coderange specified version of
+ rb_enc_strlen(). use rb_enc_fast_mbclen() if coderange is 7bit
+ or valid.
+
+ * string.c (str_gsub): use rb_enc_fast_mbclen().
+
+ * string.c (rb_str_reverse, rb_str_split_m, rb_str_each_char,
+ scan_once): ditto.
+
Wed May 20 06:20:05 2009 Yukihiro Matsumoto <matz@ruby-lang.org>
* lib/tempfile.rb (Tempfile#unlink): close first for Windows. a
diff --git a/encoding.c b/encoding.c
index 223eace375..4ef45cd65e 100644
--- a/encoding.c
+++ b/encoding.c
@@ -727,6 +727,12 @@ rb_obj_encoding(VALUE obj)
}
int
+rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc)
+{
+ return ONIGENC_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
+}
+
+int
rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
{
int n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h
index 4f6608cb1b..9be412ddf2 100644
--- a/include/ruby/encoding.h
+++ b/include/ruby/encoding.h
@@ -112,6 +112,9 @@ rb_encoding * rb_enc_find(const char *name);
/* -> mbclen (no error notification: 0 < ret <= e-p, no exception) */
int rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc);
+/* -> mbclen (only for valid encoding) */
+int rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc);
+
/* -> chlen, invalid or needmore */
int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc);
#define MBCLEN_CHARFOUND_P(ret) ONIGENC_MBCLEN_CHARFOUND_P(ret)
diff --git a/string.c b/string.c
index 8ec30cc0db..405500b1d2 100644
--- a/string.c
+++ b/string.c
@@ -851,8 +851,8 @@ rb_str_init(int argc, VALUE *argv, VALUE str)
return str;
}
-long
-rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
+static inline long
+enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
{
long c;
const char *q;
@@ -862,17 +862,32 @@ rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
}
else if (rb_enc_asciicompat(enc)) {
c = 0;
- while (p < e) {
- if (ISASCII(*p)) {
- q = search_nonascii(p, e);
- if (!q)
- return c + (e - p);
- c += q - p;
- p = q;
- }
- p += rb_enc_mbclen(p, e, enc);
- c++;
- }
+ if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
+ while (p < e) {
+ if (ISASCII(*p)) {
+ q = search_nonascii(p, e);
+ if (!q)
+ return c + (e - p);
+ c += q - p;
+ p = q;
+ }
+ p += rb_enc_fast_mbclen(p, e, enc);
+ c++;
+ }
+ }
+ else {
+ while (p < e) {
+ if (ISASCII(*p)) {
+ q = search_nonascii(p, e);
+ if (!q)
+ return c + (e - p);
+ c += q - p;
+ p = q;
+ }
+ p += rb_enc_mbclen(p, e, enc);
+ c++;
+ }
+ }
return c;
}
@@ -883,6 +898,12 @@ rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
}
long
+rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
+{
+ return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
+}
+
+long
rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
{
long c;
@@ -964,10 +985,12 @@ str_strlen(VALUE str, rb_encoding *enc)
if (!enc) enc = STR_ENC_GET(str);
p = RSTRING_PTR(str);
e = RSTRING_END(str);
+ cr = ENC_CODERANGE(str);
#ifdef NONASCII_MASK
if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
enc == rb_utf8_encoding()) {
- VALUE len = 0;
+
+ VALUE len = 0;
if ((int)sizeof(VALUE) * 2 < e - p) {
const VALUE *s, *t;
const VALUE lowbits = sizeof(VALUE) - 1;
@@ -1419,7 +1442,7 @@ rb_str_sublen(VALUE str, long pos)
return pos;
else {
char *p = RSTRING_PTR(str);
- return rb_enc_strlen(p, p + pos, STR_ENC_GET(str));
+ return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
}
}
@@ -3721,7 +3744,7 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang)
* in order to prevent infinite loops.
*/
if (RSTRING_LEN(str) <= end0) break;
- len = rb_enc_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
+ len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
offset = end0 + len;
}
@@ -3955,6 +3978,16 @@ rb_str_reverse(VALUE str)
*--p = *s++;
}
}
+ else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) {
+ while (s < e) {
+ int clen = rb_enc_fast_mbclen(s, e, enc);
+
+ if (clen > 1 || (*s & 0x80)) single = 0;
+ p -= clen;
+ memcpy(p, s, clen);
+ s += clen;
+ }
+ }
else {
while (s < e) {
int clen = rb_enc_mbclen(s, e, enc);
@@ -5610,16 +5643,16 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str)
}
else if (last_null == 1) {
rb_ary_push(result, rb_str_subseq(str, beg,
- rb_enc_mbclen(RSTRING_PTR(str)+beg,
- RSTRING_END(str),
- enc)));
+ rb_enc_fast_mbclen(RSTRING_PTR(str)+beg,
+ RSTRING_END(str),
+ enc)));
beg = start;
}
else {
if (RSTRING_PTR(str)+start == RSTRING_END(str))
start++;
else
- start += rb_enc_mbclen(RSTRING_PTR(str)+start,RSTRING_END(str),enc);
+ start += rb_enc_fast_mbclen(RSTRING_PTR(str)+start,RSTRING_END(str),enc);
last_null = 1;
continue;
}
@@ -5889,9 +5922,19 @@ rb_str_each_char(VALUE str)
ptr = RSTRING_PTR(str);
len = RSTRING_LEN(str);
enc = rb_enc_get(str);
- for (i = 0; i < len; i += n) {
- n = rb_enc_mbclen(ptr + i, ptr + len, enc);
- rb_yield(rb_str_subseq(str, i, n));
+ switch (ENC_CODERANGE(str)) {
+ case ENC_CODERANGE_VALID:
+ case ENC_CODERANGE_7BIT:
+ for (i = 0; i < len; i += n) {
+ n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
+ rb_yield(rb_str_subseq(str, i, n));
+ }
+ break;
+ default:
+ for (i = 0; i < len; i += n) {
+ n = rb_enc_mbclen(ptr + i, ptr + len, enc);
+ rb_yield(rb_str_subseq(str, i, n));
+ }
}
return str;
}
@@ -6340,8 +6383,8 @@ scan_once(VALUE str, VALUE pat, long *start)
* Always consume at least one character of the input string
*/
if (RSTRING_LEN(str) > END(0))
- *start = END(0)+rb_enc_mbclen(RSTRING_PTR(str)+END(0),
- RSTRING_END(str), enc);
+ *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0),
+ RSTRING_END(str), enc);
else
*start = END(0)+1;
}