From 0980fe7903b253e8a67a45eea6bd9856d0d85cf0 Mon Sep 17 00:00:00 2001 From: matz Date: Wed, 20 May 2009 04:44:36 +0000 Subject: * encoding.c (rb_enc_fast_mbclen): faster mbclen for strings known to be valid. * string.c (enc_strlen): coderange specified version of rb_enc_strlen(). use rb_enc_fast_mbclen() if coderange is 7bit or valid. * string.c (str_gsub): use rb_enc_fast_mbclen(). * string.c (rb_str_reverse, rb_str_split_m, rb_str_each_char, scan_once): ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@23495 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 14 ++++++++ encoding.c | 6 ++++ include/ruby/encoding.h | 3 ++ string.c | 93 ++++++++++++++++++++++++++++++++++++------------- 4 files changed, 91 insertions(+), 25 deletions(-) diff --git a/ChangeLog b/ChangeLog index f6dde0ce69..922ee1ea84 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,17 @@ +Wed May 20 06:25:29 2009 Yukihiro Matsumoto + + * encoding.c (rb_enc_fast_mbclen): faster mbclen for strings known + to be valid. + + * string.c (enc_strlen): coderange specified version of + rb_enc_strlen(). use rb_enc_fast_mbclen() if coderange is 7bit + or valid. + + * string.c (str_gsub): use rb_enc_fast_mbclen(). + + * string.c (rb_str_reverse, rb_str_split_m, rb_str_each_char, + scan_once): ditto. + Wed May 20 06:20:05 2009 Yukihiro Matsumoto * lib/tempfile.rb (Tempfile#unlink): close first for Windows. a diff --git a/encoding.c b/encoding.c index 223eace375..4ef45cd65e 100644 --- a/encoding.c +++ b/encoding.c @@ -726,6 +726,12 @@ rb_obj_encoding(VALUE obj) return rb_enc_from_encoding(enc); } +int +rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc) +{ + return ONIGENC_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e); +} + int rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc) { diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h index 4f6608cb1b..9be412ddf2 100644 --- a/include/ruby/encoding.h +++ b/include/ruby/encoding.h @@ -112,6 +112,9 @@ rb_encoding * rb_enc_find(const char *name); /* -> mbclen (no error notification: 0 < ret <= e-p, no exception) */ int rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc); +/* -> mbclen (only for valid encoding) */ +int rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc); + /* -> chlen, invalid or needmore */ int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc); #define MBCLEN_CHARFOUND_P(ret) ONIGENC_MBCLEN_CHARFOUND_P(ret) diff --git a/string.c b/string.c index 8ec30cc0db..405500b1d2 100644 --- a/string.c +++ b/string.c @@ -851,8 +851,8 @@ rb_str_init(int argc, VALUE *argv, VALUE str) return str; } -long -rb_enc_strlen(const char *p, const char *e, rb_encoding *enc) +static inline long +enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr) { long c; const char *q; @@ -862,17 +862,32 @@ rb_enc_strlen(const char *p, const char *e, rb_encoding *enc) } else if (rb_enc_asciicompat(enc)) { c = 0; - while (p < e) { - if (ISASCII(*p)) { - q = search_nonascii(p, e); - if (!q) - return c + (e - p); - c += q - p; - p = q; - } - p += rb_enc_mbclen(p, e, enc); - c++; - } + if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) { + while (p < e) { + if (ISASCII(*p)) { + q = search_nonascii(p, e); + if (!q) + return c + (e - p); + c += q - p; + p = q; + } + p += rb_enc_fast_mbclen(p, e, enc); + c++; + } + } + else { + while (p < e) { + if (ISASCII(*p)) { + q = search_nonascii(p, e); + if (!q) + return c + (e - p); + c += q - p; + p = q; + } + p += rb_enc_mbclen(p, e, enc); + c++; + } + } return c; } @@ -882,6 +897,12 @@ rb_enc_strlen(const char *p, const char *e, rb_encoding *enc) return c; } +long +rb_enc_strlen(const char *p, const char *e, rb_encoding *enc) +{ + return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN); +} + long rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr) { @@ -964,10 +985,12 @@ str_strlen(VALUE str, rb_encoding *enc) if (!enc) enc = STR_ENC_GET(str); p = RSTRING_PTR(str); e = RSTRING_END(str); + cr = ENC_CODERANGE(str); #ifdef NONASCII_MASK if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) { - VALUE len = 0; + + VALUE len = 0; if ((int)sizeof(VALUE) * 2 < e - p) { const VALUE *s, *t; const VALUE lowbits = sizeof(VALUE) - 1; @@ -1419,7 +1442,7 @@ rb_str_sublen(VALUE str, long pos) return pos; else { char *p = RSTRING_PTR(str); - return rb_enc_strlen(p, p + pos, STR_ENC_GET(str)); + return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str)); } } @@ -3721,7 +3744,7 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang) * in order to prevent infinite loops. */ if (RSTRING_LEN(str) <= end0) break; - len = rb_enc_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc); + len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc); rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc); offset = end0 + len; } @@ -3955,6 +3978,16 @@ rb_str_reverse(VALUE str) *--p = *s++; } } + else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) { + while (s < e) { + int clen = rb_enc_fast_mbclen(s, e, enc); + + if (clen > 1 || (*s & 0x80)) single = 0; + p -= clen; + memcpy(p, s, clen); + s += clen; + } + } else { while (s < e) { int clen = rb_enc_mbclen(s, e, enc); @@ -5610,16 +5643,16 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) } else if (last_null == 1) { rb_ary_push(result, rb_str_subseq(str, beg, - rb_enc_mbclen(RSTRING_PTR(str)+beg, - RSTRING_END(str), - enc))); + rb_enc_fast_mbclen(RSTRING_PTR(str)+beg, + RSTRING_END(str), + enc))); beg = start; } else { if (RSTRING_PTR(str)+start == RSTRING_END(str)) start++; else - start += rb_enc_mbclen(RSTRING_PTR(str)+start,RSTRING_END(str),enc); + start += rb_enc_fast_mbclen(RSTRING_PTR(str)+start,RSTRING_END(str),enc); last_null = 1; continue; } @@ -5889,9 +5922,19 @@ rb_str_each_char(VALUE str) ptr = RSTRING_PTR(str); len = RSTRING_LEN(str); enc = rb_enc_get(str); - for (i = 0; i < len; i += n) { - n = rb_enc_mbclen(ptr + i, ptr + len, enc); - rb_yield(rb_str_subseq(str, i, n)); + switch (ENC_CODERANGE(str)) { + case ENC_CODERANGE_VALID: + case ENC_CODERANGE_7BIT: + for (i = 0; i < len; i += n) { + n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc); + rb_yield(rb_str_subseq(str, i, n)); + } + break; + default: + for (i = 0; i < len; i += n) { + n = rb_enc_mbclen(ptr + i, ptr + len, enc); + rb_yield(rb_str_subseq(str, i, n)); + } } return str; } @@ -6340,8 +6383,8 @@ scan_once(VALUE str, VALUE pat, long *start) * Always consume at least one character of the input string */ if (RSTRING_LEN(str) > END(0)) - *start = END(0)+rb_enc_mbclen(RSTRING_PTR(str)+END(0), - RSTRING_END(str), enc); + *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0), + RSTRING_END(str), enc); else *start = END(0)+1; } -- cgit v1.2.3