diff options
author | akr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2008-02-16 07:16:36 +0000 |
---|---|---|
committer | akr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2008-02-16 07:16:36 +0000 |
commit | a47e8e776c8e91cdf17f4cb79569c8b20c88b845 (patch) | |
tree | f06290f7d8901f0ac32d7525344cd30e796cf91e | |
parent | c2459f35acbef17644d706f8f6386e1368040f96 (diff) | |
download | ruby-a47e8e776c8e91cdf17f4cb79569c8b20c88b845.tar.gz |
* string.c (rb_enc_strlen): UTF-8 character count moved to str_strlen.
(str_strlen): UTF-8 character count is only applicable for valid
UTF-8 string.
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@15504 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r-- | ChangeLog | 6 | ||||
-rw-r--r-- | string.c | 61 |
2 files changed, 38 insertions, 29 deletions
@@ -1,3 +1,9 @@ +Sat Feb 16 16:14:35 2008 Tanaka Akira <akr@fsij.org> + + * string.c (rb_enc_strlen): UTF-8 character count moved to str_strlen. + (str_strlen): UTF-8 character count is only applicable for valid + UTF-8 string. + Sat Feb 16 13:16:49 2008 Tanaka Akira <akr@fsij.org> * string.c (rb_str_sub_bang): stringize replacing hash values. @@ -597,35 +597,7 @@ rb_enc_strlen(const char *p, const char *e, rb_encoding *enc) if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) { return (e - p) / rb_enc_mbminlen(enc); } -#ifdef NONASCII_MASK - else if (enc == rb_utf8_encoding()) { - if (sizeof(long) * 2 < e - p) { - const unsigned long *s, *t; - const VALUE lowbits = sizeof(unsigned long) - 1; - s = (const unsigned long*)(~lowbits & ((VALUE)p + lowbits)); - t = (const unsigned long*)(~lowbits & (VALUE)e); - for (c=0; p<(const char *)s; p++) { - if (((*p)&0xC0) != 0x80) c++; - } - while (s < t) { - unsigned long d = *s; - d = (~d ^ (d&(d<<1)))&NONASCII_MASK; - d = (d>>7) + (d>>15); - d = d + (d>>16); -#if NONASCII_MASK == 0x8080808080808080UL - d = d + (d>>32); -#endif - c += (long)(d&0xF); - s++; - } - p = (const char *)t; - } - for (; p<e; p++) { - if (((*p)&0xC0) != 0x80) c++; - } - return c; - } -#endif + else if (rb_enc_asciicompat(enc)) { c = 0; while (p < e) { @@ -658,6 +630,37 @@ str_strlen(VALUE str, rb_encoding *enc) if (!enc) enc = STR_ENC_GET(str); p = RSTRING_PTR(str); e = RSTRING_END(str); +#ifdef NONASCII_MASK + if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID && + enc == rb_utf8_encoding()) { + len = 0; + if (sizeof(long) * 2 < e - p) { + const unsigned long *s, *t; + const VALUE lowbits = sizeof(unsigned long) - 1; + s = (const unsigned long*)(~lowbits & ((VALUE)p + lowbits)); + t = (const unsigned long*)(~lowbits & (VALUE)e); + for (len=0; p<(const char *)s; p++) { + if (((*p)&0xC0) != 0x80) len++; + } + while (s < t) { + unsigned long d = *s; + d = (~d ^ (d&(d<<1)))&NONASCII_MASK; + d = (d>>7) + (d>>15); + d = d + (d>>16); +#if NONASCII_MASK == 0x8080808080808080UL + d = d + (d>>32); +#endif + len += (long)(d&0xF); + s++; + } + p = (const char *)t; + } + for (; p<e; p++) { + if (((*p)&0xC0) != 0x80) len++; + } + } + else +#endif len = rb_enc_strlen(p, e, enc); if (len < 0) { rb_raise(rb_eArgError, "invalid mbstring sequence"); |