diff options
author | naruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2014-04-18 06:42:51 +0000 |
---|---|---|
committer | naruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2014-04-18 06:42:51 +0000 |
commit | 0ab9abe06939b41b151c95b287f6d283d5e5e9ea (patch) | |
tree | 03fb450054a1a576df48247fb95f727d7919e213 /string.c | |
parent | b97de74ee6f34327176295d8a8e89932461269f7 (diff) | |
download | ruby-0ab9abe06939b41b151c95b287f6d283d5e5e9ea.tar.gz |
* string.c (enc_strlen): move UTF-8 optimization from str_strlen to
enc_strlen.
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@45617 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'string.c')
-rw-r--r-- | string.c | 122 |
1 files changed, 61 insertions, 61 deletions
@@ -1075,6 +1075,41 @@ rb_str_init(int argc, VALUE *argv, VALUE str) return str; } +#ifdef NONASCII_MASK +#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80) + +/* + * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx + * bit representation. (see http://en.wikipedia.org/wiki/UTF-8) + * Therefore, following pseudo code can detect UTF-8 leading byte. + * + * if (!(byte & 0x80)) + * byte |= 0x40; // turn on bit6 + * return ((byte>>6) & 1); // bit6 represent it's leading byte or not. + * + * This function calculate every bytes in the argument word `s' + * using the above logic concurrently. and gather every bytes result. + */ +static inline VALUE +count_utf8_lead_bytes_with_word(const VALUE *s) +{ + VALUE d = *s; + + /* Transform into bit0 represent UTF-8 leading or not. */ + d |= ~(d>>1); + d >>= 6; + d &= NONASCII_MASK >> 7; + + /* Gather every bytes. */ + d += (d>>8); + d += (d>>16); +#if SIZEOF_VALUE == 8 + d += (d>>32); +#endif + return (d&0xF); +} +#endif + static inline long enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr) { @@ -1084,6 +1119,31 @@ enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr) if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) { return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc); } +#ifdef NONASCII_MASK + else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) { + VALUE len = 0; + if ((int)sizeof(VALUE) * 2 < e - p) { + const VALUE *s, *t; + const VALUE lowbits = sizeof(VALUE) - 1; + s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits)); + t = (const VALUE*)(~lowbits & (VALUE)e); + while (p < (const char *)s) { + if (is_utf8_lead_byte(*p)) len++; + p++; + } + while (s < t) { + len += count_utf8_lead_bytes_with_word(s); + s++; + } + p = (const char *)s; + } + while (p < e) { + if (is_utf8_lead_byte(*p)) len++; + p++; + } + return (long)len; + } +#endif else if (rb_enc_asciicompat(enc)) { c = 0; if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) { @@ -1183,41 +1243,7 @@ rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr) return c; } -#ifdef NONASCII_MASK -#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80) - -/* - * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx - * bit representation. (see http://en.wikipedia.org/wiki/UTF-8) - * Therefore, following pseudo code can detect UTF-8 leading byte. - * - * if (!(byte & 0x80)) - * byte |= 0x40; // turn on bit6 - * return ((byte>>6) & 1); // bit6 represent it's leading byte or not. - * - * This function calculate every bytes in the argument word `s' - * using the above logic concurrently. and gather every bytes result. - */ -static inline VALUE -count_utf8_lead_bytes_with_word(const VALUE *s) -{ - VALUE d = *s; - - /* Transform into bit0 represent UTF-8 leading or not. */ - d |= ~(d>>1); - d >>= 6; - d &= NONASCII_MASK >> 7; - - /* Gather every bytes. */ - d += (d>>8); - d += (d>>16); -#if SIZEOF_VALUE == 8 - d += (d>>32); -#endif - return (d&0xF); -} -#endif - +/* enc must be compatible with str's enc */ static long str_strlen(VALUE str, rb_encoding *enc) { @@ -1230,33 +1256,7 @@ str_strlen(VALUE str, rb_encoding *enc) p = RSTRING_PTR(str); e = RSTRING_END(str); cr = ENC_CODERANGE(str); -#ifdef NONASCII_MASK - if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID && - enc == rb_utf8_encoding()) { - VALUE len = 0; - if ((int)sizeof(VALUE) * 2 < e - p) { - const VALUE *s, *t; - const VALUE lowbits = sizeof(VALUE) - 1; - s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits)); - t = (const VALUE*)(~lowbits & (VALUE)e); - while (p < (const char *)s) { - if (is_utf8_lead_byte(*p)) len++; - p++; - } - while (s < t) { - len += count_utf8_lead_bytes_with_word(s); - s++; - } - p = (const char *)s; - } - while (p < e) { - if (is_utf8_lead_byte(*p)) len++; - p++; - } - return (long)len; - } -#endif n = rb_enc_strlen_cr(p, e, enc, &cr); if (cr) { ENC_CODERANGE_SET(str, cr); |