diff options
-rw-r--r-- | ChangeLog | 20 | ||||
-rw-r--r-- | common.mk | 2 | ||||
-rw-r--r-- | include/ruby/encoding.h | 1 | ||||
-rw-r--r-- | include/ruby/intern.h | 1 | ||||
-rw-r--r-- | re.c | 156 | ||||
-rw-r--r-- | string.c | 2 | ||||
-rw-r--r-- | version.h | 6 |
7 files changed, 157 insertions, 31 deletions
@@ -1,3 +1,23 @@ +Tue Mar 18 04:00:27 2008 NARUSE, Yui <naruse@ruby-lang.org> + + * re.c (rb_memsearch_ss): simple shift search. + + * re.c (rb_memsearch_qs): quick search. + + * re.c (rb_memsearch_qs_utf8): quick search for UTF-8 string. + + * re.c (rb_memsearch_qs_utf8_hash): hash functions for above. + + * re.c (rb_memsearch): use above functions. + + * string.c (rb_str_index): give enc to rb_memsearch. + + * include/ruby/intern.h (rb_memsearch): move to encoding.h. + + * include/ruby/encoding.h (rb_memsearch): move from intern.h. + + * common.mk (PREP): add dependency. + Mon Mar 17 22:23:54 2008 Yusuke Endoh <mame@tsg.ne.jp> * array.c (rb_ary_take, rb_ary_take_while, rb_ary_drop, @@ -110,6 +110,8 @@ all: $(MKFILES) $(PREP) encdb transdb $(RBCONFIG) $(LIBRUBY) encs @$(MINIRUBY) $(srcdir)/ext/extmk.rb --make="$(MAKE)" $(EXTMK_ARGS) prog: $(PROGRAM) $(WPROGRAM) +$(PREP): $(MKFILES) + miniruby$(EXEEXT): config.status $(NORMALMAINOBJ) $(MINIOBJS) $(COMMONOBJS) $(DMYEXT) $(ARCHFILE) GORUBY = go$(RUBY_INSTALL_NAME) diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h index 6bc4031ab4..2dd2f93b18 100644 --- a/include/ruby/encoding.h +++ b/include/ruby/encoding.h @@ -176,5 +176,6 @@ int rb_usascii_encindex(void); VALUE rb_enc_default_external(void); void rb_enc_set_default_external(VALUE encoding); VALUE rb_locale_charmap(VALUE klass); +long rb_memsearch(const void*,long,const void*,long,rb_encoding*); #endif /* RUBY_ENCODING_H */ diff --git a/include/ruby/intern.h b/include/ruby/intern.h index d51bc67636..01ea5285bb 100644 --- a/include/ruby/intern.h +++ b/include/ruby/intern.h @@ -466,7 +466,6 @@ double rb_genrand_real(void); /* re.c */ #define rb_memcmp memcmp int rb_memcicmp(const void*,const void*,long); -long rb_memsearch(const void*,long,const void*,long); VALUE rb_reg_nth_defined(int, VALUE); VALUE rb_reg_nth_match(int, VALUE); VALUE rb_reg_last_match(VALUE); @@ -95,41 +95,145 @@ rb_memcmp(const void *p1, const void *p2, long len) return memcmp(p1, p2, len); } -long -rb_memsearch(const void *x0, long m, const void *y0, long n) -{ - const unsigned char *x = x0, *y = y0; - const unsigned char *s, *e; - long i; - int d; - unsigned long hx, hy; +static inline long +rb_memsearch_ss(const unsigned char *xs, long m, const unsigned char *ys, long n) +{ + const unsigned char *x = xs, *xe = xs + m; + const unsigned char *y = ys, *ye = ys + n; +#ifndef VALUE_MAX +# if SIZEOF_VALUE == 8 +# define VALUE_MAX 0xFFFFFFFFFFFFFFFFULL +# elif SIZEOF_VALUE == 4 +# define VALUE_MAX 0xFFFFFFFFUL +# endif +#endif + VALUE hx, hy, mask = VALUE_MAX >> ((SIZEOF_VALUE - m) * CHAR_BIT); -#define KR_REHASH(a, b, h) (((h) << 1) - (((unsigned long)(a))<<d) + (b)) + if (m > SIZEOF_VALUE) + rb_bug("!!too long pattern string!!"); - if (m > n) return -1; - s = y; e = s + n - m; + /* Prepare hash value */ + for (hx = *x++, hy = *y++; x < xe; ++x, ++y) { + hx <<= CHAR_BIT; + hy <<= CHAR_BIT; + hx |= *x; + hy |= *y; + } + /* Searching */ + while (hx != hy) { + if (y == ye) + return -1; + hy <<= CHAR_BIT; + hy |= *y; + hy &= mask; + y++; + } + return y - ys - m; +} + +static inline long +rb_memsearch_qs(const unsigned char *xs, long m, const unsigned char *ys, long n) +{ + const unsigned char *x = xs, *xe = xs + m; + const unsigned char *y = ys, *ye = ys + n; + VALUE i, qstable[256]; /* Preprocessing */ - /* computes d = 2^(m-1) with - the left-shift operator */ - d = sizeof(hx) * CHAR_BIT - 1; - if (d > m) d = m; + for (i = 0; i < 256; ++i) + qstable[i] = m + 1; + for (; x < xe; ++x) + qstable[*x] = xe - x; + /* Searching */ + for (; y < ye; y += *(qstable + y[m])) { + if (*xs == *y && memcmp(xs, y, m) == 0) + return y - ys; + } + return -1; +} + +static inline unsigned int +rb_memsearch_qs_utf8_hash(const unsigned char *x) +{ + register const unsigned int mix = 8353; + register unsigned int h = *x; + if (h < 0xC0) { + return h + 256; + } + else if (h < 0xE0) { + h *= mix; + h += x[1]; + } + else if (h < 0xF0) { + h *= mix; + h += x[1]; + h *= mix; + h += x[2]; + } + else if (h < 0xF5) { + h *= mix; + h += x[1]; + h *= mix; + h += x[2]; + h *= mix; + h += x[3]; + } + else { + return h + 256; + } + return (unsigned char)h; +} + +static inline long +rb_memsearch_qs_utf8(const unsigned char *xs, long m, const unsigned char *ys, long n) +{ + const unsigned char *x = xs, *xe = xs + m; + const unsigned char *y = ys, *ye = ys + n; + VALUE i, qstable[512]; - if (n == m) { - return memcmp(x, s, m) == 0 ? 0 : -1; + /* Preprocessing */ + for (i = 0; i < 512; ++i) { + qstable[i] = m + 1; } - /* Prepare hash value */ - for (hy = hx = i = 0; i < d; ++i) { - hx = KR_REHASH(0, x[i], hx); - hy = KR_REHASH(0, s[i], hy); + for (; x < xe; ++x) { + qstable[rb_memsearch_qs_utf8_hash(x)] = xe - x; } /* Searching */ - while (hx != hy || memcmp(x, s, m)) { - if (s >= e) return -1; - hy = KR_REHASH(*s, *(s+d), hy); - s++; + for (; y < ye; y += qstable[rb_memsearch_qs_utf8_hash(y+m)]) { + if (*xs == *y && memcmp(xs, y, m) == 0) + return y - ys; + } + return -1; +} + +long +rb_memsearch(const void *x0, long m, const void *y0, long n, rb_encoding *enc) +{ + const unsigned char *x = x0, *y = y0; + + if (m > n) return -1; + else if (m == n) { + return memcmp(x0, y0, m) == 0 ? 0 : -1; + } + else if (m < 1) { + return 0; + } + else if (m == 1) { + const unsigned char *ys = y, *ye = ys + n; + for (; y < ye; ++y) { + if (*x == *y) + return y - ys; + } + return -1; + } + else if (m <= SIZEOF_VALUE) { + return rb_memsearch_ss(x0, m, y0, n); + } + else if (enc == rb_utf8_encoding()){ + return rb_memsearch_qs_utf8(x0, m, y0, n); + } + else { + return rb_memsearch_qs(x0, m, y0, n); } - return s-y; } #define REG_LITERAL FL_USER5 @@ -2088,7 +2088,7 @@ rb_str_index(VALUE str, VALUE sub, long offset) len = RSTRING_LEN(str) - offset; for (;;) { char *t; - pos = rb_memsearch(sptr, slen, s, len); + pos = rb_memsearch(sptr, slen, s, len, enc); if (pos < 0) return pos; t = rb_enc_right_char_head(s, s+pos, enc); if (t == s + pos) break; @@ -1,7 +1,7 @@ #define RUBY_VERSION "1.9.0" -#define RUBY_RELEASE_DATE "2008-03-17" +#define RUBY_RELEASE_DATE "2008-03-18" #define RUBY_VERSION_CODE 190 -#define RUBY_RELEASE_CODE 20080317 +#define RUBY_RELEASE_CODE 20080318 #define RUBY_PATCHLEVEL 0 #define RUBY_VERSION_MAJOR 1 @@ -9,7 +9,7 @@ #define RUBY_VERSION_TEENY 0 #define RUBY_RELEASE_YEAR 2008 #define RUBY_RELEASE_MONTH 3 -#define RUBY_RELEASE_DAY 17 +#define RUBY_RELEASE_DAY 18 #ifdef RUBY_EXTERN RUBY_EXTERN const char ruby_version[]; |