* regparse.c (PINC): use optimized enclen() instead of

ONIGENC_MBC_ENC_LEN(). * regparse.c (PFETCH): ditto. * regparse.c (PFETCH): small optimization. * regexec.c (slow_search): single byte encoding optimization. * regenc.h (enclen): avoid calling function when encoding's min_len == max_len. * re.c (rb_reg_regsub): rb_enc_ascget() optimization for single byte encoding. * re.c (rb_reg_search): avoid allocating new re_registers if we already have MatchData. * re.c (match_init_copy): avoid unnecessary onig_region_free() before onig_region_copy. * encoding.c (rb_enc_get_index): remove implicit enc_capable check each time. * encoding.c (rb_enc_set_index): ditto. * encoding.c (enc_compatible_p): small refactoring. * include/ruby/encoding.h (rb_enc_dummy_p): inline rb_enc_dummy_p() and export related code. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@16477 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
author: matz <matz@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> 2008-05-19 08:25:03 +0000
committer: matz <matz@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> 2008-05-19 08:25:03 +0000
commit: 44cd8e457b808173147c499408ffc5e908f236dc (patch)
tree: 357c92a92120e9e79054dbc58171b956e3d37e7f
parent: 48a42a0387a0924955c8cf31f880bdecbf6023a8 (diff)
download: ruby-44cd8e457b808173147c499408ffc5e908f236dc.tar.gz
9 files changed, 115 insertions, 69 deletions
diff --git a/ChangeLog b/ChangeLog
index fc406cd112..397a4a59b0 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,36 @@
+Mon May 19 17:23:55 2008  Yukihiro Matsumoto  <matz@ruby-lang.org>
+
+	* regparse.c (PINC): use optimized enclen() instead of
+	  ONIGENC_MBC_ENC_LEN().
+
+	* regparse.c (PFETCH): ditto.
+
+	* regparse.c (PFETCH): small optimization.
+
+	* regexec.c (slow_search): single byte encoding optimization.
+
+	* regenc.h (enclen): avoid calling function when encoding's
+	  min_len == max_len.
+
+	* re.c (rb_reg_regsub): rb_enc_ascget() optimization for single
+	  byte encoding.
+
+	* re.c (rb_reg_search): avoid allocating new re_registers if we
+	  already have MatchData.
+
+	* re.c (match_init_copy): avoid unnecessary onig_region_free()
+	  before onig_region_copy. 
+
+	* encoding.c (rb_enc_get_index): remove implicit enc_capable check
+	  each time.
+
+	* encoding.c (rb_enc_set_index): ditto.
+
+	* encoding.c (enc_compatible_p): small refactoring.
+
+	* include/ruby/encoding.h (rb_enc_dummy_p): inline
+	  rb_enc_dummy_p() and export related code.
+
 Mon May 19 14:32:03 2008  Koichi Sasada  <ko1@atdot.net>
 
 	* version.h: fix strange change by version.h update tool.
diff --git a/common.mk b/common.mk
index 2a76c7bfa2..8e62efae2e 100644
--- a/common.mk
+++ b/common.mk
@@ -615,7 +615,7 @@ cont.$(OBJEXT): {$(VPATH)}cont.c {$(VPATH)}ruby.h {$(VPATH)}config.h \
   {$(VPATH)}eval_intern.h {$(VPATH)}util.h {$(VPATH)}dln.h
 time.$(OBJEXT): {$(VPATH)}time.c {$(VPATH)}ruby.h {$(VPATH)}config.h \
   {$(VPATH)}defines.h {$(VPATH)}missing.h {$(VPATH)}intern.h \
-  {$(VPATH)}st.h
+  {$(VPATH)}st.h {$(VPATH)}encoding.h 
 util.$(OBJEXT): {$(VPATH)}util.c {$(VPATH)}ruby.h {$(VPATH)}config.h \
   {$(VPATH)}defines.h {$(VPATH)}missing.h {$(VPATH)}intern.h \
   {$(VPATH)}st.h {$(VPATH)}util.h
diff --git a/encoding.c b/encoding.c
index e243451569..bb400e1ef0 100644
--- a/encoding.c
+++ b/encoding.c
@@ -18,7 +18,7 @@
 #endif
 
 static ID id_encoding, id_base_encoding;
-static VALUE rb_cEncoding;
+VALUE rb_cEncoding;
 
 struct rb_encoding_entry {
     const char *name;
@@ -38,14 +38,6 @@ void rb_enc_init(void);
 
 #define enc_autoload_p(enc) (!rb_enc_mbmaxlen(enc))
 
-#define ENC_UNINITIALIZED (&rb_cEncoding)
-#define enc_initialized_p(enc) ((enc)->auxiliary_data != &rb_cEncoding)
-#define ENC_FROM_ENCODING(enc) ((VALUE)(enc)->auxiliary_data)
-
-#define ENC_DUMMY_FLAG FL_USER2
-#define ENC_DUMMY_P(enc) (RBASIC(enc)->flags & ENC_DUMMY_FLAG)
-#define ENC_SET_DUMMY(enc) (RBASIC(enc)->flags |= ENC_DUMMY_FLAG)
-
 static int load_encoding(const char *name);
 static VALUE enc_base_encoding(VALUE self);
 
@@ -318,15 +310,6 @@ rb_encdb_dummy(const char *name)
     return index;
 }
 
-int
-rb_enc_dummy_p(rb_encoding *enc)
-{
-    VALUE encoding;
-    if (!enc_initialized_p(enc)) return Qfalse;
-    encoding = rb_enc_from_encoding(enc);
-    return ENC_DUMMY_P(encoding);
-}
-
 /*
  * call-seq:
  *   enc.dummy? => true or false
@@ -343,7 +326,7 @@ rb_enc_dummy_p(rb_encoding *enc)
 static VALUE
 enc_dummy_p(VALUE enc)
 {
-    return rb_enc_dummy_p(rb_to_encoding(enc)) ? Qtrue : Qfalse;
+    return ENC_DUMMY_P(enc) ? Qtrue : Qfalse;
 }
 
 static int
@@ -555,7 +538,7 @@ rb_id_encoding(void)
 }
 
 int
-rb_enc_internal_get_index(VALUE obj)
+rb_enc_get_index(VALUE obj)
 {
     int i;
 
@@ -570,7 +553,7 @@ rb_enc_internal_get_index(VALUE obj)
 }
 
 void
-rb_enc_internal_set_index(VALUE obj, int idx)
+rb_enc_set_index(VALUE obj, int idx)
 {
     if (idx < ENCODING_INLINE_MAX) {
 	ENCODING_SET_INLINED(obj, idx);
@@ -584,14 +567,14 @@ rb_enc_internal_set_index(VALUE obj, int idx)
 void
 rb_enc_associate_index(VALUE obj, int idx)
 {
-    enc_check_capable(obj);
-    if (rb_enc_internal_get_index(obj) == idx)
+//    enc_check_capable(obj);
+    if (rb_enc_get_index(obj) == idx)
     	return;
     if (!ENC_CODERANGE_ASCIIONLY(obj) ||
 	!rb_enc_asciicompat(rb_enc_from_index(idx))) {
 	ENC_CODERANGE_CLEAR(obj);
     }
-    rb_enc_internal_set_index(obj, idx);
+    rb_enc_set_index(obj, idx);
 }
 
 void
@@ -600,13 +583,6 @@ rb_enc_associate(VALUE obj, rb_encoding *enc)
     rb_enc_associate_index(obj, rb_enc_to_index(enc));
 }
 
-int
-rb_enc_get_index(VALUE obj)
-{
-    if (!enc_capable(obj)) return -1;
-    return rb_enc_internal_get_index(obj);
-}
-
 rb_encoding*
 rb_enc_get(VALUE obj)
 {
@@ -906,11 +882,13 @@ enc_find(VALUE klass, VALUE enc)
 static VALUE
 enc_compatible_p(VALUE klass, VALUE str1, VALUE str2)
 {
-    rb_encoding *enc = rb_enc_compatible(str1, str2);
-    VALUE encoding = Qnil;
-    if (!enc || !(encoding = rb_enc_from_encoding(enc)))
-	encoding = Qnil;
-    return encoding;
+    rb_encoding *enc;
+
+    if (!enc_capable(str1)) return Qnil;
+    if (!enc_capable(str2)) return Qnil;
+    enc = rb_enc_compatible(str1, str2);
+    if (!enc) return Qnil;
+    return rb_enc_from_encoding(enc);
 }
 
 /* :nodoc: */
diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h
index 2dd2f93b18..0a6b7c18e2 100644
--- a/include/ruby/encoding.h
+++ b/include/ruby/encoding.h
@@ -33,14 +33,14 @@
     if (encoding_set_enc_index < ENCODING_INLINE_MAX) \
         ENCODING_SET_INLINED(rb_encoding_set_obj, encoding_set_enc_index); \
     else \
-        rb_enc_internal_set_index(rb_encoding_set_obj, encoding_set_enc_index); \
+        rb_enc_set_index(rb_encoding_set_obj, encoding_set_enc_index); \
 } while (0)
 
 #define ENCODING_GET_INLINED(obj) ((RBASIC(obj)->flags & ENCODING_MASK)>>ENCODING_SHIFT)
 #define ENCODING_GET(obj) \
     (ENCODING_GET_INLINED(obj) != ENCODING_INLINE_MAX ? \
      ENCODING_GET_INLINED(obj) : \
-     rb_enc_internal_get_index(obj))
+     rb_enc_get_index(obj))
 
 #define ENCODING_IS_ASCII8BIT(obj) (ENCODING_GET_INLINED(obj) == 0)
 
@@ -74,9 +74,9 @@ typedef OnigEncodingType rb_encoding;
 
 int rb_enc_replicate(const char *, rb_encoding *);
 int rb_define_dummy_encoding(const char *);
-int rb_enc_dummy_p(rb_encoding *);
 #define rb_enc_to_index(enc) ((enc) ? ((enc)->ruby_encoding_index) : 0)
 int rb_enc_get_index(VALUE obj);
+void rb_enc_set_index(VALUE obj, int encindex);
 int rb_enc_find_index(const char *name);
 int rb_to_encoding_index(VALUE);
 rb_encoding* rb_to_encoding(VALUE);
@@ -86,8 +86,6 @@ rb_encoding* rb_enc_check(VALUE,VALUE);
 void rb_enc_associate_index(VALUE, int);
 void rb_enc_associate(VALUE, rb_encoding*);
 void rb_enc_copy(VALUE dst, VALUE src);
-int rb_enc_internal_get_index(VALUE obj);
-void rb_enc_internal_set_index(VALUE obj, int encindex);
 
 VALUE rb_enc_str_new(const char*, long, rb_encoding*);
 VALUE rb_enc_reg_new(const char*, long, rb_encoding*, int);
@@ -154,7 +152,7 @@ int rb_enc_codelen(int code, rb_encoding *enc);
 #define rb_enc_isspace(c,enc) ONIGENC_IS_CODE_SPACE(enc,c)
 #define rb_enc_isdigit(c,enc) ONIGENC_IS_CODE_DIGIT(enc,c)
 
-#define rb_enc_asciicompat(enc) (!rb_enc_dummy_p(enc) && rb_enc_mbminlen(enc)==1)
+#define rb_enc_asciicompat(enc) (rb_enc_mbminlen(enc)==1 && !rb_enc_dummy_p(enc))
 
 int rb_enc_casefold(char *to, const char *p, const char *e, rb_encoding *enc);
 int rb_enc_toupper(int c, rb_encoding *enc);
@@ -178,4 +176,21 @@ void rb_enc_set_default_external(VALUE encoding);
 VALUE rb_locale_charmap(VALUE klass);
 long rb_memsearch(const void*,long,const void*,long,rb_encoding*);
 
+RUBY_EXTERN VALUE rb_cEncoding;
+
+#define ENC_UNINITIALIZED (&rb_cEncoding)
+#define enc_initialized_p(enc) ((enc)->auxiliary_data != &rb_cEncoding)
+#define ENC_FROM_ENCODING(enc) ((VALUE)(enc)->auxiliary_data)
+
+#define ENC_DUMMY_FLAG FL_USER2
+#define ENC_DUMMY_P(enc) (RBASIC(enc)->flags & ENC_DUMMY_FLAG)
+#define ENC_SET_DUMMY(enc) (RBASIC(enc)->flags |= ENC_DUMMY_FLAG)
+
+static inline int
+rb_enc_dummy_p(rb_encoding *enc)
+{
+    if (!enc_initialized_p(enc)) return Qfalse;
+    return ENC_DUMMY_P(ENC_FROM_ENCODING(enc));
+}
+
 #endif /* RUBY_ENCODING_H */
diff --git a/re.c b/re.c
index 4d9a26363f..c5b47c46b7 100644
--- a/re.c
+++ b/re.c
@@ -881,9 +881,6 @@ match_init_copy(VALUE obj, VALUE orig)
     RMATCH(obj)->regexp = RMATCH(orig)->regexp;
 
     rm = RMATCH(obj)->rmatch;
-    onig_region_free(&rm->regs, 0);
-    rm->regs.allocated = 0;
-
     onig_region_copy(&rm->regs, RMATCH_REGS(orig));
 
     if (!RMATCH(orig)->rmatch->char_offset_updated) {
@@ -1265,7 +1262,7 @@ rb_reg_search(VALUE re, VALUE str, int pos, int reverse)
 {
     int result;
     VALUE match;
-    struct re_registers regs;
+    struct re_registers *regs, regi;
     char *range = RSTRING_PTR(str);
     regex_t *reg0 = RREGEXP(re)->ptr, *reg;
     int busy = FL_TEST(re, REG_BUSY);
@@ -1277,17 +1274,29 @@ rb_reg_search(VALUE re, VALUE str, int pos, int reverse)
 
     reg = rb_reg_prepare_re(re, str);
 
+    match = rb_backref_get();
+    if (!NIL_P(match)) {
+	if (FL_TEST(match, MATCH_BUSY)) {
+	    match = Qnil;
+	}
+	else {
+	    regs = RMATCH_REGS(match);
+	}
+    }
+    if (NIL_P(match)) {
+	regs = &regi;
+	MEMZERO(regs, struct re_registers, 1);
+    }
     FL_SET(re, REG_BUSY);
     if (!reverse) {
 	range += RSTRING_LEN(str);
     }
-    MEMZERO(&regs, struct re_registers, 1);
     result = onig_search(reg,
 			 (UChar*)(RSTRING_PTR(str)),
 			 ((UChar*)(RSTRING_PTR(str)) + RSTRING_LEN(str)),
 			 ((UChar*)(RSTRING_PTR(str)) + pos),
 			 ((UChar*)range),
-			 &regs, ONIG_OPTION_NONE);
+			 regs, ONIG_OPTION_NONE);
 
     if (RREGEXP(re)->ptr != reg) {
 	if (busy) {
@@ -1300,7 +1309,8 @@ rb_reg_search(VALUE re, VALUE str, int pos, int reverse)
     }
     if (!busy) FL_UNSET(re, REG_BUSY);
     if (result < 0) {
-	onig_region_free(&regs, 0);
+	if (regs == &regi)
+	    onig_region_free(regs, 0);
 	if (result == ONIG_MISMATCH) {
 	    rb_backref_set(Qnil);
 	    return result;
@@ -1312,9 +1322,10 @@ rb_reg_search(VALUE re, VALUE str, int pos, int reverse)
 	}
     }
 
-    match = rb_backref_get();
-    if (NIL_P(match) || FL_TEST(match, MATCH_BUSY)) {
+    if (NIL_P(match)) {
 	match = match_alloc(rb_cMatch);
+	onig_region_copy(RMATCH_REGS(match), regs);
+	onig_region_free(regs, 0);
     }
     else {
 	if (rb_safe_level() >= 3)
@@ -1323,8 +1334,6 @@ rb_reg_search(VALUE re, VALUE str, int pos, int reverse)
 	    FL_UNSET(match, FL_TAINT);
     }
 
-    onig_region_copy(RMATCH_REGS(match), &regs);
-    onig_region_free(&regs, 0);
     RMATCH(match)->str = rb_str_new4(str);
     RMATCH(match)->regexp = re;
     RMATCH(match)->rmatch->char_offset_updated = 0;
@@ -3088,12 +3097,14 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp)
     int no, clen;
     rb_encoding *str_enc = rb_enc_get(str);
     rb_encoding *src_enc = rb_enc_get(src);
+    int acompat = rb_enc_asciicompat(str_enc);
+#define ASCGET(s,e,cl) (acompat ? (*cl=1,s[0]) : rb_enc_ascget(s, e, cl, str_enc))
 
     p = s = RSTRING_PTR(str);
     e = s + RSTRING_LEN(str);
 
     while (s < e) {
-        int c = rb_enc_ascget(s, e, &clen, str_enc);
+        int c = ASCGET(s, e, &clen);
 	char *ss;
 
 	if (c == -1) {
@@ -3110,7 +3121,7 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp)
 	}
         rb_enc_str_buf_cat(val, p, ss-p, str_enc);
 
-        c = rb_enc_ascget(s, e, &clen, str_enc);
+        c = ASCGET(s, e, &clen);
         if (c == -1) {
             s += mbclen(s, e, str_enc);
 	    rb_enc_str_buf_cat(val, ss, s-ss, str_enc);
@@ -3132,12 +3143,12 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp)
 	    break;
 
           case 'k':
-            if (s < e && rb_enc_ascget(s, e, &clen, str_enc) == '<') {
+            if (s < e && ASCGET(s, e, &clen) == '<') {
                 char *name, *name_end;
                
                 name_end = name = s + clen;
                 while (name_end < e) {
-                    c = rb_enc_ascget(name_end, e, &clen, str_enc);
+                    c = ASCGET(name_end, e, &clen);
                     if (c == '>') break;
                     name_end += c == -1 ? mbclen(name_end, e, str_enc) : clen;
                 }
diff --git a/regenc.h b/regenc.h
index 09c6da4783..317175f5df 100644
--- a/regenc.h
+++ b/regenc.h
@@ -70,7 +70,7 @@ typedef struct {
 #define ONIG_CHECK_NULL_RETURN(p)          if (ONIG_IS_NULL(p)) return NULL
 #define ONIG_CHECK_NULL_RETURN_VAL(p,val)  if (ONIG_IS_NULL(p)) return (val)
 
-#define enclen(enc,p,e)      ONIGENC_MBC_ENC_LEN(enc,p,e)
+#define enclen(enc,p,e) ((enc->max_enc_len == enc->min_enc_len) ? enc->min_enc_len : ONIGENC_MBC_ENC_LEN(enc,p,e))
 
 /* character types bit flag */
 #define BIT_CTYPE_NEWLINE  (1<< ONIGENC_CTYPE_NEWLINE)
diff --git a/regexec.c b/regexec.c
index b9947d8f93..a2d6993d08 100644
--- a/regexec.c
+++ b/regexec.c
@@ -2758,16 +2758,25 @@ slow_search(OnigEncoding enc, UChar* target, UChar* target_end,
 
   s = (UChar* )text;
 
+  if (enc->max_enc_len == enc->min_enc_len) {
+    int n = enc->max_enc_len;
+
+    while (s < end) {
+      if (*s == *target) {
+	p = s + 1;
+	t = target + 1;
+	if (memcmp(t, p, target_end - t) == 0)
+	  return s;
+      }
+      s += n;
+    }
+    return (UChar*)NULL;
+  }
   while (s < end) {
     if (*s == *target) {
       p = s + 1;
       t = target + 1;
-      while (t < target_end) {
-	if (*t != *p++)
-	  break;
-	t++;
-      }
-      if (t == target_end)
+      if (memcmp(t, p, target_end - t) == 0)
 	return s;
     }
     s += enclen(enc, s, end);
diff --git a/regparse.c b/regparse.c
index 1b2a0830ae..8d74efafea 100644
--- a/regparse.c
+++ b/regparse.c
@@ -253,12 +253,12 @@ strdup_with_null(OnigEncoding enc, UChar* s, UChar* end)
 #define PUNFETCH     p = pfetch_prev
 #define PINC       do { \
   pfetch_prev = p; \
-  p += ONIGENC_MBC_ENC_LEN(enc, p, end); \
+  p += enclen(enc, p, end); \
 } while (0)
 #define PFETCH(c)  do { \
-  c = ONIGENC_MBC_TO_CODE(enc, p, end); \
+  c = ((enc->max_enc_len == 1) ? *p : ONIGENC_MBC_TO_CODE(enc, p, end)); \
   pfetch_prev = p; \
-  p += ONIGENC_MBC_ENC_LEN(enc, p, end); \
+  p += enclen(enc, p, end); \
 } while (0)
 
 #define PPEEK        (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
diff --git a/string.c b/string.c
index 67f1e33b17..a80f60555f 100644
--- a/string.c
+++ b/string.c
@@ -256,7 +256,7 @@ rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc
 static inline void
 str_enc_copy(VALUE str1, VALUE str2)
 {
-    rb_enc_internal_set_index(str1, ENCODING_GET(str2));
+    rb_enc_set_index(str1, ENCODING_GET(str2));
 }
 
 static void
author	matz <matz@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>	2008-05-19 08:25:03 +0000
committer	matz <matz@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>	2008-05-19 08:25:03 +0000
commit	44cd8e457b808173147c499408ffc5e908f236dc (patch)
tree	357c92a92120e9e79054dbc58171b956e3d37e7f
parent	48a42a0387a0924955c8cf31f880bdecbf6023a8 (diff)
download	ruby-44cd8e457b808173147c499408ffc5e908f236dc.tar.gz