From 0ad460dde8b91ebda10df52ba54b512b98429a4e Mon Sep 17 00:00:00 2001 From: naruse Date: Mon, 7 Jul 2008 17:29:44 +0000 Subject: * dir.c: preserve encoding of strings in glob and fnmatch. * include/ruby/ruby.h: related changes. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@17950 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- dir.c | 245 +++++++++++++++++++++++++----------------------------------------- 1 file changed, 93 insertions(+), 152 deletions(-) (limited to 'dir.c') diff --git a/dir.c b/dir.c index 213d6cdb87..284a1ce352 100644 --- a/dir.c +++ b/dir.c @@ -80,96 +80,35 @@ char *strchr(char*,char); #define FNM_NOMATCH 1 #define FNM_ERROR 2 -#define downcase(c) (nocase && ISUPPER(c) ? TOLOWER(c) : (c)) -#define compare(c1, c2) (((unsigned char)(c1)) - ((unsigned char)(c2))) +# define Next(p, enc) (rb_enc_right_char_head(p, p+1, enc)) +# define Inc(p, enc) ((p) = Next(p, enc)) -/* caution: in case *p == '\0' - Next(p) == p + 1 in single byte environment - Next(p) == p in multi byte environment -*/ -#if defined(CharNext) -# define Next(p) CharNext(p) -#elif defined(DJGPP) -# define Next(p) ((p) + mblen(p, RUBY_MBCHAR_MAXSIZE)) -#elif defined(__EMX__) -# define Next(p) ((p) + emx_mblen(p)) -static inline int -emx_mblen(const char *p) -{ - int n = mblen(p, RUBY_MBCHAR_MAXSIZE); - return (n < 0) ? 1 : n; -} -#endif - -#ifndef Next /* single byte environment */ -# define Next(p) ((p) + 1) -# define Inc(p) (++(p)) -# define Compare(p1, p2) (compare(downcase(*(p1)), downcase(*(p2)))) -#else /* multi byte environment */ -# define Inc(p) ((p) = Next(p)) -# define Compare(p1, p2) (CompareImpl(p1, p2, nocase)) static int -CompareImpl(const char *p1, const char *p2, int nocase) +char_casecmp(const char *p1, const char *p2, rb_encoding *enc, const int nocase) { - const int len1 = Next(p1) - p1; - const int len2 = Next(p2) - p2; -#ifdef _WIN32 - char buf1[10], buf2[10]; /* large enough? */ -#endif - - if (len1 < 0 || len2 < 0) { - rb_fatal("CompareImpl: negative len"); - } - - if (len1 == 0) return len2; - if (len2 == 0) return -len1; - -#ifdef _WIN32 - if (nocase && rb_w32_iswinnt()) { - if (len1 > 1) { - if (len1 >= sizeof(buf1)) { - rb_fatal("CompareImpl: too large len"); - } - memcpy(buf1, p1, len1); - buf1[len1] = '\0'; - CharLower(buf1); - p1 = buf1; /* trick */ - } - if (len2 > 1) { - if (len2 >= sizeof(buf2)) { - rb_fatal("CompareImpl: too large len"); - } - memcpy(buf2, p2, len2); - buf2[len2] = '\0'; - CharLower(buf2); - p2 = buf2; /* trick */ - } + const char *p1end, *p2end; + + if (!*p1) return *p1; + if (!*p2) return -*p2; + p1end = p1 + strlen(p1); + p2end = p2 + strlen(p2); + int c1 = rb_enc_codepoint(p1, p1end, enc); + int c2 = rb_enc_codepoint(p2, p2end, enc); + + if (c1 == c2) return 0; + if (nocase) { + c1 = rb_enc_toupper(c1, enc); + c2 = rb_enc_toupper(c2, enc); } -#endif - if (len1 == 1) - if (len2 == 1) - return compare(downcase(*p1), downcase(*p2)); - else { - const int ret = compare(downcase(*p1), *p2); - return ret ? ret : -1; - } - else - if (len2 == 1) { - const int ret = compare(*p1, downcase(*p2)); - return ret ? ret : 1; - } - else { - const int ret = memcmp(p1, p2, len1 < len2 ? len1 : len2); - return ret ? ret : len1 - len2; - } + return c1 - c2; } -#endif /* environment */ static char * bracket( const char *p, /* pattern (next to '[') */ const char *s, /* string */ - int flags) + int flags, + rb_encoding *enc) { const int nocase = flags & FNM_CASEFOLD; const int escape = !(flags & FNM_NOESCAPE); @@ -187,19 +126,19 @@ bracket( t1++; if (!*t1) return NULL; - p = Next(t1); + p = Next(t1, enc); if (p[0] == '-' && p[1] != ']') { const char *t2 = p + 1; if (escape && *t2 == '\\') t2++; if (!*t2) return NULL; - p = Next(t2); - if (!ok && Compare(t1, s) <= 0 && Compare(s, t2) <= 0) + p = Next(t2, enc); + if (!ok && char_casecmp(t1, s, enc, nocase) <= 0 && char_casecmp(s, t2, enc, nocase) <= 0) ok = 1; } else - if (!ok && Compare(t1, s) == 0) + if (!ok && char_casecmp(t1, s, enc, nocase) == 0) ok = 1; } @@ -219,7 +158,8 @@ static int fnmatch_helper( const char **pcur, /* pattern */ const char **scur, /* string */ - int flags) + int flags, + rb_encoding *enc) { const int period = !(flags & FNM_DOTMATCH); const int pathname = flags & FNM_PATHNAME; @@ -253,16 +193,16 @@ fnmatch_helper( if (ISEND(s)) RETURN(FNM_NOMATCH); p++; - Inc(s); + Inc(s, enc); continue; case '[': { const char *t; if (ISEND(s)) RETURN(FNM_NOMATCH); - if ((t = bracket(p + 1, s, flags)) != 0) { + if ((t = bracket(p + 1, s, flags, enc)) != 0) { p = t; - Inc(s); + Inc(s, enc); continue; } goto failed; @@ -275,16 +215,16 @@ fnmatch_helper( RETURN(ISEND(p) ? 0 : FNM_NOMATCH); if (ISEND(p)) goto failed; - if (Compare(p, s) != 0) + if (char_casecmp(p, s, enc, nocase) != 0) goto failed; - Inc(p); - Inc(s); + Inc(p, enc); + Inc(s, enc); continue; failed: /* try next '*' position */ if (ptmp && stmp) { p = ptmp; - Inc(stmp); /* !ISEND(*stmp) */ + Inc(stmp, enc); /* !ISEND(*stmp) */ s = stmp; continue; } @@ -294,12 +234,15 @@ fnmatch_helper( static int fnmatch( - const char *p, /* pattern */ - const char *s, /* string */ + VALUE pattern, + VALUE string, int flags) { + const char *p = RSTRING_PTR(pattern); /* pattern */ + const char *s = RSTRING_PTR(string); /* string */ const int period = !(flags & FNM_DOTMATCH); const int pathname = flags & FNM_PATHNAME; + rb_encoding *enc = rb_enc_get(pattern); const char *ptmp = 0; const char *stmp = 0; @@ -311,8 +254,8 @@ fnmatch( ptmp = p; stmp = s; } - if (fnmatch_helper(&p, &s, flags) == 0) { - while (*s && *s != '/') Inc(s); + if (fnmatch_helper(&p, &s, flags, enc) == 0) { + while (*s && *s != '/') Inc(s, enc); if (*p && *s) { p++; s++; @@ -323,7 +266,7 @@ fnmatch( } /* failed : try next recursion */ if (ptmp && stmp && !(period && *stmp == '.')) { - while (*stmp && *stmp != '/') Inc(stmp); + while (*stmp && *stmp != '/') Inc(stmp, enc); if (*stmp) { p = ptmp; stmp++; @@ -335,7 +278,7 @@ fnmatch( } } else - return fnmatch_helper(&p, &s, flags); + return fnmatch_helper(&p, &s, flags, enc); } VALUE rb_cDir; @@ -1030,7 +973,7 @@ do_opendir(const char *path, int flags) /* Return nonzero if S has any special globbing chars in it. */ static int -has_magic(const char *s, int flags) +has_magic(const char *s, int flags, rb_encoding *enc) { const int escape = !(flags & FNM_NOESCAPE); const int nocase = flags & FNM_CASEFOLD; @@ -1055,7 +998,7 @@ has_magic(const char *s, int flags) return 1; } - p = Next(p-1); + p = Next(p-1, enc); } return 0; @@ -1063,7 +1006,7 @@ has_magic(const char *s, int flags) /* Find separator in globbing pattern. */ static char * -find_dirsep(const char *s, int flags) +find_dirsep(const char *s, int flags, rb_encoding *enc) { const int escape = !(flags & FNM_NOESCAPE); @@ -1091,7 +1034,7 @@ find_dirsep(const char *s, int flags) continue; } - p = Next(p-1); + p = Next(p-1, enc); } return (char *)p-1; @@ -1099,7 +1042,7 @@ find_dirsep(const char *s, int flags) /* Remove escaping backslashes */ static void -remove_backslashes(char *p) +remove_backslashes(char *p, rb_encoding *enc) { char *t = p; char *s = p; @@ -1112,7 +1055,7 @@ remove_backslashes(char *p) s = ++p; if (!*p) break; } - Inc(p); + Inc(p, enc); } while (*p++); @@ -1133,7 +1076,7 @@ struct glob_pattern { static void glob_free_pattern(struct glob_pattern *list); static struct glob_pattern * -glob_make_pattern(const char *p, int flags) +glob_make_pattern(const char *p, int flags, rb_encoding *enc) { struct glob_pattern *list, *tmp, **tail = &list; int dirsep = 0; /* pattern is terminated with '/' */ @@ -1149,7 +1092,7 @@ glob_make_pattern(const char *p, int flags) dirsep = 1; } else { - const char *m = find_dirsep(p, flags); + const char *m = find_dirsep(p, flags, enc); char *buf = GLOB_ALLOC_N(char, m-p+1); if (!buf) { GLOB_FREE(tmp); @@ -1157,7 +1100,7 @@ glob_make_pattern(const char *p, int flags) } memcpy(buf, p, m-p); buf[m-p] = '\0'; - tmp->type = has_magic(buf, flags) ? MAGICAL : PLAIN; + tmp->type = has_magic(buf, flags, enc) ? MAGICAL : PLAIN; tmp->str = buf; if (*m) { dirsep = 1; @@ -1230,8 +1173,8 @@ enum answer { YES, NO, UNKNOWN }; #endif struct glob_args { - void (*func)(const char *, VALUE); - const char *path; + void (*func)(VALUE, VALUE); + VALUE path; VALUE value; }; @@ -1248,7 +1191,7 @@ glob_func_caller(VALUE val) static int glob_helper( - const char *path, + VALUE vpath, int dirsep, /* '/' should be placed before appending child entry's name to 'path'. */ enum answer exist, /* Does 'path' indicate an existing entry? */ enum answer isdir, /* Does 'path' indicate a directory or a symlink to a directory? */ @@ -1258,6 +1201,7 @@ glob_helper( ruby_glob_func *func, VALUE arg) { + const char *path = RSTRING_PTR(vpath); struct stat st; int status = 0; struct glob_pattern **cur, **new_beg, **new_end; @@ -1310,13 +1254,13 @@ glob_helper( } } if (match_all && exist == YES) { - status = glob_call_func(func, path, arg); + status = glob_call_func(func, rb_tainted_str_new2(path), arg); if (status) return status; } if (match_dir && isdir == YES) { char *tmp = join_path(path, dirsep, ""); if (!tmp) return -1; - status = glob_call_func(func, tmp, arg); + status = glob_call_func(func, rb_tainted_str_new2(tmp), arg); GLOB_FREE(tmp); if (status) return status; } @@ -1338,7 +1282,7 @@ glob_helper( break; } if (recursive && strcmp(dp->d_name, ".") != 0 && strcmp(dp->d_name, "..") != 0 - && fnmatch("*", dp->d_name, flags) == 0) { + && fnmatch(rb_usascii_str_new2("*"), rb_str_new2(dp->d_name), flags) == 0) { #ifndef _WIN32 if (do_lstat(buf, &st, flags) == 0) new_isdir = S_ISDIR(st.st_mode) ? YES : S_ISLNK(st.st_mode) ? UNKNOWN : NO; @@ -1364,12 +1308,13 @@ glob_helper( p = p->next; /* 0 times recursion */ } if (p->type == PLAIN || p->type == MAGICAL) { - if (fnmatch(p->str, dp->d_name, flags) == 0) + if (fnmatch(rb_str_new2(p->str), rb_str_new2(dp->d_name), flags) == 0) *new_end++ = p->next; } } - status = glob_helper(buf, 1, YES, new_isdir, new_beg, new_end, flags, func, arg); + status = glob_helper(rb_enc_str_new(buf, strlen(buf), rb_enc_get(vpath)), 1, + YES, new_isdir, new_beg, new_end, flags, func, arg); GLOB_FREE(buf); GLOB_FREE(new_beg); if (status) break; @@ -1395,7 +1340,7 @@ glob_helper( break; } strcpy(name, (*cur)->str); - if (escape) remove_backslashes(name); + if (escape) remove_backslashes(name, rb_enc_get(vpath)); new_beg = new_end = GLOB_ALLOC_N(struct glob_pattern *, end - beg); if (!new_beg) { @@ -1405,7 +1350,7 @@ glob_helper( } *new_end++ = (*cur)->next; for (cur2 = cur + 1; cur2 < copy_end; ++cur2) { - if (*cur2 && fnmatch((*cur2)->str, name, flags) == 0) { + if (*cur2 && fnmatch(rb_str_new2((*cur2)->str), rb_str_new2(name), flags) == 0) { *new_end++ = (*cur2)->next; *cur2 = 0; } @@ -1418,7 +1363,8 @@ glob_helper( status = -1; break; } - status = glob_helper(buf, 1, UNKNOWN, UNKNOWN, new_beg, new_end, flags, func, arg); + status = glob_helper(rb_enc_str_new(buf, strlen(buf), rb_enc_get(vpath)), 1, + UNKNOWN, UNKNOWN, new_beg, new_end, flags, func, arg); GLOB_FREE(buf); GLOB_FREE(new_beg); if (status) break; @@ -1432,15 +1378,14 @@ glob_helper( } static int -ruby_glob0(const char *path, int flags, ruby_glob_func *func, VALUE arg) +ruby_glob0(VALUE path, int flags, ruby_glob_func *func, VALUE arg) { struct glob_pattern *list; const char *root, *start; - char *buf; - int n; int status; + VALUE buf; - start = root = path; + start = root = RSTRING_PTR(path); flags |= FNM_SYSCASE; #if defined DOSISH root = rb_path_skip_prefix(root); @@ -1448,20 +1393,14 @@ ruby_glob0(const char *path, int flags, ruby_glob_func *func, VALUE arg) if (root && *root == '/') root++; - n = root - start; - buf = GLOB_ALLOC_N(char, n + 1); - if (!buf) return -1; - MEMCPY(buf, start, char, n); - buf[n] = '\0'; + buf = rb_enc_str_new(start, root - start, rb_enc_get(path)); - list = glob_make_pattern(root, flags); + list = glob_make_pattern(root, flags, rb_enc_get(path)); if (!list) { - GLOB_FREE(buf); return -1; } status = glob_helper(buf, 0, UNKNOWN, UNKNOWN, &list, &list + 1, flags, func, arg); glob_free_pattern(list); - GLOB_FREE(buf); return status; } @@ -1469,11 +1408,11 @@ ruby_glob0(const char *path, int flags, ruby_glob_func *func, VALUE arg) int ruby_glob(const char *path, int flags, ruby_glob_func *func, VALUE arg) { - return ruby_glob0(path, flags & ~GLOB_VERBOSE, func, arg); + return ruby_glob0(rb_str_new2(path), flags & ~GLOB_VERBOSE, func, arg); } static int -rb_glob_caller(const char *path, VALUE a) +rb_glob_caller(VALUE path, VALUE a) { int status; struct glob_args *args = (struct glob_args *)a; @@ -1484,7 +1423,7 @@ rb_glob_caller(const char *path, VALUE a) } static int -rb_glob2(const char *path, int flags, void (*func)(const char *, VALUE), VALUE arg) +rb_glob2(VALUE path, int flags, void (*func)(VALUE, VALUE), VALUE arg) { struct glob_args args; @@ -1499,20 +1438,21 @@ rb_glob2(const char *path, int flags, void (*func)(const char *, VALUE), VALUE a } void -rb_glob(const char *path, void (*func)(const char *, VALUE), VALUE arg) +rb_glob(const char *path, void (*func)(VALUE, VALUE), VALUE arg) { - int status = rb_glob2(path, 0, func, arg); + int status = rb_glob2(rb_str_new2(path), 0, func, arg); if (status) GLOB_JUMP_TAG(status); } static void -push_pattern(const char *path, VALUE ary) +push_pattern(VALUE path, VALUE ary) { - rb_ary_push(ary, rb_tainted_str_new2(path)); + OBJ_TAINT(path); + rb_ary_push(ary, path); } -int -ruby_brace_expand(const char *str, int flags, ruby_glob_func *func, VALUE arg) +static int +ruby_brace_expand(const char *str, int flags, ruby_glob_func *func, VALUE arg, rb_encoding *enc) { const int escape = !(flags & FNM_NOESCAPE); const char *p = str; @@ -1531,7 +1471,7 @@ ruby_brace_expand(const char *str, int flags, ruby_glob_func *func, VALUE arg) if (*p == '\\' && escape) { if (!*++p) break; } - Inc(p); + Inc(p, enc); } if (lbrace && rbrace) { @@ -1551,17 +1491,17 @@ ruby_brace_expand(const char *str, int flags, ruby_glob_func *func, VALUE arg) if (*p == '\\' && escape) { if (++p == rbrace) break; } - Inc(p); + Inc(p, enc); } memcpy(buf+shift, t, p-t); strcpy(buf+shift+(p-t), rbrace+1); - status = ruby_brace_expand(buf, flags, func, arg); + status = ruby_brace_expand(buf, flags, func, arg, enc); if (status) break; } GLOB_FREE(buf); } else if (!lbrace && !rbrace) { - status = (*func)(s, arg); + status = (*func)(rb_enc_str_new(s, strlen(s), enc), arg); } return status; @@ -1574,7 +1514,7 @@ struct brace_args { }; static int -glob_brace(const char *path, VALUE val) +glob_brace(VALUE path, VALUE val) { struct brace_args *arg = (struct brace_args *)val; @@ -1582,24 +1522,24 @@ glob_brace(const char *path, VALUE val) } static int -ruby_brace_glob0(const char *str, int flags, ruby_glob_func *func, VALUE arg) +ruby_brace_glob0(VALUE str, int flags, ruby_glob_func *func, VALUE arg) { struct brace_args args; args.func = func; args.value = arg; args.flags = flags; - return ruby_brace_expand(str, flags, glob_brace, (VALUE)&args); + return ruby_brace_expand(RSTRING_PTR(str), flags, glob_brace, (VALUE)&args, rb_enc_get(str)); } int ruby_brace_glob(const char *str, int flags, ruby_glob_func *func, VALUE arg) { - return ruby_brace_glob0(str, flags & ~GLOB_VERBOSE, func, arg); + return ruby_brace_glob0(rb_str_new2(str), flags & ~GLOB_VERBOSE, func, arg); } static int -push_glob(VALUE ary, const char *str, int flags) +push_glob(VALUE ary, VALUE str, int flags) { struct glob_args args; @@ -1618,11 +1558,12 @@ rb_push_glob(VALUE str, int flags) /* '\0' is delimiter */ ary = rb_ary_new(); while (offset < RSTRING_LEN(str)) { - int status = push_glob(ary, RSTRING_PTR(str) + offset, flags); char *p, *pend; + p = RSTRING_PTR(str) + offset; + int status = push_glob(ary, + rb_enc_str_new(p, strlen(p), rb_enc_get(str)), flags); if (status) GLOB_JUMP_TAG(status); if (offset >= RSTRING_LEN(str)) break; - p = RSTRING_PTR(str) + offset; p += strlen(p) + 1; pend = RSTRING_PTR(str) + RSTRING_LEN(str); while (p < pend && !*p) @@ -1643,7 +1584,7 @@ dir_globs(long argc, VALUE *argv, int flags) int status; VALUE str = argv[i]; StringValue(str); - status = push_glob(ary, RSTRING_PTR(str), flags); + status = push_glob(ary, str, flags); if (status) GLOB_JUMP_TAG(status); } @@ -1914,7 +1855,7 @@ file_s_fnmatch(int argc, VALUE *argv, VALUE obj) StringValue(pattern); FilePathStringValue(path); - if (fnmatch(RSTRING_PTR(pattern), RSTRING_PTR(path), flags) == 0) + if (fnmatch(pattern, path, flags) == 0) return Qtrue; return Qfalse; -- cgit v1.2.3