diff options
author | akr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2007-12-08 02:50:43 +0000 |
---|---|---|
committer | akr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2007-12-08 02:50:43 +0000 |
commit | f1b7e60cb90a7e1a392d4ffccd07dd06eeff5345 (patch) | |
tree | 8135b8dc1f1ef8a6bcd08a86c4106c83941780fa /re.c | |
parent | 990bec97020bfabd09ebfd92581f505b4f09a78a (diff) | |
download | ruby-f1b7e60cb90a7e1a392d4ffccd07dd06eeff5345.tar.gz |
* encoding.c (rb_enc_mbclen): make it never fail.
(rb_enc_nth): don't check the return value of rb_enc_mbclen.
(rb_enc_strlen): ditto.
(rb_enc_precise_mbclen): return needmore(1) if e <= p.
(rb_enc_get_ascii): new function for extracting ASCII character.
* include/ruby/encoding.h (rb_enc_get_ascii): declared.
* include/ruby/regex.h (ismbchar): removed.
* re.c (rb_reg_expr_str): use rb_enc_get_ascii.
(unescape_escaped_nonascii): use rb_enc_precise_mbclen to determine
the termination of escaped non-ASCII character.
(unescape_nonascii): use rb_enc_precise_mbclen.
(rb_reg_quote): use rb_enc_get_ascii.
(rb_reg_regsub): use rb_enc_get_ascii.
* string.c (rb_str_reverse) don't check the return value of
rb_enc_mbclen.
(rb_str_split_m): don't call rb_enc_mbclen with e <= p.
* parse.y (is_identchar): use ISASCII.
(parser_ismbchar): removed.
(parser_precise_mbclen): new macro.
(parser_isascii): new macro.
(parser_tokadd_mbchar): use parser_precise_mbclen to check invalid
character precisely.
(parser_tokadd_string): use parser_isascii.
(parser_yylex): ditto.
(is_special_global_name): don't call is_identchar with e <= p.
(rb_enc_symname_p): ditto.
[ruby-dev:32455]
* ext/tk/sample/tkextlib/vu/canvSticker2.rb: remove coding cookie
because the encoding is not UTF-8. [ruby-dev:32475]
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@14131 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 're.c')
-rw-r--r-- | re.c | 54 |
1 files changed, 30 insertions, 24 deletions
@@ -218,10 +218,12 @@ rb_reg_expr_str(VALUE str, const char *s, long len) rb_encoding *enc = rb_enc_get(str); const char *p, *pend; int need_escape = 0; + int c; p = s; pend = p + len; while (p<pend) { - if (*p == '/' || (!rb_enc_isprint(*p, enc) && !ismbchar(p, pend, enc))) { + c = rb_enc_get_ascii(p, pend, enc); + if (c == '/' || (c != -1 && !rb_enc_isprint(c, enc))) { need_escape = 1; break; } @@ -233,29 +235,31 @@ rb_reg_expr_str(VALUE str, const char *s, long len) else { p = s; while (p<pend) { - if (*p == '\\') { + c = rb_enc_get_ascii(p, pend, enc); + if (c == '\\') { int n = mbclen(p+1, pend, enc) + 1; rb_str_buf_cat(str, p, n); p += n; continue; } - else if (*p == '/') { + else if (c == '/') { char c = '\\'; rb_str_buf_cat(str, &c, 1); rb_str_buf_cat(str, p, 1); } - else if (ismbchar(p, pend, enc)) { - rb_str_buf_cat(str, p, mbclen(p, pend, enc)); - p += mbclen(p, pend, enc); + else if (c == -1) { + int l = mbclen(p, pend, enc); + rb_str_buf_cat(str, p, l); + p += l; continue; } - else if (rb_enc_isprint(*p, enc)) { + else if (rb_enc_isprint(c, enc)) { rb_str_buf_cat(str, p, 1); } - else if (!rb_enc_isspace(*p, enc)) { + else if (!rb_enc_isspace(c, enc)) { char b[8]; - sprintf(b, "\\%03o", *p & 0377); + sprintf(b, "\\%03o", c); rb_str_buf_cat(str, b, 4); } else { @@ -1377,6 +1381,7 @@ unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc, char *chbuf = ALLOCA_N(char, chmaxlen); int chlen = 0; int byte; + int l; memset(chbuf, 0, chmaxlen); @@ -1386,7 +1391,8 @@ unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc, } chbuf[chlen++] = byte; - while (chlen < chmaxlen && chlen != mbclen(chbuf, chbuf+chlen, enc)) { + while (chlen < chmaxlen && + MBCLEN_NEEDMORE(rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc))) { byte = read_escaped_byte(&p, end, err); if (byte == -1) { return -1; @@ -1394,11 +1400,11 @@ unescape_escaped_nonascii(const char **pp, const char *end, rb_encoding *enc, chbuf[chlen++] = byte; } - if (chlen != mbclen(chbuf, chbuf+chlen, enc)) { + l = rb_enc_precise_mbclen(chbuf, chbuf+chlen, enc); + if (MBCLEN_INVALID(l)) { strcpy(err, "invalid multibyte escape"); return -1; } - if (1 < chlen || (chbuf[0] & 0x80)) { rb_str_buf_cat(buf, chbuf, chlen); @@ -1515,13 +1521,12 @@ unescape_nonascii(const char *p, const char *end, rb_encoding *enc, char smallbuf[2]; while (p < end) { - int chlen = mbclen(p, end, enc); + int chlen = rb_enc_precise_mbclen(p, end, enc); + if (!MBCLEN_CHARFOUND(chlen)) { + strcpy(err, "invalid multibyte character"); + return -1; + } if (1 < chlen || (*p & 0x80)) { - if (end < p + chlen) { - strcpy(err, "too short multibyte character"); - return -1; - } - /* xxx: validate the non-ascii character */ rb_str_buf_cat(buf, p, chlen); p += chlen; if (*encp == 0) @@ -2093,8 +2098,8 @@ rb_reg_quote(VALUE str) s = RSTRING_PTR(str); send = s + RSTRING_LEN(str); for (; s < send; s++) { - c = *s; - if (ismbchar(s, send, enc)) { + c = rb_enc_get_ascii(s, send, enc); + if (c == -1) { int n = mbclen(s, send, enc); while (n-- && s < send) @@ -2129,8 +2134,8 @@ rb_reg_quote(VALUE str) t += s - RSTRING_PTR(str); for (; s < send; s++) { - c = *s; - if (ismbchar(s, send, enc)) { + c = rb_enc_get_ascii(s, send, enc); + if (c == -1) { int n = mbclen(s, send, enc); while (n-- && s < send) @@ -2397,13 +2402,14 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp) e = s + RSTRING_LEN(str); while (s < e) { + int c = rb_enc_get_ascii(s, e, enc); char *ss = s++; - if (ismbchar(ss, e, enc)) { + if (c == -1) { s += mbclen(ss, e, enc) - 1; continue; } - if (*ss != '\\' || s == e) continue; + if (c != '\\' || s == e) continue; if (!val) { val = rb_str_buf_new(ss-p); |