aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog7
-rw-r--r--io.c232
-rw-r--r--test/ruby/test_io_m17n.rb71
3 files changed, 209 insertions, 101 deletions
diff --git a/ChangeLog b/ChangeLog
index 7a2ec2a774..e19f398466 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+Mon Aug 18 12:12:29 2008 Tanaka Akira <akr@fsij.org>
+
+ * io.c (io_shift_crbuf): add strp argument to append into existing
+ string.
+ (read_all): use econv if enc2 is set.
+ (io_getc): follow the io_shift_crbuf change.
+
Mon Aug 18 10:35:25 2008 Tanaka Akira <akr@fsij.org>
* io.c (io_enc_str_converted): new function.
diff --git a/io.c b/io.c
index 34b86439d6..45cd8049a1 100644
--- a/io.c
+++ b/io.c
@@ -1406,14 +1406,136 @@ io_enc_str_converted(VALUE str, rb_io_t *fptr)
return str;
}
+static void
+make_readconv(rb_io_t *fptr)
+{
+ if (!fptr->readconv) {
+ fptr->readconv = rb_econv_open(fptr->enc2->name, fptr->enc->name, 0);
+ if (!fptr->readconv)
+ rb_raise(rb_eIOError, "code converter open failed (%s to %s)", fptr->enc->name, fptr->enc2->name);
+ fptr->crbuf_off = 0;
+ fptr->crbuf_len = 0;
+ fptr->crbuf_capa = 1024;
+ fptr->crbuf = ALLOC_N(char, fptr->crbuf_capa);
+ }
+}
+
+static int
+more_char(rb_io_t *fptr)
+{
+ const unsigned char *ss, *sp, *se;
+ unsigned char *ds, *dp, *de;
+ rb_econv_result_t res;
+ int putbackable;
+ int crbuf_len0;
+
+ if (fptr->crbuf_len == fptr->crbuf_capa)
+ return 0; /* crbuf full */
+ if (fptr->crbuf_len == 0)
+ fptr->crbuf_off = 0;
+ else if (fptr->crbuf_off + fptr->crbuf_len == fptr->crbuf_capa) {
+ memmove(fptr->crbuf, fptr->crbuf+fptr->crbuf_off, fptr->crbuf_len);
+ fptr->crbuf_off = 0;
+ }
+
+ crbuf_len0 = fptr->crbuf_len;
+
+ while (1) {
+ ss = sp = (const unsigned char *)fptr->rbuf + fptr->rbuf_off;
+ se = sp + fptr->rbuf_len;
+ ds = dp = (unsigned char *)fptr->crbuf + fptr->crbuf_off + fptr->crbuf_len;
+ de = (unsigned char *)fptr->crbuf + fptr->crbuf_capa;
+ res = rb_econv_convert(fptr->readconv, &sp, se, &dp, de, ECONV_PARTIAL_INPUT|ECONV_OUTPUT_FOLLOWED_BY_INPUT);
+ fptr->rbuf_off += sp - ss;
+ fptr->rbuf_len -= sp - ss;
+ fptr->crbuf_len += dp - ds;
+
+ putbackable = rb_econv_putbackable(fptr->readconv);
+ if (putbackable) {
+ rb_econv_putback(fptr->readconv, (unsigned char *)fptr->rbuf + fptr->rbuf_off - putbackable, putbackable);
+ fptr->rbuf_off -= putbackable;
+ fptr->rbuf_len += putbackable;
+ }
+
+ rb_econv_check_error(fptr->readconv);
+
+ if (crbuf_len0 != fptr->crbuf_len)
+ return 0;
+
+ if (res == econv_finished)
+ return -1;
+
+ if (res == econv_source_buffer_empty) {
+ if (fptr->rbuf_len == 0) {
+ rb_thread_wait_fd(fptr->fd);
+ rb_io_check_closed(fptr);
+ if (io_fillbuf(fptr) == -1) {
+ ds = dp = (unsigned char *)fptr->crbuf + fptr->crbuf_off + fptr->crbuf_len;
+ de = (unsigned char *)fptr->crbuf + fptr->crbuf_capa;
+ res = rb_econv_convert(fptr->readconv, NULL, NULL, &dp, de, 0);
+ fptr->crbuf_len += dp - ds;
+ rb_econv_check_error(fptr->readconv);
+ }
+ }
+ }
+ }
+}
+
+static VALUE
+io_shift_crbuf(rb_io_t *fptr, int len, VALUE *strp)
+{
+ VALUE str;
+ if (NIL_P(*strp)) {
+ *strp = str = rb_str_new(fptr->crbuf+fptr->crbuf_off, len);
+ }
+ else {
+ size_t slen;
+ str = *strp;
+ slen = RSTRING_LEN(str);
+ rb_str_resize(str, RSTRING_LEN(str) + len);
+ memcpy(RSTRING_PTR(str)+slen, fptr->crbuf+fptr->crbuf_off, len);
+ }
+ fptr->crbuf_off += len;
+ fptr->crbuf_len -= len;
+ OBJ_TAINT(str);
+ rb_enc_associate(str, fptr->enc);
+ /* xxx: set coderange */
+ if (fptr->crbuf_len == 0)
+ fptr->crbuf_off = 0;
+ if (fptr->crbuf_off < fptr->crbuf_capa/2) {
+ memmove(fptr->crbuf, fptr->crbuf+fptr->crbuf_off, fptr->crbuf_len);
+ fptr->crbuf_off = 0;
+ }
+ return str;
+}
+
static VALUE
read_all(rb_io_t *fptr, long siz, VALUE str)
{
- long bytes = 0;
+ long bytes;
long n;
- long pos = 0;
- rb_encoding *enc = io_read_encoding(fptr);
- int cr = fptr->enc2 ? ENC_CODERANGE_BROKEN : 0;
+ long pos;
+ rb_encoding *enc;
+ int cr;
+
+ if (fptr->enc2) {
+ VALUE str = rb_str_new(NULL, 0);
+ make_readconv(fptr);
+ while (1) {
+ if (fptr->crbuf_len) {
+ io_shift_crbuf(fptr, fptr->crbuf_len, &str);
+ }
+ if (more_char(fptr) == -1) {
+ return io_enc_str_converted(str, fptr);
+ }
+ }
+ }
+
+ bytes = 0;
+ pos = 0;
+
+ enc = io_read_encoding(fptr);
+ cr = fptr->enc2 ? ENC_CODERANGE_BROKEN : 0;
if (siz == 0) siz = BUFSIZ;
if (NIL_P(str)) {
@@ -1744,81 +1866,6 @@ rscheck(const char *rsptr, long rslen, VALUE rs)
rb_raise(rb_eRuntimeError, "rs modified");
}
-static void
-make_readconv(rb_io_t *fptr)
-{
- if (!fptr->readconv) {
- fptr->readconv = rb_econv_open(fptr->enc2->name, fptr->enc->name, 0);
- if (!fptr->readconv)
- rb_raise(rb_eIOError, "code converter open failed (%s to %s)", fptr->enc->name, fptr->enc2->name);
- fptr->crbuf_off = 0;
- fptr->crbuf_len = 0;
- fptr->crbuf_capa = 1024;
- fptr->crbuf = ALLOC_N(char, fptr->crbuf_capa);
- }
-}
-
-static int
-more_char(rb_io_t *fptr)
-{
- const unsigned char *ss, *sp, *se;
- unsigned char *ds, *dp, *de;
- rb_econv_result_t res;
- int putbackable;
- int crbuf_len0;
-
- if (fptr->crbuf_len == fptr->crbuf_capa)
- return 0; /* crbuf full */
- if (fptr->crbuf_len == 0)
- fptr->crbuf_off = 0;
- else if (fptr->crbuf_off + fptr->crbuf_len == fptr->crbuf_capa) {
- memmove(fptr->crbuf, fptr->crbuf+fptr->crbuf_off, fptr->crbuf_len);
- fptr->crbuf_off = 0;
- }
-
- crbuf_len0 = fptr->crbuf_len;
-
- while (1) {
- ss = sp = (const unsigned char *)fptr->rbuf + fptr->rbuf_off;
- se = sp + fptr->rbuf_len;
- ds = dp = (unsigned char *)fptr->crbuf + fptr->crbuf_off + fptr->crbuf_len;
- de = (unsigned char *)fptr->crbuf + fptr->crbuf_capa;
- res = rb_econv_convert(fptr->readconv, &sp, se, &dp, de, ECONV_PARTIAL_INPUT|ECONV_OUTPUT_FOLLOWED_BY_INPUT);
- fptr->rbuf_off += sp - ss;
- fptr->rbuf_len -= sp - ss;
- fptr->crbuf_len += dp - ds;
-
- putbackable = rb_econv_putbackable(fptr->readconv);
- if (putbackable) {
- rb_econv_putback(fptr->readconv, (unsigned char *)fptr->rbuf + fptr->rbuf_off - putbackable, putbackable);
- fptr->rbuf_off -= putbackable;
- fptr->rbuf_len += putbackable;
- }
-
- rb_econv_check_error(fptr->readconv);
-
- if (crbuf_len0 != fptr->crbuf_len)
- return 0;
-
- if (res == econv_finished)
- return -1;
-
- if (res == econv_source_buffer_empty) {
- if (fptr->rbuf_len == 0) {
- rb_thread_wait_fd(fptr->fd);
- rb_io_check_closed(fptr);
- if (io_fillbuf(fptr) == -1) {
- ds = dp = (unsigned char *)fptr->crbuf + fptr->crbuf_off + fptr->crbuf_len;
- de = (unsigned char *)fptr->crbuf + fptr->crbuf_capa;
- res = rb_econv_convert(fptr->readconv, NULL, NULL, &dp, de, 0);
- fptr->crbuf_len += dp - ds;
- rb_econv_check_error(fptr->readconv);
- }
- }
- }
- }
-}
-
static int
appendline(rb_io_t *fptr, int delim, VALUE *strp, long *lp)
{
@@ -2356,31 +2403,14 @@ rb_io_each_byte(VALUE io)
}
static VALUE
-io_shift_crbuf(rb_io_t *fptr, int len)
-{
- VALUE str;
- str = rb_str_new(fptr->crbuf+fptr->crbuf_off, len);
- fptr->crbuf_off += len;
- fptr->crbuf_len -= len;
- OBJ_TAINT(str);
- rb_enc_associate(str, fptr->enc);
- /* xxx: set coderange */
- if (fptr->crbuf_len == 0)
- fptr->crbuf_off = 0;
- if (fptr->crbuf_off < fptr->crbuf_capa/2) {
- memmove(fptr->crbuf, fptr->crbuf+fptr->crbuf_off, fptr->crbuf_len);
- fptr->crbuf_off = 0;
- }
- return str;
-}
-
-static VALUE
io_getc(rb_io_t *fptr, rb_encoding *enc)
{
int r, n, cr = 0;
VALUE str;
if (fptr->enc2) {
+ VALUE str = Qnil;
+
if (!fptr->readconv) {
make_readconv(fptr);
}
@@ -2401,16 +2431,16 @@ io_getc(rb_io_t *fptr, rb_encoding *enc)
if (fptr->crbuf_len == 0)
return Qnil;
/* return an incomplete character just before EOF */
- return io_shift_crbuf(fptr, fptr->crbuf_len);
+ return io_shift_crbuf(fptr, fptr->crbuf_len, &str);
}
}
if (MBCLEN_INVALID_P(r)) {
r = rb_enc_mbclen(fptr->crbuf+fptr->crbuf_off,
fptr->crbuf+fptr->crbuf_off+fptr->crbuf_len,
fptr->enc);
- return io_shift_crbuf(fptr, r);
+ return io_shift_crbuf(fptr, r, &str);
}
- return io_shift_crbuf(fptr, MBCLEN_CHARFOUND_LEN(r));
+ return io_shift_crbuf(fptr, MBCLEN_CHARFOUND_LEN(r), &str);
}
if (io_fillbuf(fptr) < 0) {
diff --git a/test/ruby/test_io_m17n.rb b/test/ruby/test_io_m17n.rb
index e4afd5031b..7bcd051c36 100644
--- a/test/ruby/test_io_m17n.rb
+++ b/test/ruby/test_io_m17n.rb
@@ -473,6 +473,77 @@ EOT
}
end
+ def test_gets_invalid
+ with_pipe("utf-8:euc-jp") {|r, w|
+ before = "\u{3042}\u{3044}"
+ invalid = "\x80".force_encoding("utf-8")
+ after = "\u{3046}\u{3048}"
+ w << before + invalid + after
+ w.close
+ err = assert_raise(Encoding::InvalidByteSequence) { r.gets }
+ assert_equal(invalid.force_encoding("ascii-8bit"), err.error_bytes)
+ assert_equal(after.encode("euc-jp"), r.gets)
+ }
+ end
+
+ def test_getc_invalid
+ with_pipe("utf-8:euc-jp") {|r, w|
+ before1 = "\u{3042}"
+ before2 = "\u{3044}"
+ invalid = "\x80".force_encoding("utf-8")
+ after1 = "\u{3046}"
+ after2 = "\u{3048}"
+ w << before1 + before2 + invalid + after1 + after2
+ w.close
+ assert_equal(before1.encode("euc-jp"), r.getc)
+ assert_equal(before2.encode("euc-jp"), r.getc)
+ err = assert_raise(Encoding::InvalidByteSequence) { r.getc }
+ assert_equal(invalid.force_encoding("ascii-8bit"), err.error_bytes)
+ assert_equal(after1.encode("euc-jp"), r.getc)
+ assert_equal(after2.encode("euc-jp"), r.getc)
+ }
+ end
+
+ def test_getc_invalid2
+ with_pipe("utf-16le:euc-jp") {|r, w|
+ before1 = "\x42\x30".force_encoding("utf-16le")
+ before2 = "\x44\x30".force_encoding("utf-16le")
+ invalid = "\x00\xd8".force_encoding("utf-16le")
+ after1 = "\x46\x30".force_encoding("utf-16le")
+ after2 = "\x48\x30".force_encoding("utf-16le")
+ w << before1 + before2 + invalid + after1 + after2
+ w.close
+ assert_equal(before1.encode("euc-jp"), r.getc)
+ assert_equal(before2.encode("euc-jp"), r.getc)
+ err = assert_raise(Encoding::InvalidByteSequence) { r.getc }
+ assert_equal(invalid.force_encoding("ascii-8bit"), err.error_bytes)
+ assert_equal(after1.encode("euc-jp"), r.getc)
+ assert_equal(after2.encode("euc-jp"), r.getc)
+ }
+ end
+
+ def test_read_all
+ with_pipe("utf-8:euc-jp") {|r, w|
+ str = "\u3042\u3044"
+ w << str
+ w.close
+ assert_equal(str.encode("euc-jp"), r.read)
+ }
+ end
+
+ def test_read_all_invalid
+ with_pipe("utf-8:euc-jp") {|r, w|
+ before = "\u{3042}\u{3044}"
+ invalid = "\x80".force_encoding("utf-8")
+ after = "\u{3046}\u{3048}"
+ w << before + invalid + after
+ w.close
+ err = assert_raise(Encoding::InvalidByteSequence) { r.read }
+ assert_equal(invalid.force_encoding("ascii-8bit"), err.error_bytes)
+ assert_equal(after.encode("euc-jp"), r.read)
+ }
+ end
+
def test_file_foreach
with_tmpdir {
generate_file('tst', 'a' * 8191 + "\xa1\xa1")