From 035d4816c3782eca70464fa553b968dc46adee53 Mon Sep 17 00:00:00 2001 From: akr Date: Mon, 18 Aug 2008 12:06:42 +0000 Subject: * include/ruby/io.h (rb_io_t): new fields: writeconv, writeconv_stateless and writeconv_initialized. (MakeOpenFile): initialize them. * include/ruby/encoding.h (rb_econv_stateless_encoding): declared. (rb_econv_string): declared. * io.c (make_writeconv): new function. (io_fwrite): use econv. (make_readconv): fix error message. (finish_writeconv): new function. (fptr_finalize): call finish_writeconv. (clear_writeconv): new function. (clear_codeconv): new function to call both clear_readconv and clear_writeconv. (rb_io_fptr_finalize): call clear_codeconv instead of clear_readconv. (mode_enc): ditto. (io_set_encoding): ditto. (argf_next_argv): ditto. (io_encoding_set): ditto. * gc.c (gc_mark_children): mark writeconv_stateless in T_FILE. * transcode.c (stateless_encoding_i): new function. (rb_econv_stateless_encoding): ditto. (rb_econv_string): ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@18691 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 30 ++++++++++ gc.c | 4 +- include/ruby/encoding.h | 5 ++ include/ruby/io.h | 8 +++ io.c | 150 +++++++++++++++++++++++++++++++++++++++++----- test/ruby/test_io_m17n.rb | 45 ++++++++++++++ transcode.c | 72 ++++++++++++++++++++++ 7 files changed, 297 insertions(+), 17 deletions(-) diff --git a/ChangeLog b/ChangeLog index d2f166aa56..9ead4833be 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,33 @@ +Mon Aug 18 21:02:08 2008 Tanaka Akira + + * include/ruby/io.h (rb_io_t): new fields: writeconv, + writeconv_stateless and writeconv_initialized. + (MakeOpenFile): initialize them. + + * include/ruby/encoding.h (rb_econv_stateless_encoding): declared. + (rb_econv_string): declared. + + * io.c (make_writeconv): new function. + (io_fwrite): use econv. + (make_readconv): fix error message. + (finish_writeconv): new function. + (fptr_finalize): call finish_writeconv. + (clear_writeconv): new function. + (clear_codeconv): new function to call both clear_readconv and + clear_writeconv. + (rb_io_fptr_finalize): call clear_codeconv instead of + clear_readconv. + (mode_enc): ditto. + (io_set_encoding): ditto. + (argf_next_argv): ditto. + (io_encoding_set): ditto. + + * gc.c (gc_mark_children): mark writeconv_stateless in T_FILE. + + * transcode.c (stateless_encoding_i): new function. + (rb_econv_stateless_encoding): ditto. + (rb_econv_string): ditto. + Mon Aug 18 17:23:38 2008 Tanaka Akira * io.c (clear_readconv): extracted from rb_io_fptr_finalize. diff --git a/gc.c b/gc.c index 2bfb27d355..551437ac44 100644 --- a/gc.c +++ b/gc.c @@ -1507,8 +1507,10 @@ gc_mark_children(rb_objspace_t *objspace, VALUE ptr, int lev) break; case T_FILE: - if (obj->as.file.fptr) + if (obj->as.file.fptr) { gc_mark(objspace, obj->as.file.fptr->tied_io_for_writing, lev); + gc_mark(objspace, obj->as.file.fptr->writeconv_stateless, lev); + } break; case T_REGEXP: diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h index c859c50559..6c443d6f0d 100644 --- a/include/ruby/encoding.h +++ b/include/ruby/encoding.h @@ -268,6 +268,11 @@ void rb_econv_check_error(rb_econv_t *ec); int rb_econv_putbackable(rb_econv_t *ec); void rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n); +/* returns corresponding stateless encoding, or NULL if not stateful. */ +const char *rb_econv_stateless_encoding(const char *stateful_enc); + +VALUE rb_econv_string(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags); + /* flags for rb_econv_open */ #define ECONV_UNIVERSAL_NEWLINE_DECODER 0x100 #define ECONV_CRLF_NEWLINE_ENCODER 0x200 diff --git a/include/ruby/io.h b/include/ruby/io.h index 2a2b991a6d..32830aaaea 100644 --- a/include/ruby/io.h +++ b/include/ruby/io.h @@ -63,6 +63,11 @@ typedef struct rb_io_t { int crbuf_off; int crbuf_len; int crbuf_capa; + + rb_econv_t *writeconv; + VALUE writeconv_stateless; + int writeconv_initialized; + } rb_io_t; #define HAVE_RB_IO_T 1 @@ -110,6 +115,9 @@ typedef struct rb_io_t { fp->crbuf_off = 0;\ fp->crbuf_len = 0;\ fp->crbuf_capa = 0;\ + fp->writeconv = NULL;\ + fp->writeconv_stateless = Qnil;\ + fp->writeconv_initialized = 0;\ fp->tied_io_for_writing = 0;\ fp->enc = 0;\ fp->enc2 = 0;\ diff --git a/io.c b/io.c index 92db7be8e2..aae3e433cb 100644 --- a/io.c +++ b/io.c @@ -689,6 +689,38 @@ rb_io_wait_writable(int f) } } +static void +make_writeconv(rb_io_t *fptr) +{ + if (!fptr->writeconv_initialized) { + const char *senc, *denc; + fptr->writeconv_stateless = Qnil; + if (fptr->enc2) { + senc = fptr->enc->name; + denc = fptr->enc2->name; + } + else { + senc = rb_econv_stateless_encoding(fptr->enc->name); + if (senc) { + denc = fptr->enc->name; + fptr->writeconv_stateless = rb_str_new2(senc); + } + else { + denc = NULL; + } + } + if (senc) { + fptr->writeconv = rb_econv_open(senc, denc, 0); + if (!fptr->writeconv) + rb_raise(rb_eIOError, "code converter open failed (%s to %s)", senc, denc); + } + else { + fptr->writeconv = NULL; + } + fptr->writeconv_initialized = 1; + } +} + /* writing functions */ static long io_fwrite(VALUE str, rb_io_t *fptr) @@ -701,17 +733,18 @@ io_fwrite(VALUE str, rb_io_t *fptr) * We must also transcode if two encodings were specified */ if (fptr->enc) { - /* transcode str before output */ - /* the methods in transcode.c are static, so call indirectly */ - /* Can't use encode! because puts writes a frozen newline */ + make_writeconv(fptr); if (fptr->enc2) { - str = rb_funcall(str, id_encode, 2, - rb_enc_from_encoding(fptr->enc2), - rb_enc_from_encoding(fptr->enc)); + str = rb_econv_string(fptr->writeconv, str, 0, RSTRING_LEN(str), Qnil, ECONV_PARTIAL_INPUT); } else { - str = rb_funcall(str, id_encode, 1, - rb_enc_from_encoding(fptr->enc)); + if (fptr->writeconv) { + str = rb_str_transcode(str, fptr->writeconv_stateless); + str = rb_econv_string(fptr->writeconv, str, 0, RSTRING_LEN(str), Qnil, ECONV_PARTIAL_INPUT); + } + else { + str = rb_str_transcode(str, rb_enc_from_encoding(fptr->enc)); + } } } @@ -1394,7 +1427,7 @@ make_readconv(rb_io_t *fptr) if (!fptr->readconv) { fptr->readconv = rb_econv_open(fptr->enc2->name, fptr->enc->name, 0); if (!fptr->readconv) - rb_raise(rb_eIOError, "code converter open failed (%s to %s)", fptr->enc->name, fptr->enc2->name); + rb_raise(rb_eIOError, "code converter open failed (%s to %s)", fptr->enc2->name, fptr->enc->name); fptr->crbuf_off = 0; fptr->crbuf_len = 0; fptr->crbuf_capa = 1024; @@ -2844,10 +2877,78 @@ rb_io_set_close_on_exec(VALUE io, VALUE arg) #define IS_PREP_STDIO(f) ((f)->mode & FMODE_PREP) #define PREP_STDIO_NAME(f) ((f)->path) +static void +finish_writeconv(rb_io_t *fptr, int noraise) +{ + unsigned char *ds, *dp, *de; + rb_econv_result_t res; + + if (!fptr->wbuf) { + unsigned char buf[1024]; + int r; + + res = econv_destination_buffer_full; + while (res == econv_destination_buffer_full) { + ds = dp = buf; + de = buf + sizeof(buf); + res = rb_econv_convert(fptr->writeconv, NULL, NULL, &dp, de, 0); + while (dp-ds) { +retry: + r = rb_write_internal(fptr->fd, ds, dp-ds); + if (r == dp-ds) + break; + if (0 <= r) { + ds += r; + } + if (rb_io_wait_writable(fptr->fd)) { + if (!noraise) + rb_io_check_closed(fptr); + else if (fptr->fd < 0) + return; + goto retry; + } + return; + } + if (!noraise) { + rb_econv_check_error(fptr->writeconv); + } + if (res == econv_invalid_byte_sequence || + res == econv_undefined_conversion) { + break; + } + } + + return; + } + + res = econv_destination_buffer_full; + while (res == econv_destination_buffer_full) { + if (fptr->wbuf_len == fptr->wbuf_capa) { + io_fflush(fptr); + } + + ds = dp = (unsigned char *)fptr->wbuf + fptr->wbuf_off + fptr->wbuf_len; + de = (unsigned char *)fptr->wbuf + fptr->wbuf_capa; + res = rb_econv_convert(fptr->writeconv, NULL, NULL, &dp, de, 0); + fptr->wbuf_len += dp - ds; + if (!noraise) { + rb_econv_check_error(fptr->writeconv); + } + if (res == econv_invalid_byte_sequence || + res == econv_undefined_conversion) { + break; + } + } + +} + static void fptr_finalize(rb_io_t *fptr, int noraise) { int ebadf = 0; + if (fptr->writeconv) { + finish_writeconv(fptr, noraise); + } if (fptr->wbuf_len) { io_fflush(fptr); } @@ -2907,6 +3008,23 @@ clear_readconv(rb_io_t *fptr) } } +static void +clear_writeconv(rb_io_t *fptr) +{ + if (fptr->writeconv) { + rb_econv_close(fptr->writeconv); + fptr->writeconv = NULL; + } + fptr->writeconv_initialized = 0; +} + +static void +clear_codeconv(rb_io_t *fptr) +{ + clear_readconv(fptr); + clear_writeconv(fptr); +} + int rb_io_fptr_finalize(rb_io_t *fptr) { @@ -2926,7 +3044,7 @@ rb_io_fptr_finalize(rb_io_t *fptr) free(fptr->wbuf); fptr->wbuf = 0; } - clear_readconv(fptr); + clear_codeconv(fptr); free(fptr); return 1; } @@ -3535,7 +3653,7 @@ mode_enc(rb_io_t *fptr, const char *estr) fptr->enc = 0; fptr->enc2 = 0; - clear_readconv(fptr); + clear_codeconv(fptr); p0 = strrchr(estr, ':'); if (!p0) p1 = estr; @@ -4265,7 +4383,7 @@ io_set_encoding(VALUE io, VALUE opt) GetOpenFile(io, fptr); fptr->enc = 0; fptr->enc2 = 0; - clear_readconv(fptr); + clear_codeconv(fptr); if (!NIL_P(encoding)) { rb_warn("Ignoring encoding parameter '%s': external_encoding is used", RSTRING_PTR(encoding)); @@ -5612,7 +5730,7 @@ argf_next_argv(VALUE argf) GetOpenFile(current_file, fptr); fptr->enc = argf_enc; fptr->enc2 = argf_enc2; - clear_readconv(fptr); + clear_codeconv(fptr); } } else { @@ -6340,13 +6458,13 @@ io_encoding_set(rb_io_t *fptr, int argc, VALUE v1, VALUE v2) if (argc == 2) { fptr->enc2 = rb_to_encoding(v1); fptr->enc = rb_to_encoding(v2); - clear_readconv(fptr); + clear_codeconv(fptr); } else if (argc == 1) { if (NIL_P(v1)) { fptr->enc = 0; fptr->enc2 = 0; - clear_readconv(fptr); + clear_codeconv(fptr); } else { VALUE tmp = rb_check_string_type(v1); @@ -6356,7 +6474,7 @@ io_encoding_set(rb_io_t *fptr, int argc, VALUE v1, VALUE v2) else { fptr->enc = rb_to_encoding(v1); fptr->enc2 = 0; - clear_readconv(fptr); + clear_codeconv(fptr); } } } diff --git a/test/ruby/test_io_m17n.rb b/test/ruby/test_io_m17n.rb index e982722cfe..070987ad3c 100644 --- a/test/ruby/test_io_m17n.rb +++ b/test/ruby/test_io_m17n.rb @@ -601,5 +601,50 @@ EOT } end + def test_write_conversion_fixenc + with_pipe {|r, w| + w.set_encoding("iso-2022-jp:utf-8") + t = Thread.new { r.read.force_encoding("ascii-8bit") } + w << "\u3042" + w << "\u3044" + w.close + assert_equal("\e$B$\"$$\e(B".force_encoding("ascii-8bit"), t.value) + } + end + + def test_write_conversion_anyenc_stateful + with_pipe {|r, w| + w.set_encoding("iso-2022-jp") + t = Thread.new { r.read.force_encoding("ascii-8bit") } + w << "\u3042" + w << "\x82\xa2".force_encoding("sjis") + w.close + assert_equal("\e$B$\"$$\e(B".force_encoding("ascii-8bit"), t.value) + } + end + + def test_write_conversion_anyenc_stateless + with_pipe {|r, w| + w.set_encoding("euc-jp") + t = Thread.new { r.read.force_encoding("ascii-8bit") } + w << "\u3042" + w << "\x82\xa2".force_encoding("sjis") + w.close + assert_equal("\xa4\xa2\xa4\xa4".force_encoding("ascii-8bit"), t.value) + } + end + + def test_write_conversion_anyenc_stateful_nosync + with_pipe {|r, w| + w.sync = false + w.set_encoding("iso-2022-jp") + t = Thread.new { r.read.force_encoding("ascii-8bit") } + w << "\u3042" + w << "\x82\xa2".force_encoding("sjis") + w.close + assert_equal("\e$B$\"$$\e(B".force_encoding("ascii-8bit"), t.value) + } + end + end diff --git a/transcode.c b/transcode.c index 33b1c7fc96..6ef4e84040 100644 --- a/transcode.c +++ b/transcode.c @@ -1219,6 +1219,78 @@ rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n) tc->readagain_len -= n; } +struct stateless_encoding_t { + const char *stateless_enc; + const char *stateful_enc; +}; + +static int +stateless_encoding_i(st_data_t key, st_data_t val, st_data_t arg) +{ + struct stateless_encoding_t *data = (struct stateless_encoding_t *)arg; + st_table *table2 = (st_table *)val; + st_data_t v; + + if (st_lookup(table2, (st_data_t)data->stateful_enc, &v)) { + transcoder_entry_t *entry = (transcoder_entry_t *)v; + const rb_transcoder *tr = load_transcoder_entry(entry); + if (tr && tr->stateful_type == stateful_encoder) { + data->stateless_enc = tr->from_encoding; + return ST_STOP; + } + } + return ST_CONTINUE; +} + +const char * +rb_econv_stateless_encoding(const char *stateful_enc) +{ + struct stateless_encoding_t data; + data.stateful_enc = stateful_enc; + data.stateless_enc = NULL; + st_foreach(transcoder_table, stateless_encoding_i, (st_data_t)&data); + if (data.stateless_enc) + return data.stateless_enc; + return NULL; +} + +VALUE +rb_econv_string(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags) +{ + unsigned const char *ss, *sp, *se; + unsigned char *ds, *dp, *de; + rb_econv_result_t res; + + if (NIL_P(dst)) { + dst = rb_str_buf_new(len); + } + + res = econv_destination_buffer_full; + while (res == econv_destination_buffer_full) { + long dlen = RSTRING_LEN(dst); + int max_output = ec->last_tc->transcoder->max_output; + if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) { + unsigned long new_capa = (unsigned long)dlen + len + max_output; + if (LONG_MAX < new_capa) + rb_raise(rb_eArgError, "too long string"); + rb_str_resize(dst, new_capa); + rb_str_set_len(dst, dlen); + } + ss = sp = (const unsigned char *)RSTRING_PTR(src) + off; + se = ss + len; + ds = dp = (unsigned char *)RSTRING_PTR(dst) + dlen; + de = ds + rb_str_capacity(dst); + res = rb_econv_convert(ec, &sp, se, &dp, de, flags); + off += sp - ss; + len -= sp - ss; + rb_str_set_len(dst, dlen + (dp - ds)); + rb_econv_check_error(ec); + } + + return dst; +} + + static VALUE make_econv_exception(rb_econv_t *ec) { -- cgit v1.2.3