diff options
-rw-r--r-- | ChangeLog | 37 | ||||
-rw-r--r-- | gc.c | 2 | ||||
-rw-r--r-- | include/ruby/encoding.h | 6 | ||||
-rw-r--r-- | include/ruby/io.h | 5 | ||||
-rw-r--r-- | io.c | 33 | ||||
-rw-r--r-- | test/ruby/test_econv.rb | 7 | ||||
-rw-r--r-- | test/ruby/test_io_m17n.rb | 10 | ||||
-rw-r--r-- | test/ruby/test_transcode.rb | 8 | ||||
-rw-r--r-- | transcode.c | 107 |
9 files changed, 184 insertions, 31 deletions
@@ -1,3 +1,40 @@ +Thu Sep 4 03:10:05 2008 Tanaka Akira <akr@fsij.org> + + * include/ruby/io.h (rb_io_t): new fields: encs.ecopts and + writeconv_pre_ecopts. + (MakeOpenFile): initialize them. + + * include/ruby/encoding.h (rb_str_transcode): take ecopts argument. + (rb_econv_flags): removed. + (rb_econv_prepare_opts): declared. + (rb_econv_open_opts): declared. + + * io.c (make_writeconv): use rb_econv_open_opts. + (make_readconv): ditto. + (io_fwrite): follow rb_str_transcode change. + (rb_io_extract_modeenc): use rb_econv_prepare_opts. + (rb_file_open_generic): initialize encs.ecopts. + (rb_file_open_internal): ditto. + (rb_io_reopen): ditto. + (argf_ecopts): defined. + (argf_next_argv): set encs.ecopts. + (io_encoding_set): use rb_econv_prepare_opts. + (argf_set_encoding): set argf_ecopts. + + * gc.c (gc_mark_children): mark encs.ecopts and writeconv_pre_ecopts + in T_FILE. + + * transcode.c (transcode_loop): take ecopts argument. use + rb_econv_open_opts. + (rb_econv_flags): removed. + (rb_econv_prepare_opts): defined. + (rb_econv_open_opts): defined. + (str_transcode0): take ecopts. + (str_transcode): use rb_econv_prepare_opts. + (rb_str_transcode): take ecopts. + (econv_init): accept hash argument. + (econv_insert_output): follow rb_str_transcode change. + Thu Sep 4 01:30:26 2008 Tanaka Akira <akr@fsij.org> * include/ruby/encoding.h (rb_econv_set_replacemenet): declared. @@ -1511,6 +1511,8 @@ gc_mark_children(rb_objspace_t *objspace, VALUE ptr, int lev) gc_mark(objspace, obj->as.file.fptr->pathv, lev); gc_mark(objspace, obj->as.file.fptr->tied_io_for_writing, lev); gc_mark(objspace, obj->as.file.fptr->writeconv_stateless, lev); + gc_mark(objspace, obj->as.file.fptr->writeconv_pre_ecopts, lev); + gc_mark(objspace, obj->as.file.fptr->encs.ecopts, lev); } break; diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h index 7919347199..10a7b95229 100644 --- a/include/ruby/encoding.h +++ b/include/ruby/encoding.h @@ -208,11 +208,13 @@ typedef enum { typedef struct rb_econv_t rb_econv_t; -VALUE rb_str_transcode(VALUE str, VALUE to, int ecflags); +VALUE rb_str_transcode(VALUE str, VALUE to, int ecflags, VALUE ecopts); -int rb_econv_flags(VALUE hash); +int rb_econv_prepare_opts(VALUE opthash, VALUE *ecopts); rb_econv_t *rb_econv_open(const char *source_encoding, const char *destination_encoding, int ecflags); +rb_econv_t *rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts); + rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, diff --git a/include/ruby/io.h b/include/ruby/io.h index cdef496a93..676fd05ae3 100644 --- a/include/ruby/io.h +++ b/include/ruby/io.h @@ -58,6 +58,7 @@ typedef struct rb_io_t { rb_encoding *enc; rb_encoding *enc2; int flags; + VALUE ecopts; } encs; rb_econv_t *readconv; @@ -69,6 +70,7 @@ typedef struct rb_io_t { rb_econv_t *writeconv; VALUE writeconv_stateless; int writeconv_pre_flags; + VALUE writeconv_pre_ecopts; int writeconv_initialized; } rb_io_t; @@ -123,11 +125,14 @@ typedef struct rb_io_t { fp->cbuf_capa = 0;\ fp->writeconv = NULL;\ fp->writeconv_stateless = Qnil;\ + fp->writeconv_pre_flags = 0;\ + fp->writeconv_pre_ecopts = Qnil;\ fp->writeconv_initialized = 0;\ fp->tied_io_for_writing = 0;\ fp->encs.enc = NULL;\ fp->encs.enc2 = NULL;\ fp->encs.flags = 0;\ + fp->encs.ecopts = Qnil;\ } while (0) FILE *rb_io_stdio_file(rb_io_t *fptr); @@ -691,19 +691,22 @@ make_writeconv(rb_io_t *fptr) const char *senc, *denc; rb_encoding *enc; int ecflags; + VALUE ecopts; fptr->writeconv_initialized = 1; /* ECONV_INVALID_XXX and ECONV_UNDEF_XXX should be set both. * But ECONV_CRLF_NEWLINE_ENCODER should be set only for the first. */ fptr->writeconv_pre_flags = fptr->encs.flags; + fptr->writeconv_pre_ecopts = fptr->encs.ecopts; ecflags = fptr->encs.flags; + ecopts = fptr->encs.ecopts; #ifdef TEXTMODE_NEWLINE_ENCODER if (!fptr->encs.enc) { if (NEED_NEWLINE_ENCODER(fptr)) ecflags |= TEXTMODE_NEWLINE_ENCODER; - fptr->writeconv = rb_econv_open("", "", ecflags); + fptr->writeconv = rb_econv_open_opts("", "", ecflags, ecopts); if (!fptr->writeconv) rb_exc_raise(rb_econv_open_exc("", "", ecflags)); fptr->writeconv_stateless = Qnil; @@ -719,7 +722,7 @@ make_writeconv(rb_io_t *fptr) if (senc) { denc = enc->name; fptr->writeconv_stateless = rb_str_new2(senc); - fptr->writeconv = rb_econv_open(senc, denc, ecflags); + fptr->writeconv = rb_econv_open_opts(senc, denc, ecflags, ecopts); if (!fptr->writeconv) rb_exc_raise(rb_econv_open_exc(senc, denc, ecflags)); } @@ -753,7 +756,8 @@ io_fwrite(VALUE str, rb_io_t *fptr) } if (!NIL_P(common_encoding)) { - str = rb_str_transcode(str, common_encoding, fptr->writeconv_pre_flags); + str = rb_str_transcode(str, common_encoding, + fptr->writeconv_pre_flags, fptr->writeconv_pre_ecopts); } if (fptr->writeconv) { @@ -1438,8 +1442,10 @@ make_readconv(rb_io_t *fptr) { if (!fptr->readconv) { int ecflags; + VALUE ecopts; const char *sname, *dname; ecflags = fptr->encs.flags; + ecopts = fptr->encs.ecopts; if (NEED_NEWLINE_DECODER(fptr)) ecflags |= ECONV_UNIVERSAL_NEWLINE_DECODER; if (fptr->encs.enc2) { @@ -1449,7 +1455,7 @@ make_readconv(rb_io_t *fptr) else { sname = dname = ""; } - fptr->readconv = rb_econv_open(sname, dname, ecflags); + fptr->readconv = rb_econv_open_opts(sname, dname, ecflags, ecopts); if (!fptr->readconv) rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags)); fptr->cbuf_off = 0; @@ -3833,6 +3839,7 @@ rb_io_extract_modeenc(VALUE *mode_p, VALUE opthash, int modenum, flags; rb_encoding *enc, *enc2; int ecflags; + VALUE ecopts; int has_enc = 0; VALUE intmode; @@ -3865,6 +3872,7 @@ rb_io_extract_modeenc(VALUE *mode_p, VALUE opthash, if (NIL_P(opthash)) { ecflags = 0; + ecopts = Qnil; } else { VALUE v; @@ -3878,7 +3886,7 @@ rb_io_extract_modeenc(VALUE *mode_p, VALUE opthash, modenum |= O_BINARY; #endif } - ecflags = rb_econv_flags(opthash); + ecflags = rb_econv_prepare_opts(opthash, &ecopts); if (io_extract_encoding_option(opthash, &enc, &enc2)) { if (has_enc) { @@ -3897,6 +3905,7 @@ rb_io_extract_modeenc(VALUE *mode_p, VALUE opthash, convconfig_p->enc = enc; convconfig_p->enc2 = enc2; convconfig_p->flags = ecflags; + convconfig_p->ecopts = ecopts; } struct sysopen_struct { @@ -4005,6 +4014,7 @@ rb_file_open_generic(VALUE io, VALUE filename, int modenum, int flags, convconfi fptr->encs.enc = NULL; fptr->encs.enc2 = NULL; fptr->encs.flags = 0; + fptr->encs.ecopts = Qnil; } fptr->pathv = rb_str_new_frozen(filename); fptr->fd = rb_sysopen(RSTRING_PTR(fptr->pathv), modenum, perm); @@ -4027,6 +4037,7 @@ rb_file_open_internal(VALUE io, VALUE filename, const char *mode) convconfig.enc = NULL; convconfig.enc2 = NULL; convconfig.flags = 0; + convconfig.ecopts = Qnil; } flags = rb_io_mode_flags(mode); @@ -5012,6 +5023,7 @@ rb_io_reopen(int argc, VALUE *argv, VALUE file) fptr->mode = flags; rb_io_mode_enc(fptr, StringValueCStr(nmode)); fptr->encs.flags = 0; + fptr->encs.ecopts = Qnil; } fptr->pathv = rb_str_new_frozen(fname); @@ -5698,7 +5710,8 @@ argf_alloc(VALUE klass) #define argf_binmode ARGF.binmode #define argf_enc ARGF.encs.enc #define argf_enc2 ARGF.encs.enc2 -#define argf_ecflags ARGF.encs.flags +#define argf_ecflags ARGF.encs.flags +#define argf_ecopts ARGF.encs.ecopts #define rb_argv ARGF.argv static VALUE @@ -5871,6 +5884,7 @@ argf_next_argv(VALUE argf) fptr->encs.enc = argf_enc; fptr->encs.enc2 = argf_enc2; fptr->encs.flags = argf_ecflags; + fptr->encs.ecopts = argf_ecopts; clear_codeconv(fptr); } } @@ -6595,7 +6609,7 @@ io_encoding_set(rb_io_t *fptr, int argc, VALUE v1, VALUE v2, VALUE opt) if (argc == 2) { fptr->encs.enc2 = rb_to_encoding(v1); fptr->encs.enc = rb_to_encoding(v2); - fptr->encs.flags = rb_econv_flags(opt); + fptr->encs.flags = rb_econv_prepare_opts(opt, &fptr->encs.ecopts); clear_codeconv(fptr); } else if (argc == 1) { @@ -6603,18 +6617,20 @@ io_encoding_set(rb_io_t *fptr, int argc, VALUE v1, VALUE v2, VALUE opt) fptr->encs.enc = NULL; fptr->encs.enc2 = NULL; fptr->encs.flags = 0; + fptr->encs.ecopts = Qnil; clear_codeconv(fptr); } else { VALUE tmp = rb_check_string_type(v1); if (!NIL_P(tmp)) { mode_enc(fptr, StringValueCStr(tmp)); - fptr->encs.flags = rb_econv_flags(opt); + fptr->encs.flags = rb_econv_prepare_opts(opt, &fptr->encs.ecopts); } else { fptr->encs.enc = rb_to_encoding(v1); fptr->encs.enc2 = NULL; fptr->encs.flags = 0; + fptr->encs.ecopts = Qnil; clear_codeconv(fptr); } } @@ -7548,6 +7564,7 @@ argf_set_encoding(int argc, VALUE *argv, VALUE argf) argf_enc = fptr->encs.enc; argf_enc2 = fptr->encs.enc2; argf_ecflags = fptr->encs.flags; + argf_ecopts = fptr->encs.ecopts; return argf; } diff --git a/test/ruby/test_econv.rb b/test/ruby/test_econv.rb index c898efee28..6844067526 100644 --- a/test/ruby/test_econv.rb +++ b/test/ruby/test_econv.rb @@ -654,4 +654,11 @@ class TestEncodingConverter < Test::Unit::TestCase ec.replacement = "<undef>" assert_equal("a <undef> b", ec.convert("a \u3042 b")) end + + def test_econv_new_hash + ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace) + assert_equal("a ? b", ec.convert("a \u3042 b")) + ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace, :replace => "X") + assert_equal("a X b", ec.convert("a \u3042 b")) + end end diff --git a/test/ruby/test_io_m17n.rb b/test/ruby/test_io_m17n.rb index 90dd831dd5..b8b532e94c 100644 --- a/test/ruby/test_io_m17n.rb +++ b/test/ruby/test_io_m17n.rb @@ -239,6 +239,16 @@ EOT w.close if w && !w.closed? end + def test_s_pipe_undef_replace_string + r, w = IO.pipe("utf-8:euc-jp", :undef=>:replace, :replace=>"X") + w << "\ufffd" + w.close + assert_equal("X", r.read) + ensure + r.close if r && !r.closed? + w.close if w && !w.closed? + end + def test_dup with_pipe("utf-8:euc-jp") {|r, w| w << "\u3042" diff --git a/test/ruby/test_transcode.rb b/test/ruby/test_transcode.rb index 81c58fd651..2bf8f04de4 100644 --- a/test/ruby/test_transcode.rb +++ b/test/ruby/test_transcode.rb @@ -324,10 +324,18 @@ class TestTranscode < Test::Unit::TestCase "\xA4\xA2\xFF\xFF\xA4\xA4".encode("ISO-2022-JP", "EUC-JP", invalid: :replace)) end + def test_invalid_replace_string + assert_equal("a<x>A", "a\x80A".encode("us-ascii", "euc-jp", :invalid=>:replace, :replace=>"<x>")) + end + def test_undef_replace assert_equal("?", "\u20AC".encode("EUC-JP", :undef=>:replace), "[ruby-dev:35709]") end + def test_undef_replace_string + assert_equal("a<x>A", "a\u3042A".encode("us-ascii", :undef=>:replace, :replace=>"<x>")) + end + def test_shift_jis check_both_ways("\u3000", "\x81\x40", 'shift_jis') # full-width space check_both_ways("\u00D7", "\x81\x7E", 'shift_jis') # × diff --git a/transcode.c b/transcode.c index cc2d7793fa..273a913d31 100644 --- a/transcode.c +++ b/transcode.c @@ -1880,7 +1880,8 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, unsigned char *(*resize_destination)(VALUE, int, int), const char *from_encoding, const char *to_encoding, - int ecflags) + int ecflags, + VALUE ecopts) { rb_econv_t *ec; rb_transcoding *last_tc; @@ -1889,7 +1890,7 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, int max_output; VALUE exc; - ec = rb_econv_open(from_encoding, to_encoding, ecflags); + ec = rb_econv_open_opts(from_encoding, to_encoding, ecflags, ecopts); if (!ec) rb_exc_raise(rb_econv_open_exc(from_encoding, to_encoding, ecflags)); @@ -1924,7 +1925,8 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, unsigned char *(*resize_destination)(VALUE, int, int), const char *from_encoding, const char *to_encoding, - int ecflags) + int ecflags, + VALUE ecopts) { rb_econv_t *ec; rb_transcoding *last_tc; @@ -1934,7 +1936,7 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, int max_output; VALUE exc; - ec = rb_econv_open(from_encoding, to_encoding, ecflags); + ec = rb_econv_open_opts(from_encoding, to_encoding, ecflags, ecopts); if (!ec) rb_exc_raise(rb_econv_open_exc(from_encoding, to_encoding, ecflags)); @@ -2033,12 +2035,64 @@ econv_opts(VALUE opt) } int -rb_econv_flags(VALUE hash) +rb_econv_prepare_opts(VALUE opthash, VALUE *opts) { - if (NIL_P(hash)) - return 0; - else - return econv_opts(hash); + int ecflags; + VALUE newhash = Qnil; + if (NIL_P(opthash)) + return Qnil; + ecflags = econv_opts(opthash); + + if ((ecflags & ECONV_INVALID_MASK) == ECONV_INVALID_REPLACE || + (ecflags & ECONV_UNDEF_MASK) == ECONV_UNDEF_REPLACE) { + VALUE v = rb_hash_aref(opthash, sym_replace); + if (!NIL_P(v)) { + StringValue(v); + v = rb_str_new_frozen(v); + newhash = rb_hash_new(); + rb_hash_aset(newhash, sym_replace, v); + } + } + if (!NIL_P(newhash)) + rb_hash_freeze(newhash); + *opts = newhash; + + return ecflags; +} + +rb_econv_t * +rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash) +{ + rb_econv_t *ec; + VALUE replacement; + + if (NIL_P(opthash)) { + replacement = Qnil; + } + else { + if (TYPE(opthash) != T_HASH || !OBJ_FROZEN(opthash)) + rb_bug("rb_econv_open_opts called with invalid opthash"); + replacement = rb_hash_aref(opthash, sym_replace); + } + + ec = rb_econv_open(source_encoding, destination_encoding, ecflags); + if (!ec) + return ec; + + if (!NIL_P(replacement)) { + int ret; + rb_encoding *enc = rb_enc_get(replacement); + + ret = rb_econv_set_replacemenet(ec, + (const unsigned char *)RSTRING_PTR(replacement), + RSTRING_LEN(replacement), + enc->name); + if (ret == -1) { + rb_econv_close(ec); + return NULL; + } + } + return ec; } static int @@ -2082,7 +2136,7 @@ str_transcode_enc_args(VALUE str, VALUE arg1, VALUE arg2, } static int -str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags) +str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts) { VALUE dest; VALUE str = *self; @@ -2127,7 +2181,7 @@ str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags) dest = rb_str_tmp_new(blen); bp = (unsigned char *)RSTRING_PTR(dest); - transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, from_e, to_e, ecflags); + transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, from_e, to_e, ecflags, ecopts); if (fromp != sp+slen) { rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp); } @@ -2149,15 +2203,16 @@ str_transcode(int argc, VALUE *argv, VALUE *self) { VALUE opt; int ecflags = 0; + VALUE ecopts = Qnil; if (0 < argc) { opt = rb_check_convert_type(argv[argc-1], T_HASH, "Hash", "to_hash"); if (!NIL_P(opt)) { argc--; - ecflags = rb_econv_flags(opt); + ecflags = rb_econv_prepare_opts(opt, &ecopts); } } - return str_transcode0(argc, argv, self, ecflags); + return str_transcode0(argc, argv, self, ecflags, ecopts); } static inline VALUE @@ -2228,12 +2283,12 @@ str_encode(int argc, VALUE *argv, VALUE str) } VALUE -rb_str_transcode(VALUE str, VALUE to, int ecflags) +rb_str_transcode(VALUE str, VALUE to, int ecflags, VALUE ecopts) { int argc = 1; VALUE *argv = &to; VALUE newstr = str; - int encidx = str_transcode0(argc, argv, &newstr, ecflags); + int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts); if (encidx < 0) return rb_str_dup(str); RBASIC(newstr)->klass = rb_obj_class(str); @@ -2295,7 +2350,7 @@ make_dummy_encoding(const char *name) static VALUE econv_init(int argc, VALUE *argv, VALUE self) { - VALUE source_encoding, destination_encoding, flags_v; + VALUE source_encoding, destination_encoding, flags_v, opt, ecopts; int sidx, didx; const char *sname, *dname; rb_encoding *senc, *denc; @@ -2304,10 +2359,20 @@ econv_init(int argc, VALUE *argv, VALUE self) rb_scan_args(argc, argv, "21", &source_encoding, &destination_encoding, &flags_v); - if (flags_v == Qnil) + if (flags_v == Qnil) { ecflags = 0; - else - ecflags = NUM2INT(flags_v); + ecopts = Qnil; + } + else { + opt = rb_check_convert_type(argv[argc-1], T_HASH, "Hash", "to_hash"); + if (!NIL_P(opt)) { + ecflags = rb_econv_prepare_opts(opt, &ecopts); + } + else { + ecflags = NUM2INT(flags_v); + ecopts = Qnil; + } + } senc = NULL; sidx = rb_to_encoding_index(source_encoding); @@ -2334,7 +2399,7 @@ econv_init(int argc, VALUE *argv, VALUE self) rb_raise(rb_eTypeError, "already initialized"); } - ec = rb_econv_open(sname, dname, ecflags); + ec = rb_econv_open_opts(sname, dname, ecflags, ecopts); if (!ec) { rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags)); } @@ -2892,7 +2957,7 @@ econv_insert_output(VALUE self, VALUE string) StringValue(string); insert_enc = rb_econv_encoding_to_insert_output(ec); - string = rb_str_transcode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0); + string = rb_str_transcode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil); ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc); if (ret == -1) { |