diff options
-rw-r--r-- | ChangeLog | 18 | ||||
-rw-r--r-- | include/ruby/encoding.h | 2 | ||||
-rw-r--r-- | io.c | 33 | ||||
-rw-r--r-- | test/ruby/test_io_m17n.rb | 63 | ||||
-rw-r--r-- | transcode.c | 141 |
5 files changed, 198 insertions, 59 deletions
@@ -1,3 +1,21 @@ +Sun Aug 24 15:21:28 2008 Tanaka Akira <akr@fsij.org> + + * include/ruby/encoding.h (rb_str_transcode): add ecflags argument. + + * transcode.c (econv_opts): extracted from str_transcode. + (str_transcode_enc_args): extracted from str_transcode. + (str_transcode0): extracted from str_transcode. + (str_transcode): use econv_opts, str_transcode_enc_args, + str_transcode0. + (rb_str_transcode): call str_transcode0. + (econv_primitive_insert_output): give the additional argument for + rb_str_transcode. + + * io.c (make_writeconv): use invalid/undef flags. + (io_fwrite): ditto. + (rb_scan_open_args): give the additional argument for + rb_str_transcode. + Sun Aug 24 13:27:42 2008 Tanaka Akira <akr@fsij.org> * transcode.c (str_transcode): check last hash only if 0 < argc. diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h index 9336e6d346..3701b2d12f 100644 --- a/include/ruby/encoding.h +++ b/include/ruby/encoding.h @@ -194,7 +194,7 @@ rb_enc_dummy_p(rb_encoding *enc) return ENC_DUMMY_P(enc) != 0; } -VALUE rb_str_transcode(VALUE str, VALUE to); +VALUE rb_str_transcode(VALUE str, VALUE to, int ecflags); /* econv stuff */ @@ -699,6 +699,12 @@ make_writeconv(rb_io_t *fptr) fptr->writeconv_initialized = 1; ecflags = 0; + + if (fptr->mode & FMODE_INVALID_MASK) + ecflags |= (fptr->mode / (FMODE_INVALID_MASK/ECONV_INVALID_MASK)) & ECONV_INVALID_MASK; + if (fptr->mode & FMODE_UNDEF_MASK) + ecflags |= (fptr->mode / (FMODE_UNDEF_MASK/ECONV_UNDEF_MASK)) & ECONV_UNDEF_MASK; + #ifdef TEXTMODE_NEWLINE_ENCODER if (NEED_NEWLINE_ENCODER(fptr)) ecflags |= TEXTMODE_NEWLINE_ENCODER; @@ -740,18 +746,31 @@ io_fwrite(VALUE str, rb_io_t *fptr) long len, n, r, l, offset = 0; if (NEED_WRITECONV(fptr)) { + VALUE common_encoding = Qnil; make_writeconv(fptr); + if (fptr->writeconv) { - if (!NIL_P(fptr->writeconv_stateless)) { - str = rb_str_transcode(str, fptr->writeconv_stateless); - } - str = rb_econv_str_convert(fptr->writeconv, str, ECONV_PARTIAL_INPUT); + if (!NIL_P(fptr->writeconv_stateless)) + common_encoding = fptr->writeconv_stateless; } else { if (fptr->enc2) - str = rb_str_transcode(str, rb_enc_from_encoding(fptr->enc2)); + common_encoding = rb_enc_from_encoding(fptr->enc2); else - str = rb_str_transcode(str, rb_enc_from_encoding(fptr->enc)); + common_encoding = rb_enc_from_encoding(fptr->enc); + } + + if (!NIL_P(common_encoding)) { + int ecflags = 0; + if (fptr->mode & FMODE_INVALID_MASK) + ecflags |= (fptr->mode / (FMODE_INVALID_MASK/ECONV_INVALID_MASK)) & ECONV_INVALID_MASK; + if (fptr->mode & FMODE_UNDEF_MASK) + ecflags |= (fptr->mode / (FMODE_UNDEF_MASK/ECONV_UNDEF_MASK)) & ECONV_UNDEF_MASK; + str = rb_str_transcode(str, common_encoding, ecflags); + } + + if (fptr->writeconv) { + str = rb_econv_str_convert(fptr->writeconv, str, ECONV_PARTIAL_INPUT); } } @@ -4622,7 +4641,7 @@ rb_scan_open_args(int argc, VALUE *argv, static VALUE fs_enc; if (!fs_enc) fs_enc = rb_enc_from_encoding(fs_encoding); - fname = rb_str_transcode(fname, fs_enc); + fname = rb_str_transcode(fname, fs_enc, 0); } } #endif diff --git a/test/ruby/test_io_m17n.rb b/test/ruby/test_io_m17n.rb index 94be8b311f..93b6de0341 100644 --- a/test/ruby/test_io_m17n.rb +++ b/test/ruby/test_io_m17n.rb @@ -1202,5 +1202,68 @@ EOT } } end + + def test_invalid_w + with_tmpdir { + invalid_utf8 = "a\x80b".force_encoding("utf-8") + open("t.txt", "w:euc-jp", :invalid => :replace) {|f| + assert_nothing_raised { f.write invalid_utf8 } + } + assert_equal("a?b", File.read("t.txt")) + + open("t.txt", "w:euc-jp", :invalid => :ignore) {|f| + assert_nothing_raised { f.write invalid_utf8 } + } + assert_equal("ab", File.read("t.txt")) + + open("t.txt", "w:euc-jp", :undef => :replace) {|f| + assert_raise(Encoding::InvalidByteSequence) { f.write invalid_utf8 } + } + open("t.txt", "w:euc-jp", :undef => :ignore) {|f| + assert_raise(Encoding::InvalidByteSequence) { f.write invalid_utf8 } + } + } + end + + def test_undef_w_stateless + with_tmpdir { + generate_file("t.txt", "a\uFFFDb") + open("t.txt", "w:euc-jp:utf-8", :undef => :replace) {|f| + assert_nothing_raised { f.write "a\uFFFDb" } + } + assert_equal("a?b", File.read("t.txt")) + open("t.txt", "w:euc-jp:utf-8", :undef => :ignore) {|f| + assert_nothing_raised { f.write "a\uFFFDb" } + } + assert_equal("ab", File.read("t.txt")) + open("t.txt", "w:euc-jp:utf-8", :invalid => :replace) {|f| + assert_raise(Encoding::ConversionUndefined) { f.write "a\uFFFDb" } + } + open("t.txt", "w:euc-jp:utf-8", :invalid => :ignore) {|f| + assert_raise(Encoding::ConversionUndefined) { f.write "a\uFFFDb" } + } + } + end + + def test_undef_w_stateful + with_tmpdir { + generate_file("t.txt", "a\uFFFDb") + open("t.txt", "w:iso-2022-jp:utf-8", :undef => :replace) {|f| + assert_nothing_raised { f.write "a\uFFFDb" } + } + assert_equal("a?b", File.read("t.txt")) + open("t.txt", "w:iso-2022-jp:utf-8", :undef => :ignore) {|f| + assert_nothing_raised { f.write "a\uFFFDb" } + } + assert_equal("ab", File.read("t.txt")) + open("t.txt", "w:iso-2022-jp:utf-8", :invalid => :replace) {|f| + assert_raise(Encoding::ConversionUndefined) { f.write "a\uFFFDb" } + } + open("t.txt", "w:iso-2022-jp:utf-8", :invalid => :ignore) {|f| + assert_raise(Encoding::ConversionUndefined) { f.write "a\uFFFDb" } + } + } + end + end diff --git a/transcode.c b/transcode.c index c087716b43..9c4b9644e3 100644 --- a/transcode.c +++ b/transcode.c @@ -1673,58 +1673,49 @@ str_transcoding_resize(VALUE destination, int len, int new_len) } static int -str_transcode(int argc, VALUE *argv, VALUE *self) +econv_opts(VALUE opt) +{ + VALUE v; + int options = 0; + v = rb_hash_aref(opt, sym_invalid); + if (NIL_P(v)) { + } + else if (v==sym_ignore) { + options |= ECONV_INVALID_IGNORE; + } + else if (v==sym_replace) { + options |= ECONV_INVALID_REPLACE; + v = rb_hash_aref(opt, sym_replace); + } + else { + rb_raise(rb_eArgError, "unknown value for invalid character option"); + } + v = rb_hash_aref(opt, sym_undef); + if (NIL_P(v)) { + } + else if (v==sym_ignore) { + options |= ECONV_UNDEF_IGNORE; + } + else if (v==sym_replace) { + options |= ECONV_UNDEF_REPLACE; + } + else { + rb_raise(rb_eArgError, "unknown value for undefined character option"); + } + return options; +} + +static int +str_transcode_enc_args(VALUE str, VALUE arg1, VALUE arg2, + const char **sname, rb_encoding **senc, + const char **dname, rb_encoding **denc) { - VALUE dest; - VALUE str = *self; - long blen, slen; - unsigned char *buf, *bp, *sp; - const unsigned char *fromp; rb_encoding *from_enc, *to_enc; const char *from_e, *to_e; int from_encidx, to_encidx; VALUE from_encval, to_encval; - VALUE opt; - int options = 0; - if (0 < argc) - opt = rb_check_convert_type(argv[argc-1], T_HASH, "Hash", "to_hash"); - else - opt = Qnil; - if (!NIL_P(opt)) { - VALUE v; - - argc--; - v = rb_hash_aref(opt, sym_invalid); - if (NIL_P(v)) { - } - else if (v==sym_ignore) { - options |= ECONV_INVALID_IGNORE; - } - else if (v==sym_replace) { - options |= ECONV_INVALID_REPLACE; - v = rb_hash_aref(opt, sym_replace); - } - else { - rb_raise(rb_eArgError, "unknown value for invalid character option"); - } - v = rb_hash_aref(opt, sym_undef); - if (NIL_P(v)) { - } - else if (v==sym_ignore) { - options |= ECONV_UNDEF_IGNORE; - } - else if (v==sym_replace) { - options |= ECONV_UNDEF_REPLACE; - } - else { - rb_raise(rb_eArgError, "unknown value for undefined character option"); - } - } - if (argc < 1 || argc > 2) { - rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc); - } - if ((to_encidx = rb_to_encoding_index(to_encval = argv[0])) < 0) { + if ((to_encidx = rb_to_encoding_index(to_encval = arg1)) < 0) { to_enc = 0; to_encidx = 0; to_e = StringValueCStr(to_encval); @@ -1733,12 +1724,12 @@ str_transcode(int argc, VALUE *argv, VALUE *self) to_enc = rb_enc_from_index(to_encidx); to_e = rb_enc_name(to_enc); } - if (argc==1) { + if (NIL_P(arg2)) { from_encidx = rb_enc_get_index(str); from_enc = rb_enc_from_index(from_encidx); from_e = rb_enc_name(from_enc); } - else if ((from_encidx = rb_to_encoding_index(from_encval = argv[1])) < 0) { + else if ((from_encidx = rb_to_encoding_index(from_encval = arg2)) < 0) { from_enc = 0; from_e = StringValueCStr(from_encval); } @@ -1747,6 +1738,31 @@ str_transcode(int argc, VALUE *argv, VALUE *self) from_e = rb_enc_name(from_enc); } + *sname = from_e; + *senc = from_enc; + *dname = to_e; + *denc = to_enc; + return to_encidx; +} + +static int +str_transcode0(int argc, VALUE *argv, VALUE *self, int options) +{ + VALUE dest; + VALUE str = *self; + long blen, slen; + unsigned char *buf, *bp, *sp; + const unsigned char *fromp; + rb_encoding *from_enc, *to_enc; + const char *from_e, *to_e; + int to_encidx; + + if (argc < 1 || argc > 2) { + rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc); + } + + to_encidx = str_transcode_enc_args(str, argv[0], argc==1 ? Qnil : argv[1], &from_e, &from_enc, &to_e, &to_enc); + if (from_enc && from_enc == to_enc) { return -1; } @@ -1782,6 +1798,22 @@ str_transcode(int argc, VALUE *argv, VALUE *self) return to_encidx; } +static int +str_transcode(int argc, VALUE *argv, VALUE *self) +{ + VALUE opt; + int options = 0; + + if (0 < argc) { + opt = rb_check_convert_type(argv[argc-1], T_HASH, "Hash", "to_hash"); + if (!NIL_P(opt)) { + argc--; + options = econv_opts(opt); + } + } + return str_transcode0(argc, argv, self, options); +} + static inline VALUE str_encode_associate(VALUE str, int encidx) { @@ -1850,9 +1882,16 @@ str_encode(int argc, VALUE *argv, VALUE str) } VALUE -rb_str_transcode(VALUE str, VALUE to) +rb_str_transcode(VALUE str, VALUE to, int flags) { - return str_encode(1, &to, str); + int argc = 1; + VALUE *argv = &to; + VALUE newstr = str; + int encidx = str_transcode0(argc, argv, &newstr, flags); + + if (encidx < 0) return rb_str_dup(str); + RBASIC(newstr)->klass = rb_obj_class(str); + return str_encode_associate(newstr, encidx); } static void @@ -2305,7 +2344,7 @@ econv_primitive_insert_output(VALUE self, VALUE string) StringValue(string); insert_enc = rb_econv_encoding_to_insert_output(ec); - string = rb_str_transcode(string, rb_enc_from_encoding(rb_enc_find(insert_enc))); + string = rb_str_transcode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0); ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc); if (ret == -1) |