From c871aee96b0d5112ecc040cd56e84a2d7fe2e3c5 Mon Sep 17 00:00:00 2001 From: naruse Date: Tue, 13 Apr 2010 01:26:46 +0000 Subject: * transcode.c (transcode_loop): insert output the value when fallback hash has a related key. [ruby-dev:40540] [ruby-dev:40829] #3036 * transcode.c (rb_econv_prepare_opts): pass to newhash a value with the key :fallback. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@27326 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 9 +++++++++ test/ruby/test_transcode.rb | 12 ++++++++++-- transcode.c | 39 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 57 insertions(+), 3 deletions(-) diff --git a/ChangeLog b/ChangeLog index 4f2c6da71d..71d342e187 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,12 @@ +Tue Apr 13 09:32:12 2010 NARUSE, Yui + + * transcode.c (transcode_loop): insert output the value when + fallback hash has a related key. [ruby-dev:40540] + [ruby-dev:40829] #3036 + + * transcode.c (rb_econv_prepare_opts): pass to newhash + a value with the key :fallback. + Tue Apr 13 00:12:04 2010 Tanaka Akira * random.c (rand_init): use the absolute value of seed to diff --git a/test/ruby/test_transcode.rb b/test/ruby/test_transcode.rb index 7f73d31797..d16a4d534b 100644 --- a/test/ruby/test_transcode.rb +++ b/test/ruby/test_transcode.rb @@ -1892,8 +1892,7 @@ class TestTranscode < Test::Unit::TestCase check_both_ways("\u795E\u6797\u7FA9\u535A", "\xAF\xAB\xAA\x4C\xB8\x71\xB3\xD5", 'Big5-HKSCS') # 神林義博 end - def - test_Big5_UAO + def test_Big5_UAO check_both_ways("\u4e17", "\x81\x40", 'Big5-UAO') # 丗 end @@ -1903,4 +1902,13 @@ class TestTranscode < Test::Unit::TestCase assert_equal(Encoding::US_ASCII, a.encoding) assert_equal(Encoding::Shift_JIS, b.encoding) end + + def test_fallback + assert_equal("\u3042".encode("EUC-JP"), "\u{20000}".encode("EUC-JP", + fallback: {"\u{20000}" => "\u3042".encode("EUC-JP")})) + assert_equal("\u3042".encode("EUC-JP"), "\u{20000}".encode("EUC-JP", + fallback: {"\u{20000}" => "\u3042"})) + assert_equal("[ISU]", "\u{1F4BA}".encode("SJIS-KDDI", + fallback: {"\u{1F4BA}" => "[ISU]"})) + end end diff --git a/transcode.c b/transcode.c index dba26a2394..7683d4c9be 100644 --- a/transcode.c +++ b/transcode.c @@ -21,7 +21,7 @@ VALUE rb_eConverterNotFoundError; VALUE rb_cEncodingConverter; -static VALUE sym_invalid, sym_undef, sym_replace; +static VALUE sym_invalid, sym_undef, sym_replace, sym_fallback; static VALUE sym_xml, sym_text, sym_attr; static VALUE sym_universal_newline; static VALUE sym_crlf_newline; @@ -2256,17 +2256,37 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, unsigned char *out_start = *out_pos; int max_output; VALUE exc; + VALUE fallback = Qnil; ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts); if (!ec) rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags)); + if (!NIL_P(ecopts) && TYPE(ecopts) == T_HASH) + fallback = rb_hash_aref(ecopts, sym_fallback); last_tc = ec->last_tc; max_output = last_tc ? last_tc->transcoder->max_output : 1; resume: ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0); + if (!NIL_P(fallback) && ret == econv_undefined_conversion) { + VALUE rep = rb_enc_str_new( + (const char *)ec->last_error.error_bytes_start, + ec->last_error.error_bytes_len, + rb_enc_find(ec->last_error.source_encoding)); + rep = rb_hash_lookup2(fallback, rep, Qundef); + if (rep != Qundef) { + StringValue(rep); + ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep), + RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep))); + if (ret == -1) { + rb_raise(rb_eArgError, "too big fallback string"); + } + goto resume; + } + } + if (ret == econv_invalid_byte_sequence || ret == econv_incomplete_input || ret == econv_undefined_conversion) { @@ -2442,6 +2462,7 @@ rb_econv_prepare_opts(VALUE opthash, VALUE *opts) return 0; } ecflags = econv_opts(opthash); + v = rb_hash_aref(opthash, sym_replace); if (!NIL_P(v)) { StringValue(v); @@ -2456,6 +2477,16 @@ rb_econv_prepare_opts(VALUE opthash, VALUE *opts) rb_hash_aset(newhash, sym_replace, v); } + v = rb_hash_aref(opthash, sym_fallback); + if (!NIL_P(v)) { + v = rb_convert_type(v, T_HASH, "Hash", "to_hash"); + if (!NIL_P(v)) { + if (NIL_P(newhash)) + newhash = rb_hash_new(); + rb_hash_aset(newhash, sym_fallback, v); + } + } + if (!NIL_P(newhash)) rb_hash_freeze(newhash); *opts = newhash; @@ -2728,6 +2759,11 @@ str_encode_bang(int argc, VALUE *argv, VALUE str) * :replace :: * Sets the replacement string to the value. The default replacement * string is "\uFFFD" for Unicode encoding forms, and "?" otherwise. + * :fallback :: + * Sets the replacement string by the hash for undefined character. + * Its key is a such undefined character encoded in source encoding + * of current transcoder. Its value can be any encoding until it + * can be converted into the destination encoding of the transcoder. * :xml :: * The value must be :text or :attr. * If the value is :text #encode replaces @@ -4193,6 +4229,7 @@ Init_transcode(void) sym_invalid = ID2SYM(rb_intern("invalid")); sym_undef = ID2SYM(rb_intern("undef")); sym_replace = ID2SYM(rb_intern("replace")); + sym_fallback = ID2SYM(rb_intern("fallback")); sym_xml = ID2SYM(rb_intern("xml")); sym_text = ID2SYM(rb_intern("text")); sym_attr = ID2SYM(rb_intern("attr")); -- cgit v1.2.3