diff options
author | akr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2008-08-23 06:02:58 +0000 |
---|---|---|
committer | akr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2008-08-23 06:02:58 +0000 |
commit | d4fabf0e36946a2ac8423a0817e282efa1ee1459 (patch) | |
tree | 08aed479413d135f5601abf40ffbc9ee92ffd7fb | |
parent | 860c2331039537c4d0c5c8c92ae1e89395f0edcc (diff) | |
download | ruby-d4fabf0e36946a2ac8423a0817e282efa1ee1459.tar.gz |
* include/ruby/encoding.h (ECONV_INVALID_MASK): defined.
(ECONV_INVALID_IGNORE): defined.
(ECONV_INVALID_REPLACE): defined.
(ECONV_UNDEF_MASK): defined.
(ECONV_UNDEF_IGNORE): defined.
(ECONV_UNDEF_REPLACE): defined.
* transcode.c (INVALID_IGNORE): removed.
(INVALID_REPLACE): removed.
(UNDEF_IGNORE): removed.
(UNDEF_REPLACE): removed.
(rb_econv_convert0): renamed from rb_econv_convert.
(rb_econv_convert): defined to call rb_econv_convert0 with
replace/ignore behavior moved from transcode_loop.
(transcode_loop): replace/ignore behavior removed.
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@18787 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r-- | ChangeLog | 18 | ||||
-rw-r--r-- | include/ruby/encoding.h | 14 | ||||
-rw-r--r-- | test/ruby/test_econv.rb | 33 | ||||
-rw-r--r-- | transcode.c | 107 |
4 files changed, 119 insertions, 53 deletions
@@ -1,3 +1,21 @@ +Sat Aug 23 14:59:32 2008 Tanaka Akira <akr@fsij.org> + + * include/ruby/encoding.h (ECONV_INVALID_MASK): defined. + (ECONV_INVALID_IGNORE): defined. + (ECONV_INVALID_REPLACE): defined. + (ECONV_UNDEF_MASK): defined. + (ECONV_UNDEF_IGNORE): defined. + (ECONV_UNDEF_REPLACE): defined. + + * transcode.c (INVALID_IGNORE): removed. + (INVALID_REPLACE): removed. + (UNDEF_IGNORE): removed. + (UNDEF_REPLACE): removed. + (rb_econv_convert0): renamed from rb_econv_convert. + (rb_econv_convert): defined to call rb_econv_convert0 with + replace/ignore behavior moved from transcode_loop. + (transcode_loop): replace/ignore behavior removed. + Sat Aug 23 11:23:05 2008 Tanaka Akira <akr@fsij.org> * io.c (rb_io_extract_modeenc): check :textmode and :binmode in option diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h index 03aac871fe..1f0feae98e 100644 --- a/include/ruby/encoding.h +++ b/include/ruby/encoding.h @@ -277,9 +277,17 @@ VALUE rb_econv_string(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, void rb_econv_binmode(rb_econv_t *ec); /* flags for rb_econv_open */ -#define ECONV_UNIVERSAL_NEWLINE_DECODER 0x100 -#define ECONV_CRLF_NEWLINE_ENCODER 0x200 -#define ECONV_CR_NEWLINE_ENCODER 0x400 +#define ECONV_INVALID_MASK 0x000f +#define ECONV_INVALID_IGNORE 0x0001 +#define ECONV_INVALID_REPLACE 0x0002 + +#define ECONV_UNDEF_MASK 0x00f0 +#define ECONV_UNDEF_IGNORE 0x0010 +#define ECONV_UNDEF_REPLACE 0x0020 + +#define ECONV_UNIVERSAL_NEWLINE_DECODER 0x0100 +#define ECONV_CRLF_NEWLINE_ENCODER 0x0200 +#define ECONV_CR_NEWLINE_ENCODER 0x0400 /* flags for rb_econv_convert */ #define ECONV_PARTIAL_INPUT 0x10000 diff --git a/test/ruby/test_econv.rb b/test/ruby/test_econv.rb index b8d9df7639..025887f1c6 100644 --- a/test/ruby/test_econv.rb +++ b/test/ruby/test_econv.rb @@ -448,4 +448,37 @@ class TestEncodingConverter < Test::Unit::TestCase assert_equal(["abcdef", ""], [dst, src]) end + def test_invalid_replace + ec = Encoding::Converter.new("UTF-8", "EUC-JP", Encoding::Converter::INVALID_REPLACE) + ret = ec.primitive_convert(src="abc\x80def", dst="", nil, 100) + assert_equal(:finished, ret) + assert_equal("", src) + assert_equal("abc?def", dst) + end + + def test_invalid_ignore + ec = Encoding::Converter.new("UTF-8", "EUC-JP", Encoding::Converter::INVALID_IGNORE) + ret = ec.primitive_convert(src="abc\x80def", dst="", nil, 100) + assert_equal(:finished, ret) + assert_equal("", src) + assert_equal("abcdef", dst) + end + + def test_undef_replace + ec = Encoding::Converter.new("UTF-8", "EUC-JP", Encoding::Converter::UNDEF_REPLACE) + ret = ec.primitive_convert(src="abc\u{fffd}def", dst="", nil, 100) + assert_equal(:finished, ret) + assert_equal("", src) + assert_equal("abc?def", dst) + end + + def test_undef_ignore + ec = Encoding::Converter.new("UTF-8", "EUC-JP", Encoding::Converter::UNDEF_IGNORE) + ret = ec.primitive_convert(src="abc\u{fffd}def", dst="", nil, 100) + assert_equal(:finished, ret) + assert_equal("", src) + assert_equal("abcdef", dst) + end + + end diff --git a/transcode.c b/transcode.c index 55f3281559..61843cfe9e 100644 --- a/transcode.c +++ b/transcode.c @@ -21,10 +21,6 @@ VALUE rb_eInvalidByteSequence; VALUE rb_cEncodingConverter; static VALUE sym_invalid, sym_undef, sym_ignore, sym_replace; -#define INVALID_IGNORE 0x1 -#define INVALID_REPLACE 0x2 -#define UNDEF_IGNORE 0x10 -#define UNDEF_REPLACE 0x20 /* * Dispatch data and logic @@ -972,8 +968,8 @@ found_needreport: return econv_source_buffer_empty; } -rb_econv_result_t -rb_econv_convert(rb_econv_t *ec, +static rb_econv_result_t +rb_econv_convert0(rb_econv_t *ec, const unsigned char **input_ptr, const unsigned char *input_stop, unsigned char **output_ptr, unsigned char *output_stop, int flags) @@ -1051,6 +1047,47 @@ gotresult: return res; } +static int output_replacement_character(rb_econv_t *ec); + +rb_econv_result_t +rb_econv_convert(rb_econv_t *ec, + const unsigned char **input_ptr, const unsigned char *input_stop, + unsigned char **output_ptr, unsigned char *output_stop, + int flags) +{ + rb_econv_result_t ret; + +resume: + ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags); + + if (ret == econv_invalid_byte_sequence) { + /* deal with invalid byte sequence */ + /* todo: add more alternative behaviors */ + if (ec->flags&ECONV_INVALID_IGNORE) { + goto resume; + } + else if (ec->flags&ECONV_INVALID_REPLACE) { + if (output_replacement_character(ec) == 0) + goto resume; + } + } + + if (ret == econv_undefined_conversion) { + /* valid character in source encoding + * but no related character(s) in destination encoding */ + /* todo: add more alternative behaviors */ + if (ec->flags&ECONV_UNDEF_IGNORE) { + goto resume; + } + else if (ec->flags&ECONV_UNDEF_REPLACE) { + if (output_replacement_character(ec) == 0) + goto resume; + } + } + + return ret; +} + const char * rb_econv_encoding_to_insert_output(rb_econv_t *ec) { @@ -1455,7 +1492,7 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, int max_output; VALUE exc; - ec = rb_econv_open(from_encoding, to_encoding, 0); + ec = rb_econv_open(from_encoding, to_encoding, opt & (ECONV_INVALID_MASK|ECONV_UNDEF_MASK)); if (!ec) rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_encoding, to_encoding); @@ -1464,35 +1501,18 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, resume: ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, opt); + if (ret == econv_invalid_byte_sequence) { - /* deal with invalid byte sequence */ - /* todo: add more alternative behaviors */ - if (opt&INVALID_IGNORE) { - goto resume; - } - else if (opt&INVALID_REPLACE) { - if (output_replacement_character(ec) == 0) - goto resume; - } exc = make_econv_exception(ec); rb_econv_close(ec); rb_exc_raise(exc); } if (ret == econv_undefined_conversion) { - /* valid character in from encoding - * but no related character(s) in to encoding */ - /* todo: add more alternative behaviors */ - if (opt&UNDEF_IGNORE) { - goto resume; - } - else if (opt&UNDEF_REPLACE) { - if (output_replacement_character(ec) == 0) - goto resume; - } exc = make_econv_exception(ec); rb_econv_close(ec); rb_exc_raise(exc); } + if (ret == econv_destination_buffer_full) { more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop); goto resume; @@ -1520,7 +1540,7 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, int max_output; VALUE exc; - ec = rb_econv_open(from_encoding, to_encoding, 0); + ec = rb_econv_open(from_encoding, to_encoding, opt & (ECONV_INVALID_MASK|ECONV_UNDEF_MASK)); if (!ec) rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_encoding, to_encoding); @@ -1549,31 +1569,12 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, ptr += p - &input_byte; switch (ret) { case econv_invalid_byte_sequence: - /* deal with invalid byte sequence */ - /* todo: add more alternative behaviors */ - if (opt&INVALID_IGNORE) { - break; - } - else if (opt&INVALID_REPLACE) { - if (output_replacement_character(ec) == 0) - break; - } exc = make_econv_exception(ec); rb_econv_close(ec); rb_exc_raise(exc); break; case econv_undefined_conversion: - /* valid character in from encoding - * but no related character(s) in to encoding */ - /* todo: add more alternative behaviors */ - if (opt&UNDEF_IGNORE) { - break; - } - else if (opt&UNDEF_REPLACE) { - if (output_replacement_character(ec) == 0) - break; - } exc = make_econv_exception(ec); rb_econv_close(ec); rb_exc_raise(exc); @@ -1632,10 +1633,10 @@ str_transcode(int argc, VALUE *argv, VALUE *self) if (NIL_P(v)) { } else if (v==sym_ignore) { - options |= INVALID_IGNORE; + options |= ECONV_INVALID_IGNORE; } else if (v==sym_replace) { - options |= INVALID_REPLACE; + options |= ECONV_INVALID_REPLACE; v = rb_hash_aref(opt, sym_replace); } else { @@ -1645,10 +1646,10 @@ str_transcode(int argc, VALUE *argv, VALUE *self) if (NIL_P(v)) { } else if (v==sym_ignore) { - options |= UNDEF_IGNORE; + options |= ECONV_UNDEF_IGNORE; } else if (v==sym_replace) { - options |= UNDEF_REPLACE; + options |= ECONV_UNDEF_REPLACE; } else { rb_raise(rb_eArgError, "unknown value for undefined character option"); @@ -2331,6 +2332,12 @@ Init_transcode(void) rb_define_method(rb_cEncodingConverter, "primitive_errinfo", econv_primitive_errinfo, 0); rb_define_method(rb_cEncodingConverter, "primitive_insert_output", econv_primitive_insert_output, 1); rb_define_method(rb_cEncodingConverter, "primitive_putback", econv_primitive_putback, 1); + rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK)); + rb_define_const(rb_cEncodingConverter, "INVALID_IGNORE", INT2FIX(ECONV_INVALID_IGNORE)); + rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE)); + rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK)); + rb_define_const(rb_cEncodingConverter, "UNDEF_IGNORE", INT2FIX(ECONV_UNDEF_IGNORE)); + rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE)); rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT)); rb_define_const(rb_cEncodingConverter, "OUTPUT_FOLLOWED_BY_INPUT", INT2FIX(ECONV_OUTPUT_FOLLOWED_BY_INPUT)); rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECODER", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECODER)); |