From db6ec3105ea5c3b7e30e15ec0a0b4d1a5dbe0ac3 Mon Sep 17 00:00:00 2001 From: akr Date: Tue, 26 Aug 2008 16:09:29 +0000 Subject: * include/ruby/encoding.h (rb_econv_result_t): enumeration constant: econv_incomplete_input. * io.c (finish_writeconv): check econv_incomplete_input. * transcode.c (transcode_restartable0): return econv_incomplete_input for unexpected end of source buffer. (trans_sweep): check econv_incomplete_input. (rb_trans_conv): ditto. (rb_econv_convert0): ditto. (rb_econv_convert): ditto. (transcode_loop): ditto. (make_econv_exception): change message for econv_incomplete_input. (econv_result_to_symbol): return :incomplete_input for econv_incomplete_input. (ecerr_incomplete_input): new method. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@18875 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 19 +++++++++++++++++ include/ruby/encoding.h | 1 + io.c | 12 ++++++----- test/ruby/test_econv.rb | 14 ++++++++++++- transcode.c | 54 ++++++++++++++++++++++++++++++++++++++----------- 5 files changed, 82 insertions(+), 18 deletions(-) diff --git a/ChangeLog b/ChangeLog index c7747df911..ff399a4052 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,22 @@ +Wed Aug 27 01:03:23 2008 Tanaka Akira + + * include/ruby/encoding.h (rb_econv_result_t): enumeration constant: + econv_incomplete_input. + + * io.c (finish_writeconv): check econv_incomplete_input. + + * transcode.c (transcode_restartable0): return econv_incomplete_input + for unexpected end of source buffer. + (trans_sweep): check econv_incomplete_input. + (rb_trans_conv): ditto. + (rb_econv_convert0): ditto. + (rb_econv_convert): ditto. + (transcode_loop): ditto. + (make_econv_exception): change message for econv_incomplete_input. + (econv_result_to_symbol): return :incomplete_input for + econv_incomplete_input. + (ecerr_incomplete_input): new method. + Wed Aug 27 00:05:55 2008 Tanaka Akira * include/ruby/io.h (rb_io_t): rename crbuf to cbuf. diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h index 3c0fcd2572..23b61463c3 100644 --- a/include/ruby/encoding.h +++ b/include/ruby/encoding.h @@ -203,6 +203,7 @@ typedef enum { econv_source_buffer_empty, econv_finished, econv_output_followed_by_input, + econv_incomplete_input, } rb_econv_result_t; typedef struct { diff --git a/io.c b/io.c index 1eba36d7d6..b3ea394c61 100644 --- a/io.c +++ b/io.c @@ -2967,7 +2967,7 @@ finish_writeconv(rb_io_t *fptr, int noraise) de = buf + sizeof(buf); res = rb_econv_convert(fptr->writeconv, NULL, NULL, &dp, de, 0); while (dp-ds) { -retry: + retry: r = rb_write_internal(fptr->fd, ds, dp-ds); if (r == dp-ds) break; @@ -2987,6 +2987,7 @@ retry: rb_econv_check_error(fptr->writeconv); } if (res == econv_invalid_byte_sequence || + res == econv_incomplete_input || res == econv_undefined_conversion) { break; } @@ -3009,6 +3010,7 @@ retry: rb_econv_check_error(fptr->writeconv); } if (res == econv_invalid_byte_sequence || + res == econv_incomplete_input || res == econv_undefined_conversion) { break; } @@ -3603,7 +3605,7 @@ rb_io_mode_flags(const char *mode) } } -finished: + finished: if ((flags & FMODE_BINMODE) && (flags & FMODE_TEXTMODE)) goto error; @@ -7051,7 +7053,7 @@ copy_stream_sendfile(struct copy_stream_struct *stp) } } -retry_sendfile: + retry_sendfile: if (use_pread) { ss = simple_sendfile(stp->dst_fd, stp->src_fd, &src_offset, copy_length); } @@ -7095,7 +7097,7 @@ static ssize_t copy_stream_read(struct copy_stream_struct *stp, char *buf, int len, off_t offset) { ssize_t ss; -retry_read: + retry_read: if (offset == (off_t)-1) ss = read(stp->src_fd, buf, len); else { @@ -7231,7 +7233,7 @@ copy_stream_func(void *arg) copy_stream_read_write(stp); #ifdef USE_SENDFILE -finish: + finish: #endif return Qnil; } diff --git a/test/ruby/test_econv.rb b/test/ruby/test_econv.rb index a9724d5d62..5324e6dfe2 100644 --- a/test/ruby/test_econv.rb +++ b/test/ruby/test_econv.rb @@ -364,7 +364,7 @@ class TestEncodingConverter < Test::Unit::TestCase def test_errinfo_invalid_partial_character ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") ec.primitive_convert(src="\xa4", dst="", nil, 10) - assert_errinfo(:invalid_byte_sequence, "EUC-JP", "UTF-8", "\xA4", "", nil, ec) + assert_errinfo(:incomplete_input, "EUC-JP", "UTF-8", "\xA4", "", nil, ec) end def test_errinfo_valid_partial_character @@ -426,6 +426,18 @@ class TestEncodingConverter < Test::Unit::TestCase assert_equal("UTF-8", err.destination_encoding) assert_equal("\xA4".force_encoding("ASCII-8BIT"), err.error_bytes) assert_equal("d", err.readagain_bytes) + assert_equal(false, err.incomplete_input?) + end + + def test_exc_incomplete + err = assert_raise(Encoding::InvalidByteSequence) { + "abc\xa4".encode("ISO-8859-1", "EUC-JP") + } + assert_equal("EUC-JP", err.source_encoding) + assert_equal("UTF-8", err.destination_encoding) + assert_equal("\xA4".force_encoding("ASCII-8BIT"), err.error_bytes) + assert_equal(nil, err.readagain_bytes) + assert_equal(true, err.incomplete_input?) end def test_exc_undef diff --git a/transcode.c b/transcode.c index 78efb93c22..38548bc856 100644 --- a/transcode.c +++ b/transcode.c @@ -466,6 +466,7 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, case 24: goto resume_label24; case 25: goto resume_label25; case 26: goto resume_label26; + case 27: goto resume_label27; } while (1) { @@ -500,7 +501,7 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, SUSPEND_OUTPUT_FOLLOWED_BY_INPUT(25); while (in_p >= in_stop) { if (!(opt & ECONV_PARTIAL_INPUT)) - goto invalid; + goto incomplete; SUSPEND(econv_source_buffer_empty, 5); } next_byte = (unsigned char)*in_p++; @@ -602,6 +603,10 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, SUSPEND(econv_invalid_byte_sequence, 1); continue; + incomplete: + SUSPEND(econv_incomplete_input, 27); + continue; + undef: SUSPEND(econv_undefined_conversion, 2); continue; @@ -949,6 +954,7 @@ trans_sweep(rb_econv_t *ec, switch (res) { case econv_invalid_byte_sequence: + case econv_incomplete_input: case econv_undefined_conversion: case econv_output_followed_by_input: return i; @@ -997,6 +1003,7 @@ rb_trans_conv(rb_econv_t *ec, for (i = ec->num_trans-1; 0 <= i; i--) { switch (ec->elems[i].last_result) { case econv_invalid_byte_sequence: + case econv_incomplete_input: case econv_undefined_conversion: case econv_output_followed_by_input: case econv_finished: @@ -1030,7 +1037,7 @@ rb_trans_conv(rb_econv_t *ec, sweep_start = 0; -found_needreport: + found_needreport: do { needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start); @@ -1041,6 +1048,7 @@ found_needreport: if (ec->elems[i].last_result != econv_source_buffer_empty) { rb_econv_result_t res = ec->elems[i].last_result; if (res == econv_invalid_byte_sequence || + res == econv_incomplete_input || res == econv_undefined_conversion || res == econv_output_followed_by_input) { ec->elems[i].last_result = econv_source_buffer_empty; @@ -1160,10 +1168,11 @@ rb_econv_convert0(rb_econv_t *ec, } while (res == econv_output_followed_by_input); } -gotresult: + gotresult: ec->last_error.result = res; ec->last_error.partial_input = flags & ECONV_PARTIAL_INPUT; if (res == econv_invalid_byte_sequence || + res == econv_incomplete_input || res == econv_undefined_conversion) { rb_transcoding *error_tc = ec->elems[result_position].tc; ec->last_error.error_tc = error_tc; @@ -1200,10 +1209,11 @@ rb_econv_convert(rb_econv_t *ec, output_stop = empty_ptr; } -resume: + resume: ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags); - if (ret == econv_invalid_byte_sequence) { + if (ret == econv_invalid_byte_sequence || + ret == econv_incomplete_input) { /* deal with invalid byte sequence */ /* todo: add more alternative behaviors */ if (ec->opts.flags&ECONV_INVALID_IGNORE) { @@ -1398,7 +1408,7 @@ rb_econv_insert_output(rb_econv_t *ec, xfree((void*)insert_str); return 0; -fail: + fail: if (insert_str != str) xfree((void*)insert_str); return -1; @@ -1620,7 +1630,8 @@ static VALUE make_econv_exception(rb_econv_t *ec) { VALUE mesg, exc; - if (ec->last_error.result == econv_invalid_byte_sequence) { + if (ec->last_error.result == econv_invalid_byte_sequence || + ec->last_error.result == econv_incomplete_input) { const char *err = (const char *)ec->last_error.error_bytes_start; size_t error_len = ec->last_error.error_bytes_len; VALUE bytes = rb_str_new(err, error_len); @@ -1628,7 +1639,12 @@ make_econv_exception(rb_econv_t *ec) size_t readagain_len = ec->last_error.readagain_len; VALUE bytes2 = Qnil; VALUE dumped2; - if (readagain_len) { + if (ec->last_error.result == econv_incomplete_input) { + mesg = rb_sprintf("incomplete input: %s on %s", + StringValueCStr(dumped), + ec->last_error.source_encoding); + } + else if (readagain_len) { bytes2 = rb_str_new(err+error_len, readagain_len); dumped2 = rb_str_dump(bytes2); mesg = rb_sprintf("invalid byte sequence: %s followed by %s on %s", @@ -1647,6 +1663,7 @@ make_econv_exception(rb_econv_t *ec) rb_ivar_set(exc, rb_intern("destination_encoding"), rb_str_new2(ec->last_error.destination_encoding)); rb_ivar_set(exc, rb_intern("error_bytes"), bytes); rb_ivar_set(exc, rb_intern("readagain_bytes"), bytes2); + rb_ivar_set(exc, rb_intern("incomplete_input"), ec->last_error.result == econv_incomplete_input ? Qtrue : Qfalse); return exc; } if (ec->last_error.result == econv_undefined_conversion) { @@ -1742,10 +1759,11 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, last_tc = ec->last_tc; max_output = last_tc ? last_tc->transcoder->max_output : 1; -resume: + resume: ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0); - if (ret == econv_invalid_byte_sequence) { + if (ret == econv_invalid_byte_sequence || + ret == econv_incomplete_input) { exc = make_econv_exception(ec); rb_econv_close(ec); rb_exc_raise(exc); @@ -1812,6 +1830,7 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, ptr += p - &input_byte; switch (ret) { case econv_invalid_byte_sequence: + case econv_incomplete_input: exc = make_econv_exception(ec); rb_econv_close(ec); rb_exc_raise(exc); @@ -2291,6 +2310,7 @@ econv_result_to_symbol(rb_econv_result_t res) { switch (res) { case econv_invalid_byte_sequence: return ID2SYM(rb_intern("invalid_byte_sequence")); + case econv_incomplete_input: return ID2SYM(rb_intern("incomplete_input")); case econv_undefined_conversion: return ID2SYM(rb_intern("undefined_conversion")); case econv_destination_buffer_full: return ID2SYM(rb_intern("destination_buffer_full")); case econv_source_buffer_empty: return ID2SYM(rb_intern("source_buffer_empty")); @@ -2311,6 +2331,7 @@ econv_result_to_symbol(rb_econv_result_t res) * * possible results: * :invalid_byte_sequence + * :incomplete_input * :undefined_conversion * :output_followed_by_input * :destination_buffer_full @@ -2342,6 +2363,8 @@ econv_result_to_symbol(rb_econv_result_t res) * * primitive_convert stops conversion when one of following condition met. * - invalid byte sequence found in source buffer (:invalid_byte_sequence) + * - unexpected end of source buffer (:incomplete_input) + * this occur only when PARTIAL_INPUT is not specified. * - character not representable in output encoding (:undefined_conversion) * - after some output is generated, before input is done (:output_followed_by_input) * this occur only when OUTPUT_FOLLOWED_BY_INPUT is specified. @@ -2451,7 +2474,7 @@ econv_primitive_convert(int argc, VALUE *argv, VALUE self) * for primitive_convert. * * Other elements are only meaningful when result is - * :invalid_byte_sequence or :undefined_conversion. + * :invalid_byte_sequence, :incomplete_input or :undefined_conversion. * * enc1 and enc2 indicats a conversion step as pair of strings. * For example, EUC-JP to ISO-8859-1 is @@ -2482,7 +2505,7 @@ econv_primitive_convert(int argc, VALUE *argv, VALUE self) * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") * ec.primitive_convert(src="\xa4", dst="", nil, 10) * p ec.primitive_errinfo - * #=> [:invalid_byte_sequence, "EUC-JP", "UTF-8", "\xA4", "", nil] + * #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", "", nil] * * # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by * # partial characters. @@ -2625,6 +2648,12 @@ ecerr_readagain_bytes(VALUE self) return rb_attr_get(self, rb_intern("readagain_bytes")); } +static VALUE +ecerr_incomplete_input(VALUE self) +{ + return rb_attr_get(self, rb_intern("incomplete_input")); +} + extern void Init_newline(void); void @@ -2674,6 +2703,7 @@ Init_transcode(void) rb_define_method(rb_eInvalidByteSequence, "destination_encoding", ecerr_destination_encoding, 0); rb_define_method(rb_eInvalidByteSequence, "error_bytes", ecerr_error_bytes, 0); rb_define_method(rb_eInvalidByteSequence, "readagain_bytes", ecerr_readagain_bytes, 0); + rb_define_method(rb_eInvalidByteSequence, "incomplete_input?", ecerr_incomplete_input, 0); Init_newline(); } -- cgit v1.2.3