From f2b0dba1cfff30d94d21d224a2c240752fda8edb Mon Sep 17 00:00:00 2001 From: matz Date: Sat, 15 Dec 2007 05:42:25 +0000 Subject: * transcode.c (str_transcode, transcode_dispatch): added two-step * trancode.c: some minor formatting fixes * transcode_data.h, transcode_data_iso_8859.c: Shortened extremely frequently used macros to shorten file length. * test/ruby/test_transcode.rb: Fixed name of test class; added setup method to ensure all necessary encodings exist; split tests into more test methods; added tests; fixed ordering of arguments in assert_equal to have expected result first. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@14236 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- transcode.c | 86 +++++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 55 insertions(+), 31 deletions(-) (limited to 'transcode.c') diff --git a/transcode.c b/transcode.c index 3580622bd6..30901e6dfd 100644 --- a/transcode.c +++ b/transcode.c @@ -81,8 +81,8 @@ register_transcoder(const char *from_e, const char *to_e, { static int n = 0; if (n >= MAX_TRANSCODERS) { - /* we are initializing, is it okay to use rb_raise here? */ - rb_raise(rb_eRuntimeError /*change exception*/, "not enough transcoder slots"); + /* we are initializing, is it okay to use rb_raise here? */ + rb_raise(rb_eRuntimeError /*change exception*/, "not enough transcoder slots"); } transcoder_table[n].from_encoding = from_e; transcoder_table[n].to_encoding = to_e; @@ -127,25 +127,37 @@ init_transcoder_table(void) register_transcoder(NULL, NULL, NULL, 0, 0); } +static int +encoding_equal(const char* encoding1, const char* encoding2) +{ + return 0==strcasecmp(encoding1, encoding2); +} static transcoder* transcode_dispatch(const char* from_encoding, const char* to_encoding) { transcoder *candidate = transcoder_table; - for (candidate = transcoder_table; candidate->from_encoding; candidate++) - if (0==strcasecmp(from_encoding, candidate->from_encoding) - && 0==strcasecmp(to_encoding, candidate->to_encoding)) - break; - /* in the future, add multistep transcoding logic here */ - return candidate->from_encoding ? candidate : NULL; + for (candidate = transcoder_table; candidate->from_encoding; candidate++) { + if (encoding_equal(from_encoding, candidate->from_encoding) + && encoding_equal(to_encoding, candidate->to_encoding)) { + return candidate; + } + } + /* multistep logic, via UTF-8 */ + if (!encoding_equal(from_encoding, "UTF-8") + && !encoding_equal(to_encoding, "UTF-8") + && transcode_dispatch("UTF-8", to_encoding)) { /* check that we have a second step */ + return transcode_dispatch(from_encoding, "UTF-8"); /* return first step */ + } + return NULL; } /* dynamic structure, one per conversion (similar to iconv_t) */ /* may carry conversion state (e.g. for iso-2022-jp) */ typedef struct transcoding { VALUE ruby_string_dest; /* the String used as the conversion destination, - or NULL if something else is being converted */ + or NULL if something else is being converted */ char *(*flush_func)(struct transcoding*, int, int); } transcoding; @@ -201,7 +213,7 @@ transcode_loop(char **in_pos, char **out_pos, } next_table = next_table->info[next_offset]; goto follow_byte; - /* maybe rewrite the following cases to use fallthrough???? */ + /* maybe rewrite the following cases to use fallthrough???? */ case ZERObt: /* drop input */ continue; case ONEbt: @@ -262,6 +274,7 @@ str_transcode(int argc, VALUE *argv, VALUE str) VALUE from_encval, to_encval; transcoder *my_transcoder; transcoding my_transcoding; + int final_encoding = 0; if (argc<1 || argc>2) { rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc); @@ -275,7 +288,7 @@ str_transcode(int argc, VALUE *argv, VALUE str) to_e = rb_enc_name(to_enc); } if (argc==1) { - from_encidx = rb_enc_get_index(str); + from_encidx = rb_enc_get_index(str); from_enc = rb_enc_from_index(from_encidx); from_e = rb_enc_name(from_enc); } @@ -298,33 +311,44 @@ str_transcode(int argc, VALUE *argv, VALUE str) if (strcasecmp(from_e, to_e) == 0) { return Qnil; } - if (!(my_transcoder = transcode_dispatch(from_e, to_e))) { - rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_e, to_e); - } - fromp = sp = RSTRING_PTR(str); - slen = RSTRING_LEN(str); - blen = slen + 30; /* len + margin */ - dest = rb_str_tmp_new(blen); - bp = RSTRING_PTR(dest); - my_transcoding.ruby_string_dest = dest; - my_transcoding.flush_func = str_transcoding_resize; - - /* for simple testing: */ - transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), my_transcoder, &my_transcoding); - if (fromp != sp+slen) { - rb_raise(rb_eArgError, "not fully converted, %d bytes left", sp+slen-fromp); - } - buf = RSTRING_PTR(dest); - *bp = '\0'; - rb_str_set_len(dest, bp - buf); + while (!final_encoding) /* loop for multistep transcoding */ + { /* later, maybe use smaller intermediate strings for very long strings */ + if (!(my_transcoder = transcode_dispatch(from_e, to_e))) { + rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_e, to_e); + } + fromp = sp = RSTRING_PTR(str); + slen = RSTRING_LEN(str); + blen = slen + 30; /* len + margin */ + dest = rb_str_tmp_new(blen); + bp = RSTRING_PTR(dest); + my_transcoding.ruby_string_dest = dest; + my_transcoding.flush_func = str_transcoding_resize; + + transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), my_transcoder, &my_transcoding); + if (fromp != sp+slen) { + rb_raise(rb_eArgError, "not fully converted, %d bytes left", sp+slen-fromp); + } + buf = RSTRING_PTR(dest); + *bp = '\0'; + rb_str_set_len(dest, bp - buf); + + rb_enc_associate(dest, to_enc); + + if (encoding_equal(my_transcoder->to_encoding, to_e)) { + final_encoding = 1; + } + else { + from_e = my_transcoder->to_encoding; + str = dest; + } + } /* set encoding */ if (!to_enc) { to_encidx = rb_enc_replicate(to_e, rb_default_encoding()); to_enc = rb_enc_from_index(to_encidx); } - rb_enc_associate(dest, to_enc); return dest; } -- cgit v1.2.3