From b7db9036beece0e278aae5fd48f689ebca86e9ca Mon Sep 17 00:00:00 2001 From: nobu Date: Tue, 25 Dec 2007 05:57:04 +0000 Subject: * common.mk (COMMONOBJS): transcode_data_*.c moved under enc/trans. * transcode_data.h (rb_transcoding, rb_transcoder): prefixed. * transcode.c (rb_register_transcoder, rb_declare_transcoder): split declaration and registration. [ruby-dev:32704] * transcode.c (transcode_dispatch): autoload pre-declared transcoder. * transcode.c (str_transcode): use rb_define_dummy_encoding(). * transcode.c (Init_transcode): initialize transcoder tables. * enc/trans/single_byte.c, enc/trans/japanese.c: moved from top. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@14666 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- transcode.c | 260 +++++++++++++++++++++++++----------------------------------- 1 file changed, 109 insertions(+), 151 deletions(-) (limited to 'transcode.c') diff --git a/transcode.c b/transcode.c index c915cc299d..e8c6545163 100644 --- a/transcode.c +++ b/transcode.c @@ -12,9 +12,9 @@ #include "ruby/ruby.h" #include "ruby/encoding.h" - +#define PType (int) #include "transcode_data.h" - +#include VALUE rb_str_tmp_new(long); VALUE rb_str_shared_replace(VALUE, VALUE); @@ -23,168 +23,122 @@ VALUE rb_str_shared_replace(VALUE, VALUE); * Dispatch data and logic */ -/* extern declarations, should use some include file here */ -extern const BYTE_LOOKUP rb_from_ISO_8859_1; -extern const BYTE_LOOKUP rb_from_ISO_8859_2; -extern const BYTE_LOOKUP rb_from_ISO_8859_3; -extern const BYTE_LOOKUP rb_from_ISO_8859_4; -extern const BYTE_LOOKUP rb_from_ISO_8859_5; -extern const BYTE_LOOKUP rb_from_ISO_8859_6; -extern const BYTE_LOOKUP rb_from_ISO_8859_7; -extern const BYTE_LOOKUP rb_from_ISO_8859_8; -extern const BYTE_LOOKUP rb_from_ISO_8859_9; -extern const BYTE_LOOKUP rb_from_ISO_8859_10; -extern const BYTE_LOOKUP rb_from_ISO_8859_11; -extern const BYTE_LOOKUP rb_from_ISO_8859_13; -extern const BYTE_LOOKUP rb_from_ISO_8859_14; -extern const BYTE_LOOKUP rb_from_ISO_8859_15; - -extern const BYTE_LOOKUP rb_to_ISO_8859_1; -extern const BYTE_LOOKUP rb_to_ISO_8859_2; -extern const BYTE_LOOKUP rb_to_ISO_8859_3; -extern const BYTE_LOOKUP rb_to_ISO_8859_4; -extern const BYTE_LOOKUP rb_to_ISO_8859_5; -extern const BYTE_LOOKUP rb_to_ISO_8859_6; -extern const BYTE_LOOKUP rb_to_ISO_8859_7; -extern const BYTE_LOOKUP rb_to_ISO_8859_8; -extern const BYTE_LOOKUP rb_to_ISO_8859_9; -extern const BYTE_LOOKUP rb_to_ISO_8859_10; -extern const BYTE_LOOKUP rb_to_ISO_8859_11; -extern const BYTE_LOOKUP rb_to_ISO_8859_13; -extern const BYTE_LOOKUP rb_to_ISO_8859_14; -extern const BYTE_LOOKUP rb_to_ISO_8859_15; - -extern const BYTE_LOOKUP rb_from_SHIFT_JIS; -extern const BYTE_LOOKUP rb_from_EUC_JP; - -extern const BYTE_LOOKUP rb_to_SHIFT_JIS; -extern const BYTE_LOOKUP rb_to_EUC_JP; - -extern void from_iso_2022_jp_transcoder_preprocessor(char**, char**, char*, char*, - struct transcoder_st *transcoder, struct transcoding*); -extern void to_iso_2022_jp_transcoder_postprocessor(char**, char**, char*, char*, - struct transcoder_st *transcoder, struct transcoding*); - -/* declarations probably need to go into separate header file, e.g. transcode.h */ - -/* todo: dynamic structure, one per conversion (stream) */ - -/* in the future, add some mechanism for dynamically adding stuff here */ -#define MAX_TRANSCODERS 35 /* todo: fix: this number has to be adjusted by hand */ -static transcoder transcoder_table[MAX_TRANSCODERS]; -/* variable to work across register_transcoder and register_functional_transcoder */ -static int next_transcoder_position = 0; - -/* not sure why it's not possible to do relocatable initializations */ -/* maybe the code here can be removed (changed to simple initialization) */ -/* if we move this to another file???? */ -static void -register_transcoder(const char *from_e, const char *to_e, - const BYTE_LOOKUP *tree_start, int max_output, int from_utf8) +static st_table *transcoder_table, *transcoder_lib_table; + +#define TRANSCODER_INTERNAL_SEPARATOR '\t' + +static char * +transcoder_key(const char *from_e, const char *to_e) { - if (next_transcoder_position >= MAX_TRANSCODERS) { - /* we are initializing, is it okay to use rb_raise here? */ - rb_raise(rb_eRuntimeError /*change exception*/, "not enough transcoder slots"); - } - transcoder_table[next_transcoder_position].from_encoding = from_e; - transcoder_table[next_transcoder_position].to_encoding = to_e; - transcoder_table[next_transcoder_position].conv_tree_start = tree_start; - transcoder_table[next_transcoder_position].max_output = max_output; - transcoder_table[next_transcoder_position].from_utf8 = from_utf8; + int to_len = strlen(to_e); + int from_len = strlen(from_e); + char *const key = xmalloc(to_len + from_len + 2); + + memcpy(key, to_e, to_len); + memcpy(key + to_len + 1, from_e, from_len + 1); + key[to_len] = TRANSCODER_INTERNAL_SEPARATOR; + return key; +} - next_transcoder_position++; +void +rb_register_transcoder(const rb_transcoder *tr) +{ + st_data_t k, val = 0; + const char *const from_e = tr->from_encoding; + const char *const to_e = tr->to_encoding; + char *const key = transcoder_key(from_e, to_e); + + if (st_lookup(transcoder_table, (st_data_t)key, &val)) { + xfree(key); + rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered", + from_e, to_e); + } + k = (st_data_t)key; + if (st_delete(transcoder_lib_table, &k, &val)) { + xfree((char *)k); + } + st_insert(transcoder_table, (st_data_t)key, (st_data_t)tr); } static void -register_functional_transcoder(const char *from_e, const char *to_e, - const BYTE_LOOKUP *tree_start, int max_output, int from_utf8, - void (*preprocessor)(char**, char**, char*, char*, transcoder*, transcoding*), - void (*postprocessor)(char**, char**, char*, char*, transcoder*, transcoding*)) +declare_transcoder(const char *to, const char *from, const char *lib) { - if (next_transcoder_position >= MAX_TRANSCODERS) { - /* we are initializing, is it okay to use rb_raise here? */ - rb_raise(rb_eRuntimeError /*change exception*/, "not enough transcoder slots"); + const char *const key = transcoder_key(to, from); + st_data_t k = (st_data_t)key, val; + + if (st_delete(transcoder_lib_table, &k, &val)) { + xfree((char *)k); } - transcoder_table[next_transcoder_position].from_encoding = from_e; - transcoder_table[next_transcoder_position].to_encoding = to_e; - transcoder_table[next_transcoder_position].conv_tree_start = tree_start; - transcoder_table[next_transcoder_position].max_output = max_output; - transcoder_table[next_transcoder_position].from_utf8 = from_utf8; - transcoder_table[next_transcoder_position].conv_tree_start = tree_start; - transcoder_table[next_transcoder_position].preprocessor = preprocessor; - transcoder_table[next_transcoder_position].postprocessor = postprocessor; - - next_transcoder_position++; + st_insert(transcoder_lib_table, (st_data_t)key, (st_data_t)lib); } -static void -init_transcoder_table(void) +#define MAX_TRANSCODER_LIBNAME_LEN 64 +static const char transcoder_lib_prefix[] = "enc/trans/"; + +void +rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib) { - register_transcoder("ISO-8859-1", "UTF-8", &rb_from_ISO_8859_1, 2, 0); - register_transcoder("ISO-8859-2", "UTF-8", &rb_from_ISO_8859_2, 2, 0); - register_transcoder("ISO-8859-3", "UTF-8", &rb_from_ISO_8859_3, 2, 0); - register_transcoder("ISO-8859-4", "UTF-8", &rb_from_ISO_8859_4, 2, 0); - register_transcoder("ISO-8859-5", "UTF-8", &rb_from_ISO_8859_5, 3, 0); - register_transcoder("ISO-8859-6", "UTF-8", &rb_from_ISO_8859_6, 2, 0); - register_transcoder("ISO-8859-7", "UTF-8", &rb_from_ISO_8859_7, 3, 0); - register_transcoder("ISO-8859-8", "UTF-8", &rb_from_ISO_8859_8, 3, 0); - register_transcoder("ISO-8859-9", "UTF-8", &rb_from_ISO_8859_9, 2, 0); - register_transcoder("ISO-8859-10", "UTF-8", &rb_from_ISO_8859_10, 3, 0); - register_transcoder("ISO-8859-11", "UTF-8", &rb_from_ISO_8859_11, 3, 0); - register_transcoder("ISO-8859-13", "UTF-8", &rb_from_ISO_8859_13, 3, 0); - register_transcoder("ISO-8859-14", "UTF-8", &rb_from_ISO_8859_14, 3, 0); - register_transcoder("ISO-8859-15", "UTF-8", &rb_from_ISO_8859_15, 3, 0); - register_transcoder("UTF-8", "ISO-8859-1", &rb_to_ISO_8859_1, 1, 1); - register_transcoder("UTF-8", "ISO-8859-2", &rb_to_ISO_8859_2, 1, 1); - register_transcoder("UTF-8", "ISO-8859-3", &rb_to_ISO_8859_3, 1, 1); - register_transcoder("UTF-8", "ISO-8859-4", &rb_to_ISO_8859_4, 1, 1); - register_transcoder("UTF-8", "ISO-8859-5", &rb_to_ISO_8859_5, 1, 1); - register_transcoder("UTF-8", "ISO-8859-6", &rb_to_ISO_8859_6, 1, 1); - register_transcoder("UTF-8", "ISO-8859-7", &rb_to_ISO_8859_7, 1, 1); - register_transcoder("UTF-8", "ISO-8859-8", &rb_to_ISO_8859_8, 1, 1); - register_transcoder("UTF-8", "ISO-8859-9", &rb_to_ISO_8859_9, 1, 1); - register_transcoder("UTF-8", "ISO-8859-10", &rb_to_ISO_8859_10, 1, 1); - register_transcoder("UTF-8", "ISO-8859-11", &rb_to_ISO_8859_11, 1, 1); - register_transcoder("UTF-8", "ISO-8859-13", &rb_to_ISO_8859_13, 1, 1); - register_transcoder("UTF-8", "ISO-8859-14", &rb_to_ISO_8859_14, 1, 1); - register_transcoder("UTF-8", "ISO-8859-15", &rb_to_ISO_8859_15, 1, 1); - - register_transcoder("SHIFT_JIS", "UTF-8", &rb_from_SHIFT_JIS, 3, 0); - register_transcoder("EUC-JP", "UTF-8", &rb_from_EUC_JP, 3, 0); - register_transcoder("UTF-8", "SHIFT_JIS", &rb_to_SHIFT_JIS, 2, 1); - register_transcoder("UTF-8", "EUC-JP", &rb_to_EUC_JP, 2, 1); - register_functional_transcoder("ISO-2022-JP", "UTF-8", &rb_from_EUC_JP, - 8, 0, &from_iso_2022_jp_transcoder_preprocessor, NULL); - register_functional_transcoder("UTF-8", "ISO-2022-JP", &rb_to_EUC_JP, - 8, 1, NULL, &to_iso_2022_jp_transcoder_postprocessor); - - register_transcoder(NULL, NULL, NULL, 0, 0); + if (!lib || strlen(lib) > MAX_TRANSCODER_LIBNAME_LEN) { + rb_raise(rb_eArgError, "invalid library name - %s", + lib ? lib : "(null)"); + } + declare_transcoder(enc1, enc2, lib); + declare_transcoder(enc2, enc1, lib); } -static int -encoding_equal(const char* encoding1, const char* encoding2) +static void +init_transcoder_table(void) { - return 0==strcasecmp(encoding1, encoding2); + rb_declare_transcoder("ISO-8859-1", "UTF-8", "single_byte"); + rb_declare_transcoder("ISO-8859-2", "UTF-8", "single_byte"); + rb_declare_transcoder("ISO-8859-3", "UTF-8", "single_byte"); + rb_declare_transcoder("ISO-8859-4", "UTF-8", "single_byte"); + rb_declare_transcoder("ISO-8859-5", "UTF-8", "single_byte"); + rb_declare_transcoder("ISO-8859-6", "UTF-8", "single_byte"); + rb_declare_transcoder("ISO-8859-7", "UTF-8", "single_byte"); + rb_declare_transcoder("ISO-8859-8", "UTF-8", "single_byte"); + rb_declare_transcoder("ISO-8859-9", "UTF-8", "single_byte"); + rb_declare_transcoder("ISO-8859-10", "UTF-8", "single_byte"); + rb_declare_transcoder("ISO-8859-11", "UTF-8", "single_byte"); + rb_declare_transcoder("ISO-8859-13", "UTF-8", "single_byte"); + rb_declare_transcoder("ISO-8859-14", "UTF-8", "single_byte"); + rb_declare_transcoder("ISO-8859-15", "UTF-8", "single_byte"); + rb_declare_transcoder("SHIFT_JIS", "UTF-8", "japanese"); + rb_declare_transcoder("EUC-JP", "UTF-8", "japanese"); + rb_declare_transcoder("ISO-2022-JP", "UTF-8", "japanese"); } -static transcoder* +#define encoding_equal(enc1, enc2) (strcasecmp(enc1, enc2) == 0) + +static rb_transcoder * transcode_dispatch(const char* from_encoding, const char* to_encoding) { - transcoder *candidate = transcoder_table; - - for (candidate = transcoder_table; candidate->from_encoding; candidate++) { - if (encoding_equal(from_encoding, candidate->from_encoding) - && encoding_equal(to_encoding, candidate->to_encoding)) { - return candidate; + char *const key = transcoder_key(from_encoding, to_encoding); + st_data_t k, val = 0; + + k = (st_data_t)key; + if (!st_lookup(transcoder_table, k, &val) && + st_delete(transcoder_lib_table, &k, &val)) { + const char *const lib = (const char *)val; + int len = strlen(lib); + char path[sizeof(transcoder_lib_prefix) + MAX_TRANSCODER_LIBNAME_LEN]; + + xfree((char *)k); + if (len > MAX_TRANSCODER_LIBNAME_LEN) return NULL; + memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1); + memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len + 1); + if (!rb_require(path)) return NULL; + if (!st_lookup(transcoder_table, (st_data_t)key, &val)) { + /* multistep logic, via UTF-8 */ + if (!encoding_equal(from_encoding, "UTF-8") && + !encoding_equal(to_encoding, "UTF-8") && + transcode_dispatch("UTF-8", to_encoding)) { /* check that we have a second step */ + return transcode_dispatch(from_encoding, "UTF-8"); /* return first step */ + } + return NULL; } } - /* multistep logic, via UTF-8 */ - if (!encoding_equal(from_encoding, "UTF-8") - && !encoding_equal(to_encoding, "UTF-8") - && transcode_dispatch("UTF-8", to_encoding)) { /* check that we have a second step */ - return transcode_dispatch(from_encoding, "UTF-8"); /* return first step */ - } - return NULL; + return (rb_transcoder *)val; } @@ -194,8 +148,8 @@ transcode_dispatch(const char* from_encoding, const char* to_encoding) static void transcode_loop(char **in_pos, char **out_pos, char *in_stop, char *out_stop, - transcoder *my_transcoder, - transcoding *my_transcoding) + const rb_transcoder *my_transcoder, + rb_transcoding *my_transcoding) { char *in_p = *in_pos, *out_p = *out_pos; const BYTE_LOOKUP *conv_tree_start = my_transcoder->conv_tree_start; @@ -280,7 +234,7 @@ transcode_loop(char **in_pos, char **out_pos, */ static char * -str_transcoding_resize(transcoding *my_transcoding, int len, int new_len) +str_transcoding_resize(rb_transcoding *my_transcoding, int len, int new_len) { VALUE dest_string = my_transcoding->ruby_string_dest; rb_str_resize(dest_string, new_len); @@ -298,8 +252,8 @@ str_transcode(int argc, VALUE *argv, VALUE *self) const char *from_e, *to_e; int from_encidx, to_encidx; VALUE from_encval, to_encval; - transcoder *my_transcoder; - transcoding my_transcoding; + rb_transcoder *my_transcoder; + rb_transcoding my_transcoding; int final_encoding = 0; if (argc<1 || argc>2) { @@ -307,6 +261,7 @@ str_transcode(int argc, VALUE *argv, VALUE *self) } if ((to_encidx = rb_to_encoding_index(to_encval = argv[0])) < 0) { to_enc = 0; + to_encidx = 0; to_e = StringValueCStr(to_encval); } else { @@ -405,7 +360,7 @@ str_transcode(int argc, VALUE *argv, VALUE *self) } /* set encoding */ if (!to_enc) { - to_encidx = rb_enc_replicate(to_e, rb_ascii8bit_encoding()); + to_encidx = rb_define_dummy_encoding(to_e); } *self = dest; @@ -467,7 +422,10 @@ rb_str_transcode(int argc, VALUE *argv, VALUE str) void Init_transcode(void) { + transcoder_table = st_init_strcasetable(); + transcoder_lib_table = st_init_strcasetable(); init_transcoder_table(); + rb_define_method(rb_cString, "encode", rb_str_transcode, -1); rb_define_method(rb_cString, "encode!", rb_str_transcode_bang, -1); } -- cgit v1.2.3