diff options
author | naruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2006-09-15 11:26:07 +0000 |
---|---|---|
committer | naruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2006-09-15 11:26:07 +0000 |
commit | 5300eecfb324f3a29d891b3e229baef631dc6aeb (patch) | |
tree | d5082f25c6b239bcd018156e6fa54dc5f2c798ab /ext/nkf | |
parent | 976b4e5f8bedcd8285578c6da5117b6883ef1c02 (diff) | |
download | ruby-5300eecfb324f3a29d891b3e229baef631dc6aeb.tar.gz |
* ext/nkf/nkf-8/nkf.c: imported nkf 2.0.8 rev.110.
* Fix: check_bom cuts \xfe\xff\xXX\xXX of UTF-32.
* Add support --ic=UTF-32.
* Fix: can't guess UTF-16 and UTF-32.
* Fix: can't decode beyond BMP of UTF-16LE.
* ext/nkf/nkf.c (guess): Support UTF-32.
* ext/nkf/lib/kconv.rb (kconv): Support UTF-32.
* ext/nkf/lib/kconv.rb (to_utf32): new method.
* ext/nkf/lib/kconv.rb (to_utf32): new method.
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@10938 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'ext/nkf')
-rw-r--r-- | ext/nkf/lib/kconv.rb | 29 | ||||
-rw-r--r-- | ext/nkf/nkf-utf8/nkf.c | 85 | ||||
-rw-r--r-- | ext/nkf/nkf.c | 34 |
3 files changed, 117 insertions, 31 deletions
diff --git a/ext/nkf/lib/kconv.rb b/ext/nkf/lib/kconv.rb index 4ffe8d984e..91553228fe 100644 --- a/ext/nkf/lib/kconv.rb +++ b/ext/nkf/lib/kconv.rb @@ -105,6 +105,8 @@ module Kconv opt << 'W' when ::NKF::UTF16 opt << 'W16' + when ::NKF::UTF32 + opt << 'W32' end case out_code @@ -118,6 +120,8 @@ module Kconv opt << 'w' when ::NKF::UTF16 opt << 'w16' + when ::NKF::UTF32 + opt << 'w32' when ::NKF::NOCONV return str end @@ -202,6 +206,20 @@ module Kconv end module_function :toutf16 + # call-seq: + # Kconv.toutf32(str) -> string + # + # Convert <code>str</code> to UTF-32 + # + # *Note* + # This method decode MIME encoded string and + # convert halfwidth katakana to fullwidth katakana. + # If you don't want it, use NKF.nkf('-w32xm0', str). + def toutf32(str) + ::NKF::nkf('-w32m', str) + end + module_function :toutf32 + # # guess # @@ -337,6 +355,17 @@ class String # If you don't want it, use NKF.nkf('-w16xm0', str). def toutf16; Kconv.toutf16(self) end + # call-seq: + # String#toutf32 -> string + # + # Convert <code>self</code> to UTF-32 + # + # *Note* + # This method decode MIME encoded string and + # convert halfwidth katakana to fullwidth katakana. + # If you don't want it, use NKF.nkf('-w32xm0', str). + def toutf32; Kconv.toutf32(self) end + # # is Encoding # diff --git a/ext/nkf/nkf-utf8/nkf.c b/ext/nkf/nkf-utf8/nkf.c index 2f3da8b373..bd2e90c77c 100644 --- a/ext/nkf/nkf-utf8/nkf.c +++ b/ext/nkf/nkf-utf8/nkf.c @@ -581,6 +581,8 @@ struct input_code input_code_list[] = { {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0}, #ifdef UTF8_INPUT_ENABLE {"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0}, + {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0}, + {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0}, #endif {0} }; @@ -1293,6 +1295,15 @@ void options(unsigned char *cp) strcmp(codeset, "UTF-16LE-BOM") == 0){ input_f = UTF16_INPUT; input_endian = ENDIAN_LITTLE; + }else if(strcmp(codeset, "UTF-32") == 0 || + strcmp(codeset, "UTF-32BE") == 0 || + strcmp(codeset, "UTF-32BE-BOM") == 0){ + input_f = UTF32_INPUT; + input_endian = ENDIAN_BIG; + }else if(strcmp(codeset, "UTF-32LE") == 0 || + strcmp(codeset, "UTF-32LE-BOM") == 0){ + input_f = UTF32_INPUT; + input_endian = ENDIAN_LITTLE; #endif } continue; @@ -1901,12 +1912,7 @@ void options(unsigned char *cp) } } -#ifdef ANSI_C_PROTOTYPE struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0)) -#else -struct input_code * find_inputcode_byfunc(iconv_func) - nkf_char (*iconv_func)(); -#endif { if (iconv_func){ struct input_code *p = input_code_list; @@ -2227,6 +2233,12 @@ void code_status(nkf_char c) struct input_code *result = 0; struct input_code *p = input_code_list; while (p->name){ + if (!p->status_func) { + ++p; + continue; + } + if (!p->status_func) + continue; (p->status_func)(p, c); if (p->stat > 0){ action_flag = 0; @@ -2407,8 +2419,11 @@ void check_bom(FILE *f) if(!input_f){ set_iconv(TRUE, w_iconv32); } - input_endian = ENDIAN_BIG; - return; + if (iconv == w_iconv32) { + input_endian = ENDIAN_BIG; + return; + } + (*i_ungetc)(0xFF,f); }else (*i_ungetc)(c2,f); (*i_ungetc)(0xFE,f); }else if(c2 == 0xFF){ @@ -2416,8 +2431,11 @@ void check_bom(FILE *f) if(!input_f){ set_iconv(TRUE, w_iconv32); } - input_endian = ENDIAN_2143; - return; + if (iconv == w_iconv32) { + input_endian = ENDIAN_2143; + return; + } + (*i_ungetc)(0xFF,f); }else (*i_ungetc)(c2,f); (*i_ungetc)(0xFF,f); }else (*i_ungetc)(c2,f); @@ -2431,7 +2449,10 @@ void check_bom(FILE *f) if(!input_f){ set_iconv(TRUE, w_iconv); } - return; + if (iconv == w_iconv) { + return; + } + (*i_ungetc)(0xBF,f); }else (*i_ungetc)(c2,f); (*i_ungetc)(0xBB,f); }else (*i_ungetc)(c2,f); @@ -2444,16 +2465,22 @@ void check_bom(FILE *f) if(!input_f){ set_iconv(TRUE, w_iconv32); } - input_endian = ENDIAN_3412; - return; + if (iconv == w_iconv32) { + input_endian = ENDIAN_3412; + return; + } + (*i_ungetc)(0x00,f); }else (*i_ungetc)(c2,f); (*i_ungetc)(0x00,f); }else (*i_ungetc)(c2,f); if(!input_f){ set_iconv(TRUE, w_iconv16); } - input_endian = ENDIAN_BIG; - return; + if (iconv == w_iconv16) { + input_endian = ENDIAN_BIG; + return; + } + (*i_ungetc)(0xFF,f); }else (*i_ungetc)(c2,f); (*i_ungetc)(0xFE,f); break; @@ -2464,16 +2491,22 @@ void check_bom(FILE *f) if(!input_f){ set_iconv(TRUE, w_iconv32); } - input_endian = ENDIAN_LITTLE; - return; + if (iconv == w_iconv32) { + input_endian = ENDIAN_LITTLE; + return; + } + (*i_ungetc)(0x00,f); }else (*i_ungetc)(c2,f); (*i_ungetc)(0x00,f); }else (*i_ungetc)(c2,f); if(!input_f){ set_iconv(TRUE, w_iconv16); } - input_endian = ENDIAN_LITTLE; - return; + if (iconv == w_iconv16) { + input_endian = ENDIAN_LITTLE; + return; + } + (*i_ungetc)(0xFE,f); }else (*i_ungetc)(c2,f); (*i_ungetc)(0xFF,f); break; @@ -2557,21 +2590,21 @@ nkf_char kanji_convert(FILE *f) c0 <<= 8; if ((c3 = (*i_getc)(f)) != EOF) { c0 |= c3; - } else c1 = EOF; - } else c1 = EOF; + } else c2 = EOF; + } else c2 = EOF; } - } + } else c2 = EOF; } else { if ((c2 = (*i_getc)(f)) != EOF) { if (0xD8 <= c2 && c2 <= 0xDB) { if ((c3 = (*i_getc)(f)) != EOF) { - c3 <<= 8; if ((c0 = (*i_getc)(f)) != EOF) { + c0 <<= 8; c0 |= c3; - } else c1 = EOF; - } else c1 = EOF; + } else c2 = EOF; + } else c2 = EOF; } - } else c1 = EOF; + } else c2 = EOF; } SEND; } else if(iconv == w_iconv32){ @@ -2595,7 +2628,7 @@ nkf_char kanji_convert(FILE *f) } c2 = 0; }else{ - c1 = EOF; + c2 = EOF; } SEND; } else diff --git a/ext/nkf/nkf.c b/ext/nkf/nkf.c index 8a4bcfce64..e12df16960 100644 --- a/ext/nkf/nkf.c +++ b/ext/nkf/nkf.c @@ -306,6 +306,8 @@ rb_nkf_guess1(VALUE obj, VALUE src) * "UTF-8" * when NKF::UTF16 * "UTF-16" + * when NKF::UTF32 + * "UTF-32" * when NKF::UNKNOWN * "UNKNOWN" * when NKF::BINARY @@ -345,6 +347,8 @@ rb_nkf_guess2(VALUE obj, VALUE src) code = _UTF8; } else if (strcmp(input_codename, "UTF-16") == 0) { code = _UTF16; + } else if (strcmp(input_codename, "UTF-32") == 0) { + code = _UTF32; } else if (strlen(input_codename) > 0) { code = _UNKNOWN; } @@ -382,16 +386,16 @@ rb_nkf_guess2(VALUE obj, VALUE src) * * Output is buffered (DEFAULT), Output is unbuffered. * - * === -j -s -e -w -w16 + * === -j -s -e -w -w16 -w32 * * Output code is ISO-2022-JP (7bit JIS), Shift_JIS, EUC-JP, - * UTF-8N, UTF-16BE. + * UTF-8N, UTF-16BE, UTF-32BE. * Without this option and compile option, ISO-2022-JP is assumed. * - * === -J -S -E -W -W16 + * === -J -S -E -W -W16 -W32 * * Input assumption is JIS 7 bit, Shift_JIS, EUC-JP, - * UTF-8, UTF-16LE. + * UTF-8, UTF-16, UTF-32. * * ==== -J * @@ -574,6 +578,16 @@ rb_nkf_guess2(VALUE obj, VALUE src) * * [UTF-16LE-BOM] UTF-16 Little Endian with BOM * + * [UTF-32] same as UTF-32BE + * + * [UTF-32BE] UTF-32 Big Endian without BOM + * + * [UTF-32BE-BOM] UTF-32 Big Endian with BOM + * + * [UTF-32LE] UTF-32 Little Endian without BOM + * + * [UTF-32LE-BOM] UTF-32 Little Endian with BOM + * * [UTF8-MAC] NKDed UTF-8, a.k.a. UTF8-NFD (input only) * * === --fb-{skip, html, xml, perl, java, subchar} @@ -587,10 +601,20 @@ rb_nkf_guess2(VALUE obj, VALUE src) * nkf adds a specified escape character to specified 2nd byte of Shift_JIS characters. * 1st byte of argument is the escape character and following bytes are target characters. * - * === --disable-cp932ext + * === --no-cp932ext * * Handle the characters extended in CP932 as unassigned characters. * + * == --no-best-fit-chars + * + * When Unicode to Encoded byte conversion, + * don't convert characters which is not round trip safe. + * When Unicode to Unicode conversion, + * with this and -x option, nkf can be used as UTF converter. + * (In other words, without this and -x option, nkf doesn't save some characters) + * + * When nkf convert string which related to path, you should use this opion. + * * === --cap-input * * Decode hex encoded characters. |