diff options
Diffstat (limited to 'ext/nkf/nkf-utf8/nkf.c')
-rw-r--r-- | ext/nkf/nkf-utf8/nkf.c | 90 |
1 files changed, 71 insertions, 19 deletions
diff --git a/ext/nkf/nkf-utf8/nkf.c b/ext/nkf/nkf-utf8/nkf.c index aa7c459b83..458fbbc0e8 100644 --- a/ext/nkf/nkf-utf8/nkf.c +++ b/ext/nkf/nkf-utf8/nkf.c @@ -46,7 +46,7 @@ static char *CopyRight = static char *Version = "2.0"; static char *Patchlevel = - "4/0401/Shinji Kono"; + "4/0410/Shinji Kono"; /* ** @@ -198,7 +198,7 @@ static char *Patchlevel = #define UTF8 12 #define UTF8_INPUT 13 -#define UTF16_INPUT 14 +#define UTF16LE_INPUT 14 #define UTF16BE_INPUT 15 #define WISH_TRUE 15 @@ -376,8 +376,9 @@ static int x0201_f = NO_X0201; /* Assume NO JISX0201 */ #endif static int iso2022jp_f = FALSE; /* convert ISO-2022-JP */ #ifdef UTF8_OUTPUT_ENABLE -static int w_oconv16_begin_f= 0; /* utf-16 header */ +static int unicode_bom_f= 0; /* Output Unicode BOM */ static int w_oconv16_LE = 0; /* utf-16 little endian */ +static int ms_ucs_map_f = FALSE; /* Microsoft UCS Mapping Compatible */ #endif @@ -443,7 +444,7 @@ STATIC void s_status PROTO((struct input_code *, int)); #ifdef UTF8_INPUT_ENABLE STATIC void w_status PROTO((struct input_code *, int)); STATIC void w16_status PROTO((struct input_code *, int)); -static int utf16_mode = UTF16_INPUT; +static int utf16_mode = UTF16LE_INPUT; #endif struct input_code input_code_list[] = { @@ -892,6 +893,7 @@ struct { #ifdef UTF8_OUTPUT_ENABLE {"utf8", "w"}, {"utf16", "w16"}, + {"ms-ucs-map", ""}, #endif #ifdef UTF8_INPUT_ENABLE {"utf8-input", "W"}, @@ -1008,6 +1010,12 @@ options(cp) return; } #endif +#ifdef UTF8_OUTPUT_ENABLE + if (strcmp(long_option[i].name, "ms-ucs-map") == 0){ + ms_ucs_map_f = TRUE; + continue; + } +#endif if (strcmp(long_option[i].name, "prefix=") == 0){ if (*p == '=' && ' ' < p[1] && p[1] < 128){ for (i = 2; ' ' < p[i] && p[i] < 128; i++){ @@ -1082,17 +1090,23 @@ options(cp) if ('1'== cp[0] && '6'==cp[1]) { output_conv = w_oconv16; cp+=2; if (cp[0]=='L') { - w_oconv16_begin_f=2; cp++; + unicode_bom_f=2; cp++; w_oconv16_LE = 1; if (cp[0] == '0'){ - w_oconv16_begin_f=1; cp++; + unicode_bom_f=1; cp++; } } else if (cp[0] == 'B') { - w_oconv16_begin_f=2; cp++; + unicode_bom_f=2; cp++; if (cp[0] == '0'){ - w_oconv16_begin_f=1; cp++; + unicode_bom_f=1; cp++; } - } + } + } else if (cp[0] == '8') { + output_conv = w_oconv; cp++; + unicode_bom_f=2; + if (cp[0] == '0'){ + unicode_bom_f=1; cp++; + } } else output_conv = w_oconv; continue; @@ -1100,7 +1114,16 @@ options(cp) #ifdef UTF8_INPUT_ENABLE case 'W': /* UTF-8 input */ if ('1'== cp[0] && '6'==cp[1]) { - input_f = UTF16_INPUT; + input_f = UTF16LE_INPUT; + if (cp[0]=='L') { + cp++; + } else if (cp[0] == 'B') { + cp++; + input_f = UTF16BE_INPUT; + } + } else if (cp[0] == '8') { + cp++; + input_f = UTF8_INPUT; } else input_f = UTF8_INPUT; continue; @@ -1760,7 +1783,7 @@ module_connection() #ifdef UTF8_INPUT_ENABLE } else if (input_f == UTF8_INPUT) { set_iconv(-TRUE, w_iconv); - } else if (input_f == UTF16_INPUT) { + } else if (input_f == UTF16LE_INPUT) { set_iconv(-TRUE, w_iconv16); #endif } else { @@ -2364,7 +2387,7 @@ w_iconv16(c2, c1, c0) int ret; if (c2==0376 && c1==0377){ - utf16_mode = UTF16_INPUT; + utf16_mode = UTF16LE_INPUT; return 0; } else if (c2==0377 && c1==0376){ utf16_mode = UTF16BE_INPUT; @@ -2424,6 +2447,7 @@ e2w_conv(c2, c1) { extern unsigned short euc_to_utf8_1byte[]; extern unsigned short * euc_to_utf8_2bytes[]; + extern unsigned short * euc_to_utf8_2bytes_ms[]; unsigned short *p; if (c2 == X0201) { @@ -2432,7 +2456,7 @@ e2w_conv(c2, c1) c2 &= 0x7f; c2 = (c2&0x7f) - 0x21; if (0<=c2 && c2<sizeof_euc_to_utf8_2bytes) - p = euc_to_utf8_2bytes[c2]; + p = ms_ucs_map_f ? euc_to_utf8_2bytes_ms[c2] : euc_to_utf8_2bytes[c2]; else return 0; } @@ -2462,7 +2486,16 @@ w_oconv(c2, c1) if (c2 == EOF) { (*o_putc)(EOF); return; - } else if (c2 == 0) { + } + + if (unicode_bom_f==2) { + (*o_putc)('\357'); + (*o_putc)('\273'); + (*o_putc)('\277'); + unicode_bom_f=1; + } + + if (c2 == 0) { output_mode = ASCII; (*o_putc)(c1); } else if (c2 == ISO8859_1) { @@ -2489,7 +2522,7 @@ w_oconv16(c2, c1) return; } - if (w_oconv16_begin_f==2) { + if (unicode_bom_f==2) { if (w_oconv16_LE){ (*o_putc)((unsigned char)'\377'); (*o_putc)('\376'); @@ -2497,7 +2530,7 @@ w_oconv16(c2, c1) (*o_putc)('\376'); (*o_putc)((unsigned char)'\377'); } - w_oconv16_begin_f=1; + unicode_bom_f=1; } if (c2 == ISO8859_1) { @@ -3930,8 +3963,8 @@ reinit() } } #ifdef UTF8_OUTPUT_ENABLE - if (w_oconv16_begin_f) { - w_oconv16_begin_f = 2; + if (unicode_bom_f) { + unicode_bom_f = 2; } #endif f_line = 0; @@ -3990,7 +4023,13 @@ usage() #ifdef DEFAULT_CODE_UTF8 fprintf(stderr,"j,s,e,w Outout code is JIS 7 bit, Shift JIS, AT&T JIS (EUC), UTF-8 (DEFAULT)\n"); #endif +#ifdef UTF8_OUTPUT_ENABLE + fprintf(stderr," After 'w' you can add more options. (80?|16((B|L)0?)?) \n"); +#endif fprintf(stderr,"J,S,E,W Input assumption is JIS 7 bit , Shift JIS, AT&T JIS (EUC), UTF-8\n"); +#ifdef UTF8_INPUT_ENABLE + fprintf(stderr," After 'W' you can add more options. (8|16(B|L)?) \n"); +#endif fprintf(stderr,"t no conversion\n"); fprintf(stderr,"i_/o_ Output sequence to designate JIS-kanji/ASCII (DEFAULT B)\n"); fprintf(stderr,"r {de/en}crypt ROT13/47\n"); @@ -4012,8 +4051,21 @@ usage() fprintf(stderr,"I Convert non ISO-2022-JP charactor to GETA\n"); fprintf(stderr,"-L[uwm] line mode u:LF w:CRLF m:CR (DEFAULT noconversion)\n"); fprintf(stderr,"long name options\n"); - fprintf(stderr," --fj,--unix,--mac,--windows convert for the system\n"); + fprintf(stderr," --fj,--unix,--mac,--windows convert for the system\n"); fprintf(stderr," --jis,--euc,--sjis,--utf8,--utf16,--mime,--base64 convert for the code\n"); + fprintf(stderr," --hiragana, --katakana Hiragana/Katakana Conversion\n"); +#ifdef INPUT_OPTION + fprintf(stderr," --cap-input, --url-input Convert hex after ':' or '%'\n"); +#endif +#ifdef NUMCHAR_OPTION + fprintf(stderr," --numchar-input Convert Unicode Character Reference\n"); +#endif +#ifdef SHIFTJIS_CP932 + fprintf(stderr," --no-cp932 Don't convert Shift_JIS FAxx-FCxx to equivalnet CP932\n"); +#endif +#ifdef UTF8_OUTPUT_ENABLE + fprintf(stderr," --ms-ucs-map Microsoft UCS Mapping Compatible\n"); +#endif #ifdef OVERWRITE fprintf(stderr," --overwrite Overwrite original listed files by filtered result\n"); #endif |