From 0a64817fb80016030c03518fb9459f63c11605ea Mon Sep 17 00:00:00 2001 From: matz Date: Fri, 13 Aug 1999 05:37:52 +0000 Subject: remove marshal/gtk/kconv git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@518 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ext/nkf/MANIFEST | 7 + ext/nkf/depend | 1 + ext/nkf/extconf.rb | 2 + ext/nkf/lib/kconv.rb | 58 ++ ext/nkf/nkf.c | 206 ++++++ ext/nkf/nkf1.7/nkf.c | 1899 ++++++++++++++++++++++++++++++++++++++++++++++++++ ext/nkf/test.rb | 318 +++++++++ 7 files changed, 2491 insertions(+) create mode 100644 ext/nkf/MANIFEST create mode 100644 ext/nkf/depend create mode 100644 ext/nkf/extconf.rb create mode 100644 ext/nkf/lib/kconv.rb create mode 100644 ext/nkf/nkf.c create mode 100644 ext/nkf/nkf1.7/nkf.c create mode 100644 ext/nkf/test.rb (limited to 'ext/nkf') diff --git a/ext/nkf/MANIFEST b/ext/nkf/MANIFEST new file mode 100644 index 0000000000..5114a3762a --- /dev/null +++ b/ext/nkf/MANIFEST @@ -0,0 +1,7 @@ +MANIFEST +depend +extconf.rb +lib/kconv.rb +nkf.c +nkf1.7/nkf.c +test.rb diff --git a/ext/nkf/depend b/ext/nkf/depend new file mode 100644 index 0000000000..645bc869c8 --- /dev/null +++ b/ext/nkf/depend @@ -0,0 +1 @@ +nkf.o : nkf.c $(hdrdir)/ruby.h $(topdir)/config.h $(hdrdir)/defines.h nkf1.7/nkf.c diff --git a/ext/nkf/extconf.rb b/ext/nkf/extconf.rb new file mode 100644 index 0000000000..710662f19c --- /dev/null +++ b/ext/nkf/extconf.rb @@ -0,0 +1,2 @@ +require 'mkmf' +create_makefile('nkf') diff --git a/ext/nkf/lib/kconv.rb b/ext/nkf/lib/kconv.rb new file mode 100644 index 0000000000..bfd276330d --- /dev/null +++ b/ext/nkf/lib/kconv.rb @@ -0,0 +1,58 @@ +require 'nkf' + +module Kconv + AUTO = NKF::AUTO + JIS = NKF::JIS + EUC = NKF::EUC + SJIS = NKF::SJIS + BINARY = NKF::BINARY + NOCONV = NKF::NOCONV + UNKNOWN = NKF::UNKNOWN + def kconv(str, out_code, in_code = AUTO) + opt = '-' + case in_code + when NKF::JIS + opt << 'J' + when NKF::EUC + opt << 'E' + when NKF::SJIS + opt << 'S' + end + + case out_code + when NKF::JIS + opt << 'j' + when NKF::EUC + opt << 'e' + when NKF::SJIS + opt << 's' + when NKF::NOCONV + return str + end + + opt = '' if opt == '-' + + NKF::nkf(opt, str) + end + module_function :kconv + + def tojis(str) + NKF::nkf('-j', str) + end + module_function :tojis + + def toeuc(str) + NKF::nkf('-e', str) + end + module_function :toeuc + + def tosjis(str) + NKF::nkf('-s', str) + end + module_function :tosjis + + def guess(str) + NKF::guess(str) + end + module_function :guess +end diff --git a/ext/nkf/nkf.c b/ext/nkf/nkf.c new file mode 100644 index 0000000000..35d9295a74 --- /dev/null +++ b/ext/nkf/nkf.c @@ -0,0 +1,206 @@ +#include "ruby.h" + +#define _AUTO 0 +#define _JIS 1 +#define _EUC 2 +#define _SJIS 3 +#define _BINARY 4 +#define _NOCONV 4 +#define _UNKNOWN _AUTO + +#undef getc +#undef ungetc +#define getc(f) (input_ctr>i_len?-1:input[input_ctr++]) +#define ungetc(c,f) input_ctr-- + +#undef putchar +#define putchar(c) rb_nkf_putchar(c) + +#define INCSIZE 32 +static int incsize; + +static unsigned char *input, *output; +static int input_ctr, i_len; +static int output_ctr, o_len; + +static VALUE dst; + +static int +rb_nkf_putchar(c) + unsigned int c; +{ + if (output_ctr >= o_len) { + o_len += incsize; + rb_str_cat(dst, "", incsize); + incsize *= 2; + } + + output[output_ctr++] = c; +/* +printf("[[%c][%c][%d]]\n", c, output[output_ctr - 1], output_ctr); +*/ + return c; +} + +#define PERL_XS 1 +#include "nkf1.7/nkf.c" + +static VALUE +rb_nkf_kconv(obj, opt, src) + VALUE obj, opt, src; +{ + int i; + char *opt_ptr, *opt_end; + + reinit(); + opt_ptr = str2cstr(opt, &i); + opt_end = opt_ptr + i; + for (; opt_ptr < opt_end; opt_ptr++) { + if (*opt_ptr != '-') { + continue; + } + arguments(opt_ptr); + } + + incsize = INCSIZE; + + input_ctr = 0; + input = str2cstr(src, &i_len); + dst = rb_str_new(0, i_len*3 + 10); /* large enough? */ + + output_ctr = 0; + output = RSTRING(dst)->ptr; + o_len = RSTRING(dst)->len; + *output = '\0'; + + if(iso8859_f && (oconv != j_oconv || !x0201_f )) { + iso8859_f = FALSE; + } + + kanji_convert(NULL); + if (output_ctr > 0) output_ctr--; + if (output[output_ctr] == '\0') { +/* +printf("([%c][%d])\n", output[output_ctr], output_ctr); +*/ + RSTRING(dst)->len = output_ctr; + } else { +/* +printf("<[%c][%d]>\n", output[output_ctr], output_ctr); +*/ + RSTRING(dst)->len = output_ctr + 1; + } + + return dst; +} + +/* + * Character code detection - Algorithm described in: + * Ken Lunde. `Understanding Japanese Information Processing' + * Sebastopol, CA: O'Reilly & Associates. + */ + +static VALUE +rb_nkf_guess(obj, src) + VALUE obj, src; +{ + unsigned char *p; + unsigned char *pend; + int plen; + int sequence_counter = 0; + + Check_Type(src, T_STRING); + + p = str2cstr(src, &plen); + pend = p + plen; + +#define INCR do {\ + p++;\ + if (p==pend) return INT2FIX(_UNKNOWN);\ + sequence_counter++;\ + if (sequence_counter % 2 == 1 && *p != 0xa4)\ + sequence_counter = 0;\ + if (6 <= sequence_counter) {\ + sequence_counter = 0;\ + return INT2FIX(_EUC);\ + }\ +} while (0) + + if (*p == 0xa4) + sequence_counter = 1; + + while (p= 0x40) { + if (*p >= 0x81) { + if (*p <= 0x8d || (0x8f <= *p && *p <= 0x9f)) { + return INT2FIX(_SJIS); + } + else if (0xfd <= *p && *p <= 0xfe) { + return INT2FIX(_EUC); + } + } + INCR; + } + } + else if (*p <= 0x9f) { + return INT2FIX(_SJIS); + } + } + else if (0xf0 <= *p && *p <= 0xfe) { + return INT2FIX(_EUC); + } + else if (0xe0 <= *p && *p <= 0xef) { + INCR; + if ((0x40 <= *p && *p <= 0x7e) || + (0x80 <= *p && *p <= 0xa0)) { + return INT2FIX(_SJIS); + } + if (0xfd <= *p && *p <= 0xfe) { + return INT2FIX(_EUC); + } + } + INCR; + } + return INT2FIX(_UNKNOWN); +} + +void +Init_nkf() +{ + VALUE mKconv = rb_define_module("NKF"); + + rb_define_module_function(mKconv, "nkf", rb_nkf_kconv, 2); + rb_define_module_function(mKconv, "guess", rb_nkf_guess, 1); + + rb_define_const(mKconv, "AUTO", INT2FIX(_AUTO)); + rb_define_const(mKconv, "JIS", INT2FIX(_JIS)); + rb_define_const(mKconv, "EUC", INT2FIX(_EUC)); + rb_define_const(mKconv, "SJIS", INT2FIX(_SJIS)); + rb_define_const(mKconv, "BINARY", INT2FIX(_BINARY)); + rb_define_const(mKconv, "NOCONV", INT2FIX(_NOCONV)); + rb_define_const(mKconv, "UNKNOWN", INT2FIX(_UNKNOWN)); +} diff --git a/ext/nkf/nkf1.7/nkf.c b/ext/nkf/nkf1.7/nkf.c new file mode 100644 index 0000000000..26ef657021 --- /dev/null +++ b/ext/nkf/nkf1.7/nkf.c @@ -0,0 +1,1899 @@ +/** Network Kanji Filter. (PDS Version) +************************************************************************ +** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA) +** $BO"Mm@h!'(B $B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j(B +** $B!J(BE-Mail Address: ichikawa@flab.fujitsu.co.jp$B!K(B +** Copyright (C) 1996,1998 +** $BO"Mm@h!'(B $BN05eBg3X>pJs9)3X2J(B $B2OLn(B $B??<#(B mine/X0208 support +** $B!J(BE-Mail Address: kono@ie.u-ryukyu.ac.jp$B!K(B +** $BO"Mm@h!'(B COW for DOS & Win16 & Win32 & OS/2 +** $B!J(BE-Mail Address: GHG00637@niftyserve.or.p$B!K(B +** $B$3$N%=!<%9$N$$$+$J$kJ#Z$b$7$J$$!"0-$7$+$i$:!#(B +** Everyone is permitted to do anything on this program +** including copying, modifying, improving. +** as long as you don't try to pretend that you wrote it. +** i.e., the above copyright notice has to appear in all copies. +** You don't have to ask before copying or publishing. +** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE. +***********************************************************************/ + +static char *CopyRight = + "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),1998 S. Kono, COW"; +static char *Version = + "1.7"; +static char *Patchlevel = + "0/9711/Shinji Kono"; + +/* +** +** +** +** USAGE: nkf [flags] [file] +** +** Flags: +** b Output is bufferred (DEFAULT) +** u Output is unbufferred +** +** t no operation +** +** j Outout code is JIS 7 bit (DEFAULT SELECT) +** s Output code is MS Kanji (DEFAULT SELECT) +** e Output code is AT&T JIS (DEFAULT SELECT) +** l Output code is JIS 7bit and ISO8859-1 Latin-1 +** +** m MIME conversion for ISO-2022-JP +** i_ Output sequence to designate JIS-kanji (DEFAULT_J) +** o_ Output sequence to designate single-byte roman characters (DEFAULT_R) +** +** r {de/en}crypt ROT13/47 +** +** v display Version +** +** T Text mode output (for MS-DOS) +** +** x Do not convert X0201 kana into X0208 +** Z Convert X0208 alphabet to ASCII +** +** f60 fold option +** +** m MIME decode +** B try to fix broken JIS, missing Escape +** B[1-9] broken level +** +** O Output to 'nkf.out' file +** d Delete \r in line feed +** c Add \r in line feed +**/ +/******************************/ +/* $B%G%U%)%k%H$N=PNO%3!<%IA*Br(B */ +/* Select DEFAULT_CODE */ +#define DEFAULT_CODE_JIS +/* #define DEFAULT_CODE_SJIS */ +/* #define DEFAULT_CODE_EUC */ +/******************************/ + +#if (defined(__TURBOC__) || defined(LSI_C)) && !defined(MSDOS) +#define MSDOS +#endif + +#ifndef PERL_XS +#include +#endif + +#if defined(MSDOS) || defined(__OS2__) +#include +#include +#include +#endif + +#ifdef MSDOS +#ifdef LSI_C +#define setbinmode(fp) fsetbin(fp) +#else /* Microsoft C, Turbo C */ +#define setbinmode(fp) setmode(fileno(fp), O_BINARY) +#endif +#else /* UNIX,OS/2 */ +#define setbinmode(fp) +#endif + +#ifdef _IOFBF /* SysV and MSDOS */ +#define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size) +#else /* BSD */ +#define setvbuffer(fp, buf, size) setbuffer(fp, buf, size) +#endif + +/*Borland C++ 4.5 EasyWin*/ +#if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */ +#define EASYWIN +#include +#endif + +#define FALSE 0 +#define TRUE 1 + +/* state of output_mode and input_mode */ + +#define ASCII 0 +#define X0208 1 +#define X0201 2 +#define NO_X0201 3 +#define JIS_INPUT 4 +#define SJIS_INPUT 5 +#define LATIN1_INPUT 6 +#define FIXED_MIME 7 +#define DOUBLE_SPACE -2 + +#define NL 0x0a +#define ESC 0x1b +#define SPACE 0x20 +#define AT 0x40 +#define SSP 0xa0 +#define DEL 0x7f +#define SI 0x0f +#define SO 0x0e +#define SSO 0x8e + +#define HOLD_SIZE 32 +#define IOBUF_SIZE 16384 + +#define DEFAULT_J 'B' +#define DEFAULT_R 'B' + +#define SJ0162 0x00e1 /* 01 - 62 ku offset */ +#define SJ6394 0x0161 /* 63 - 94 ku offset */ + + +/* MIME preprocessor */ + +#undef STRICT_MIME /* do stupid strict mime integrity check */ +#define GETC(p) ((!mime_mode)?getc(p):mime_getc(p)) +#define UNGETC(c,p) ((!mime_mode)?ungetc(c,p):mime_ungetc(c)) + + +#ifdef EASYWIN /*Easy Win */ +extern POINT _BufferSize; +#endif + +/* function prototype */ + +#ifndef _ +# ifdef __STDC__ +# define _(args) args +# else +# define _(args) () +# endif +#endif + +#ifndef PERL_XS +static void noconvert _((FILE *f)); +static int mime_integrity _((FILE *f,unsigned char *p)); +static int usage _((void)); +static char stdibuf[IOBUF_SIZE]; +static char stdobuf[IOBUF_SIZE]; +static unsigned int mime_input = 0; /* undecoded */ +static int end_check; +#endif + +static void kanji_convert _((FILE *f)); +static void h_conv _((FILE *f,int c2,int c1)); +static int push_hold_buf _((int c2,int c1)); +static void s_iconv _((int c2,int c1)); +static void e_oconv _((int c2,int c1)); +static void s_oconv _((int c2,int c1)); +static void j_oconv _((int c2,int c1)); +static int line_fold _((int c2,int c1)); +static int pre_convert _((int c1,int c2)); +static int mime_begin _((FILE *f)); +static int mime_getc _((FILE *f)); +static int mime_ungetc _((unsigned int c)); +static int base64decode _((int c)); +static void arguments _((char *c)); +static void reinit _((void)); + +/* buffers */ + +static unsigned char hold_buf[HOLD_SIZE*2]; +static int hold_count; + +/* MIME preprocessor fifo */ + +#define MIME_BUF_SIZE (1024) /* 2^n ring buffer */ +#define MIME_BUF_MASK (MIME_BUF_SIZE-1) +#define Fifo(n) mime_buf[(n)&MIME_BUF_MASK] +static unsigned char mime_buf[MIME_BUF_SIZE]; +static unsigned int mime_top = 0; +static unsigned int mime_last = 0; /* decoded */ + +/* flags */ +static int unbuf_f = FALSE; +static int estab_f = FALSE; +static int nop_f = FALSE; +static int binmode_f = TRUE; /* binary mode */ +static int rot_f = FALSE; /* rot14/43 mode */ +static int input_f = FALSE; /* non fixed input code */ +static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */ +static int mime_f = TRUE; /* convert MIME B base64 or Q */ +static int mimebuf_f = FALSE; /* MIME buffered input */ +static int broken_f = FALSE; /* convert ESC-less broken JIS */ +static int iso8859_f = FALSE; /* ISO8859 through */ +#if defined(MSDOS) || defined(__OS2__) +static int x0201_f = TRUE; /* Assume JISX0201 kana */ +#else +static int x0201_f = NO_X0201; /* Assume NO JISX0201 */ +#endif + +/* X0208 -> ASCII converter */ + +static int c1_return; + +/* fold parameter */ +static int line = 0; /* chars in line */ +static int prev = 0; +static int fold_f = FALSE; +static int fold_len = 0; + +/* options */ +static char kanji_intro = DEFAULT_J, + ascii_intro = DEFAULT_R; + +/* Folding */ + +int line_fold(); +#define FOLD_MARGIN 10 +#define DEFAULT_FOLD 60 + +/* converters */ + +#ifdef DEFAULT_CODE_JIS +# define DEFAULT_CONV j_oconv +#endif +#ifdef DEFAULT_CODE_SJIS +# define DEFAULT_CONV s_oconv +#endif +#ifdef DEFAULT_CODE_EUC +# define DEFAULT_CONV e_oconv +#endif + +static void (*iconv) _((int c2,int c1)); + /* s_iconv or oconv */ +static void (*oconv) _((int c2,int c1)) = DEFAULT_CONV; + /* [ejs]_oconv */ + +/* Global states */ +static int output_mode = ASCII, /* output kanji mode */ + input_mode = ASCII, /* input kanji mode */ + shift_mode = FALSE; /* TRUE shift out, or X0201 */ +static int mime_mode = FALSE; /* MIME mode B base64, Q hex */ + +/* X0201 / X0208 conversion tables */ + +/* X0201 kana conversion table */ +/* 90-9F A0-DF */ +unsigned char cv[]= { +0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57, +0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21, +0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29, +0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43, +0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26, +0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d, +0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35, +0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d, +0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46, +0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c, +0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52, +0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e, +0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62, +0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69, +0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d, +0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c, +0x00,0x00}; + + +/* X0201 kana conversion table for daguten */ +/* 90-9F A0-DF */ +unsigned char dv[]= { +0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e, +0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36, +0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e, +0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47, +0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53, +0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00, +0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x00,0x00}; + +/* X0201 kana conversion table for han-daguten */ +/* 90-9F A0-DF */ +unsigned char ev[]= { +0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54, +0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00, +0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x00,0x00}; + + +/* X0208 kigou conversion table */ +/* 0x8140 - 0x819e */ +unsigned char fv[] = { + +0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a, +0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00, +0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f, +0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27, +0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d, +0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00, +0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40, +0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +} ; + + +static int file_out = FALSE; +static int add_cr = FALSE; +static int del_cr = FALSE; + +#ifndef PERL_XS +int +main(argc, argv) + int argc; + char **argv; +{ + FILE *fin; + char *cp; + +#ifdef EASYWIN /*Easy Win */ + _BufferSize.y = 400;/*Set Scroll Buffer Size*/ +#endif + + for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) { + cp = *argv; + arguments(cp); + } + + if(iso8859_f && (oconv != j_oconv || !x0201_f )) { + fprintf(stderr,"Mixed ISO8859/JISX0201/SJIS/EUC output is not allowed.\n"); + exit(1); + } + + if(binmode_f == TRUE) +#ifdef __OS2__ + if(freopen("","wb",stdout) == NULL) + return (-1); +#else + setbinmode(stdout); +#endif + + if(unbuf_f) + setbuf(stdout, (char *) NULL); + else + setvbuffer(stdout, stdobuf, IOBUF_SIZE); + + if(argc == 0) { + if(binmode_f == TRUE) +#ifdef __OS2__ + if(freopen("","rb",stdin) == NULL) return (-1); +#else + setbinmode(stdin); +#endif + setvbuffer(stdin, stdibuf, IOBUF_SIZE); + if(nop_f) + noconvert(stdin); + else + kanji_convert(stdin); + } else { + while (argc--) { + if((fin = fopen(*argv++, "r")) == NULL) { + perror(*--argv); + return(-1); + } else { +/* reopen file for stdout */ + if(file_out == TRUE){ + if(argc == 1 ) { + if(freopen(*argv++, "w", stdout) == NULL) { + perror(*--argv); + return (-1); + } + argc--; + } else { + if(freopen("nkf.out", "w", stdout) == NULL) { + perror(*--argv); + return (-1); + } + } + if(binmode_f == TRUE) { +#ifdef __OS2__ + if(freopen("","wb",stdout) == NULL) + return (-1); +#else + setbinmode(stdout); +#endif + } + } + if(binmode_f == TRUE) +#ifdef __OS2__ + if(freopen("","rb",fin) == NULL) + return (-1); +#else + setbinmode(fin); +#endif + setvbuffer(fin, stdibuf, IOBUF_SIZE); + if(nop_f) + noconvert(fin); + else + kanji_convert(fin); + fclose(fin); + } + } + } +#ifdef EASYWIN /*Easy Win */ + if(file_out == FALSE) + scanf("%d",&end_check); + else + fclose(stdout); +#else /* for Other OS */ + if(file_out == TRUE) + fclose(stdout); +#endif + return (0); +} +#endif + +static void +arguments(cp) + char *cp; +{ + while (*cp) { + switch (*cp++) { + case 'b': /* buffered mode */ + unbuf_f = FALSE; + continue; + case 'u': /* non bufferd mode */ + unbuf_f = TRUE; + continue; + case 't': /* transparent mode */ + nop_f = TRUE; + continue; + case 'j': /* JIS output */ + case 'n': + oconv = j_oconv; + continue; + case 'e': /* AT&T EUC output */ + oconv = e_oconv; + continue; + case 's': /* SJIS output */ + oconv = s_oconv; + continue; + case 'l': /* ISO8859 Latin-1 support, no conversion */ + iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */ + input_f = LATIN1_INPUT; + continue; + case 'i': /* Kanji IN ESC-$-@/B */ + if(*cp=='@'||*cp=='B') + kanji_intro = *cp++; + continue; + case 'o': /* ASCII IN ESC-(-J/B */ + if(*cp=='J'||*cp=='B'||*cp=='H') + ascii_intro = *cp++; + continue; + case 'r': + rot_f = TRUE; + continue; +#if defined(MSDOS) || defined(__OS2__) + case 'T': + binmode_f = FALSE; + continue; +#endif +#ifndef PERL_XS + case 'v': + usage(); + exit(1); + break; +#endif + /* Input code assumption */ + case 'J': /* JIS input */ + case 'E': /* AT&T EUC input */ + input_f = JIS_INPUT; + continue; + case 'S': /* MS Kanji input */ + input_f = SJIS_INPUT; + if(x0201_f==NO_X0201) x0201_f=TRUE; + continue; + case 'Z': /* Convert X0208 alphabet to asii */ + /* bit:0 Convert X0208 + bit:1 Convert Kankaku to one space + bit:2 Convert Kankaku to two spaces + */ + if('9'>= *cp && *cp>='0') + alpha_f |= 1<<(*cp++ -'0'); + else + alpha_f |= TRUE; + continue; + case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */ + x0201_f = FALSE; /* No X0201->X0208 conversion */ + /* accept X0201 + ESC-(-I in JIS, EUC, MS Kanji + SI/SO in JIS, EUC, MS Kanji + SSO in EUC, JIS, not in MS Kanji + MS Kanji (0xa0-0xdf) + output X0201 + ESC-(-I in JIS (0x20-0x5f) + SSO in EUC (0xa0-0xdf) + 0xa0-0xd in MS Kanji (0xa0-0xdf) + */ + continue; + case 'X': /* Assume X0201 kana */ + /* Default value is NO_X0201 for EUC/MS-Kanji mix */ + x0201_f = TRUE; + continue; + case 'f': /* folding -f60 or -f */ + fold_f = TRUE; + fold_len = atoi(cp); + if(!(0= *cp && *cp>='0') + broken_f |= 1<<(*cp++ -'0'); + else + broken_f |= TRUE; + continue; +#ifndef PERL_XS + case 'O':/* for Output file */ + file_out = TRUE; + continue; +#endif + case 'c':/* add cr code */ + add_cr = TRUE; + continue; + case 'd':/* delete cr code */ + del_cr = TRUE; + continue; + default: + /* bogus option but ignored */ + continue; + } + } +} + +#ifndef PERL_XS +static void +noconvert(f) + FILE *f; +{ + int c; + + while ((c = getc(f)) != EOF) + putchar(c); +} +#endif + + +static void +kanji_convert(f) + FILE *f; +{ + int c1, c2; + + c2 = 0; + + if(input_f == JIS_INPUT || input_f == LATIN1_INPUT) { + estab_f = TRUE; iconv = oconv; + } else if(input_f == SJIS_INPUT) { + estab_f = TRUE; iconv = s_iconv; + } else { + estab_f = FALSE; iconv = oconv; + } + input_mode = ASCII; + output_mode = ASCII; + shift_mode = FALSE; + +#define NEXT continue /* no output, get next */ +#define SEND ; /* output c1 and c2, get next */ +#define LAST break /* end of loop, go closing */ + + while ((c1 = GETC(f)) != EOF) { + if(c2) { + /* second byte */ + if(c2 > DEL) { + /* in case of 8th bit is on */ + if(!estab_f) { + /* in case of not established yet */ + if(c1 > SSP) { + /* It is still ambiguious */ + h_conv(f, c2, c1); + c2 = 0; + NEXT; + } else if(c1 < AT) { + /* ignore bogus code */ + c2 = 0; + NEXT; + } else { + /* established */ + /* it seems to be MS Kanji */ + estab_f = TRUE; + iconv = s_iconv; + SEND; + } + } else + /* in case of already established */ + if(c1 < AT) { + /* ignore bogus code */ + c2 = 0; + NEXT; + } else + SEND; + } else + /* 7 bit code */ + /* it might be kanji shitfted */ + if((c1 == DEL) || (c1 <= SPACE)) { + /* ignore bogus first code */ + c2 = 0; + NEXT; + } else + SEND; + } else { + /* first byte */ + if(c1 > DEL) { + /* 8 bit code */ + if(!estab_f && !iso8859_f) { + /* not established yet */ + if(c1 < SSP) { + /* it seems to be MS Kanji */ + estab_f = TRUE; + iconv = s_iconv; + } else if(c1 < 0xe0) { + /* it seems to be EUC */ + estab_f = TRUE; + iconv = oconv; + } else { + /* still ambiguious */ + } + c2 = c1; + NEXT; + } else { /* estab_f==TRUE */ + if(iso8859_f) { + SEND; + } else if(SSP<=c1 && c1<0xe0 && iconv == s_iconv) { + /* SJIS X0201 Case... */ + /* This is too arrogant, but ... */ + if(x0201_f==NO_X0201) { + iconv = oconv; + c2 = c1; + NEXT; + } else + if(x0201_f) { + if(dv[(c1-SSP)*2]||ev[(c1-SSP)*2]) { + /* look ahead for X0201/X0208conversion */ + if((c2 = GETC(f)) == EOF) { + (*oconv)(cv[(c1-SSP)*2],cv[(c1-SSP)*2+1]); + LAST; + } else if(c2==(0xde)) { /* $BByE@(B */ + (*oconv)(dv[(c1-SSP)*2],dv[(c1-SSP)*2+1]); + c2=0; + NEXT; + } else if(c2==(0xdf)&&ev[(c1-SSP)*2]) { + /* $BH>ByE@(B */ + (*oconv)(ev[(c1-SSP)*2],ev[(c1-SSP)*2+1]); + c2=0; + NEXT; + } + UNGETC(c2,f); c2 = 0; + } + (*oconv)(cv[(c1-SSP)*2],cv[(c1-SSP)*2+1]); + NEXT; + } else + SEND; + } else if(c1==SSO && iconv != s_iconv) { + /* EUC X0201 Case */ + /* This is too arrogant + if(x0201_f == NO_X0201) { + estab_f = FALSE; + c2 = 0; + NEXT; + } */ + c1 = GETC(f); /* skip SSO */ + euc_1byte_check: + if(x0201_f && SSP<=c1 && c1<0xe0) { + if(dv[(c1-SSP)*2]||ev[(c1-SSP)*2]) { + if((c2 = GETC(f)) == EOF) { + (*oconv)(cv[(c1-SSP)*2],cv[(c1-SSP)*2+1]); + LAST; + } + /* forward lookup $BByE@(B/$BH>ByE@(B */ + if(c2 != SSO) { + UNGETC(c2,f); c2 = 0; + (*oconv)(cv[(c1-SSP)*2],cv[(c1-SSP)*2+1]); + NEXT; + } else if((c2 = GETC(f)) == EOF) { + (*oconv)(cv[(c1-SSP)*2],cv[(c1-SSP)*2+1]); + (*oconv)(0,SSO); + LAST; + } else if(c2==(0xde)) { /* $BByE@(B */ + (*oconv)(dv[(c1-SSP)*2],dv[(c1-SSP)*2+1]); + c2=0; + NEXT; + } else if(c2==(0xdf)&&ev[(c1-SSP)*2]) { + /* $BH>ByE@(B */ + (*oconv)(ev[(c1-SSP)*2],ev[(c1-SSP)*2+1]); + c2=0; + NEXT; + } else { + (*oconv)(cv[(c1-SSP)*2],cv[(c1-SSP)*2+1]); + /* we have to check this c2 */ + /* and no way to push back SSO */ + c1 = c2; c2 = 0; + goto euc_1byte_check; + } + } + (*oconv)(cv[(c1-SSP)*2],cv[(c1-SSP)*2+1]); + NEXT; + } else + SEND; + } else if(c1 < SSP && iconv != s_iconv) { + /* strange code in EUC */ + iconv = s_iconv; /* try SJIS */ + c2 = c1; + NEXT; + } else { + /* already established */ + c2 = c1; + NEXT; + } + } + } else if((c1 > SPACE) && (c1 != DEL)) { + /* in case of Roman characters */ + if(shift_mode) { + c1 |= 0x80; + /* output 1 shifted byte */ + if(x0201_f && (!iso8859_f||input_mode==X0201) && + SSP<=c1 && c1<0xe0 ) { + if(dv[(c1-SSP)*2]||ev[(c1-SSP)*2]) { + if((c2 = GETC(f)) == EOF) { + (*oconv)(cv[(c1-SSP)*2],cv[(c1-SSP)*2+1]); + LAST; + } else if(c2==(0xde&0x7f)) { /* $BByE@(B */ + (*oconv)(dv[(c1-SSP)*2],dv[(c1-SSP)*2+1]); + c2=0; + NEXT; + } else if(c2==(0xdf&0x7f)&&ev[(c1-SSP)*2]) { + /* $BH>ByE@(B */ + (*oconv)(ev[(c1-SSP)*2],ev[(c1-SSP)*2+1]); + c2=0; + NEXT; + } + UNGETC(c2,f); c2 = 0; + } + (*oconv)(cv[(c1-SSP)*2],cv[(c1-SSP)*2+1]); + NEXT; + } else + SEND; + } else if(c1 == '(' && broken_f && input_mode == X0208 + && !mime_mode ) { + /* Try to recover missing escape */ + if((c1 = GETC(f)) == EOF) { + (*oconv)(0, '('); + LAST; + } else { + if(c1 == 'B' || c1 == 'J' || c1 == 'H') { + input_mode = ASCII; shift_mode = FALSE; + NEXT; + } else { + (*oconv)(0, '('); + /* do not modify various input_mode */ + /* It can be vt100 sequence */ + SEND; + } + } + } else if(input_mode == X0208) { + /* in case of Kanji shifted */ + c2 = c1; + NEXT; + /* goto next_byte */ + } else if(c1 == '=' && mime_f && !mime_mode ) { + if((c1 = getc(f)) == EOF) { + (*oconv)(0, '='); + LAST; + } else if(c1 == '?') { + /* =? is mime conversiooon start sequence */ + if(mime_begin(f) == EOF) /* check in detail */ + LAST; + else + NEXT; + } else { + (*oconv)(0, '='); + ungetc(c1,f); + NEXT; + } + } else if(c1 == '$' && broken_f && !mime_mode) { + /* try to recover missing escape */ + if((c1 = GETC(f)) == EOF) { + (*oconv)(0, '$'); + LAST; + } else if(c1 == '@'|| c1 == 'B') { + /* in case of Kanji in ESC sequence */ + input_mode = X0208; + shift_mode = FALSE; + NEXT; + } else { + /* sorry */ + (*oconv)(0, '$'); + (*oconv)(0, c1); + NEXT; + } + } else + SEND; + } else if(c1 == SI) { + shift_mode = FALSE; + NEXT; + } else if(c1 == SO) { + shift_mode = TRUE; + NEXT; + } else if(c1 == ESC ) { + if((c1 = GETC(f)) == EOF) { + (*oconv)(0, ESC); + LAST; + } else if(c1 == '$') { + if((c1 = GETC(f)) == EOF) { + (*oconv)(0, ESC); + (*oconv)(0, '$'); + LAST; + } else if(c1 == '@'|| c1 == 'B') { + /* This is kanji introduction */ + input_mode = X0208; + shift_mode = FALSE; + NEXT; + } else if(c1 == '(') { + if((c1 = GETC(f)) == EOF) { + (*oconv)(0, ESC); + (*oconv)(0, '$'); + (*oconv)(0, '('); + LAST; + } else if(c1 == '@'|| c1 == 'B') { + /* This is kanji introduction */ + input_mode = X0208; + shift_mode = FALSE; + NEXT; + } else { + (*oconv)(0, ESC); + (*oconv)(0, '$'); + (*oconv)(0, '('); + (*oconv)(0, c1); + NEXT; + } + } else if(broken_f&0x2) { + input_mode = X0208; + shift_mode = FALSE; + NEXT; + } else { + (*oconv)(0, ESC); + (*oconv)(0, '$'); + (*oconv)(0, c1); + NEXT; + } + } else if(c1 == '(') { + if((c1 = GETC(f)) == EOF) { + (*oconv)(0, ESC); + (*oconv)(0, '('); + LAST; + } else { + if(c1 == 'I') { + /* This is X0201 kana introduction */ + input_mode = X0201; shift_mode = X0201; + NEXT; + } else if(c1 == 'B' || c1 == 'J' || c1 == 'H') { + /* This is X0208 kanji introduction */ + input_mode = ASCII; shift_mode = FALSE; + NEXT; + } else if(broken_f&0x2) { + input_mode = ASCII; shift_mode = FALSE; + NEXT; + } else { + (*oconv)(0, ESC); + (*oconv)(0, '('); + /* maintain various input_mode here */ + SEND; + } + } + } else { + /* lonely ESC */ + (*oconv)(0, ESC); + SEND; + } + } else if(c1 == NL && broken_f&4) { + input_mode = ASCII; + SEND; + } else + SEND; + } + /* send: */ + if(input_mode == X0208) + (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */ + else + (*iconv)(c2, c1); /* can be EUC/SJIS */ + c2 = 0; + continue; + /* goto next_word */ + } + + /* epilogue */ + (*iconv)(EOF, 0); +} + + + + +static void +h_conv(f, c2, c1) + FILE *f; + int c1, c2; +{ + int wc; + + + /** it must NOT be in the kanji shifte sequence */ + /** it must NOT be written in JIS7 */ + /** and it must be after 2 byte 8bit code */ + + hold_count = 0; + push_hold_buf(c2, c1); + c2 = 0; + + while ((c1 = GETC(f)) != EOF) { + if(c2) { + /* second byte */ + if(!estab_f) { + /* not established */ + if(c1 > SSP) { + /* it is still ambiguious yet */ + SEND; + } else if(c1 < AT) { + /* ignore bogus first byte */ + c2 = 0; + SEND; + } else { + /* now established */ + /* it seems to be MS Kanji */ + estab_f = TRUE; + iconv = s_iconv; + SEND; + } + } else + SEND; + } else { + /* First byte */ + if(c1 > DEL) { + /* 8th bit is on */ + if(c1 < SSP) { + /* it seems to be MS Kanji */ + estab_f = TRUE; + iconv = s_iconv; + } else if(c1 < 0xe0) { + /* it seems to be EUC */ + estab_f = TRUE; + iconv = oconv; + } else { + /* still ambiguious */ + } + c2 = c1; + NEXT; + } else + /* 7 bit code , then send without any process */ + SEND; + } + /* send: */ + if((push_hold_buf(c2, c1) == EOF) || estab_f) + break; + c2 = 0; + continue; + } + + /** now, + ** 1) EOF is detected, or + ** 2) Code is established, or + ** 3) Buffer is FULL (but last word is pushed) + ** + ** in 1) and 3) cases, we continue to use + ** Kanji codes by oconv and leave estab_f unchanged. + **/ + + for (wc = 0; wc < hold_count; wc += 2) { + c2 = hold_buf[wc]; + c1 = hold_buf[wc+1]; + (*iconv)(c2, c1); + } + return; +} + + + +static int +push_hold_buf(c2, c1) + int c2, c1; +{ + if(hold_count >= HOLD_SIZE*2) + return (EOF); + hold_buf[hold_count++] = c2; + hold_buf[hold_count++] = c1; + return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count); +} + + +static void +s_iconv(c2, c1) + int c2, + c1; +{ + if((c2 == EOF) || (c2 == 0)) { + /* NOP */ + } else { + c2 = c2 + c2 - ((c2 <= 0x9f) ? SJ0162 : SJ6394); + if(c1 < 0x9f) + c1 = c1 - ((c1 > DEL) ? SPACE : 0x1f); + else { + c1 = c1 - 0x7e; + c2++; + } + } + (*oconv)(c2, c1); +} + + +static void +e_oconv(c2, c1) + int c2, c1; +{ + c2 = pre_convert(c1,c2); c1 = c1_return; + if(fold_f) { + switch(line_fold(c2,c1)) { + case '\n': + if(add_cr == TRUE) { + putchar('\r'); + c1 = '\n'; + } + putchar('\n'); + break; + case 0: return; + case '\r': + c1 = '\n'; c2 = 0; + break; + case '\t': + case ' ': + c1 = ' '; c2 = 0; + break; + } + } + if(c2==DOUBLE_SPACE) { + putchar(' '); putchar(' '); + return; + } + if(c2 == EOF) + return; + else if(c2 == 0 && (c1&0x80)) { + putchar(SSO); putchar(c1); + } else if(c2 == 0) { + if(c1 == '\n' && add_cr == TRUE) + putchar('\r'); + if(c1 != '\r') + putchar(c1); + else if(del_cr == FALSE) + putchar(c1); + } else { + if((c1<0x20 || 0x7e> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1))); + putchar((c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e))); + } + return; +} + + +static void +j_oconv(c2, c1) + int c2, c1; +{ + c2 = pre_convert(c1,c2); c1 = c1_return; + if(fold_f) { + switch(line_fold(c2,c1)) { + case '\n': + if(output_mode) { + putchar(ESC); + putchar('('); + putchar(ascii_intro); + } + if(add_cr == TRUE) { + putchar('\r'); + c1 = '\n'; + } + putchar('\n'); + output_mode = ASCII; + break; + case '\r': + c1 = '\n'; c2 = 0; + break; + case '\t': + case ' ': + c1 = ' '; c2 = 0; + break; + case 0: return; + } + } + if(c2 == EOF) { + if(output_mode) { + putchar(ESC); + putchar('('); + putchar(ascii_intro); + } + } else if(c2 == 0 && (c1 & 0x80)) { + if(input_mode==X0201 || !iso8859_f) { + if(output_mode!=X0201) { + putchar(ESC); + putchar('('); + putchar('I'); + output_mode = X0201; + } + c1 &= 0x7f; + } else { + /* iso8859 introduction, or 8th bit on */ + /* Can we convert in 7bit form using ESC-'-'-A ? + Is this popular? */ + } + putchar(c1); + } else if(c2 == 0) { + if(output_mode) { + putchar(ESC); + putchar('('); + putchar(ascii_intro); + output_mode = ASCII; + } + if(c1 == '\n' && add_cr == TRUE) + putchar('\r'); + if(c1 != '\r') + putchar(c1); + else if(del_cr == FALSE) + putchar(c1); + } else if(c2 == DOUBLE_SPACE) { + if(output_mode) { + putchar(ESC); + putchar('('); + putchar(ascii_intro); + output_mode = ASCII; + } + putchar(' '); + if(c1 == '\n' && add_cr == TRUE) + putchar('\r'); + if(c1 != '\r') + putchar(c1); + else if(del_cr == FALSE) + putchar(c1); + } else { + if(output_mode != X0208) { + putchar(ESC); + putchar('$'); + putchar(kanji_intro); + output_mode = X0208; + } + if(c1<0x20 || 0x7e0x80 Japanese (X0208/X0201) + <0x80 ASCII + \n new line + ' ' space + + This fold algorthm does not preserve heading space in a line. + This is the main difference from fmt. +*/ + +static int +line_fold(c2,c1) +int c2,c1; +{ + int prev0; + if(c1=='\r') + return 0; /* ignore cr */ + if(c1== 8) { + if(line>0) line--; + return 1; + } + if(c2==EOF && line != 0) /* close open last line */ + return '\n'; + /* new line */ + if(c1=='\n') { + if(prev == c1) { /* duplicate newline */ + if(line) { + line = 0; + return '\n'; /* output two newline */ + } else { + line = 0; + return 1; + } + } else { + if(prev&0x80) { /* Japanese? */ + prev = c1; + return 0; /* ignore given single newline */ + } else if(prev==' ') { + return 0; + } else { + prev = c1; + if(++line<=fold_len) + return ' '; + else { + line = 0; + return '\r'; /* fold and output nothing */ + } + } + } + } + if(c1=='\f') { + prev = '\n'; + if(line==0) + return 1; + line = 0; + return '\n'; /* output newline and clear */ + } + /* X0208 kankaku or ascii space */ + if( (c2==0&&c1==' ')|| + (c2==0&&c1=='\t')|| + (c2==DOUBLE_SPACE)|| + (c2=='!'&& c1=='!')) { + if(prev == ' ') { + return 0; /* remove duplicate spaces */ + } + prev = ' '; + if(++line<=fold_len) + return ' '; /* output ASCII space only */ + else { + prev = ' '; line = 0; + return '\r'; /* fold and output nothing */ + } + } + prev0 = prev; /* we still need this one... , but almost done */ + prev = c1; + if(c2 || (SSP<=c1 && c1<=0xdf)) + prev |= 0x80; /* this is Japanese */ + line += (c2==0)?1:2; + if(line<=fold_len) { /* normal case */ + return 1; + } + if(line>=fold_len+FOLD_MARGIN) { /* too many kinsou suspension */ + line = (c2==0)?1:2; + return '\n'; /* We can't wait, do fold now */ + } + /* simple kinsoku rules return 1 means no folding */ + if(c2==0) { + if(c1==0xde) return 1; /* $B!+(B*/ + if(c1==0xdf) return 1; /* $B!,(B*/ + if(c1==0xa4) return 1; /* $B!#(B*/ + if(c1==0xa3) return 1; /* $B!$(B*/ + if(c1==0xa1) return 1; /* $B!W(B*/ + if(c1==0xb0) return 1; /* - */ + if(SSP<=c1 && c1<=0xdf) { /* X0201 */ + line = 1; + return '\n';/* add one new line before this character */ + } + /* fold point in ASCII { [ ( */ + if(( c1!=')'&& + c1!=']'&& + c1!='}'&& + c1!='.'&& + c1!=','&& + c1!='!'&& + c1!='?'&& + c1!='/'&& + c1!=':'&& + c1!=';')&& + ((prev0=='\n')|| (prev0==' ')|| /* ignored new line */ + (prev0&0x80)) /* X0208 - ASCII */ + ) { + line = 1; + return '\n';/* add one new line before this character */ + } + return 1; /* default no fold in ASCII */ + } else { + if(c2=='!') { + if(c1=='"') return 1; /* $B!"(B */ + if(c1=='#') return 1; /* $B!#(B */ + if(c1=='$') return 1; /* $B!$(B */ + if(c1=='%') return 1; /* $B!%(B */ + if(c1=='\'') return 1; /* $B!\(B */ + if(c1=='(') return 1; /* $B!((B */ + if(c1==')') return 1; /* $B!)(B */ + if(c1=='*') return 1; /* $B!*(B */ + if(c1=='+') return 1; /* $B!+(B */ + if(c1==',') return 1; /* $B!,(B */ + } + line = 2; + return '\n'; /* add one new line before this character */ + } +} + +static int +pre_convert(c1,c2) + int c1,c2; +{ + if(c2) c1 &= 0x7f; + c1_return = c1; + if(c2==EOF) return c2; + c2 &= 0x7f; + if(rot_f) { + if(c2) { + c1 = rot47(c1); + c2 = rot47(c2); + } else { + if(!(c1 & 0x80)) + c1 = rot13(c1); + } + c1_return = c1; + } + /* JISX0208 Alphabet */ + if(alpha_f && c2 == 0x23 ) return 0; + /* JISX0208 Kigou */ + if(alpha_f && c2 == 0x21 ) { + if(0x21==c1) { + if(alpha_f&0x2) { + c1_return = ' '; + return 0; + } else if(alpha_f&0x4) { + c1_return = ' '; + return DOUBLE_SPACE; + } else { + return c2; + } + } else if(0x20' ';i++) { /* start at =? */ + if( ((((r[i] = c1 = getc(f))==EOF) || nkf_toupper(c1) != p[i] ) { + /* pattern fails, try next one */ + q = p; + while (p = mime_pattern[++j]) { + for(k=2;k i */ + if(p[k]!=q[k]) break; + if(k==i && nkf_toupper(c1)==p[k]) break; + } + if(p) continue; /* found next one, continue */ + /* all fails, output from recovery buffer */ + ungetc(c1,f); + for(j=0;j> 4) & 0x03); + if(c2 != '=') { + Fifo(mime_last++) = cc; + cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f); + if(c3 != '=') { + Fifo(mime_last++) = cc; + cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f); + if(c4 != '=') + Fifo(mime_last++) = cc; + } + } else { + return c1; + } + return Fifo(mime_top++); +} + +static int +mime_ungetc(c) + unsigned int c; +{ + Fifo(mime_last++) = c; + return c; +} + +#ifdef STRICT_MIME +int +mime_integrity(f,p) + FILE *f; + unsigned char *p; +{ + int c,d; + unsigned int q; + /* In buffered mode, read until =? or NL or buffer fffull + */ + mime_input = mime_top; + mime_last = mime_top; + while(*p) Fifo(mime_input++) = *p++; + d = 0; + q = mime_input; + while((c=getc(f))!=EOF) { + if(((mime_input-mime_top)&MIME_BUF_MASK)==0) break; + if(c=='=' && d=='?') { + /* checked. skip header, start decode */ + Fifo(mime_input++) = c; + mime_input = q; + return 1; + } + if(!( (c=='+'||c=='/'|| c=='=' || c=='?' || + ('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9')))) + break; + /* Should we check length mod 4? */ + Fifo(mime_input++) = c; + d=c; + } + /* In case of Incomplete MIME, no MIME decode */ + Fifo(mime_input++) = c; + mime_last = mime_input; /* point undecoded buffer */ + mime_mode = 1; /* no decode on Fifo last in mime_getc */ + return 1; +} +#endif + +static int +base64decode(c) + int c; +{ + int i; + if(c > '@') + if(c < '[') + i = c - 'A'; /* A..Z 0-25 */ + else + i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */ + else if(c > '/') + i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */ + else if(c == '+') + i = '>' /* 62 */ ; /* + 62 */ + else + i = '?' /* 63 */ ; /* / 63 */ + return (i); +} + +static void +reinit() +{ + unbuf_f = FALSE; + estab_f = FALSE; + nop_f = FALSE; + binmode_f = TRUE; + rot_f = FALSE; + input_f = FALSE; + alpha_f = FALSE; + mime_f = TRUE; + mimebuf_f = FALSE; + broken_f = FALSE; + iso8859_f = FALSE; + x0201_f = TRUE; + x0201_f = NO_X0201; + fold_f = FALSE; + kanji_intro = DEFAULT_J; + ascii_intro = DEFAULT_R; + oconv = DEFAULT_CONV; + output_mode = ASCII; + input_mode = ASCII; + shift_mode = FALSE; + mime_mode = FALSE; + file_out = FALSE; + add_cr = FALSE; + del_cr = FALSE; +} + +#ifndef PERL_XS +int +usage() +{ + fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n"); + fprintf(stderr,"Flags:\n"); + fprintf(stderr,"b,u Output is bufferred (DEFAULT),Output is unbufferred\n"); +#ifdef DEFAULT_CODE_SJIS + fprintf(stderr,"j,s,e Outout code is JIS 7 bit, Shift JIS (DEFAULT), AT&T JIS (EUC)\n"); +#endif +#ifdef DEFAULT_CODE_JIS + fprintf(stderr,"j,s,e Outout code is JIS 7 bit (DEFAULT), Shift JIS, AT&T JIS (EUC)\n"); +#endif +#ifdef DEFAULT_CODE_EUC + fprintf(stderr,"j,s,e Outout code is JIS 7 bit, Shift JIS, AT&T JIS (EUC) (DEFAULT)\n"); +#endif + fprintf(stderr,"J,S,E Input assumption is JIS 7 bit , Shift JIS, AT&T JIS (EUC)\n"); + fprintf(stderr,"t no conversion\n"); + fprintf(stderr,"i_ Output sequence to designate JIS-kanji (DEFAULT B)\n"); + fprintf(stderr,"o_ Output sequence to designate ASCII (DEFAULT B)\n"); + fprintf(stderr,"r {de/en}crypt ROT13/47\n"); + fprintf(stderr,"v Show this usage\n"); + fprintf(stderr,"m[BQ0] MIME decode [B:base64,Q:quoted,0:no decode]\n"); + fprintf(stderr,"l ISO8859-1 (Latin-1) support\n"); + fprintf(stderr,"f Folding: -f60 or -f\n"); + fprintf(stderr,"Z[0-2] Convert X0208 alphabet to ASCII 1: Kankaku to space,2: 2 spaces\n"); + fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n"); + fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n"); +#ifdef MSDOS + fprintf(stderr,"T Text mode output\n"); +#endif + fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n"); + fprintf(stderr,"d,c Delete \\r in line feed, Add \\r in line feed\n"); + fprintf(stderr,"Network Kanji Filter Version %s (%s) " +#if defined(MSDOS) && !defined(_Windows) + "for DOS" +#endif +#if !defined(__WIN32__) && defined(_Windows) + "for Win16" +#endif +#if defined(__WIN32__) && defined(_Windows) + "for Win32" +#endif +#ifdef __OS2__ + "for OS/2" +#endif + ,Version,Patchlevel); + fprintf(stderr,"\n%s\n",CopyRight); + return 0; +} +#endif + +/** + ** $B%Q%C%A@):n + ** ohta@src.ricoh.co.jp (Junn Ohta) + ** inouet@strl.nhk.or.jp (Tomoyuki Inoue) + ** kiri@pulser.win.or.jp (Tetsuaki Kiriyama) + ** Kimihiko Sato + ** a_kuroe@kuroe.aoba.yokohama.jp (Akihiko Kuroe) + ** kono@ie.u-ryukyu.ac.jp (Shinji Kono) + ** GHG00637@nifty-serve.or.jp (COW) + ** + ** $B:G=*99?7F|(B + ** 1998.11.7 + **/ + +/* end */ diff --git a/ext/nkf/test.rb b/ext/nkf/test.rb new file mode 100644 index 0000000000..4519f8ba7e --- /dev/null +++ b/ext/nkf/test.rb @@ -0,0 +1,318 @@ +$counter = 0 +def result(result, message = nil) + $counter += 1 + printf("%s %d%s\n", + result ? 'ok' : 'no', + $counter, + message ? ' ... ' + message : '') +end + +begin + require 'nkf' + include NKF +rescue LoadError + result(false) +end +result(true) + +if nkf('-me', '1') + result(true); +else + result(false); +end + +output = nkf('-e', "\033\$@#1#3#2%B") +if output + # print output, "\n" + result(true, output) +else + result(false) +end + +output = nkf('-Zj', "\033\$@#1#3#2%B") +if output + # print output, "\n" + result(true, output) +else + result(false) +end + +output = "\244\306 " * 1024 +old = output.length +output = nkf("-j", output) +if output + # print output, "\n" + result(true, "#{old} #{output.length}") +else + result(false) +end + + +$detail = false +def test(opt, input, expect) + print "\nINPUT:\n", input if $detail + print "\nEXPECT:\n", expect if $detail + result = nkf(opt, input) + print "\nGOT:\n", result if $detail + + print result == expect ? "Ok\n" : "Fail\n" + return result +end + +# Basic Conversion +print "\nBasic Conversion test\n\n" + +example = {} +example['jis'] = <<'eofeof'.unpack('u')[0] +M1FERED"6GIAR(%-E8V]N9"!3=&%G92"8I9=Y($AI +M#28./ +>@Y*#DR!+:6=O=2"!18&'@D^"8(._@]:$081@A+X* +eofeof +#' + +example['euc'] = <<'eofeof'.unpack('u')[0] +M1FERI?*E\R!+:6=O=2"AIJ'GH["CP:;!IMBGHJ?!J,`* +eofeof +#' + +example['amb'] = <<'eofeof'.unpack('u')[0] +MI<*PL:7"L+&EPK"QI<*PL:7"L+&EPK"QI<*PL:7"L+&EPK"QI<*PL:7"L+&E +MPK"QI<*PL:7"L+&EPK"QI<(*I<*PL:7"L+&EPK"QI<*PL:7"L+&EPK"QI<*P +ML:7"L+&EPK"QI<*PL:7"L+&EPK"QI<*PL:7"L+&EPK"QI<(*I<*PL:7"L+&E +MPK"QI<*PL:7"L+&EPK"QI<*PL:7"L+&EPK"QI<*PL:7"L+&EPK"QI<*PL:7" +ML+&EPK"QI<(*I<*PL:7"L+&EPK"QI<*PL:7"L+&EPK"QI<*PL:7"L+&EPK"Q +MI<*PL:7"L+&EPK"QI<*PL:7"L+&EPK"QI<(*I<*PL:7"L+&EPK"QI<*PL:7" +ML+&EPK"QI<*PL:7"L+&EPK"QI<*PL:7"L+&EPK"QI<*PL:7"L+&EPK"QI<(* +eofeof + +example['amb.euc'] = <<'eofeof'.unpack('u')[0] +M&R1")4(P,25",#$E0C`Q)4(P,25",#$E0C`Q)4(P,25",#$E0C`Q)4(P,25" +M,#$E0C`Q)4(P,25",#$E0C`Q)4(;*$(*&R1")4(P,25",#$E0C`Q)4(P,25" +M,#$E0C`Q)4(P,25",#$E0C`Q)4(P,25",#$E0C`Q)4(P,25",#$E0C`Q)4(; +M*$(*&R1")4(P,25",#$E0C`Q)4(P,25",#$E0C`Q)4(P,25",#$E0C`Q)4(P +M,25",#$E0C`Q)4(P,25",#$E0C`Q)4(;*$(*&R1")4(P,25",#$E0C`Q)4(P +M,25",#$E0C`Q)4(P,25",#$E0C`Q)4(P,25",#$E0C`Q)4(P,25",#$E0C`Q +M)4(;*$(*&R1")4(P,25",#$E0C`Q)4(P,25",#$E0C`Q)4(P,25",#$E0C`Q +>)4(P,25",#$E0C`Q)4(P,25",#$E0C`Q)4(;*$(* +eofeof + +example['amb.sjis'] = <<'eofeof'.unpack('u')[0] +M&RA))4(P,25",#$E0C`Q)4(P,25",#$E0C`Q)4(P,25",#$E0C`Q)4(P,25" +M,#$E0C`Q)4(P,25",#$E0C`Q)4(;*$(*&RA))4(P,25",#$E0C`Q)4(P,25" +M,#$E0C`Q)4(P,25",#$E0C`Q)4(P,25",#$E0C`Q)4(P,25",#$E0C`Q)4(; +M*$(*&RA))4(P,25",#$E0C`Q)4(P,25",#$E0C`Q)4(P,25",#$E0C`Q)4(P +M,25",#$E0C`Q)4(P,25",#$E0C`Q)4(;*$(*&RA))4(P,25",#$E0C`Q)4(P +M,25",#$E0C`Q)4(P,25",#$E0C`Q)4(P,25",#$E0C`Q)4(P,25",#$E0C`Q +M)4(;*$(*&RA))4(P,25",#$E0C`Q)4(P,25",#$E0C`Q)4(P,25",#$E0C`Q +>)4(P,25",#$E0C`Q)4(P,25",#$E0C`Q)4(;*$(* +eofeof + +example['x0201.sjis'] = <<'eofeof'.unpack('u')[0] +MD5.*<(-*@TR#3H-0@U*#2X--@T^#48-3"I%3B7""8()A@F*"8X)D@F6"9H*! +M@H*"@X*$@H6"AH*'"I%3BTR-AH%)@9>!E(&0@9.!3X&5@9:!:8%J@7R!>X&! +M@6V!;H%O@7"!CPJ4O(IPMK>X/;FZMMZWWKC>N=ZZWH+&"I2\BG#*W\O?S-_- +MW\[?M]^QW@K*W\O?S`IH86YK86MU(,K?R]_,I`K*W\O?S-VA"I2\BG""S(SC +!"@!" +eofeof +#' + +example['x0201.euc'] = <<'eofeof'.unpack('u')[0] +MP;2ST:6KI:VEKZ6QI;.EK*6NI;"ELJ6T"L&TL=&CP:/"H\.CQ*/%H\:CQZ/A +MH^*CXZ/DH^6CYJ/G"L&TM:VYYJ&JH?>A]*'PH?.AL*'UH?:ARJ'+H=VAW*'A +MHCK>.WHZXCMZ.N8[>CKJ. +MWJ3("LB^L]&.RH[?CLN.WX[,CM^.S8[?CLZ.WXZWCM^.L8[>"H[*CM^.RX[? +MCLP*:&%N:V%K=2".RH[?CLN.WX[,CJ0*CLJ.WX[+CM^.S([=CJ$*R+ZST:3. +#N.4* +eofeof +#' + +example['x0201.jis'] = <<'eofeof'.unpack('u')[0] +M&R1"030S424K)2TE+R4Q)3,E+"4N)3`E,B4T&RA""ALD0D$T,5$C02-"(T,C +M1"-%(T8C1R-A(V(C8R-D(V4C9B-G&RA""ALD0D$T-2TY9B$J(7.5XZ7ALD0B1(&RA""ALD0D@^,U$;*$E*7TM?3%]-7TY? +M-U\Q7ALH0@H;*$E*7TM?3!LH0@IH86YK86MU(!LH24I?2U],)!LH0@H;*$E* +97TM?3%TA&RA""ALD0D@^,U$D3CAE&RA""@`` +eofeof +#` + +example['x0201.sosi'] = <<'eofeof'.unpack('u')[0] +M&R1"030S424K)2TE+R4Q)3,E+"4N)3`E,B4T&RA*"ALD0D$T,5$C02-"(T,C +M1"-%(T8C1R-A(V(C8R-D(V4C9B-G&RA*"ALD0D$T-2TY9B$J(7.5XZ7@\;)$(D2!LH2@H;)$)(/C-1&RA*#DI?2U],7TU? +M3E\W7S%>#PH.2E]+7TP/&RA*"FAA;FMA:W4@#DI?2U],)`\;*$H*#DI?2U], +672$/&RA*"ALD0D@^,U$D3CAE&RA""@`` +eofeof +#" + +example['x0201.x0208'] = <<'eofeof'.unpack('u')[0] +M&R1"030S424K)2TE+R4Q)3,E+"4N)3`E,B4T&RA""ALD0D$T,5$;*$)!0D-$ +M149'86)C9&5F9PH;)$)!-#4M.68;*$(A0",D)5XF*B@I+2L]6UU[?1LD0B%O +M&RA""ALD0D@^,U$E*R4M)2\;*$(]&R1")3$E,R4L)2XE,"4R)30D2!LH0@H; +M)$)(/C-1)5$E5"57)5HE724M(2PE(B$K&RA""ALD0B51)50E51LH0@IH86YK +M86MU(!LD0B51)50E52$B&RA""ALD0B51)50E525S(2,;*$(*&R1"2#XS421. +&.&4;*$(* +eofeof +#` + +example['mime.iso2022'] = <<'eofeof'.unpack('u')[0] +M/3])4T\M,C`R,BU*4#]"/T=Y4D%.144W96E23TI566Q/4U9)1WEH2S\]"CT_ +M:7-O+3(P,C(M2E`_0C]'>5)!3D5%-V5I4D]*55EL3U-624=Y:$L_/0H]/VES +M;RTR,#(R+4I0/U$_/3%")$(D1B11/3%"*$)?96YD/ST*&R1`)#TD)B0K)$H; +M*$H@/3])4T\M,C`R,BU*4#]"/T=Y4D%.144W96E23U!Y:S=D:'-O4V<]/3\] +M(&5N9"!O9B!L:6YE"CT_25-/+3(P,C(M2E`_0C]'>5)!3D5%-V5I4D]0>6LW +M9&AS;U-G/3T_/2`]/TE33RTR,#(R+4I0/T(_1WE204Y%13=E:5)/4'EK-V1H +M5)!3D5%-V5I +44D]*55EL3QM;2U-624=Y:$L_/0H_ +eofeof +#' + +example['mime.ans.strict'] = <<'eofeof'.unpack('u')[0] +M&R1"-$$[>B1.)48E.25(&RA""ALD0C1!.WHD3B5&)3DE2!LH0@H;)$(D1B11 +M&RA"(&5N9`H;)$(D/20F)"LD2ALH0B`;)$(T03MZ)$X_*3MV&RA"96YD(&]F +M(&QI;F4*&R1"-$$[>B1./RD[=C1!.WHD3C\I.W8;*$(*0G)O:V5N(&-A5)!3D5%-V5I4D]*55EL3QM;2U-624=Y:$L_/0H_ +eofeof +#' + +example['mime.unbuf.strict'] = <<'eofeof'.unpack('u')[0] +M&R1"-$$[>B1.)48E.25(&RA""ALD0C1!.WHD3B5&)3DE2!LH0@H;)$(D1B11 +M&RA"(&5N9`H;)$(D/20F)"LD2ALH0B`;)$(T03MZ)$X_*3MV&RA"96YD(&]F +M(&QI;F4*&R1"-$$[>B1./RD[=C1!.WHD3C\I.W8;*$(*0G)O:V5N(&-AB1./RD;*$)H5)! +M3D5%-V5I4D]0>6LW9&AS;U-G/3T_/0H;)$(T03MZ)$XE1ALH0EM+4U9)1WEH +$2S\]"F5I +eofeof + +example['mime.ans'] = <<'eofeof'.unpack('u')[0] +M&R1"-$$[>B1.)48E.25(&RA""ALD0C1!.WHD3B5&)3DE2!LH0@H;)$(D1B11 +M&RA"(&5N9`H;)$(D/20F)"LD2ALH0B`;)$(T03MZ)$X_*3MV&RA"96YD(&]F +M(&QI;F4*&R1"-$$[>B1./RD[=C1!.WHD3C\I.W8;*$(*0G)O:V5N(&-AB1./RD;*$)HB1./RD[=ALH0@H;)$(T +603MZ)$XE1ALH0EM+4U9)1WEH2S\]"@`* +eofeof +#" + +example['mime.unbuf'] = <<'eofeof'.unpack('u')[0] +M&R1"-$$[>B1.)48E.25(&RA""ALD0C1!.WHD3B5&)3DE2!LH0@H;)$(D1B11 +M&RA"(&5N9`H;)$(D/20F)"LD2ALH0B`;)$(T03MZ)$X_*3MV&RA"96YD(&]F +M(&QI;F4*&R1"-$$[>B1./RD[=C1!.WHD3C\I.W8;*$(*0G)O:V5N(&-AB1./RD;*$)HB1./RD[=ALH0@H;)$(T +603MZ)$XE1ALH0EM+4U9)1WEH2S\]"@`* +eofeof +#" + +example['mime.base64'] = <<'eofeof'.unpack('u')[0] +M9W-M5"])3&YG$I+-&=Q=4,S24LS9W%Q0E%:3TUI-39,,S0Q-&=S5T)1 +M43!+9VUA1%9O3T@*9S)+1%1O3'=K8C)1;$E+;V=Q2T-X24MG9W5M0W%*3EEG +<$E+9V=U;4,X64Q&9W)70S592VMG<6U""F=Q +eofeof +#" + +example['mime.base64.ans'] = <<'eofeof'.unpack('u')[0] +M&R1")$M&?B1I)#LD1D0Z)"TD7B0Y)"PA(D5L-7XV83E9)$2P@1$5.34%22R`@7"`B36EN(&OF<&AEX0208 conversion +# X0208 aphabet -> ASCII +# X0201 相互変換 + +print "\nX0201 test\n\n" + +# -X is necessary to allow X0201 in SJIS +# -Z convert X0208 alphabet to ASCII +print 'X0201 conversion: SJIS ' +test('-XZ', example['x0201.sjis'], example['x0201.x0208']) +print 'X0201 conversion: JIS ' +test('-Z', example['x0201.jis'], example['x0201.x0208']) +print 'X0201 conversion:SI/SO ' +test('-Z', example['x0201.sosi'], example['x0201.x0208']) +print 'X0201 conversion: EUC ' +test('-Z', example['x0201.euc'], example['x0201.x0208']) +# -x means X0201 output +print 'X0201 output: SJIS ' +test('-xs', example['x0201.euc'], example['x0201.sjis']) +print 'X0201 output: JIS ' +test('-xj', example['x0201.sjis'], example['x0201.jis']) +print 'X0201 output: EUC ' +test('-xe', example['x0201.jis'], example['x0201.euc']) + +# MIME decode + +print "\nMIME test\n\n" + +# MIME ISO-2022-JP + +print "Next test is expeced to Fail.\n" + +print 'MIME decode (strict) ' +tmp = test('-m', example['mime.iso2022'], example['mime.ans.strict']) +print 'MIME decode (nonstrict)' +tmp = test('-m', example['mime.iso2022'], example['mime.ans']) +# open(OUT,'>tmp1');print OUT pack('u',$tmp);close(OUT); +# unbuf mode implies more pessimistic decode +print 'MIME decode (unbuf) ' +test('-mu', example['mime.iso2022'], example['mime.unbuf']) +print 'MIME decode (base64) ' +t = test('-mB', example['mime.base64'], example['mime.base64.ans']) + +# MIME ISO-8859-1 + +# Without -l, ISO-8859-1 was handled as X0201. + +print 'MIME ISO-8859-1 (Q) ' +test('-ml', example['mime.is8859'], example['mime.is8859.ans']) -- cgit v1.2.3