diff options
author | naruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2004-10-29 06:51:33 +0000 |
---|---|---|
committer | naruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2004-10-29 06:51:33 +0000 |
commit | 35b917f3ee63fc2251e3d4fc063a48f2d91bb96c (patch) | |
tree | eda11bf5a0f952cede89cfb1cd3fd38d42999431 /ext/nkf/nkf.c | |
parent | 640fad68cc5061b1fef782a842f1e4f038107883 (diff) | |
download | ruby-35b917f3ee63fc2251e3d4fc063a48f2d91bb96c.tar.gz |
follow to nkf 2.0.4
:new constants
NKF::VERSION
NKF::ASCII
NKF::UTF8
NKF::UTF16
NKF::UTF32
:new methods
NFK.guess1 (guess)
NKF.guess2 (from nkf2)
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@7132 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'ext/nkf/nkf.c')
-rw-r--r-- | ext/nkf/nkf.c | 191 |
1 files changed, 141 insertions, 50 deletions
diff --git a/ext/nkf/nkf.c b/ext/nkf/nkf.c index ca6de73e10..6517b3aba1 100644 --- a/ext/nkf/nkf.c +++ b/ext/nkf/nkf.c @@ -1,51 +1,82 @@ +/* + * NKF Module for Ruby base on nkf 2.x + * + * original nkf2.0 is maintained at http://sourceforge.jp/projects/nkf/ + * + */ + +static char *RVersion = "2.0.4.1r1"; + #include "ruby.h" +/* Encoding Constants */ #define _AUTO 0 #define _JIS 1 #define _EUC 2 #define _SJIS 3 #define _BINARY 4 #define _NOCONV 4 +#define _ASCII 5 +/* 0b011x is reserved for UTF-8 Family */ +#define _UTF8 6 +/* 0b10xx is reserved for UTF-16 Family */ +#define _UTF16 8 +/* 0b11xx is reserved for UTF-32 Family */ +#define _UTF32 12 +#define _OTHER 16 #define _UNKNOWN _AUTO +/* Replace nkf's getchar/putchar for variable modification */ +/* we never use getc, ungetc */ + #undef getc #undef ungetc -#define getc(f) (input_ctr<i_len?input[input_ctr++]:-1) -#define ungetc(c,f) input_ctr-- +#define getc(f) (input_ctr>=i_len?-1:input[input_ctr++]) +#define ungetc(c,f) input_ctr-- +#define INCSIZE 32 #undef putchar -#define putchar(c) rb_nkf_putchar(c) +#undef TRUE +#undef FALSE +#define putchar(c) rb_nkf_putchar(c) -#define INCSIZE 32 -static int incsize; +/* Input/Output pointers */ -static unsigned char *input, *output; -static int input_ctr, i_len; -static int output_ctr, o_len; +static unsigned char *output; +static unsigned char *input; +static int input_ctr; +static int i_len; +static int output_ctr; +static int o_len; +static int incsize; -static VALUE dst; +static VALUE result; static int rb_nkf_putchar(c) - unsigned int c; + unsigned int c; { if (output_ctr >= o_len) { o_len += incsize; - rb_str_resize(dst, o_len); - output = RSTRING(dst)->ptr; + rb_str_resize(result, o_len); incsize *= 2; + output = RSTRING(result)->ptr; } output[output_ctr++] = c; return c; } +/* Include kanji filter main part */ +/* getchar and putchar will be replaced during inclusion */ + #define PERL_XS 1 -#include "nkf1.7/nkf.c" +#include "nkf-utf8/utf8tbl.c" +#include "nkf-utf8/nkf.c" static VALUE rb_nkf_kconv(obj, opt, src) - VALUE obj, opt, src; + VALUE obj, opt, src; { char *opt_ptr, *opt_end; volatile VALUE v; @@ -58,44 +89,46 @@ rb_nkf_kconv(obj, opt, src) if (*opt_ptr != '-') { continue; } - arguments(opt_ptr); + options(opt_ptr); } incsize = INCSIZE; - input_ctr = 0; + input_ctr = 0; StringValue(src); input = RSTRING(src)->ptr; i_len = RSTRING(src)->len; - dst = rb_str_new(0, i_len*3 + 10); - v = dst; + result = rb_str_new(0, i_len*3 + 10); + v = result; output_ctr = 0; - output = RSTRING(dst)->ptr; - o_len = RSTRING(dst)->len; + output = RSTRING(result)->ptr; + o_len = RSTRING(result)->len; *output = '\0'; - if(iso8859_f && (oconv != j_oconv || !x0201_f )) { - iso8859_f = FALSE; - } + if(x0201_f == WISH_TRUE) + x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201); kanji_convert(NULL); - RSTRING(dst)->ptr[output_ctr] = '\0'; - RSTRING(dst)->len = output_ctr; - OBJ_INFECT(dst, src); + RSTRING(result)->ptr[output_ctr] = '\0'; + RSTRING(result)->len = output_ctr; + OBJ_INFECT(result, src); - return dst; + return result; } + /* + * NKF.guess1 + * * Character code detection - Algorithm described in: * Ken Lunde. `Understanding Japanese Information Processing' * Sebastopol, CA: O'Reilly & Associates. */ static VALUE -rb_nkf_guess(obj, src) - VALUE obj, src; +rb_nkf_guess1(obj, src) + VALUE obj, src; { unsigned char *p; unsigned char *pend; @@ -107,16 +140,16 @@ rb_nkf_guess(obj, src) if (p == pend) return INT2FIX(_UNKNOWN); #define INCR do {\ - p++;\ - if (p==pend) return INT2FIX(_UNKNOWN);\ - sequence_counter++;\ - if (sequence_counter % 2 == 1 && *p != 0xa4)\ + p++;\ + if (p==pend) return INT2FIX(_UNKNOWN);\ + sequence_counter++;\ + if (sequence_counter % 2 == 1 && *p != 0xa4)\ sequence_counter = 0;\ - if (6 <= sequence_counter) {\ - sequence_counter = 0;\ - return INT2FIX(_EUC);\ - }\ -} while (0) + if (6 <= sequence_counter) {\ + sequence_counter = 0;\ + return INT2FIX(_EUC);\ + }\ + } while (0) if (*p == 0xa4) sequence_counter = 1; @@ -180,19 +213,77 @@ rb_nkf_guess(obj, src) return INT2FIX(_UNKNOWN); } + +/* + * NKF.guess2 + * + * Guess Encoding By NKF2.0 Routine + */ + +static VALUE +rb_nkf_guess2(obj, src) + VALUE obj, src; +{ + int code = _BINARY; + + reinit(); + + input_ctr = 0; + StringValue(src); + input = RSTRING(src)->ptr; + i_len = RSTRING(src)->len; + + if(x0201_f == WISH_TRUE) + x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201); + + guess_f = TRUE; + kanji_convert( NULL ); + guess_f = FALSE; + + if (!is_inputcode_mixed) { + if (strcmp(input_codename, "") == 0) { + code = _ASCII; + } else if (strcmp(input_codename, "ISO-2022-JP") == 0) { + code = _JIS; + } else if (strcmp(input_codename, "EUC-JP") == 0) { + code = _EUC; + } else if (strcmp(input_codename, "Shift_JIS") == 0) { + code = _SJIS; + } else if (strcmp(input_codename, "UTF-8") == 0) { + code = _UTF8; + } else if (strcmp(input_codename, "UTF-16") == 0) { + code = _UTF16; + } else if (strlen(input_codename) > 0) { + code = _UNKNOWN; + } + } + + return INT2FIX( code ); +} + + +/* Initialize NKF Module */ + void Init_nkf() { - VALUE mKconv = rb_define_module("NKF"); - - rb_define_module_function(mKconv, "nkf", rb_nkf_kconv, 2); - rb_define_module_function(mKconv, "guess", rb_nkf_guess, 1); - - rb_define_const(mKconv, "AUTO", INT2FIX(_AUTO)); - rb_define_const(mKconv, "JIS", INT2FIX(_JIS)); - rb_define_const(mKconv, "EUC", INT2FIX(_EUC)); - rb_define_const(mKconv, "SJIS", INT2FIX(_SJIS)); - rb_define_const(mKconv, "BINARY", INT2FIX(_BINARY)); - rb_define_const(mKconv, "NOCONV", INT2FIX(_NOCONV)); - rb_define_const(mKconv, "UNKNOWN", INT2FIX(_UNKNOWN)); + VALUE mKconv = rb_define_module("NKF"); + + rb_define_module_function(mKconv, "nkf", rb_nkf_kconv, 2); + rb_define_module_function(mKconv, "guess", rb_nkf_guess1, 1); + rb_define_module_function(mKconv, "guess1", rb_nkf_guess1, 1); + rb_define_module_function(mKconv, "guess2", rb_nkf_guess2, 1); + + rb_define_const(mKconv, "AUTO", INT2FIX(_AUTO)); + rb_define_const(mKconv, "JIS", INT2FIX(_JIS)); + rb_define_const(mKconv, "EUC", INT2FIX(_EUC)); + rb_define_const(mKconv, "SJIS", INT2FIX(_SJIS)); + rb_define_const(mKconv, "BINARY", INT2FIX(_BINARY)); + rb_define_const(mKconv, "NOCONV", INT2FIX(_NOCONV)); + rb_define_const(mKconv, "ASCII", INT2FIX(_ASCII)); + rb_define_const(mKconv, "UTF8", INT2FIX(_UTF8)); + rb_define_const(mKconv, "UTF16", INT2FIX(_UTF16)); + rb_define_const(mKconv, "UTF32", INT2FIX(_UTF32)); + rb_define_const(mKconv, "UNKNOWN", INT2FIX(_UNKNOWN)); + rb_define_const(mKconv, "VERSION", rb_str_new2(RVersion)); } |