aboutsummaryrefslogtreecommitdiffstats
path: root/ext/nkf/nkf.c
diff options
context:
space:
mode:
authornaruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2007-12-19 04:29:22 +0000
committernaruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2007-12-19 04:29:22 +0000
commit0cf4378f14318890b8694665e4a37b511d0ac566 (patch)
treecdad8752565e9245ba497b34d8d9d1aa8ccab324 /ext/nkf/nkf.c
parentb9ca01063ef999af00f17e993d74cf6d0218b4ab (diff)
downloadruby-0cf4378f14318890b8694665e4a37b511d0ac566.tar.gz
* ext/nkf/nkf.c (NKF::_ENCODING): removed.
* ext/nkf/nkf.c (rb_nkf_kconv): renamed to rb_nkf_convert. * ext/nkf/nkf.c (rb_nkf_convert): set encoding. * ext/nkf/nkf.c (rb_nkf_guess1): removed. * ext/nkf/nkf.c (rb_nkf_guess2): renamed to rb_nkf_guess. * ext/nkf/nkf.c (rb_nkf_guess): guess method now returns encoding object. * ext/nkf/nkf-utf8/nkf.c: Update to nkf 2.0.8 2007-12-19. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@14315 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'ext/nkf/nkf.c')
-rw-r--r--ext/nkf/nkf.c322
1 files changed, 71 insertions, 251 deletions
diff --git a/ext/nkf/nkf.c b/ext/nkf/nkf.c
index 460b536bdf..48402e07e6 100644
--- a/ext/nkf/nkf.c
+++ b/ext/nkf/nkf.c
@@ -10,24 +10,8 @@
#define RUBY_NKF_REVISION "$Revision$"
#define RUBY_NKF_VERSION NKF_VERSION " (" NKF_RELEASE_DATE ")"
-#include "ruby.h"
-
-/* Encoding Constants */
-#define _AUTO 0
-#define _JIS 1
-#define _EUC 2
-#define _SJIS 3
-#define _BINARY 4
-#define _NOCONV 4
-#define _ASCII 5
-/* 0b011x is reserved for UTF-8 Family */
-#define _UTF8 6
-/* 0b10xx is reserved for UTF-16 Family */
-#define _UTF16 8
-/* 0b11xx is reserved for UTF-32 Family */
-#define _UTF32 12
-#define _OTHER 16
-#define _UNKNOWN _AUTO
+#include "ruby/ruby.h"
+#include "ruby/encoding.h"
/* Replace nkf's getchar/putchar for variable modification */
/* we never use getc, ungetc */
@@ -140,221 +124,81 @@ int nkf_split_options(const char *arg)
*/
static VALUE
-rb_nkf_kconv(VALUE obj, VALUE opt, VALUE src)
+rb_nkf_convert(VALUE obj, VALUE opt, VALUE src)
{
- char *opt_ptr, *opt_end;
- volatile VALUE v;
-
- reinit();
- StringValue(opt);
- opt_ptr = RSTRING_PTR(opt);
- opt_end = opt_ptr + RSTRING_LEN(opt);
- nkf_split_options(opt_ptr);
-
- incsize = INCSIZE;
-
- input_ctr = 0;
- StringValue(src);
- input = (unsigned char *)RSTRING_PTR(src);
- i_len = RSTRING_LEN(src);
- result = rb_str_new(0, i_len*3 + 10);
- v = result;
-
- output_ctr = 0;
- output = (unsigned char *)RSTRING_PTR(result);
- o_len = RSTRING_LEN(result);
- *output = '\0';
-
- if(x0201_f == WISH_TRUE)
- x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
-
- kanji_convert(NULL);
- rb_str_set_len(result, output_ctr);
- OBJ_INFECT(result, src);
-
- return result;
+ char *opt_ptr, *opt_end;
+ volatile VALUE v;
+ char *encname;
+ int idx;
+
+ reinit();
+ StringValue(opt);
+ opt_ptr = RSTRING_PTR(opt);
+ opt_end = opt_ptr + RSTRING_LEN(opt);
+ nkf_split_options(opt_ptr);
+
+ incsize = INCSIZE;
+
+ input_ctr = 0;
+ StringValue(src);
+ input = (unsigned char *)RSTRING_PTR(src);
+ i_len = RSTRING_LEN(src);
+ result = rb_str_new(0, i_len*3 + 10);
+ v = result;
+
+ output_ctr = 0;
+ output = (unsigned char *)RSTRING_PTR(result);
+ o_len = RSTRING_LEN(result);
+ *output = '\0';
+
+ kanji_convert(NULL);
+ rb_str_set_len(result, output_ctr);
+ OBJ_INFECT(result, src);
+ encname = nkf_enc_name(output_encoding);
+ fprintf(stderr, "%s\n", encname);
+ idx = rb_enc_find_index(encname);
+ fprintf(stderr, "%d\n", idx);
+ if (idx <= 0) {
+ idx = rb_enc_replicate(encname, rb_enc_find(rb_enc_name(ONIG_ENCODING_ASCII)));
+ fprintf(stderr, "%d\n", idx);
+ }
+ rb_enc_associate_index(result, idx);
+ return result;
}
/*
* call-seq:
- * NKF.guess1(str) -> integer
- *
- * Returns guessed encoding of _str_ as integer.
- *
- * Algorithm described in:
- * Ken Lunde. `Understanding Japanese Information Processing'
- * Sebastopol, CA: O'Reilly & Associates.
- *
- * case NKF.guess1(input)
- * when NKF::JIS
- * "ISO-2022-JP"
- * when NKF::SJIS
- * "Shift_JIS"
- * when NKF::EUC
- * "EUC-JP"
- * when NKF::UNKNOWN
- * "UNKNOWN(ASCII)"
- * when NKF::BINARY
- * "BINARY"
- * end
+ * NKF.guess(str) -> encoding
+ *
+ * Returns guessed encoding of _str_ by nkf routine.
+ *
*/
static VALUE
-rb_nkf_guess1(VALUE obj, VALUE src)
+rb_nkf_guess(VALUE obj, VALUE src)
{
- unsigned char *p;
- unsigned char *pend;
- int sequence_counter = 0;
-
- StringValue(src);
- p = (unsigned char *)RSTRING_PTR(src);
- pend = p + RSTRING_LEN(src);
- if (p == pend) return INT2FIX(_UNKNOWN);
-
-#define INCR do {\
- p++;\
- if (p==pend) return INT2FIX(_UNKNOWN);\
- sequence_counter++;\
- if (sequence_counter % 2 == 1 && *p != 0xa4)\
- sequence_counter = 0;\
- if (6 <= sequence_counter) {\
- sequence_counter = 0;\
- return INT2FIX(_EUC);\
- }\
- } while (0)
-
- if (*p == 0xa4)
- sequence_counter = 1;
-
- while (p<pend) {
- if (*p == '\033') {
- return INT2FIX(_JIS);
- }
- if (*p < '\006' || *p == 0x7f || *p == 0xff) {
- return INT2FIX(_BINARY);
- }
- if (0x81 <= *p && *p <= 0x8d) {
- return INT2FIX(_SJIS);
- }
- if (0x8f <= *p && *p <= 0x9f) {
- return INT2FIX(_SJIS);
- }
- if (*p == 0x8e) { /* SS2 */
- INCR;
- if ((0x40 <= *p && *p <= 0x7e) ||
- (0x80 <= *p && *p <= 0xa0) ||
- (0xe0 <= *p && *p <= 0xfc))
- return INT2FIX(_SJIS);
- }
- else if (0xa1 <= *p && *p <= 0xdf) {
- INCR;
- if (0xf0 <= *p && *p <= 0xfe)
- return INT2FIX(_EUC);
- if (0xe0 <= *p && *p <= 0xef) {
- while (p < pend && *p >= 0x40) {
- if (*p >= 0x81) {
- if (*p <= 0x8d || (0x8f <= *p && *p <= 0x9f)) {
- return INT2FIX(_SJIS);
- }
- else if (0xfd <= *p && *p <= 0xfe) {
- return INT2FIX(_EUC);
- }
- }
- INCR;
- }
- }
- else if (*p <= 0x9f) {
- return INT2FIX(_SJIS);
- }
- }
- else if (0xf0 <= *p && *p <= 0xfe) {
- return INT2FIX(_EUC);
- }
- else if (0xe0 <= *p && *p <= 0xef) {
- INCR;
- if ((0x40 <= *p && *p <= 0x7e) ||
- (0x80 <= *p && *p <= 0xa0)) {
- return INT2FIX(_SJIS);
- }
- if (0xfd <= *p && *p <= 0xfe) {
- return INT2FIX(_EUC);
- }
- }
- INCR;
- }
- return INT2FIX(_UNKNOWN);
-}
+ char* codename;
+ rb_encoding* enc;
+ reinit();
-/*
- * call-seq:
- * NKF.guess2(str) -> integer
- *
- * Returns guessed encoding of _str_ as integer by nkf routine.
- *
- * case NKF.guess(input)
- * when NKF::ASCII
- * "ASCII"
- * when NKF::JIS
- * "ISO-2022-JP"
- * when NKF::SJIS
- * "Shift_JIS"
- * when NKF::EUC
- * "EUC-JP"
- * when NKF::UTF8
- * "UTF-8"
- * when NKF::UTF16
- * "UTF-16"
- * when NKF::UTF32
- * "UTF-32"
- * when NKF::UNKNOWN
- * "UNKNOWN"
- * when NKF::BINARY
- * "BINARY"
- * end
- */
+ input_ctr = 0;
+ StringValue(src);
+ input = (unsigned char *)RSTRING_PTR(src);
+ i_len = RSTRING_LEN(src);
-static VALUE
-rb_nkf_guess2(VALUE obj, VALUE src)
-{
- int code = _BINARY;
-
- reinit();
-
- input_ctr = 0;
- StringValue(src);
- input = (unsigned char *)RSTRING_PTR(src);
- i_len = RSTRING_LEN(src);
-
- if(x0201_f == WISH_TRUE)
- x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201);
-
- guess_f = TRUE;
- kanji_convert( NULL );
- guess_f = FALSE;
-
- if (!is_inputcode_mixed) {
- if (strcmp(input_codename, "") == 0) {
- code = _ASCII;
- } else if (strcmp(input_codename, "ISO-2022-JP") == 0) {
- code = _JIS;
- } else if (strcmp(input_codename, "EUC-JP") == 0) {
- code = _EUC;
- } else if (strcmp(input_codename, "Shift_JIS") == 0) {
- code = _SJIS;
- } else if (strcmp(input_codename, "UTF-8") == 0) {
- code = _UTF8;
- } else if (strcmp(input_codename, "UTF-16") == 0) {
- code = _UTF16;
- } else if (strcmp(input_codename, "UTF-32") == 0) {
- code = _UTF32;
- } else if (strlen(input_codename) > 0) {
- code = _UNKNOWN;
- }
- }
+ guess_f = TRUE;
+ kanji_convert( NULL );
+ guess_f = FALSE;
- return INT2FIX( code );
+ codename = get_guessed_code();
+ enc = rb_enc_find(codename);
+ if (enc <= 0) {
+ int idx = rb_enc_replicate(codename, rb_enc_find(rb_enc_name(ONIG_ENCODING_ASCII)));
+ enc = rb_enc_from_index(idx);
+ }
+ return rb_enc_from_encoding(enc);
}
@@ -632,41 +476,17 @@ void
Init_nkf()
{
/* hoge */
- VALUE mKconv = rb_define_module("NKF");
+ VALUE mNKF = rb_define_module("NKF");
/* hoge */
- rb_define_module_function(mKconv, "nkf", rb_nkf_kconv, 2);
- rb_define_module_function(mKconv, "guess1", rb_nkf_guess1, 1);
- rb_define_module_function(mKconv, "guess2", rb_nkf_guess2, 1);
- rb_define_alias(mKconv, "guess", "guess2");
- rb_define_alias(rb_singleton_class(mKconv), "guess", "guess2");
-
- /* Auto-Detect */
- rb_define_const(mKconv, "AUTO", INT2FIX(_AUTO));
- /* ISO-2022-JP */
- rb_define_const(mKconv, "JIS", INT2FIX(_JIS));
- /* EUC-JP */
- rb_define_const(mKconv, "EUC", INT2FIX(_EUC));
- /* Shift_JIS */
- rb_define_const(mKconv, "SJIS", INT2FIX(_SJIS));
- /* BINARY */
- rb_define_const(mKconv, "BINARY", INT2FIX(_BINARY));
- /* No conversion */
- rb_define_const(mKconv, "NOCONV", INT2FIX(_NOCONV));
- /* ASCII */
- rb_define_const(mKconv, "ASCII", INT2FIX(_ASCII));
- /* UTF-8 */
- rb_define_const(mKconv, "UTF8", INT2FIX(_UTF8));
- /* UTF-16 */
- rb_define_const(mKconv, "UTF16", INT2FIX(_UTF16));
- /* UTF-32 */
- rb_define_const(mKconv, "UTF32", INT2FIX(_UTF32));
- /* UNKNOWN */
- rb_define_const(mKconv, "UNKNOWN", INT2FIX(_UNKNOWN));
+ rb_define_module_function(mNKF, "nkf", rb_nkf_convert, 2);
+ rb_define_module_function(mNKF, "guess", rb_nkf_guess, 1);
+ rb_define_alias(rb_singleton_class(mNKF), "guess", "guess");
+
/* Full version string of nkf */
- rb_define_const(mKconv, "VERSION", rb_str_new2(RUBY_NKF_VERSION));
+ rb_define_const(mNKF, "VERSION", rb_str_new2(RUBY_NKF_VERSION));
/* Version of nkf */
- rb_define_const(mKconv, "NKF_VERSION", rb_str_new2(NKF_VERSION));
+ rb_define_const(mNKF, "NKF_VERSION", rb_str_new2(NKF_VERSION));
/* Release date of nkf */
- rb_define_const(mKconv, "NKF_RELEASE_DATE", rb_str_new2(NKF_RELEASE_DATE));
+ rb_define_const(mNKF, "NKF_RELEASE_DATE", rb_str_new2(NKF_RELEASE_DATE));
}