* ext/nkf/nkf-8/nkf.c: imported nkf 2.0.8 rev.110.

* Fix: check_bom cuts \xfe\xff\xXX\xXX of UTF-32. * Add support --ic=UTF-32. * Fix: can't guess UTF-16 and UTF-32. * Fix: can't decode beyond BMP of UTF-16LE. * ext/nkf/nkf.c (guess): Support UTF-32. * ext/nkf/lib/kconv.rb (kconv): Support UTF-32. * ext/nkf/lib/kconv.rb (to_utf32): new method. * ext/nkf/lib/kconv.rb (to_utf32): new method. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@10938 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
author: naruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> 2006-09-15 11:26:07 +0000
committer: naruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> 2006-09-15 11:26:07 +0000
commit: 5300eecfb324f3a29d891b3e229baef631dc6aeb (patch)
tree: d5082f25c6b239bcd018156e6fa54dc5f2c798ab /ext/nkf
parent: 976b4e5f8bedcd8285578c6da5117b6883ef1c02 (diff)
download: ruby-5300eecfb324f3a29d891b3e229baef631dc6aeb.tar.gz
3 files changed, 117 insertions, 31 deletions
diff --git a/ext/nkf/lib/kconv.rb b/ext/nkf/lib/kconv.rb
index 4ffe8d984e..91553228fe 100644
--- a/ext/nkf/lib/kconv.rb
+++ b/ext/nkf/lib/kconv.rb
@@ -105,6 +105,8 @@ module Kconv
       opt << 'W'
     when ::NKF::UTF16
       opt << 'W16'
+    when ::NKF::UTF32
+      opt << 'W32'
     end
 
     case out_code
@@ -118,6 +120,8 @@ module Kconv
       opt << 'w'
     when ::NKF::UTF16
       opt << 'w16'
+    when ::NKF::UTF32
+      opt << 'w32'
     when ::NKF::NOCONV
       return str
     end
@@ -202,6 +206,20 @@ module Kconv
   end
   module_function :toutf16
 
+  # call-seq:
+  #    Kconv.toutf32(str)   -> string
+  #
+  # Convert <code>str</code> to UTF-32
+  #
+  # *Note*
+  # This method decode MIME encoded string and
+  # convert halfwidth katakana to fullwidth katakana.
+  # If you don't want it, use NKF.nkf('-w32xm0', str).
+  def toutf32(str)
+    ::NKF::nkf('-w32m', str)
+  end
+  module_function :toutf32
+
   #
   # guess
   #
@@ -337,6 +355,17 @@ class String
   # If you don't want it, use NKF.nkf('-w16xm0', str).
   def toutf16; Kconv.toutf16(self) end
 
+  # call-seq:
+  #    String#toutf32   -> string
+  #
+  # Convert <code>self</code> to UTF-32
+  #
+  # *Note*
+  # This method decode MIME encoded string and
+  # convert halfwidth katakana to fullwidth katakana.
+  # If you don't want it, use NKF.nkf('-w32xm0', str).
+  def toutf32; Kconv.toutf32(self) end
+
   #
   # is Encoding
   #
diff --git a/ext/nkf/nkf-utf8/nkf.c b/ext/nkf/nkf-utf8/nkf.c
index 2f3da8b373..bd2e90c77c 100644
--- a/ext/nkf/nkf-utf8/nkf.c
+++ b/ext/nkf/nkf-utf8/nkf.c
@@ -581,6 +581,8 @@ struct input_code input_code_list[] = {
     {"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
 #ifdef UTF8_INPUT_ENABLE
     {"UTF-8",     0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
+    {"UTF-16",    0, 0, 0, {0, 0, 0},     NULL, w_iconv16, 0},
+    {"UTF-32",    0, 0, 0, {0, 0, 0},     NULL, w_iconv32, 0},
 #endif
     {0}
 };
@@ -1293,6 +1295,15 @@ void options(unsigned char *cp)
 			     strcmp(codeset, "UTF-16LE-BOM") == 0){
 			input_f = UTF16_INPUT;
 			input_endian = ENDIAN_LITTLE;
+		    }else if(strcmp(codeset, "UTF-32") == 0 ||
+			     strcmp(codeset, "UTF-32BE") == 0 ||
+			     strcmp(codeset, "UTF-32BE-BOM") == 0){
+			input_f = UTF32_INPUT;
+			input_endian = ENDIAN_BIG;
+		    }else if(strcmp(codeset, "UTF-32LE") == 0 ||
+			     strcmp(codeset, "UTF-32LE-BOM") == 0){
+			input_f = UTF32_INPUT;
+			input_endian = ENDIAN_LITTLE;
 #endif
 		    }
                     continue;
@@ -1901,12 +1912,7 @@ void options(unsigned char *cp)
     }
 }
 
-#ifdef ANSI_C_PROTOTYPE
 struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
-#else
-struct input_code * find_inputcode_byfunc(iconv_func)
-     nkf_char (*iconv_func)();
-#endif
 {
     if (iconv_func){
         struct input_code *p = input_code_list;
@@ -2227,6 +2233,12 @@ void code_status(nkf_char c)
     struct input_code *result = 0;
     struct input_code *p = input_code_list;
     while (p->name){
+        if (!p->status_func) {
+	    ++p;
+	    continue;
+	}
+        if (!p->status_func)
+	    continue;
         (p->status_func)(p, c);
         if (p->stat > 0){
             action_flag = 0;
@@ -2407,8 +2419,11 @@ void check_bom(FILE *f)
 		    if(!input_f){
 			set_iconv(TRUE, w_iconv32);
 		    }
-		    input_endian = ENDIAN_BIG;
-		    return;
+		    if (iconv == w_iconv32) {
+			input_endian = ENDIAN_BIG;
+			return;
+		    }
+		    (*i_ungetc)(0xFF,f);
 		}else (*i_ungetc)(c2,f);
 		(*i_ungetc)(0xFE,f);
 	    }else if(c2 == 0xFF){
@@ -2416,8 +2431,11 @@ void check_bom(FILE *f)
 		    if(!input_f){
 			set_iconv(TRUE, w_iconv32);
 		    }
-		    input_endian = ENDIAN_2143;
-		    return;
+		    if (iconv == w_iconv32) {
+			input_endian = ENDIAN_2143;
+			return;
+		    }
+		    (*i_ungetc)(0xFF,f);
 		}else (*i_ungetc)(c2,f);
 		(*i_ungetc)(0xFF,f);
 	    }else (*i_ungetc)(c2,f);
@@ -2431,7 +2449,10 @@ void check_bom(FILE *f)
 		if(!input_f){
 		    set_iconv(TRUE, w_iconv);
 		}
-		return;
+		if (iconv == w_iconv) {
+		    return;
+		}
+		(*i_ungetc)(0xBF,f);
 	    }else (*i_ungetc)(c2,f);
 	    (*i_ungetc)(0xBB,f);
 	}else (*i_ungetc)(c2,f);
@@ -2444,16 +2465,22 @@ void check_bom(FILE *f)
 		    if(!input_f){
 			set_iconv(TRUE, w_iconv32);
 		    }
-		    input_endian = ENDIAN_3412;
-		    return;
+		    if (iconv == w_iconv32) {
+			input_endian = ENDIAN_3412;
+			return;
+		    }
+		    (*i_ungetc)(0x00,f);
 		}else (*i_ungetc)(c2,f);
 		(*i_ungetc)(0x00,f);
 	    }else (*i_ungetc)(c2,f);
 	    if(!input_f){
 		set_iconv(TRUE, w_iconv16);
 	    }
-	    input_endian = ENDIAN_BIG;
-	    return;
+	    if (iconv == w_iconv16) {
+		input_endian = ENDIAN_BIG;
+		return;
+	    }
+	    (*i_ungetc)(0xFF,f);
 	}else (*i_ungetc)(c2,f);
 	(*i_ungetc)(0xFE,f);
 	break;
@@ -2464,16 +2491,22 @@ void check_bom(FILE *f)
 		    if(!input_f){
 			set_iconv(TRUE, w_iconv32);
 		    }
-		    input_endian = ENDIAN_LITTLE;
-		    return;
+		    if (iconv == w_iconv32) {
+			input_endian = ENDIAN_LITTLE;
+			return;
+		    }
+		    (*i_ungetc)(0x00,f);
 		}else (*i_ungetc)(c2,f);
 		(*i_ungetc)(0x00,f);
 	    }else (*i_ungetc)(c2,f);
 	    if(!input_f){
 		set_iconv(TRUE, w_iconv16);
 	    }
-	    input_endian = ENDIAN_LITTLE;
-	    return;
+	    if (iconv == w_iconv16) {
+		input_endian = ENDIAN_LITTLE;
+		return;
+	    }
+	    (*i_ungetc)(0xFE,f);
 	}else (*i_ungetc)(c2,f);
 	(*i_ungetc)(0xFF,f);
 	break;
@@ -2557,21 +2590,21 @@ nkf_char kanji_convert(FILE *f)
 				c0 <<= 8;
 				if ((c3 = (*i_getc)(f)) != EOF) {
 				    c0 |= c3;
-				} else c1 = EOF;
-			    } else c1 = EOF;
+				} else c2 = EOF;
+			    } else c2 = EOF;
 			}
-		    }
+		    } else c2 = EOF;
 		} else {
 		    if ((c2 = (*i_getc)(f)) != EOF) {
 			if (0xD8 <= c2 && c2 <= 0xDB) {
 			    if ((c3 = (*i_getc)(f)) != EOF) {
-				c3 <<= 8;
 				if ((c0 = (*i_getc)(f)) != EOF) {
+				    c0 <<= 8;
 				    c0 |= c3;
-				} else c1 = EOF;
-			    } else c1 = EOF;
+				} else c2 = EOF;
+			    } else c2 = EOF;
 			}
-		    } else c1 = EOF;
+		    } else c2 = EOF;
 		}
 		SEND;
             } else if(iconv == w_iconv32){
@@ -2595,7 +2628,7 @@ nkf_char kanji_convert(FILE *f)
 		    }
 		    c2 = 0;
 		}else{
-		    c1 = EOF;
+		    c2 = EOF;
 		}
 		SEND;
             } else
diff --git a/ext/nkf/nkf.c b/ext/nkf/nkf.c
index 8a4bcfce64..e12df16960 100644
--- a/ext/nkf/nkf.c
+++ b/ext/nkf/nkf.c
@@ -306,6 +306,8 @@ rb_nkf_guess1(VALUE obj, VALUE src)
  *       "UTF-8"
  *     when NKF::UTF16
  *       "UTF-16"
+ *     when NKF::UTF32
+ *       "UTF-32"
  *     when NKF::UNKNOWN
  *       "UNKNOWN"
  *     when NKF::BINARY
@@ -345,6 +347,8 @@ rb_nkf_guess2(VALUE obj, VALUE src)
       code = _UTF8;
     } else if (strcmp(input_codename, "UTF-16") == 0) {
       code = _UTF16;
+    } else if (strcmp(input_codename, "UTF-32") == 0) {
+      code = _UTF32;
     } else if (strlen(input_codename) > 0) {
       code = _UNKNOWN;
     }
@@ -382,16 +386,16 @@ rb_nkf_guess2(VALUE obj, VALUE src)
  *
  *  Output is buffered (DEFAULT), Output is unbuffered.
  *
- *  === -j -s -e -w -w16
+ *  === -j -s -e -w -w16 -w32
  *
  *  Output code is ISO-2022-JP (7bit JIS), Shift_JIS, EUC-JP,
- *  UTF-8N, UTF-16BE.
+ *  UTF-8N, UTF-16BE, UTF-32BE.
  *  Without this option and compile option, ISO-2022-JP is assumed.
  *
- *  === -J -S -E -W -W16
+ *  === -J -S -E -W -W16 -W32
  *
  *  Input assumption is JIS 7 bit, Shift_JIS, EUC-JP,
- *  UTF-8, UTF-16LE.
+ *  UTF-8, UTF-16, UTF-32.
  *
  *  ==== -J
  *
@@ -574,6 +578,16 @@ rb_nkf_guess2(VALUE obj, VALUE src)
  *
  *  [UTF-16LE-BOM] UTF-16 Little Endian with BOM
  *
+ *  [UTF-32] same as UTF-32BE
+ *
+ *  [UTF-32BE] UTF-32 Big Endian without BOM
+ *
+ *  [UTF-32BE-BOM] UTF-32 Big Endian with BOM
+ *
+ *  [UTF-32LE] UTF-32 Little Endian without BOM
+ *
+ *  [UTF-32LE-BOM] UTF-32 Little Endian with BOM
+ *
  *  [UTF8-MAC] NKDed UTF-8, a.k.a. UTF8-NFD (input only)
  *
  *  === --fb-{skip, html, xml, perl, java, subchar}
@@ -587,10 +601,20 @@ rb_nkf_guess2(VALUE obj, VALUE src)
  *  nkf adds a specified escape character to specified 2nd byte of Shift_JIS characters.
  *  1st byte of argument is the escape character and following bytes are target characters.
  *
- *  === --disable-cp932ext
+ *  === --no-cp932ext
  *
  *  Handle the characters extended in CP932 as unassigned characters.
  *
+ *  == --no-best-fit-chars
+ *
+ *  When Unicode to Encoded byte conversion,
+ *  don't convert characters which is not round trip safe.
+ *  When Unicode to Unicode conversion,
+ *  with this and -x option, nkf can be used as UTF converter.
+ *  (In other words, without this and -x option, nkf doesn't save some characters)
+ *
+ *  When nkf convert string which related to path, you should use this opion.
+ *
  *  === --cap-input
  *
  *  Decode hex encoded characters.
author	naruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>	2006-09-15 11:26:07 +0000
committer	naruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>	2006-09-15 11:26:07 +0000
commit	5300eecfb324f3a29d891b3e229baef631dc6aeb (patch)
tree	d5082f25c6b239bcd018156e6fa54dc5f2c798ab /ext/nkf
parent	976b4e5f8bedcd8285578c6da5117b6883ef1c02 (diff)
download	ruby-5300eecfb324f3a29d891b3e229baef631dc6aeb.tar.gz