diff options
-rw-r--r-- | lib/prism/prism.gemspec | 4 | ||||
-rw-r--r-- | prism/enc/pm_encoding.h | 3 | ||||
-rw-r--r-- | prism/enc/pm_mac_japanese.c | 57 | ||||
-rw-r--r-- | prism/enc/pm_shift_jis.c | 52 | ||||
-rw-r--r-- | prism/enc/pm_windows_31j.c | 57 | ||||
-rw-r--r-- | prism/prism.c | 3 | ||||
-rw-r--r-- | test/prism/encoding_test.rb | 141 |
7 files changed, 130 insertions, 187 deletions
diff --git a/lib/prism/prism.gemspec b/lib/prism/prism.gemspec index 381ecfea5b..cd90b025b2 100644 --- a/lib/prism/prism.gemspec +++ b/lib/prism/prism.gemspec @@ -86,16 +86,14 @@ Gem::Specification.new do |spec| "lib/prism/visitor.rb", "src/diagnostic.c", "src/enc/pm_big5.c", - "src/enc/pm_cp51932.c", "src/enc/pm_cp949.c", "src/enc/pm_cp950.c", + "src/enc/pm_cp51932.c", "src/enc/pm_euc_jp.c", "src/enc/pm_gbk.c", - "src/enc/pm_mac_japanese.c", "src/enc/pm_shift_jis.c", "src/enc/pm_tables.c", "src/enc/pm_unicode.c", - "src/enc/pm_windows_31j.c", "src/node.c", "src/pack.c", "src/prettyprint.c", diff --git a/prism/enc/pm_encoding.h b/prism/enc/pm_encoding.h index 1985f00f26..7dfc8cd982 100644 --- a/prism/enc/pm_encoding.h +++ b/prism/enc/pm_encoding.h @@ -213,6 +213,9 @@ extern pm_encoding_t pm_encoding_mac_thai; extern pm_encoding_t pm_encoding_mac_turkish; extern pm_encoding_t pm_encoding_mac_ukraine; extern pm_encoding_t pm_encoding_shift_jis; +extern pm_encoding_t pm_encoding_sjis_docomo; +extern pm_encoding_t pm_encoding_sjis_kddi; +extern pm_encoding_t pm_encoding_sjis_softbank; extern pm_encoding_t pm_encoding_tis_620; extern pm_encoding_t pm_encoding_utf_8; extern pm_encoding_t pm_encoding_utf8_mac; diff --git a/prism/enc/pm_mac_japanese.c b/prism/enc/pm_mac_japanese.c deleted file mode 100644 index a5185f0e55..0000000000 --- a/prism/enc/pm_mac_japanese.c +++ /dev/null @@ -1,57 +0,0 @@ -#include "prism/enc/pm_encoding.h" - -static size_t -pm_encoding_mac_japanese_char_width(const uint8_t *b, ptrdiff_t n) { - // These are the single byte characters. - if (*b < 0x80 || (*b >= 0xA1 && *b <= 0xDF)) { - return 1; - } - - // These are the double byte characters. - if ( - (n > 1) && - ((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) && - (b[1] >= 0x40 && b[1] <= 0xFC) - ) { - return 2; - } - - return 0; -} - -static size_t -pm_encoding_mac_japanese_alpha_char(const uint8_t *b, ptrdiff_t n) { - if (pm_encoding_mac_japanese_char_width(b, n) == 1) { - return pm_encoding_ascii_alpha_char(b, n); - } else { - return 0; - } -} - -static size_t -pm_encoding_mac_japanese_alnum_char(const uint8_t *b, ptrdiff_t n) { - if (pm_encoding_mac_japanese_char_width(b, n) == 1) { - return pm_encoding_ascii_alnum_char(b, n); - } else { - return 0; - } -} - -static bool -pm_encoding_mac_japanese_isupper_char(const uint8_t *b, ptrdiff_t n) { - if (pm_encoding_mac_japanese_char_width(b, n) == 1) { - return pm_encoding_ascii_isupper_char(b, n); - } else { - return 0; - } -} - -/** MacJapanese encoding */ -pm_encoding_t pm_encoding_mac_japanese = { - .name = "MacJapanese", - .char_width = pm_encoding_mac_japanese_char_width, - .alnum_char = pm_encoding_mac_japanese_alnum_char, - .alpha_char = pm_encoding_mac_japanese_alpha_char, - .isupper_char = pm_encoding_mac_japanese_isupper_char, - .multibyte = true -}; diff --git a/prism/enc/pm_shift_jis.c b/prism/enc/pm_shift_jis.c index f92956e08b..7833c6653b 100644 --- a/prism/enc/pm_shift_jis.c +++ b/prism/enc/pm_shift_jis.c @@ -48,7 +48,57 @@ pm_encoding_shift_jis_isupper_char(const uint8_t *b, ptrdiff_t n) { /** Shift_JIS encoding */ pm_encoding_t pm_encoding_shift_jis = { - .name = "shift_jis", + .name = "Shift_JIS", + .char_width = pm_encoding_shift_jis_char_width, + .alnum_char = pm_encoding_shift_jis_alnum_char, + .alpha_char = pm_encoding_shift_jis_alpha_char, + .isupper_char = pm_encoding_shift_jis_isupper_char, + .multibyte = true +}; + +/** SJIS-DoCoMo encoding */ +pm_encoding_t pm_encoding_sjis_docomo = { + .name = "SJIS-DoCoMo", + .char_width = pm_encoding_shift_jis_char_width, + .alnum_char = pm_encoding_shift_jis_alnum_char, + .alpha_char = pm_encoding_shift_jis_alpha_char, + .isupper_char = pm_encoding_shift_jis_isupper_char, + .multibyte = true +}; + +/** SJIS-KDDI encoding */ +pm_encoding_t pm_encoding_sjis_kddi = { + .name = "SJIS-KDDI", + .char_width = pm_encoding_shift_jis_char_width, + .alnum_char = pm_encoding_shift_jis_alnum_char, + .alpha_char = pm_encoding_shift_jis_alpha_char, + .isupper_char = pm_encoding_shift_jis_isupper_char, + .multibyte = true +}; + +/** SJIS-SoftBank encoding */ +pm_encoding_t pm_encoding_sjis_softbank = { + .name = "SJIS-SoftBank", + .char_width = pm_encoding_shift_jis_char_width, + .alnum_char = pm_encoding_shift_jis_alnum_char, + .alpha_char = pm_encoding_shift_jis_alpha_char, + .isupper_char = pm_encoding_shift_jis_isupper_char, + .multibyte = true +}; + +/** MacJapanese encoding */ +pm_encoding_t pm_encoding_mac_japanese = { + .name = "MacJapanese", + .char_width = pm_encoding_shift_jis_char_width, + .alnum_char = pm_encoding_shift_jis_alnum_char, + .alpha_char = pm_encoding_shift_jis_alpha_char, + .isupper_char = pm_encoding_shift_jis_isupper_char, + .multibyte = true +}; + +/** Windows-31J */ +pm_encoding_t pm_encoding_windows_31j = { + .name = "Windows-31J", .char_width = pm_encoding_shift_jis_char_width, .alnum_char = pm_encoding_shift_jis_alnum_char, .alpha_char = pm_encoding_shift_jis_alpha_char, diff --git a/prism/enc/pm_windows_31j.c b/prism/enc/pm_windows_31j.c deleted file mode 100644 index 848a9efd36..0000000000 --- a/prism/enc/pm_windows_31j.c +++ /dev/null @@ -1,57 +0,0 @@ -#include "prism/enc/pm_encoding.h" - -static size_t -pm_encoding_windows_31j_char_width(const uint8_t *b, ptrdiff_t n) { - // These are the single byte characters. - if (*b < 0x80 || (*b >= 0xA1 && *b <= 0xDF)) { - return 1; - } - - // These are the double byte characters. - if ( - (n > 1) && - ((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) && - (b[1] >= 0x40 && b[1] <= 0xFC) - ) { - return 2; - } - - return 0; -} - -static size_t -pm_encoding_windows_31j_alpha_char(const uint8_t *b, ptrdiff_t n) { - if (pm_encoding_windows_31j_char_width(b, n) == 1) { - return pm_encoding_ascii_alpha_char(b, n); - } else { - return 0; - } -} - -static size_t -pm_encoding_windows_31j_alnum_char(const uint8_t *b, ptrdiff_t n) { - if (pm_encoding_windows_31j_char_width(b, n) == 1) { - return pm_encoding_ascii_alnum_char(b, n); - } else { - return 0; - } -} - -static bool -pm_encoding_windows_31j_isupper_char(const uint8_t *b, ptrdiff_t n) { - if (pm_encoding_windows_31j_char_width(b, n) == 1) { - return pm_encoding_ascii_isupper_char(b, n); - } else { - return false; - } -} - -/** Windows-31J */ -pm_encoding_t pm_encoding_windows_31j = { - .name = "windows-31j", - .char_width = pm_encoding_windows_31j_char_width, - .alnum_char = pm_encoding_windows_31j_alnum_char, - .alpha_char = pm_encoding_windows_31j_alpha_char, - .isupper_char = pm_encoding_windows_31j_isupper_char, - .multibyte = true -}; diff --git a/prism/prism.c b/prism/prism.c index a0c0e728b6..0cabae6232 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -6317,6 +6317,9 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star case 'S': case 's': ENCODING1("Shift_JIS", pm_encoding_shift_jis); ENCODING1("SJIS", pm_encoding_windows_31j); + ENCODING1("SJIS-DoCoMo", pm_encoding_sjis_docomo); + ENCODING1("SJIS-KDDI", pm_encoding_sjis_kddi); + ENCODING1("SJIS-SoftBank", pm_encoding_sjis_softbank); break; case 'T': case 't': ENCODING1("TIS-620", pm_encoding_tis_620); diff --git a/test/prism/encoding_test.rb b/test/prism/encoding_test.rb index 13c622e40a..8ab6f323e5 100644 --- a/test/prism/encoding_test.rb +++ b/test/prism/encoding_test.rb @@ -7,75 +7,78 @@ require_relative "test_helper" module Prism class EncodingTest < TestCase encodings = { - Encoding::ASCII => 0x00...0x100, - Encoding::ASCII_8BIT => 0x00...0x100, - Encoding::CP850 => 0x00...0x100, - Encoding::CP852 => 0x00...0x100, - Encoding::CP855 => 0x00...0x100, - Encoding::GB1988 => 0x00...0x100, - Encoding::IBM437 => 0x00...0x100, - Encoding::IBM720 => 0x00...0x100, - Encoding::IBM737 => 0x00...0x100, - Encoding::IBM775 => 0x00...0x100, - Encoding::IBM852 => 0x00...0x100, - Encoding::IBM855 => 0x00...0x100, - Encoding::IBM857 => 0x00...0x100, - Encoding::IBM860 => 0x00...0x100, - Encoding::IBM861 => 0x00...0x100, - Encoding::IBM862 => 0x00...0x100, - Encoding::IBM863 => 0x00...0x100, - Encoding::IBM864 => 0x00...0x100, - Encoding::IBM865 => 0x00...0x100, - Encoding::IBM866 => 0x00...0x100, - Encoding::IBM869 => 0x00...0x100, - Encoding::ISO_8859_1 => 0x00...0x100, - Encoding::ISO_8859_2 => 0x00...0x100, - Encoding::ISO_8859_3 => 0x00...0x100, - Encoding::ISO_8859_4 => 0x00...0x100, - Encoding::ISO_8859_5 => 0x00...0x100, - Encoding::ISO_8859_6 => 0x00...0x100, - Encoding::ISO_8859_7 => 0x00...0x100, - Encoding::ISO_8859_8 => 0x00...0x100, - Encoding::ISO_8859_9 => 0x00...0x100, - Encoding::ISO_8859_10 => 0x00...0x100, - Encoding::ISO_8859_11 => 0x00...0x100, - Encoding::ISO_8859_13 => 0x00...0x100, - Encoding::ISO_8859_14 => 0x00...0x100, - Encoding::ISO_8859_15 => 0x00...0x100, - Encoding::ISO_8859_16 => 0x00...0x100, - Encoding::KOI8_R => 0x00...0x100, - Encoding::KOI8_U => 0x00...0x100, - Encoding::MACCENTEURO => 0x00...0x100, - Encoding::MACCROATIAN => 0x00...0x100, - Encoding::MACCYRILLIC => 0x00...0x100, - Encoding::MACGREEK => 0x00...0x100, - Encoding::MACICELAND => 0x00...0x100, - Encoding::MACROMAN => 0x00...0x100, - Encoding::MACROMANIA => 0x00...0x100, - Encoding::MACTHAI => 0x00...0x100, - Encoding::MACTURKISH => 0x00...0x100, - Encoding::MACUKRAINE => 0x00...0x100, - Encoding::TIS_620 => 0x00...0x100, - Encoding::Windows_1250 => 0x00...0x100, - Encoding::Windows_1251 => 0x00...0x100, - Encoding::Windows_1252 => 0x00...0x100, - Encoding::Windows_1253 => 0x00...0x100, - Encoding::Windows_1254 => 0x00...0x100, - Encoding::Windows_1255 => 0x00...0x100, - Encoding::Windows_1256 => 0x00...0x100, - Encoding::Windows_1257 => 0x00...0x100, - Encoding::Windows_1258 => 0x00...0x100, - Encoding::Windows_874 => 0x00...0x100, - Encoding::Big5 => 0x00...0x10000, - Encoding::Big5_HKSCS => 0x00...0x10000, - Encoding::Big5_UAO => 0x00...0x10000, - Encoding::CP949 => 0x00...0x10000, - Encoding::CP950 => 0x00...0x10000, - Encoding::CP51932 => 0x00...0x10000, - Encoding::GBK => 0x00...0x10000, - Encoding::MACJAPANESE => 0x00...0x10000, - Encoding::Shift_JIS => 0x00...0x10000, - Encoding::Windows_31J => 0x00...0x10000 + Encoding::ASCII => 0x00...0x100, + Encoding::ASCII_8BIT => 0x00...0x100, + Encoding::CP850 => 0x00...0x100, + Encoding::CP852 => 0x00...0x100, + Encoding::CP855 => 0x00...0x100, + Encoding::GB1988 => 0x00...0x100, + Encoding::IBM437 => 0x00...0x100, + Encoding::IBM720 => 0x00...0x100, + Encoding::IBM737 => 0x00...0x100, + Encoding::IBM775 => 0x00...0x100, + Encoding::IBM852 => 0x00...0x100, + Encoding::IBM855 => 0x00...0x100, + Encoding::IBM857 => 0x00...0x100, + Encoding::IBM860 => 0x00...0x100, + Encoding::IBM861 => 0x00...0x100, + Encoding::IBM862 => 0x00...0x100, + Encoding::IBM863 => 0x00...0x100, + Encoding::IBM864 => 0x00...0x100, + Encoding::IBM865 => 0x00...0x100, + Encoding::IBM866 => 0x00...0x100, + Encoding::IBM869 => 0x00...0x100, + Encoding::ISO_8859_1 => 0x00...0x100, + Encoding::ISO_8859_2 => 0x00...0x100, + Encoding::ISO_8859_3 => 0x00...0x100, + Encoding::ISO_8859_4 => 0x00...0x100, + Encoding::ISO_8859_5 => 0x00...0x100, + Encoding::ISO_8859_6 => 0x00...0x100, + Encoding::ISO_8859_7 => 0x00...0x100, + Encoding::ISO_8859_8 => 0x00...0x100, + Encoding::ISO_8859_9 => 0x00...0x100, + Encoding::ISO_8859_10 => 0x00...0x100, + Encoding::ISO_8859_11 => 0x00...0x100, + Encoding::ISO_8859_13 => 0x00...0x100, + Encoding::ISO_8859_14 => 0x00...0x100, + Encoding::ISO_8859_15 => 0x00...0x100, + Encoding::ISO_8859_16 => 0x00...0x100, + Encoding::KOI8_R => 0x00...0x100, + Encoding::KOI8_U => 0x00...0x100, + Encoding::MACCENTEURO => 0x00...0x100, + Encoding::MACCROATIAN => 0x00...0x100, + Encoding::MACCYRILLIC => 0x00...0x100, + Encoding::MACGREEK => 0x00...0x100, + Encoding::MACICELAND => 0x00...0x100, + Encoding::MACROMAN => 0x00...0x100, + Encoding::MACROMANIA => 0x00...0x100, + Encoding::MACTHAI => 0x00...0x100, + Encoding::MACTURKISH => 0x00...0x100, + Encoding::MACUKRAINE => 0x00...0x100, + Encoding::TIS_620 => 0x00...0x100, + Encoding::Windows_1250 => 0x00...0x100, + Encoding::Windows_1251 => 0x00...0x100, + Encoding::Windows_1252 => 0x00...0x100, + Encoding::Windows_1253 => 0x00...0x100, + Encoding::Windows_1254 => 0x00...0x100, + Encoding::Windows_1255 => 0x00...0x100, + Encoding::Windows_1256 => 0x00...0x100, + Encoding::Windows_1257 => 0x00...0x100, + Encoding::Windows_1258 => 0x00...0x100, + Encoding::Windows_874 => 0x00...0x100, + Encoding::Big5 => 0x00...0x10000, + Encoding::Big5_HKSCS => 0x00...0x10000, + Encoding::Big5_UAO => 0x00...0x10000, + Encoding::CP949 => 0x00...0x10000, + Encoding::CP950 => 0x00...0x10000, + Encoding::CP51932 => 0x00...0x10000, + Encoding::GBK => 0x00...0x10000, + Encoding::MACJAPANESE => 0x00...0x10000, + Encoding::Shift_JIS => 0x00...0x10000, + Encoding::SJIS_DoCoMo => 0x00...0x10000, + Encoding::SJIS_KDDI => 0x00...0x10000, + Encoding::SJIS_SoftBank => 0x00...0x10000, + Encoding::Windows_31J => 0x00...0x10000 } # By default we don't test every codepoint in these encodings because they |