diff options
Diffstat (limited to 'prism')
-rw-r--r-- | prism/enc/pm_big5.c | 45 | ||||
-rw-r--r-- | prism/enc/pm_encoding.h | 1 | ||||
-rw-r--r-- | prism/prism.c | 1 |
3 files changed, 47 insertions, 0 deletions
diff --git a/prism/enc/pm_big5.c b/prism/enc/pm_big5.c index 948cfc4b11..e39ae63629 100644 --- a/prism/enc/pm_big5.c +++ b/prism/enc/pm_big5.c @@ -167,3 +167,48 @@ pm_encoding_t pm_encoding_stateless_iso_2022_jp_kddi = { .isupper_char = pm_encoding_emacs_mule_isupper_char, .multibyte = true }; + +static size_t +pm_encoding_gb18030_char_width(const uint8_t *b, ptrdiff_t n) { + // These are the 1 byte characters. + if (*b < 0x80) { + return 1; + } + + // These are the 2 byte characters. + if ((n > 1) && (b[0] >= 0x81 && b[0] <= 0xFE) && (b[1] >= 0x40 && b[1] <= 0xFE && b[1] != 0x7F)) { + return 2; + } + + // These are the 4 byte characters. + if ((n > 3) && ((b[0] >= 0x81 && b[0] <= 0xFE) && (b[1] >= 0x30 && b[1] <= 0x39) && (b[2] >= 0x81 && b[2] <= 0xFE) && (b[3] >= 0x30 && b[3] <= 0x39))) { + return 4; + } + + return 0; +} + +static size_t +pm_encoding_gb18030_alpha_char(const uint8_t *b, ptrdiff_t n) { + return (pm_encoding_gb18030_char_width(b, n) == 1) ? pm_encoding_ascii_alpha_char(b, n) : 0; +} + +static size_t +pm_encoding_gb18030_alnum_char(const uint8_t *b, ptrdiff_t n) { + return (pm_encoding_gb18030_char_width(b, n) == 1) ? pm_encoding_ascii_alnum_char(b, n) : 0; +} + +static bool +pm_encoding_gb18030_isupper_char(const uint8_t *b, ptrdiff_t n) { + return (pm_encoding_gb18030_char_width(b, n) == 1) && pm_encoding_ascii_isupper_char(b, n); +} + +/** GB18030 encoding */ +pm_encoding_t pm_encoding_gb18030 = { + .name = "GB18030", + .char_width = pm_encoding_gb18030_char_width, + .alnum_char = pm_encoding_gb18030_alnum_char, + .alpha_char = pm_encoding_gb18030_alpha_char, + .isupper_char = pm_encoding_gb18030_isupper_char, + .multibyte = true +}; diff --git a/prism/enc/pm_encoding.h b/prism/enc/pm_encoding.h index e14d4f6f2c..49dace45c7 100644 --- a/prism/enc/pm_encoding.h +++ b/prism/enc/pm_encoding.h @@ -173,6 +173,7 @@ extern pm_encoding_t pm_encoding_euc_jp_ms; extern pm_encoding_t pm_encoding_euc_jis_2004; extern pm_encoding_t pm_encoding_euc_kr; extern pm_encoding_t pm_encoding_gb12345; +extern pm_encoding_t pm_encoding_gb18030; extern pm_encoding_t pm_encoding_gb1988; extern pm_encoding_t pm_encoding_gb2312; extern pm_encoding_t pm_encoding_gbk; diff --git a/prism/prism.c b/prism/prism.c index e32b444619..7ede34bea0 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -6262,6 +6262,7 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star case 'G': case 'g': ENCODING1("GBK", pm_encoding_gbk); ENCODING1("GB12345", pm_encoding_gb12345); + ENCODING1("GB18030", pm_encoding_gb18030); ENCODING1("GB1988", pm_encoding_gb1988); ENCODING1("GB2312", pm_encoding_gb2312); break; |