aboutsummaryrefslogtreecommitdiffstats
path: root/prism
diff options
context:
space:
mode:
authorKevin Newton <kddnewton@gmail.com>2023-11-29 21:45:25 -0500
committerKevin Newton <kddnewton@gmail.com>2023-11-30 21:37:56 -0500
commit32249c2cf141158b02f51a41e0dfb36b78c424f4 (patch)
treec12bdcd02bfeea4de3586712c27959c661f86318 /prism
parenta9162a44c59d85a56930e78cf1801558984db4a7 (diff)
downloadruby-32249c2cf141158b02f51a41e0dfb36b78c424f4.tar.gz
[ruby/prism] GB18030 encoding
https://github.com/ruby/prism/commit/ca3ab7ec89
Diffstat (limited to 'prism')
-rw-r--r--prism/enc/pm_big5.c45
-rw-r--r--prism/enc/pm_encoding.h1
-rw-r--r--prism/prism.c1
3 files changed, 47 insertions, 0 deletions
diff --git a/prism/enc/pm_big5.c b/prism/enc/pm_big5.c
index 948cfc4b11..e39ae63629 100644
--- a/prism/enc/pm_big5.c
+++ b/prism/enc/pm_big5.c
@@ -167,3 +167,48 @@ pm_encoding_t pm_encoding_stateless_iso_2022_jp_kddi = {
.isupper_char = pm_encoding_emacs_mule_isupper_char,
.multibyte = true
};
+
+static size_t
+pm_encoding_gb18030_char_width(const uint8_t *b, ptrdiff_t n) {
+ // These are the 1 byte characters.
+ if (*b < 0x80) {
+ return 1;
+ }
+
+ // These are the 2 byte characters.
+ if ((n > 1) && (b[0] >= 0x81 && b[0] <= 0xFE) && (b[1] >= 0x40 && b[1] <= 0xFE && b[1] != 0x7F)) {
+ return 2;
+ }
+
+ // These are the 4 byte characters.
+ if ((n > 3) && ((b[0] >= 0x81 && b[0] <= 0xFE) && (b[1] >= 0x30 && b[1] <= 0x39) && (b[2] >= 0x81 && b[2] <= 0xFE) && (b[3] >= 0x30 && b[3] <= 0x39))) {
+ return 4;
+ }
+
+ return 0;
+}
+
+static size_t
+pm_encoding_gb18030_alpha_char(const uint8_t *b, ptrdiff_t n) {
+ return (pm_encoding_gb18030_char_width(b, n) == 1) ? pm_encoding_ascii_alpha_char(b, n) : 0;
+}
+
+static size_t
+pm_encoding_gb18030_alnum_char(const uint8_t *b, ptrdiff_t n) {
+ return (pm_encoding_gb18030_char_width(b, n) == 1) ? pm_encoding_ascii_alnum_char(b, n) : 0;
+}
+
+static bool
+pm_encoding_gb18030_isupper_char(const uint8_t *b, ptrdiff_t n) {
+ return (pm_encoding_gb18030_char_width(b, n) == 1) && pm_encoding_ascii_isupper_char(b, n);
+}
+
+/** GB18030 encoding */
+pm_encoding_t pm_encoding_gb18030 = {
+ .name = "GB18030",
+ .char_width = pm_encoding_gb18030_char_width,
+ .alnum_char = pm_encoding_gb18030_alnum_char,
+ .alpha_char = pm_encoding_gb18030_alpha_char,
+ .isupper_char = pm_encoding_gb18030_isupper_char,
+ .multibyte = true
+};
diff --git a/prism/enc/pm_encoding.h b/prism/enc/pm_encoding.h
index e14d4f6f2c..49dace45c7 100644
--- a/prism/enc/pm_encoding.h
+++ b/prism/enc/pm_encoding.h
@@ -173,6 +173,7 @@ extern pm_encoding_t pm_encoding_euc_jp_ms;
extern pm_encoding_t pm_encoding_euc_jis_2004;
extern pm_encoding_t pm_encoding_euc_kr;
extern pm_encoding_t pm_encoding_gb12345;
+extern pm_encoding_t pm_encoding_gb18030;
extern pm_encoding_t pm_encoding_gb1988;
extern pm_encoding_t pm_encoding_gb2312;
extern pm_encoding_t pm_encoding_gbk;
diff --git a/prism/prism.c b/prism/prism.c
index e32b444619..7ede34bea0 100644
--- a/prism/prism.c
+++ b/prism/prism.c
@@ -6262,6 +6262,7 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star
case 'G': case 'g':
ENCODING1("GBK", pm_encoding_gbk);
ENCODING1("GB12345", pm_encoding_gb12345);
+ ENCODING1("GB18030", pm_encoding_gb18030);
ENCODING1("GB1988", pm_encoding_gb1988);
ENCODING1("GB2312", pm_encoding_gb2312);
break;