aboutsummaryrefslogtreecommitdiffstats
path: root/prism/encoding.c
diff options
context:
space:
mode:
authorKevin Newton <kddnewton@gmail.com>2024-02-13 17:45:27 -0500
committergit <svn-admin@ruby-lang.org>2024-02-14 01:01:37 +0000
commit2fa051f627172674b342da26ebe1e671a5e449ec (patch)
tree9936bd9f9e6ce67025fa9806736b2186d915b45b /prism/encoding.c
parentdc5191d6952246be6717a76c193abe1bc55128b7 (diff)
downloadruby-2fa051f627172674b342da26ebe1e671a5e449ec.tar.gz
[ruby/prism] Validate multibyte characters in strings
Check that multibyte characters are valid using pm_strpbrk. We need to add a couple of codepaths to ensure all encodings are covered. Importantly this doesn't check regular expressions, because apparently you're allowed to have invalid multibyte characters inside regular expression comment groups/extended mode. https://github.com/ruby/prism/commit/2857d3e1b5
Diffstat (limited to 'prism/encoding.c')
-rw-r--r--prism/encoding.c20
1 files changed, 14 insertions, 6 deletions
diff --git a/prism/encoding.c b/prism/encoding.c
index 981945caba..1d455c2421 100644
--- a/prism/encoding.c
+++ b/prism/encoding.c
@@ -2253,12 +2253,12 @@ static const uint8_t pm_utf_8_dfa[] = {
static pm_unicode_codepoint_t
pm_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
assert(n >= 0);
- size_t maximum = (size_t) n;
+ size_t maximum = (n > 4) ? 4 : ((size_t) n);
uint32_t codepoint;
uint32_t state = 0;
- for (size_t index = 0; index < 4 && index < maximum; index++) {
+ for (size_t index = 0; index < maximum; index++) {
uint32_t byte = b[index];
uint32_t type = pm_utf_8_dfa[byte];
@@ -2267,7 +2267,7 @@ pm_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
(0xffu >> type) & (byte);
state = pm_utf_8_dfa[256 + (state * 16) + type];
- if (!state) {
+ if (state == 0) {
*width = index + 1;
return (pm_unicode_codepoint_t) codepoint;
}
@@ -2282,9 +2282,17 @@ pm_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
*/
size_t
pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n) {
- size_t width;
- pm_utf_8_codepoint(b, n, &width);
- return width;
+ assert(n >= 0);
+
+ size_t maximum = (n > 4) ? 4 : ((size_t) n);
+ uint32_t state = 0;
+
+ for (size_t index = 0; index < maximum; index++) {
+ state = pm_utf_8_dfa[256 + (state * 16) + pm_utf_8_dfa[b[index]]];
+ if (state == 0) return index + 1;
+ }
+
+ return 0;
}
/**