[ruby/prism] Validate multibyte characters in strings

Check that multibyte characters are valid using pm_strpbrk. We need to add a couple of codepaths to ensure all encodings are covered. Importantly this doesn't check regular expressions, because apparently you're allowed to have invalid multibyte characters inside regular expression comment groups/extended mode. https://github.com/ruby/prism/commit/2857d3e1b5
author: Kevin Newton <kddnewton@gmail.com> 2024-02-13 17:45:27 -0500
committer: git <svn-admin@ruby-lang.org> 2024-02-14 01:01:37 +0000
commit: 2fa051f627172674b342da26ebe1e671a5e449ec (patch)
tree: 9936bd9f9e6ce67025fa9806736b2186d915b45b /prism/encoding.c
parent: dc5191d6952246be6717a76c193abe1bc55128b7 (diff)
download: ruby-2fa051f627172674b342da26ebe1e671a5e449ec.tar.gz
1 files changed, 14 insertions, 6 deletions
diff --git a/prism/encoding.c b/prism/encoding.c
index 981945caba..1d455c2421 100644
--- a/prism/encoding.c
+++ b/prism/encoding.c
@@ -2253,12 +2253,12 @@ static const uint8_t pm_utf_8_dfa[] = {
 static pm_unicode_codepoint_t
 pm_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
     assert(n >= 0);
-    size_t maximum = (size_t) n;
 
+    size_t maximum = (n > 4) ? 4 : ((size_t) n);
     uint32_t codepoint;
     uint32_t state = 0;
 
-    for (size_t index = 0; index < 4 && index < maximum; index++) {
+    for (size_t index = 0; index < maximum; index++) {
         uint32_t byte = b[index];
         uint32_t type = pm_utf_8_dfa[byte];
 
@@ -2267,7 +2267,7 @@ pm_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
             (0xffu >> type) & (byte);
 
         state = pm_utf_8_dfa[256 + (state * 16) + type];
-        if (!state) {
+        if (state == 0) {
             *width = index + 1;
             return (pm_unicode_codepoint_t) codepoint;
         }
@@ -2282,9 +2282,17 @@ pm_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
  */
 size_t
 pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n) {
-    size_t width;
-    pm_utf_8_codepoint(b, n, &width);
-    return width;
+    assert(n >= 0);
+
+    size_t maximum = (n > 4) ? 4 : ((size_t) n);
+    uint32_t state = 0;
+
+    for (size_t index = 0; index < maximum; index++) {
+        state = pm_utf_8_dfa[256 + (state * 16) + pm_utf_8_dfa[b[index]]];
+        if (state == 0) return index + 1;
+    }
+
+    return 0;
 }
 
 /**
author	Kevin Newton <kddnewton@gmail.com>	2024-02-13 17:45:27 -0500
committer	git <svn-admin@ruby-lang.org>	2024-02-14 01:01:37 +0000
commit	2fa051f627172674b342da26ebe1e671a5e449ec (patch)
tree	9936bd9f9e6ce67025fa9806736b2186d915b45b /prism/encoding.c
parent	dc5191d6952246be6717a76c193abe1bc55128b7 (diff)
download	ruby-2fa051f627172674b342da26ebe1e671a5e449ec.tar.gz