diff options
author | Kevin Newton <kddnewton@gmail.com> | 2024-02-13 17:45:27 -0500 |
---|---|---|
committer | git <svn-admin@ruby-lang.org> | 2024-02-14 01:01:37 +0000 |
commit | 2fa051f627172674b342da26ebe1e671a5e449ec (patch) | |
tree | 9936bd9f9e6ce67025fa9806736b2186d915b45b /prism/encoding.c | |
parent | dc5191d6952246be6717a76c193abe1bc55128b7 (diff) | |
download | ruby-2fa051f627172674b342da26ebe1e671a5e449ec.tar.gz |
[ruby/prism] Validate multibyte characters in strings
Check that multibyte characters are valid using pm_strpbrk. We need
to add a couple of codepaths to ensure all encodings are covered.
Importantly this doesn't check regular expressions, because
apparently you're allowed to have invalid multibyte characters
inside regular expression comment groups/extended mode.
https://github.com/ruby/prism/commit/2857d3e1b5
Diffstat (limited to 'prism/encoding.c')
-rw-r--r-- | prism/encoding.c | 20 |
1 files changed, 14 insertions, 6 deletions
diff --git a/prism/encoding.c b/prism/encoding.c index 981945caba..1d455c2421 100644 --- a/prism/encoding.c +++ b/prism/encoding.c @@ -2253,12 +2253,12 @@ static const uint8_t pm_utf_8_dfa[] = { static pm_unicode_codepoint_t pm_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) { assert(n >= 0); - size_t maximum = (size_t) n; + size_t maximum = (n > 4) ? 4 : ((size_t) n); uint32_t codepoint; uint32_t state = 0; - for (size_t index = 0; index < 4 && index < maximum; index++) { + for (size_t index = 0; index < maximum; index++) { uint32_t byte = b[index]; uint32_t type = pm_utf_8_dfa[byte]; @@ -2267,7 +2267,7 @@ pm_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) { (0xffu >> type) & (byte); state = pm_utf_8_dfa[256 + (state * 16) + type]; - if (!state) { + if (state == 0) { *width = index + 1; return (pm_unicode_codepoint_t) codepoint; } @@ -2282,9 +2282,17 @@ pm_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) { */ size_t pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n) { - size_t width; - pm_utf_8_codepoint(b, n, &width); - return width; + assert(n >= 0); + + size_t maximum = (n > 4) ? 4 : ((size_t) n); + uint32_t state = 0; + + for (size_t index = 0; index < maximum; index++) { + state = pm_utf_8_dfa[256 + (state * 16) + pm_utf_8_dfa[b[index]]]; + if (state == 0) return index + 1; + } + + return 0; } /** |