diff options
author | Haldun Bayhantopcu <haldun@github.com> | 2023-11-27 22:57:46 +0100 |
---|---|---|
committer | git <svn-admin@ruby-lang.org> | 2023-11-28 02:53:31 +0000 |
commit | 32b5f5be7cd0140c5f919d81d6ebf826efd03bb8 (patch) | |
tree | bea1364febf92f9599555ff09c11c3d763f25866 /prism | |
parent | 031e81c8f388abb856d2b63ead5d3603e4e3dfe6 (diff) | |
download | ruby-32b5f5be7cd0140c5f919d81d6ebf826efd03bb8.tar.gz |
[ruby/prism] Introduce char_is_identifier_utf8
https://github.com/ruby/prism/commit/5f43e57b0f
Diffstat (limited to 'prism')
-rw-r--r-- | prism/prism.c | 31 |
1 files changed, 24 insertions, 7 deletions
diff --git a/prism/prism.c b/prism/prism.c index 96ed3989e2..1751857e1e 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -5908,6 +5908,19 @@ char_is_identifier_start(pm_parser_t *parser, const uint8_t *b) { } /** + * Similar to char_is_identifier but this function assumes that the encoding + * has not been changed. + */ +static inline size_t +char_is_identifier_utf8(const uint8_t *b, const uint8_t *end) { + if (*b < 0x80) { + return (*b == '_') || (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT ? 1 : 0); + } else { + return (size_t) (pm_encoding_utf_8_alnum_char(b, end - b) || 1u); + } +} + +/** * Like the above, this function is also used extremely frequently to lex all of * the identifiers in a source file once the first character has been found. So * it's important that it be as fast as possible. @@ -5925,11 +5938,8 @@ char_is_identifier(pm_parser_t *parser, const uint8_t *b) { } else { return 0; } - } else if (*b < 0x80) { - return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT ? 1 : 0) || (*b == '_'); - } else { - return (size_t) (pm_encoding_utf_8_alnum_char(b, parser->end - b) || 1u); } + return char_is_identifier_utf8(b, parser->end); } // Here we're defining a perfect hash for the characters that are allowed in @@ -7003,9 +7013,16 @@ lex_identifier(pm_parser_t *parser, bool previous_command_start) { const uint8_t *end = parser->end; const uint8_t *current_start = parser->current.start; const uint8_t *current_end = parser->current.end; + bool encoding_changed = parser->encoding_changed; - while (current_end < end && (width = char_is_identifier(parser, current_end)) > 0) { - current_end += width; + if (encoding_changed) { + while (current_end < end && (width = char_is_identifier(parser, current_end)) > 0) { + current_end += width; + } + } else { + while (current_end < end && (width = char_is_identifier_utf8(current_end, end)) > 0) { + current_end += width; + } } parser->current.end = current_end; @@ -7123,7 +7140,7 @@ lex_identifier(pm_parser_t *parser, bool previous_command_start) { } } - if (parser->encoding_changed) { + if (encoding_changed) { return parser->encoding.isupper_char(current_start, end - current_start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER; } return pm_encoding_utf_8_isupper_char(current_start, end - current_start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER; |