[ruby/prism] Introduce char_is_identifier_utf8

https://github.com/ruby/prism/commit/5f43e57b0f
author: Haldun Bayhantopcu <haldun@github.com> 2023-11-27 22:57:46 +0100
committer: git <svn-admin@ruby-lang.org> 2023-11-28 02:53:31 +0000
commit: 32b5f5be7cd0140c5f919d81d6ebf826efd03bb8 (patch)
tree: bea1364febf92f9599555ff09c11c3d763f25866 /prism
parent: 031e81c8f388abb856d2b63ead5d3603e4e3dfe6 (diff)
download: ruby-32b5f5be7cd0140c5f919d81d6ebf826efd03bb8.tar.gz
1 files changed, 24 insertions, 7 deletions
diff --git a/prism/prism.c b/prism/prism.c
index 96ed3989e2..1751857e1e 100644
--- a/prism/prism.c
+++ b/prism/prism.c
@@ -5908,6 +5908,19 @@ char_is_identifier_start(pm_parser_t *parser, const uint8_t *b) {
 }
 
 /**
+ * Similar to char_is_identifier but this function assumes that the encoding
+ * has not been changed.
+ */
+static inline size_t
+char_is_identifier_utf8(const uint8_t *b, const uint8_t *end) {
+    if (*b < 0x80) {
+        return (*b == '_') || (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT ? 1 : 0);
+    } else {
+        return (size_t) (pm_encoding_utf_8_alnum_char(b, end - b) || 1u);
+    }
+}
+
+/**
  * Like the above, this function is also used extremely frequently to lex all of
  * the identifiers in a source file once the first character has been found. So
  * it's important that it be as fast as possible.
@@ -5925,11 +5938,8 @@ char_is_identifier(pm_parser_t *parser, const uint8_t *b) {
         } else {
             return 0;
         }
-    } else if (*b < 0x80) {
-        return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT ? 1 : 0) || (*b == '_');
-    } else {
-        return (size_t) (pm_encoding_utf_8_alnum_char(b, parser->end - b) || 1u);
     }
+    return char_is_identifier_utf8(b, parser->end);
 }
 
 // Here we're defining a perfect hash for the characters that are allowed in
@@ -7003,9 +7013,16 @@ lex_identifier(pm_parser_t *parser, bool previous_command_start) {
     const uint8_t *end = parser->end;
     const uint8_t *current_start = parser->current.start;
     const uint8_t *current_end = parser->current.end;
+    bool encoding_changed = parser->encoding_changed;
 
-    while (current_end < end && (width = char_is_identifier(parser, current_end)) > 0) {
-        current_end += width;
+    if (encoding_changed) {
+        while (current_end < end && (width = char_is_identifier(parser, current_end)) > 0) {
+            current_end += width;
+        }
+    } else {
+        while (current_end < end && (width = char_is_identifier_utf8(current_end, end)) > 0) {
+            current_end += width;
+        }
     }
     parser->current.end = current_end;
 
@@ -7123,7 +7140,7 @@ lex_identifier(pm_parser_t *parser, bool previous_command_start) {
         }
     }
 
-    if (parser->encoding_changed) {
+    if (encoding_changed) {
         return parser->encoding.isupper_char(current_start, end - current_start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER;
     }
     return pm_encoding_utf_8_isupper_char(current_start, end - current_start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER;
author	Haldun Bayhantopcu <haldun@github.com>	2023-11-27 22:57:46 +0100
committer	git <svn-admin@ruby-lang.org>	2023-11-28 02:53:31 +0000
commit	32b5f5be7cd0140c5f919d81d6ebf826efd03bb8 (patch)
tree	bea1364febf92f9599555ff09c11c3d763f25866 /prism
parent	031e81c8f388abb856d2b63ead5d3603e4e3dfe6 (diff)
download	ruby-32b5f5be7cd0140c5f919d81d6ebf826efd03bb8.tar.gz