[ruby/prism] Report onigmo errors for depth

https://github.com/ruby/prism/commit/e0e8bba8be
author: Kevin Newton <kddnewton@gmail.com> 2024-06-05 10:50:18 -0400
committer: Kevin Newton <kddnewton@gmail.com> 2024-06-05 14:40:03 -0400
commit: c3747d5a2e029afb9aabc0b2f122ab701e68f71f (patch)
tree: 6b3172106773f9f468249b5d65d505202360b337 /prism
parent: 41f27346440eab0d80b8c50be8741e3344af9ed5 (diff)
download: ruby-c3747d5a2e029afb9aabc0b2f122ab701e68f71f.tar.gz
5 files changed, 114 insertions, 39 deletions
diff --git a/prism/config.yml b/prism/config.yml
index 1fe2e36e67..0400f60e01 100644
--- a/prism/config.yml
+++ b/prism/config.yml
@@ -206,8 +206,8 @@ errors:
   - PARAMETER_SPLAT_MULTI
   - PARAMETER_STAR
   - PARAMETER_UNEXPECTED_FWD
-  - PARAMETER_WILD_LOOSE_COMMA
   - PARAMETER_UNEXPECTED_NO_KW
+  - PARAMETER_WILD_LOOSE_COMMA
   - PATTERN_CAPTURE_DUPLICATE
   - PATTERN_EXPRESSION_AFTER_BRACKET
   - PATTERN_EXPRESSION_AFTER_COMMA
@@ -236,6 +236,7 @@ errors:
   - REGEXP_INCOMPAT_CHAR_ENCODING
   - REGEXP_INVALID_UNICODE_RANGE
   - REGEXP_NON_ESCAPED_MBC
+  - REGEXP_PARSE_ERROR
   - REGEXP_TERM
   - REGEXP_UNKNOWN_OPTIONS
   - REGEXP_UTF8_CHAR_NON_UTF8_REGEXP
diff --git a/prism/prism.c b/prism/prism.c
index e7eb967e15..aaad62d739 100644
--- a/prism/prism.c
+++ b/prism/prism.c
@@ -20018,7 +20018,7 @@ typedef struct {
  * This callback is called when the regular expression parser encounters a named
  * capture group.
  */
-void
+static void
 parse_regular_expression_named_capture(const pm_string_t *capture, void *data) {
     parse_regular_expression_named_capture_data_t *callback_data = (parse_regular_expression_named_capture_data_t *) data;
 
@@ -20084,6 +20084,38 @@ parse_regular_expression_named_capture(const pm_string_t *capture, void *data) {
 }
 
 /**
+ * This struct is used to pass information between the regular expression parser
+ * and the error callback.
+ */
+typedef struct {
+    pm_parser_t *parser;
+    const pm_string_t *content;
+    const pm_call_node_t *call;
+} parse_regular_expression_error_data_t;
+
+/**
+ * This callback is called when the regular expression parser encounters a
+ * syntax error.
+ */
+static void
+parse_regular_expression_error(const uint8_t *start, const uint8_t *end, const char *message, void *data) {
+    parse_regular_expression_error_data_t *callback_data = (parse_regular_expression_error_data_t *) data;
+
+    pm_parser_t *parser = callback_data->parser;
+    const pm_string_t *content = callback_data->content;
+    const pm_call_node_t *call = callback_data->call;
+
+    pm_location_t location;
+    if (content->type == PM_STRING_SHARED) {
+        location = (pm_location_t) { .start = start, .end = end };
+    } else {
+        location = call->receiver->location;
+    }
+
+    PM_PARSER_ERR_FORMAT(parser, location.start, location.end, PM_ERR_REGEXP_PARSE_ERROR, message);
+}
+
+/**
  * Potentially change a =~ with a regular expression with named captures into a
  * match write node.
  */
@@ -20096,7 +20128,13 @@ parse_regular_expression_named_captures(pm_parser_t *parser, const pm_string_t *
         .names = { 0 }
     };
 
-    pm_regexp_parse(parser, pm_string_source(content), pm_string_length(content), parse_regular_expression_named_capture, &callback_data);
+    parse_regular_expression_error_data_t error_data = {
+        .parser = parser,
+        .content = content,
+        .call = call
+    };
+
+    pm_regexp_parse(parser, pm_string_source(content), pm_string_length(content), parse_regular_expression_named_capture, &callback_data, parse_regular_expression_error, &error_data);
     pm_constant_id_list_free(&callback_data.names);
 
     if (callback_data.match != NULL) {
diff --git a/prism/regexp.c b/prism/regexp.c
index 9d3379d522..10f6943546 100644
--- a/prism/regexp.c
+++ b/prism/regexp.c
@@ -1,5 +1,7 @@
 #include "prism/regexp.h"
 
+#define PM_REGEXP_PARSE_DEPTH_MAX 4096
+
 /**
  * This is the parser that is going to handle parsing regular expressions.
  */
@@ -27,10 +29,25 @@ typedef struct {
 
     /** The data to pass to the name callback. */
     void *name_data;
+
+    /** The callback to call when a parse error is found. */
+    pm_regexp_error_callback_t error_callback;
+
+    /** The data to pass to the error callback. */
+    void *error_data;
 } pm_regexp_parser_t;
 
 /**
- * This appends a new string to the list of named captures.
+ * Append an error to the parser.
+ */
+static inline void
+pm_regexp_parse_error(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, const char *message) {
+    parser->error_callback(start, end, message, parser->error_data);
+}
+
+/**
+ * This appends a new string to the list of named captures. This function
+ * assumes the caller has already checked the validity of the name callback.
  */
 static void
 pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
@@ -246,20 +263,20 @@ pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
 
 // Forward declaration because character sets can be nested.
 static bool
-pm_regexp_parse_lbracket(pm_regexp_parser_t *parser);
+pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth);
 
 /**
  * match-char-set : '[' '^'? (match-range | match-char)* ']'
  *                ;
  */
 static bool
-pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
+pm_regexp_parse_character_set(pm_regexp_parser_t *parser, uint16_t depth) {
     pm_regexp_char_accept(parser, '^');
 
     while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ']') {
         switch (*parser->cursor++) {
             case '[':
-                pm_regexp_parse_lbracket(parser);
+                pm_regexp_parse_lbracket(parser, depth + 1);
                 break;
             case '\\':
                 if (!pm_regexp_char_is_eof(parser)) {
@@ -279,7 +296,12 @@ pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
  * A left bracket can either mean a POSIX class or a character set.
  */
 static bool
-pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
+pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth) {
+    if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) {
+        pm_regexp_parse_error(parser, parser->start, parser->end, "parse depth limit over");
+        return false;
+    }
+
     const uint8_t *reset = parser->cursor;
 
     if ((parser->cursor + 2 < parser->end) && parser->cursor[0] == '[' && parser->cursor[1] == ':') {
@@ -289,13 +311,13 @@ pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
         parser->cursor = reset;
     }
 
-    return pm_regexp_parse_character_set(parser);
+    return pm_regexp_parse_character_set(parser, depth);
 }
 
 // Forward declaration here since parsing groups needs to go back up the grammar
 // to parse expressions within them.
 static bool
-pm_regexp_parse_expression(pm_regexp_parser_t *parser);
+pm_regexp_parse_expression(pm_regexp_parser_t *parser, uint16_t depth);
 
 /**
  * These are the states of the options that are configurable on the regular
@@ -409,7 +431,7 @@ pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
  * * (?imxdau-imx:subexp)          - turn on and off configuration for an expression
  */
 static bool
-pm_regexp_parse_group(pm_regexp_parser_t *parser) {
+pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) {
     // First, parse any options for the group.
     if (pm_regexp_char_accept(parser, '?')) {
         if (pm_regexp_char_is_eof(parser)) {
@@ -476,7 +498,11 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
                         if (!pm_regexp_char_find(parser, '>')) {
                             return false;
                         }
-                        pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
+
+                        if (parser->name_callback != NULL) {
+                            pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
+                        }
+
                         break;
                     }
                 }
@@ -487,7 +513,10 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
                     return false;
                 }
 
-                pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
+                if (parser->name_callback != NULL) {
+                    pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
+                }
+
                 break;
             }
             case '(': // conditional expression
@@ -532,7 +561,7 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
 
     // Now, parse the expressions within this group.
     while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')') {
-        if (!pm_regexp_parse_expression(parser)) {
+        if (!pm_regexp_parse_expression(parser, depth + 1)) {
             return false;
         }
         pm_regexp_char_accept(parser, '|');
@@ -555,7 +584,7 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
  *      ;
  */
 static bool
-pm_regexp_parse_item(pm_regexp_parser_t *parser) {
+pm_regexp_parse_item(pm_regexp_parser_t *parser, uint16_t depth) {
     switch (*parser->cursor) {
         case '^':
         case '$':
@@ -569,10 +598,10 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser) {
             return pm_regexp_parse_quantifier(parser);
         case '(':
             parser->cursor++;
-            return pm_regexp_parse_group(parser) && pm_regexp_parse_quantifier(parser);
+            return pm_regexp_parse_group(parser, depth) && pm_regexp_parse_quantifier(parser);
         case '[':
             parser->cursor++;
-            return pm_regexp_parse_lbracket(parser) && pm_regexp_parse_quantifier(parser);
+            return pm_regexp_parse_lbracket(parser, depth) && pm_regexp_parse_quantifier(parser);
         default: {
             size_t width;
             if (!parser->encoding_changed) {
@@ -594,13 +623,18 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser) {
  *            ;
  */
 static bool
-pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
-    if (!pm_regexp_parse_item(parser)) {
+pm_regexp_parse_expression(pm_regexp_parser_t *parser, uint16_t depth) {
+    if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) {
+        pm_regexp_parse_error(parser, parser->start, parser->end, "parse depth limit over");
+        return false;
+    }
+
+    if (!pm_regexp_parse_item(parser, depth)) {
         return false;
     }
 
     while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')' && *parser->cursor != '|') {
-        if (!pm_regexp_parse_item(parser)) {
+        if (!pm_regexp_parse_item(parser, depth)) {
             return false;
         }
     }
@@ -616,20 +650,12 @@ pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
  */
 static bool
 pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
-    return (
-        (
-            // Exit early if the pattern is empty.
-            pm_regexp_char_is_eof(parser) ||
-            // Parse the first expression in the pattern.
-            pm_regexp_parse_expression(parser)
-        ) &&
-        (
-            // Return now if we've parsed the entire pattern.
-            pm_regexp_char_is_eof(parser) ||
-            // Otherwise, we should have a pipe character.
-            (pm_regexp_char_expect(parser, '|') && pm_regexp_parse_pattern(parser))
-        )
-    );
+    do {
+        if (pm_regexp_char_is_eof(parser)) return true;
+        if (!pm_regexp_parse_expression(parser, 0)) return false;
+    } while (pm_regexp_char_accept(parser, '|'));
+
+    return pm_regexp_char_is_eof(parser);
 }
 
 /**
@@ -637,7 +663,7 @@ pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
  * groups.
  */
 PRISM_EXPORTED_FUNCTION void
-pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, pm_regexp_name_callback_t name_callback, void *name_data) {
+pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data) {
     pm_regexp_parse_pattern(&(pm_regexp_parser_t) {
         .parser = parser,
         .start = source,
@@ -646,6 +672,8 @@ pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, pm_rege
         .encoding_changed = parser->encoding_changed,
         .encoding = parser->encoding,
         .name_callback = name_callback,
-        .name_data = name_data
+        .name_data = name_data,
+        .error_callback = error_callback,
+        .error_data = error_data
     });
 }
diff --git a/prism/regexp.h b/prism/regexp.h
index f92952d54a..42bc504107 100644
--- a/prism/regexp.h
+++ b/prism/regexp.h
@@ -22,14 +22,21 @@
 typedef void (*pm_regexp_name_callback_t)(const pm_string_t *name, void *data);
 
 /**
+ * This callback is called when a parse error is found.
+ */
+typedef void (*pm_regexp_error_callback_t)(const uint8_t *start, const uint8_t *end, const char *message, void *data);
+
+/**
  * Parse a regular expression.
  *
  * @param parser The parser that is currently being used.
  * @param source The source code to parse.
  * @param size The size of the source code.
- * @param name_callback The callback to call when a named capture group is found.
- * @param name_data The data to pass to the name callback.
+ * @param name_callback The optional callback to call when a named capture group is found.
+ * @param name_data The optional data to pass to the name callback.
+ * @param error_callback The callback to call when a parse error is found.
+ * @param error_data The data to pass to the error callback.
  */
-PRISM_EXPORTED_FUNCTION void pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, pm_regexp_name_callback_t name_callback, void *name_data);
+PRISM_EXPORTED_FUNCTION void pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data);
 
 #endif
diff --git a/prism/templates/src/diagnostic.c.erb b/prism/templates/src/diagnostic.c.erb
index f734b66afb..e972d6aace 100644
--- a/prism/templates/src/diagnostic.c.erb
+++ b/prism/templates/src/diagnostic.c.erb
@@ -318,6 +318,7 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = {
     [PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING]      = { "incompatible character encoding: /%.*s/", PM_ERROR_LEVEL_SYNTAX },
     [PM_ERR_REGEXP_NON_ESCAPED_MBC]             = { "/.../n has a non escaped non ASCII character in non ASCII-8BIT script: /%.*s/", PM_ERROR_LEVEL_SYNTAX },
     [PM_ERR_REGEXP_INVALID_UNICODE_RANGE]       = { "invalid Unicode range: /%.*s/", PM_ERROR_LEVEL_SYNTAX },
+    [PM_ERR_REGEXP_PARSE_ERROR]                 = { "%s", PM_ERROR_LEVEL_SYNTAX },
     [PM_ERR_REGEXP_UNKNOWN_OPTIONS]             = { "unknown regexp %s: %.*s", PM_ERROR_LEVEL_SYNTAX },
     [PM_ERR_REGEXP_TERM]                        = { "unterminated regexp meets end of file; expected a closing delimiter", PM_ERROR_LEVEL_SYNTAX },
     [PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP]   = { "UTF-8 character in non UTF-8 regexp: /%s/", PM_ERROR_LEVEL_SYNTAX },
author	Kevin Newton <kddnewton@gmail.com>	2024-06-05 10:50:18 -0400
committer	Kevin Newton <kddnewton@gmail.com>	2024-06-05 14:40:03 -0400
commit	c3747d5a2e029afb9aabc0b2f122ab701e68f71f (patch)
tree	6b3172106773f9f468249b5d65d505202360b337 /prism
parent	41f27346440eab0d80b8c50be8741e3344af9ed5 (diff)
download	ruby-c3747d5a2e029afb9aabc0b2f122ab701e68f71f.tar.gz