aboutsummaryrefslogtreecommitdiffstats
path: root/prism
diff options
context:
space:
mode:
authorKevin Newton <kddnewton@gmail.com>2024-06-05 10:50:18 -0400
committerKevin Newton <kddnewton@gmail.com>2024-06-05 14:40:03 -0400
commitc3747d5a2e029afb9aabc0b2f122ab701e68f71f (patch)
tree6b3172106773f9f468249b5d65d505202360b337 /prism
parent41f27346440eab0d80b8c50be8741e3344af9ed5 (diff)
downloadruby-c3747d5a2e029afb9aabc0b2f122ab701e68f71f.tar.gz
[ruby/prism] Report onigmo errors for depth
https://github.com/ruby/prism/commit/e0e8bba8be
Diffstat (limited to 'prism')
-rw-r--r--prism/config.yml3
-rw-r--r--prism/prism.c42
-rw-r--r--prism/regexp.c94
-rw-r--r--prism/regexp.h13
-rw-r--r--prism/templates/src/diagnostic.c.erb1
5 files changed, 114 insertions, 39 deletions
diff --git a/prism/config.yml b/prism/config.yml
index 1fe2e36e67..0400f60e01 100644
--- a/prism/config.yml
+++ b/prism/config.yml
@@ -206,8 +206,8 @@ errors:
- PARAMETER_SPLAT_MULTI
- PARAMETER_STAR
- PARAMETER_UNEXPECTED_FWD
- - PARAMETER_WILD_LOOSE_COMMA
- PARAMETER_UNEXPECTED_NO_KW
+ - PARAMETER_WILD_LOOSE_COMMA
- PATTERN_CAPTURE_DUPLICATE
- PATTERN_EXPRESSION_AFTER_BRACKET
- PATTERN_EXPRESSION_AFTER_COMMA
@@ -236,6 +236,7 @@ errors:
- REGEXP_INCOMPAT_CHAR_ENCODING
- REGEXP_INVALID_UNICODE_RANGE
- REGEXP_NON_ESCAPED_MBC
+ - REGEXP_PARSE_ERROR
- REGEXP_TERM
- REGEXP_UNKNOWN_OPTIONS
- REGEXP_UTF8_CHAR_NON_UTF8_REGEXP
diff --git a/prism/prism.c b/prism/prism.c
index e7eb967e15..aaad62d739 100644
--- a/prism/prism.c
+++ b/prism/prism.c
@@ -20018,7 +20018,7 @@ typedef struct {
* This callback is called when the regular expression parser encounters a named
* capture group.
*/
-void
+static void
parse_regular_expression_named_capture(const pm_string_t *capture, void *data) {
parse_regular_expression_named_capture_data_t *callback_data = (parse_regular_expression_named_capture_data_t *) data;
@@ -20084,6 +20084,38 @@ parse_regular_expression_named_capture(const pm_string_t *capture, void *data) {
}
/**
+ * This struct is used to pass information between the regular expression parser
+ * and the error callback.
+ */
+typedef struct {
+ pm_parser_t *parser;
+ const pm_string_t *content;
+ const pm_call_node_t *call;
+} parse_regular_expression_error_data_t;
+
+/**
+ * This callback is called when the regular expression parser encounters a
+ * syntax error.
+ */
+static void
+parse_regular_expression_error(const uint8_t *start, const uint8_t *end, const char *message, void *data) {
+ parse_regular_expression_error_data_t *callback_data = (parse_regular_expression_error_data_t *) data;
+
+ pm_parser_t *parser = callback_data->parser;
+ const pm_string_t *content = callback_data->content;
+ const pm_call_node_t *call = callback_data->call;
+
+ pm_location_t location;
+ if (content->type == PM_STRING_SHARED) {
+ location = (pm_location_t) { .start = start, .end = end };
+ } else {
+ location = call->receiver->location;
+ }
+
+ PM_PARSER_ERR_FORMAT(parser, location.start, location.end, PM_ERR_REGEXP_PARSE_ERROR, message);
+}
+
+/**
* Potentially change a =~ with a regular expression with named captures into a
* match write node.
*/
@@ -20096,7 +20128,13 @@ parse_regular_expression_named_captures(pm_parser_t *parser, const pm_string_t *
.names = { 0 }
};
- pm_regexp_parse(parser, pm_string_source(content), pm_string_length(content), parse_regular_expression_named_capture, &callback_data);
+ parse_regular_expression_error_data_t error_data = {
+ .parser = parser,
+ .content = content,
+ .call = call
+ };
+
+ pm_regexp_parse(parser, pm_string_source(content), pm_string_length(content), parse_regular_expression_named_capture, &callback_data, parse_regular_expression_error, &error_data);
pm_constant_id_list_free(&callback_data.names);
if (callback_data.match != NULL) {
diff --git a/prism/regexp.c b/prism/regexp.c
index 9d3379d522..10f6943546 100644
--- a/prism/regexp.c
+++ b/prism/regexp.c
@@ -1,5 +1,7 @@
#include "prism/regexp.h"
+#define PM_REGEXP_PARSE_DEPTH_MAX 4096
+
/**
* This is the parser that is going to handle parsing regular expressions.
*/
@@ -27,10 +29,25 @@ typedef struct {
/** The data to pass to the name callback. */
void *name_data;
+
+ /** The callback to call when a parse error is found. */
+ pm_regexp_error_callback_t error_callback;
+
+ /** The data to pass to the error callback. */
+ void *error_data;
} pm_regexp_parser_t;
/**
- * This appends a new string to the list of named captures.
+ * Append an error to the parser.
+ */
+static inline void
+pm_regexp_parse_error(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, const char *message) {
+ parser->error_callback(start, end, message, parser->error_data);
+}
+
+/**
+ * This appends a new string to the list of named captures. This function
+ * assumes the caller has already checked the validity of the name callback.
*/
static void
pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
@@ -246,20 +263,20 @@ pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
// Forward declaration because character sets can be nested.
static bool
-pm_regexp_parse_lbracket(pm_regexp_parser_t *parser);
+pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth);
/**
* match-char-set : '[' '^'? (match-range | match-char)* ']'
* ;
*/
static bool
-pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
+pm_regexp_parse_character_set(pm_regexp_parser_t *parser, uint16_t depth) {
pm_regexp_char_accept(parser, '^');
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ']') {
switch (*parser->cursor++) {
case '[':
- pm_regexp_parse_lbracket(parser);
+ pm_regexp_parse_lbracket(parser, depth + 1);
break;
case '\\':
if (!pm_regexp_char_is_eof(parser)) {
@@ -279,7 +296,12 @@ pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
* A left bracket can either mean a POSIX class or a character set.
*/
static bool
-pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
+pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth) {
+ if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) {
+ pm_regexp_parse_error(parser, parser->start, parser->end, "parse depth limit over");
+ return false;
+ }
+
const uint8_t *reset = parser->cursor;
if ((parser->cursor + 2 < parser->end) && parser->cursor[0] == '[' && parser->cursor[1] == ':') {
@@ -289,13 +311,13 @@ pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
parser->cursor = reset;
}
- return pm_regexp_parse_character_set(parser);
+ return pm_regexp_parse_character_set(parser, depth);
}
// Forward declaration here since parsing groups needs to go back up the grammar
// to parse expressions within them.
static bool
-pm_regexp_parse_expression(pm_regexp_parser_t *parser);
+pm_regexp_parse_expression(pm_regexp_parser_t *parser, uint16_t depth);
/**
* These are the states of the options that are configurable on the regular
@@ -409,7 +431,7 @@ pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
* * (?imxdau-imx:subexp) - turn on and off configuration for an expression
*/
static bool
-pm_regexp_parse_group(pm_regexp_parser_t *parser) {
+pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) {
// First, parse any options for the group.
if (pm_regexp_char_accept(parser, '?')) {
if (pm_regexp_char_is_eof(parser)) {
@@ -476,7 +498,11 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
if (!pm_regexp_char_find(parser, '>')) {
return false;
}
- pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
+
+ if (parser->name_callback != NULL) {
+ pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
+ }
+
break;
}
}
@@ -487,7 +513,10 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
return false;
}
- pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
+ if (parser->name_callback != NULL) {
+ pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
+ }
+
break;
}
case '(': // conditional expression
@@ -532,7 +561,7 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
// Now, parse the expressions within this group.
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')') {
- if (!pm_regexp_parse_expression(parser)) {
+ if (!pm_regexp_parse_expression(parser, depth + 1)) {
return false;
}
pm_regexp_char_accept(parser, '|');
@@ -555,7 +584,7 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
* ;
*/
static bool
-pm_regexp_parse_item(pm_regexp_parser_t *parser) {
+pm_regexp_parse_item(pm_regexp_parser_t *parser, uint16_t depth) {
switch (*parser->cursor) {
case '^':
case '$':
@@ -569,10 +598,10 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser) {
return pm_regexp_parse_quantifier(parser);
case '(':
parser->cursor++;
- return pm_regexp_parse_group(parser) && pm_regexp_parse_quantifier(parser);
+ return pm_regexp_parse_group(parser, depth) && pm_regexp_parse_quantifier(parser);
case '[':
parser->cursor++;
- return pm_regexp_parse_lbracket(parser) && pm_regexp_parse_quantifier(parser);
+ return pm_regexp_parse_lbracket(parser, depth) && pm_regexp_parse_quantifier(parser);
default: {
size_t width;
if (!parser->encoding_changed) {
@@ -594,13 +623,18 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser) {
* ;
*/
static bool
-pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
- if (!pm_regexp_parse_item(parser)) {
+pm_regexp_parse_expression(pm_regexp_parser_t *parser, uint16_t depth) {
+ if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) {
+ pm_regexp_parse_error(parser, parser->start, parser->end, "parse depth limit over");
+ return false;
+ }
+
+ if (!pm_regexp_parse_item(parser, depth)) {
return false;
}
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')' && *parser->cursor != '|') {
- if (!pm_regexp_parse_item(parser)) {
+ if (!pm_regexp_parse_item(parser, depth)) {
return false;
}
}
@@ -616,20 +650,12 @@ pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
*/
static bool
pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
- return (
- (
- // Exit early if the pattern is empty.
- pm_regexp_char_is_eof(parser) ||
- // Parse the first expression in the pattern.
- pm_regexp_parse_expression(parser)
- ) &&
- (
- // Return now if we've parsed the entire pattern.
- pm_regexp_char_is_eof(parser) ||
- // Otherwise, we should have a pipe character.
- (pm_regexp_char_expect(parser, '|') && pm_regexp_parse_pattern(parser))
- )
- );
+ do {
+ if (pm_regexp_char_is_eof(parser)) return true;
+ if (!pm_regexp_parse_expression(parser, 0)) return false;
+ } while (pm_regexp_char_accept(parser, '|'));
+
+ return pm_regexp_char_is_eof(parser);
}
/**
@@ -637,7 +663,7 @@ pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
* groups.
*/
PRISM_EXPORTED_FUNCTION void
-pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, pm_regexp_name_callback_t name_callback, void *name_data) {
+pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data) {
pm_regexp_parse_pattern(&(pm_regexp_parser_t) {
.parser = parser,
.start = source,
@@ -646,6 +672,8 @@ pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, pm_rege
.encoding_changed = parser->encoding_changed,
.encoding = parser->encoding,
.name_callback = name_callback,
- .name_data = name_data
+ .name_data = name_data,
+ .error_callback = error_callback,
+ .error_data = error_data
});
}
diff --git a/prism/regexp.h b/prism/regexp.h
index f92952d54a..42bc504107 100644
--- a/prism/regexp.h
+++ b/prism/regexp.h
@@ -22,14 +22,21 @@
typedef void (*pm_regexp_name_callback_t)(const pm_string_t *name, void *data);
/**
+ * This callback is called when a parse error is found.
+ */
+typedef void (*pm_regexp_error_callback_t)(const uint8_t *start, const uint8_t *end, const char *message, void *data);
+
+/**
* Parse a regular expression.
*
* @param parser The parser that is currently being used.
* @param source The source code to parse.
* @param size The size of the source code.
- * @param name_callback The callback to call when a named capture group is found.
- * @param name_data The data to pass to the name callback.
+ * @param name_callback The optional callback to call when a named capture group is found.
+ * @param name_data The optional data to pass to the name callback.
+ * @param error_callback The callback to call when a parse error is found.
+ * @param error_data The data to pass to the error callback.
*/
-PRISM_EXPORTED_FUNCTION void pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, pm_regexp_name_callback_t name_callback, void *name_data);
+PRISM_EXPORTED_FUNCTION void pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data);
#endif
diff --git a/prism/templates/src/diagnostic.c.erb b/prism/templates/src/diagnostic.c.erb
index f734b66afb..e972d6aace 100644
--- a/prism/templates/src/diagnostic.c.erb
+++ b/prism/templates/src/diagnostic.c.erb
@@ -318,6 +318,7 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = {
[PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING] = { "incompatible character encoding: /%.*s/", PM_ERROR_LEVEL_SYNTAX },
[PM_ERR_REGEXP_NON_ESCAPED_MBC] = { "/.../n has a non escaped non ASCII character in non ASCII-8BIT script: /%.*s/", PM_ERROR_LEVEL_SYNTAX },
[PM_ERR_REGEXP_INVALID_UNICODE_RANGE] = { "invalid Unicode range: /%.*s/", PM_ERROR_LEVEL_SYNTAX },
+ [PM_ERR_REGEXP_PARSE_ERROR] = { "%s", PM_ERROR_LEVEL_SYNTAX },
[PM_ERR_REGEXP_UNKNOWN_OPTIONS] = { "unknown regexp %s: %.*s", PM_ERROR_LEVEL_SYNTAX },
[PM_ERR_REGEXP_TERM] = { "unterminated regexp meets end of file; expected a closing delimiter", PM_ERROR_LEVEL_SYNTAX },
[PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP] = { "UTF-8 character in non UTF-8 regexp: /%s/", PM_ERROR_LEVEL_SYNTAX },