diff options
author | Kevin Newton <kddnewton@gmail.com> | 2024-06-05 10:50:18 -0400 |
---|---|---|
committer | Kevin Newton <kddnewton@gmail.com> | 2024-06-05 14:40:03 -0400 |
commit | c3747d5a2e029afb9aabc0b2f122ab701e68f71f (patch) | |
tree | 6b3172106773f9f468249b5d65d505202360b337 /prism | |
parent | 41f27346440eab0d80b8c50be8741e3344af9ed5 (diff) | |
download | ruby-c3747d5a2e029afb9aabc0b2f122ab701e68f71f.tar.gz |
[ruby/prism] Report onigmo errors for depth
https://github.com/ruby/prism/commit/e0e8bba8be
Diffstat (limited to 'prism')
-rw-r--r-- | prism/config.yml | 3 | ||||
-rw-r--r-- | prism/prism.c | 42 | ||||
-rw-r--r-- | prism/regexp.c | 94 | ||||
-rw-r--r-- | prism/regexp.h | 13 | ||||
-rw-r--r-- | prism/templates/src/diagnostic.c.erb | 1 |
5 files changed, 114 insertions, 39 deletions
diff --git a/prism/config.yml b/prism/config.yml index 1fe2e36e67..0400f60e01 100644 --- a/prism/config.yml +++ b/prism/config.yml @@ -206,8 +206,8 @@ errors: - PARAMETER_SPLAT_MULTI - PARAMETER_STAR - PARAMETER_UNEXPECTED_FWD - - PARAMETER_WILD_LOOSE_COMMA - PARAMETER_UNEXPECTED_NO_KW + - PARAMETER_WILD_LOOSE_COMMA - PATTERN_CAPTURE_DUPLICATE - PATTERN_EXPRESSION_AFTER_BRACKET - PATTERN_EXPRESSION_AFTER_COMMA @@ -236,6 +236,7 @@ errors: - REGEXP_INCOMPAT_CHAR_ENCODING - REGEXP_INVALID_UNICODE_RANGE - REGEXP_NON_ESCAPED_MBC + - REGEXP_PARSE_ERROR - REGEXP_TERM - REGEXP_UNKNOWN_OPTIONS - REGEXP_UTF8_CHAR_NON_UTF8_REGEXP diff --git a/prism/prism.c b/prism/prism.c index e7eb967e15..aaad62d739 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -20018,7 +20018,7 @@ typedef struct { * This callback is called when the regular expression parser encounters a named * capture group. */ -void +static void parse_regular_expression_named_capture(const pm_string_t *capture, void *data) { parse_regular_expression_named_capture_data_t *callback_data = (parse_regular_expression_named_capture_data_t *) data; @@ -20084,6 +20084,38 @@ parse_regular_expression_named_capture(const pm_string_t *capture, void *data) { } /** + * This struct is used to pass information between the regular expression parser + * and the error callback. + */ +typedef struct { + pm_parser_t *parser; + const pm_string_t *content; + const pm_call_node_t *call; +} parse_regular_expression_error_data_t; + +/** + * This callback is called when the regular expression parser encounters a + * syntax error. + */ +static void +parse_regular_expression_error(const uint8_t *start, const uint8_t *end, const char *message, void *data) { + parse_regular_expression_error_data_t *callback_data = (parse_regular_expression_error_data_t *) data; + + pm_parser_t *parser = callback_data->parser; + const pm_string_t *content = callback_data->content; + const pm_call_node_t *call = callback_data->call; + + pm_location_t location; + if (content->type == PM_STRING_SHARED) { + location = (pm_location_t) { .start = start, .end = end }; + } else { + location = call->receiver->location; + } + + PM_PARSER_ERR_FORMAT(parser, location.start, location.end, PM_ERR_REGEXP_PARSE_ERROR, message); +} + +/** * Potentially change a =~ with a regular expression with named captures into a * match write node. */ @@ -20096,7 +20128,13 @@ parse_regular_expression_named_captures(pm_parser_t *parser, const pm_string_t * .names = { 0 } }; - pm_regexp_parse(parser, pm_string_source(content), pm_string_length(content), parse_regular_expression_named_capture, &callback_data); + parse_regular_expression_error_data_t error_data = { + .parser = parser, + .content = content, + .call = call + }; + + pm_regexp_parse(parser, pm_string_source(content), pm_string_length(content), parse_regular_expression_named_capture, &callback_data, parse_regular_expression_error, &error_data); pm_constant_id_list_free(&callback_data.names); if (callback_data.match != NULL) { diff --git a/prism/regexp.c b/prism/regexp.c index 9d3379d522..10f6943546 100644 --- a/prism/regexp.c +++ b/prism/regexp.c @@ -1,5 +1,7 @@ #include "prism/regexp.h" +#define PM_REGEXP_PARSE_DEPTH_MAX 4096 + /** * This is the parser that is going to handle parsing regular expressions. */ @@ -27,10 +29,25 @@ typedef struct { /** The data to pass to the name callback. */ void *name_data; + + /** The callback to call when a parse error is found. */ + pm_regexp_error_callback_t error_callback; + + /** The data to pass to the error callback. */ + void *error_data; } pm_regexp_parser_t; /** - * This appends a new string to the list of named captures. + * Append an error to the parser. + */ +static inline void +pm_regexp_parse_error(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, const char *message) { + parser->error_callback(start, end, message, parser->error_data); +} + +/** + * This appends a new string to the list of named captures. This function + * assumes the caller has already checked the validity of the name callback. */ static void pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) { @@ -246,20 +263,20 @@ pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) { // Forward declaration because character sets can be nested. static bool -pm_regexp_parse_lbracket(pm_regexp_parser_t *parser); +pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth); /** * match-char-set : '[' '^'? (match-range | match-char)* ']' * ; */ static bool -pm_regexp_parse_character_set(pm_regexp_parser_t *parser) { +pm_regexp_parse_character_set(pm_regexp_parser_t *parser, uint16_t depth) { pm_regexp_char_accept(parser, '^'); while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ']') { switch (*parser->cursor++) { case '[': - pm_regexp_parse_lbracket(parser); + pm_regexp_parse_lbracket(parser, depth + 1); break; case '\\': if (!pm_regexp_char_is_eof(parser)) { @@ -279,7 +296,12 @@ pm_regexp_parse_character_set(pm_regexp_parser_t *parser) { * A left bracket can either mean a POSIX class or a character set. */ static bool -pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) { +pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth) { + if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) { + pm_regexp_parse_error(parser, parser->start, parser->end, "parse depth limit over"); + return false; + } + const uint8_t *reset = parser->cursor; if ((parser->cursor + 2 < parser->end) && parser->cursor[0] == '[' && parser->cursor[1] == ':') { @@ -289,13 +311,13 @@ pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) { parser->cursor = reset; } - return pm_regexp_parse_character_set(parser); + return pm_regexp_parse_character_set(parser, depth); } // Forward declaration here since parsing groups needs to go back up the grammar // to parse expressions within them. static bool -pm_regexp_parse_expression(pm_regexp_parser_t *parser); +pm_regexp_parse_expression(pm_regexp_parser_t *parser, uint16_t depth); /** * These are the states of the options that are configurable on the regular @@ -409,7 +431,7 @@ pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) { * * (?imxdau-imx:subexp) - turn on and off configuration for an expression */ static bool -pm_regexp_parse_group(pm_regexp_parser_t *parser) { +pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) { // First, parse any options for the group. if (pm_regexp_char_accept(parser, '?')) { if (pm_regexp_char_is_eof(parser)) { @@ -476,7 +498,11 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) { if (!pm_regexp_char_find(parser, '>')) { return false; } - pm_regexp_parser_named_capture(parser, start, parser->cursor - 1); + + if (parser->name_callback != NULL) { + pm_regexp_parser_named_capture(parser, start, parser->cursor - 1); + } + break; } } @@ -487,7 +513,10 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) { return false; } - pm_regexp_parser_named_capture(parser, start, parser->cursor - 1); + if (parser->name_callback != NULL) { + pm_regexp_parser_named_capture(parser, start, parser->cursor - 1); + } + break; } case '(': // conditional expression @@ -532,7 +561,7 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) { // Now, parse the expressions within this group. while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')') { - if (!pm_regexp_parse_expression(parser)) { + if (!pm_regexp_parse_expression(parser, depth + 1)) { return false; } pm_regexp_char_accept(parser, '|'); @@ -555,7 +584,7 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) { * ; */ static bool -pm_regexp_parse_item(pm_regexp_parser_t *parser) { +pm_regexp_parse_item(pm_regexp_parser_t *parser, uint16_t depth) { switch (*parser->cursor) { case '^': case '$': @@ -569,10 +598,10 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser) { return pm_regexp_parse_quantifier(parser); case '(': parser->cursor++; - return pm_regexp_parse_group(parser) && pm_regexp_parse_quantifier(parser); + return pm_regexp_parse_group(parser, depth) && pm_regexp_parse_quantifier(parser); case '[': parser->cursor++; - return pm_regexp_parse_lbracket(parser) && pm_regexp_parse_quantifier(parser); + return pm_regexp_parse_lbracket(parser, depth) && pm_regexp_parse_quantifier(parser); default: { size_t width; if (!parser->encoding_changed) { @@ -594,13 +623,18 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser) { * ; */ static bool -pm_regexp_parse_expression(pm_regexp_parser_t *parser) { - if (!pm_regexp_parse_item(parser)) { +pm_regexp_parse_expression(pm_regexp_parser_t *parser, uint16_t depth) { + if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) { + pm_regexp_parse_error(parser, parser->start, parser->end, "parse depth limit over"); + return false; + } + + if (!pm_regexp_parse_item(parser, depth)) { return false; } while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')' && *parser->cursor != '|') { - if (!pm_regexp_parse_item(parser)) { + if (!pm_regexp_parse_item(parser, depth)) { return false; } } @@ -616,20 +650,12 @@ pm_regexp_parse_expression(pm_regexp_parser_t *parser) { */ static bool pm_regexp_parse_pattern(pm_regexp_parser_t *parser) { - return ( - ( - // Exit early if the pattern is empty. - pm_regexp_char_is_eof(parser) || - // Parse the first expression in the pattern. - pm_regexp_parse_expression(parser) - ) && - ( - // Return now if we've parsed the entire pattern. - pm_regexp_char_is_eof(parser) || - // Otherwise, we should have a pipe character. - (pm_regexp_char_expect(parser, '|') && pm_regexp_parse_pattern(parser)) - ) - ); + do { + if (pm_regexp_char_is_eof(parser)) return true; + if (!pm_regexp_parse_expression(parser, 0)) return false; + } while (pm_regexp_char_accept(parser, '|')); + + return pm_regexp_char_is_eof(parser); } /** @@ -637,7 +663,7 @@ pm_regexp_parse_pattern(pm_regexp_parser_t *parser) { * groups. */ PRISM_EXPORTED_FUNCTION void -pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, pm_regexp_name_callback_t name_callback, void *name_data) { +pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data) { pm_regexp_parse_pattern(&(pm_regexp_parser_t) { .parser = parser, .start = source, @@ -646,6 +672,8 @@ pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, pm_rege .encoding_changed = parser->encoding_changed, .encoding = parser->encoding, .name_callback = name_callback, - .name_data = name_data + .name_data = name_data, + .error_callback = error_callback, + .error_data = error_data }); } diff --git a/prism/regexp.h b/prism/regexp.h index f92952d54a..42bc504107 100644 --- a/prism/regexp.h +++ b/prism/regexp.h @@ -22,14 +22,21 @@ typedef void (*pm_regexp_name_callback_t)(const pm_string_t *name, void *data); /** + * This callback is called when a parse error is found. + */ +typedef void (*pm_regexp_error_callback_t)(const uint8_t *start, const uint8_t *end, const char *message, void *data); + +/** * Parse a regular expression. * * @param parser The parser that is currently being used. * @param source The source code to parse. * @param size The size of the source code. - * @param name_callback The callback to call when a named capture group is found. - * @param name_data The data to pass to the name callback. + * @param name_callback The optional callback to call when a named capture group is found. + * @param name_data The optional data to pass to the name callback. + * @param error_callback The callback to call when a parse error is found. + * @param error_data The data to pass to the error callback. */ -PRISM_EXPORTED_FUNCTION void pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, pm_regexp_name_callback_t name_callback, void *name_data); +PRISM_EXPORTED_FUNCTION void pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data); #endif diff --git a/prism/templates/src/diagnostic.c.erb b/prism/templates/src/diagnostic.c.erb index f734b66afb..e972d6aace 100644 --- a/prism/templates/src/diagnostic.c.erb +++ b/prism/templates/src/diagnostic.c.erb @@ -318,6 +318,7 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = { [PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING] = { "incompatible character encoding: /%.*s/", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_REGEXP_NON_ESCAPED_MBC] = { "/.../n has a non escaped non ASCII character in non ASCII-8BIT script: /%.*s/", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_REGEXP_INVALID_UNICODE_RANGE] = { "invalid Unicode range: /%.*s/", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_REGEXP_PARSE_ERROR] = { "%s", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_REGEXP_UNKNOWN_OPTIONS] = { "unknown regexp %s: %.*s", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_REGEXP_TERM] = { "unterminated regexp meets end of file; expected a closing delimiter", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP] = { "UTF-8 character in non UTF-8 regexp: /%s/", PM_ERROR_LEVEL_SYNTAX }, |