diff options
author | Kevin Newton <kddnewton@gmail.com> | 2023-10-12 08:46:40 -0400 |
---|---|---|
committer | Kevin Newton <kddnewton@gmail.com> | 2023-10-13 15:31:30 -0400 |
commit | fa76cddc5b1eebf77c9c5bbe951f70fd6c115716 (patch) | |
tree | bf98c1898db99a2d35aa759c98dfb259f02055a5 /prism | |
parent | e4f1c06a9bb6012ac155b7a7789d2b5cb4e8abdc (diff) | |
download | ruby-fa76cddc5b1eebf77c9c5bbe951f70fd6c115716.tar.gz |
[ruby/prism] Properly handle unescaping in regexp
https://github.com/ruby/prism/commit/abf9fd6863
Diffstat (limited to 'prism')
-rw-r--r-- | prism/prism.c | 479 |
1 files changed, 254 insertions, 225 deletions
diff --git a/prism/prism.c b/prism/prism.c index 51afe730aa..e2c4a82394 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -4001,9 +4001,10 @@ pm_redo_node_create(pm_parser_t *parser, const pm_token_t *token) { return node; } -// Allocate a new RegularExpressionNode node. +// Allocate a new initialize a new RegularExpressionNode node with the given +// unescaped string. static pm_regular_expression_node_t * -pm_regular_expression_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing) { +pm_regular_expression_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing, const pm_string_t *unescaped) { pm_regular_expression_node_t *node = PM_ALLOC_NODE(parser, pm_regular_expression_node_t); *node = (pm_regular_expression_node_t) { @@ -4018,12 +4019,18 @@ pm_regular_expression_node_create(pm_parser_t *parser, const pm_token_t *opening .opening_loc = PM_LOCATION_TOKEN_VALUE(opening), .content_loc = PM_LOCATION_TOKEN_VALUE(content), .closing_loc = PM_LOCATION_TOKEN_VALUE(closing), - .unescaped = PM_EMPTY_STRING + .unescaped = *unescaped }; return node; } +// Allocate a new initialize a new RegularExpressionNode node. +static inline pm_regular_expression_node_t * +pm_regular_expression_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing) { + return pm_regular_expression_node_create_unescaped(parser, opening, content, closing, &PM_EMPTY_STRING); +} + // Allocate a new RequiredDestructuredParameterNode node. static pm_required_destructured_parameter_node_t * pm_required_destructured_parameter_node_create(pm_parser_t *parser, const pm_token_t *opening) { @@ -4472,12 +4479,20 @@ pm_symbol_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, return node; } -// Allocate a new SymbolNode node. +// Allocate and initialize a new SymbolNode node. static inline pm_symbol_node_t * pm_symbol_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing) { return pm_symbol_node_create_unescaped(parser, opening, value, closing, &PM_EMPTY_STRING); } +// Allocate and initialize a new SymbolNode node with the current string. +static pm_symbol_node_t * +pm_symbol_node_create_current_string(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing) { + pm_symbol_node_t *node = pm_symbol_node_create_unescaped(parser, opening, value, closing, &parser->current_string); + parser->current_string = PM_EMPTY_STRING; + return node; +} + // Allocate and initialize a new SymbolNode node from a label. static pm_symbol_node_t * pm_symbol_node_label_create(pm_parser_t *parser, const pm_token_t *token) { @@ -6097,6 +6112,7 @@ static const uint8_t PM_ESCAPE_FLAG_NONE = 0x0; static const uint8_t PM_ESCAPE_FLAG_CONTROL = 0x1; static const uint8_t PM_ESCAPE_FLAG_META = 0x2; static const uint8_t PM_ESCAPE_FLAG_SINGLE = 0x4; +static const uint8_t PM_ESCAPE_FLAG_REGEXP = 0x8; // This is a lookup table for whether or not an ASCII character is printable. static const bool ascii_printable_chars[] = { @@ -6168,6 +6184,43 @@ escape_write_unicode(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t *st } } +// The regular expression engine doesn't support the same escape sequences as +// Ruby does. So first we have to read the escape sequence, and then we have to +// format it like the regular expression engine expects it. For example, in Ruby +// if we have: +// +// /\M-\C-?/ +// +// then the first byte is actually 255, so we have to rewrite this as: +// +// /\xFF/ +// +// Note that in this case there is a literal \ byte in the regular expression +// source so that the regular expression engine will perform its own unescaping. +static inline void +escape_write_byte(pm_buffer_t *buffer, uint8_t flags, uint8_t byte) { + if (flags & PM_ESCAPE_FLAG_REGEXP) { + pm_buffer_append_bytes(buffer, (const uint8_t *) "\\x", 2); + + uint8_t byte1 = (uint8_t) ((byte >> 4) & 0xF); + uint8_t byte2 = (uint8_t) (byte & 0xF); + + if (byte1 >= 0xA) { + pm_buffer_append_u8(buffer, (uint8_t) ((byte1 - 0xA) + 'A')); + } else { + pm_buffer_append_u8(buffer, (uint8_t) (byte1 + '0')); + } + + if (byte2 >= 0xA) { + pm_buffer_append_u8(buffer, (uint8_t) (byte2 - 0xA + 'A')); + } else { + pm_buffer_append_u8(buffer, (uint8_t) (byte2 + '0')); + } + } else { + pm_buffer_append_u8(buffer, byte); + } +} + // Read the value of an escape into the buffer. static void escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { @@ -6245,6 +6298,8 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { return; } case 'x': { + const uint8_t *start = parser->current.end - 1; + parser->current.end++; uint8_t byte = peek(parser); @@ -6258,7 +6313,11 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { parser->current.end++; } - pm_buffer_append_u8(buffer, value); + if (flags & PM_ESCAPE_FLAG_REGEXP) { + pm_buffer_append_bytes(buffer, start, (size_t) (parser->current.end - start)); + } else { + pm_buffer_append_u8(buffer, value); + } } else { pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_HEXADECIMAL); } @@ -6266,6 +6325,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { return; } case 'u': { + const uint8_t *start = parser->current.end - 1; parser->current.end++; if ( @@ -6276,7 +6336,13 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { pm_char_is_hexadecimal_digit(parser->current.end[3]) ) { uint32_t value = escape_unicode(parser->current.end, 4); - escape_write_unicode(parser, buffer, parser->current.end, parser->current.end + 4, value); + + if (flags & PM_ESCAPE_FLAG_REGEXP) { + pm_buffer_append_bytes(buffer, start, (size_t) (parser->current.end + 4 - start)); + } else { + escape_write_unicode(parser, buffer, start, parser->current.end + 4, value); + } + parser->current.end += 4; } else if (peek(parser) == '{') { const uint8_t *unicode_codepoints_start = parser->current.end - 2; @@ -6307,7 +6373,18 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { } uint32_t value = escape_unicode(unicode_start, hexadecimal_length); - escape_write_unicode(parser, buffer, unicode_start, parser->current.end, value); + + if (flags & PM_ESCAPE_FLAG_REGEXP) { + if (codepoints_count == 1) { + pm_buffer_append_bytes(buffer, (const uint8_t *) "\\u{", 3); + } else { + pm_buffer_append_u8(buffer, ' '); + } + pm_buffer_append_bytes(buffer, unicode_start, hexadecimal_length); + } else { + escape_write_unicode(parser, buffer, unicode_start, parser->current.end, value); + } + parser->current.end += pm_strspn_whitespace(parser->current.end, parser->end - parser->current.end); } @@ -6318,6 +6395,10 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { if (peek(parser) == '}') { parser->current.end++; + + if (flags & PM_ESCAPE_FLAG_REGEXP) { + pm_buffer_append_u8(buffer, '}'); + } } else { pm_parser_err(parser, unicode_codepoints_start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_TERM); } @@ -6332,10 +6413,11 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { uint8_t peeked = peek(parser); switch (peeked) { - case '?': + case '?': { parser->current.end++; - pm_buffer_append_u8(buffer, escape_byte(0x7f, flags)); + escape_write_byte(buffer, flags, escape_byte(0x7f, flags)); return; + } case '\\': if (flags & PM_ESCAPE_FLAG_CONTROL) { pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL_REPEAT); @@ -6344,14 +6426,16 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { parser->current.end++; escape_read(parser, buffer, flags | PM_ESCAPE_FLAG_CONTROL); return; - default: + default: { if (!char_is_ascii_printable(peeked)) { pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL); return; } + parser->current.end++; - pm_buffer_append_u8(buffer, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL)); + escape_write_byte(buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL)); return; + } } } case 'C': { @@ -6365,10 +6449,11 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { uint8_t peeked = peek(parser); switch (peeked) { - case '?': + case '?': { parser->current.end++; - pm_buffer_append_u8(buffer, escape_byte(0x7f, flags)); + escape_write_byte(buffer, flags, escape_byte(0x7f, flags)); return; + } case '\\': if (flags & PM_ESCAPE_FLAG_CONTROL) { pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL_REPEAT); @@ -6377,14 +6462,16 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { parser->current.end++; escape_read(parser, buffer, flags | PM_ESCAPE_FLAG_CONTROL); return; - default: + default: { if (!char_is_ascii_printable(peeked)) { pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL); return; } + parser->current.end++; - pm_buffer_append_u8(buffer, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL)); + escape_write_byte(buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL)); return; + } } } case 'M': { @@ -6413,7 +6500,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { } parser->current.end++; - pm_buffer_append_u8(buffer, escape_byte(peeked, flags | PM_ESCAPE_FLAG_META)); + escape_write_byte(buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_META)); return; } case '\r': { @@ -8106,6 +8193,7 @@ parser_lex(pm_parser_t *parser) { // characters. const uint8_t *breakpoints = lex_mode->as.regexp.breakpoints; const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end); + pm_token_buffer_t token_buffer = { 0 }; while (breakpoint != NULL) { // If we hit a null byte, skip directly past it. @@ -8150,11 +8238,12 @@ parser_lex(pm_parser_t *parser) { // first. if (breakpoint > parser->current.start) { parser->current.end = breakpoint; + pm_token_buffer_flush(parser, &token_buffer); LEX(PM_TOKEN_STRING_CONTENT); } - // Since we've hit the terminator of the regular expression, we now - // need to parse the options. + // Since we've hit the terminator of the regular expression, + // we now need to parse the options. parser->current.end = breakpoint + 1; parser->current.end += pm_strspn_regexp_option(parser->current.end, parser->end - parser->current.end); @@ -8167,44 +8256,77 @@ parser_lex(pm_parser_t *parser) { // literally. In this case we'll skip past the next character // and find the next breakpoint. if (*breakpoint == '\\') { - size_t difference = pm_unescape_calculate_difference(parser, breakpoint, PM_UNESCAPE_ALL); - if (difference == 0) { - // we're at the end of the file - breakpoint = NULL; - continue; - } + parser->current.end = breakpoint + 1; + pm_token_buffer_escape(parser, &token_buffer); - // If the result is an escaped newline ... - if (breakpoint[difference - 1] == '\n') { - if (parser->heredoc_end) { - // ... if we are on the same line as a heredoc, flush the heredoc and - // continue parsing after heredoc_end. - parser->current.end = breakpoint + difference; - parser_flush_heredoc_end(parser); - LEX(PM_TOKEN_STRING_CONTENT); - } else { - // ... else track the newline. - pm_newline_list_append(&parser->newline_list, breakpoint + difference - 1); - } + uint8_t peeked = peek(parser); + switch (peeked) { + case '\r': + parser->current.end++; + if (peek(parser) != '\n') { + pm_token_buffer_push(&token_buffer, '\\'); + pm_token_buffer_push(&token_buffer, '\r'); + break; + } + /* fallthrough */ + case '\n': + if (parser->heredoc_end) { + // ... if we are on the same line as a heredoc, + // flush the heredoc and continue parsing after + // heredoc_end. + parser_flush_heredoc_end(parser); + pm_token_buffer_copy(parser, &token_buffer); + LEX(PM_TOKEN_STRING_CONTENT); + } else { + // ... else track the newline. + pm_newline_list_append(&parser->newline_list, parser->current.end); + } + + parser->current.end++; + break; + case 'c': + case 'C': + case 'M': + case 'u': + case 'x': + escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_REGEXP); + break; + default: + if (lex_mode->as.regexp.terminator == '/' && peeked == '/') { + pm_token_buffer_push(&token_buffer, peeked); + parser->current.end++; + break; + } + + if (peeked < 0x80) pm_token_buffer_push(&token_buffer, '\\'); + pm_token_buffer_push(&token_buffer, peeked); + parser->current.end++; + break; } - breakpoint = pm_strpbrk(parser, breakpoint + difference, breakpoints, parser->end - (breakpoint + difference)); + token_buffer.cursor = parser->current.end; + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end); continue; } // If we hit a #, then we will attempt to lex interpolation. if (*breakpoint == '#') { pm_token_type_t type = lex_interpolation(parser, breakpoint); - if (type != PM_TOKEN_NOT_PROVIDED) { - LEX(type); + + if (type == PM_TOKEN_NOT_PROVIDED) { + // If we haven't returned at this point then we had + // something that looked like an interpolated class or + // instance variable like "#@" but wasn't actually. In + // this case we'll just skip to the next breakpoint. + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end); + continue; } - // If we haven't returned at this point then we had - // something that looked like an interpolated class or - // instance variable like "#@" but wasn't actually. In this - // case we'll just skip to the next breakpoint. - breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end); - continue; + if (type == PM_TOKEN_STRING_CONTENT) { + pm_token_buffer_flush(parser, &token_buffer); + } + + LEX(type); } // If we've hit the incrementor, then we need to skip past it @@ -8713,34 +8835,6 @@ parser_lex(pm_parser_t *parser) { /* Parse functions */ /******************************************************************************/ -// When we are parsing certain content, we need to unescape the content to -// provide to the consumers of the parser. The following functions accept a range -// of characters from the source and unescapes into the provided type. -// -// We have functions for unescaping regular expression nodes, string nodes, -// symbol nodes, and xstring nodes -static pm_regular_expression_node_t * -pm_regular_expression_node_create_and_unescape(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing, pm_unescape_type_t unescape_type) { - pm_regular_expression_node_t *node = pm_regular_expression_node_create(parser, opening, content, closing); - - assert((content->end - content->start) >= 0); - pm_string_shared_init(&node->unescaped, content->start, content->end); - - pm_unescape_manipulate_string(parser, &node->unescaped, unescape_type); - return node; -} - -static pm_string_node_t * -pm_string_node_create_and_unescape(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing, pm_unescape_type_t unescape_type) { - pm_string_node_t *node = pm_string_node_create(parser, opening, content, closing); - - assert((content->end - content->start) >= 0); - pm_string_shared_init(&node->unescaped, content->start, content->end); - - pm_unescape_manipulate_string(parser, &node->unescaped, unescape_type); - return node; -} - // These are the various precedence rules. Because we are using a Pratt parser, // they are named binding power to represent the manner in which nodes are bound // together in the stack. @@ -10785,25 +10879,12 @@ parse_string_part(pm_parser_t *parser) { // "aaa #{bbb} #@ccc ddd" // ^^^^ ^ ^^^^ case PM_TOKEN_STRING_CONTENT: { - pm_unescape_type_t unescape_type = PM_UNESCAPE_ALL; - - if (parser->lex_modes.current->mode == PM_LEX_HEREDOC) { - if (parser->lex_modes.current->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) { - // If we're in a tilde heredoc, we want to unescape it later - // because we don't want unescaped newlines to disappear - // before we handle them in the dedent. - unescape_type = PM_UNESCAPE_NONE; - } else if (parser->lex_modes.current->as.heredoc.quote == PM_HEREDOC_QUOTE_SINGLE) { - unescape_type = PM_UNESCAPE_MINIMAL; - } - } - - parser_lex(parser); - pm_token_t opening = not_provided(parser); pm_token_t closing = not_provided(parser); + pm_node_t *node = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing); - return (pm_node_t *) pm_string_node_create_and_unescape(parser, &opening, &parser->previous, &closing, unescape_type); + parser_lex(parser); + return node; } // Here the lexer has returned the beginning of an embedded expression. In // that case we'll parse the inner statements and return that as the part. @@ -10946,15 +11027,7 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s } // Now we can parse the first part of the symbol. - pm_node_t *part; - if (match1(parser, PM_TOKEN_STRING_CONTENT)) { - pm_token_t opening = not_provided(parser); - pm_token_t closing = not_provided(parser); - part = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing); - parser_lex(parser); - } else { - part = parse_string_part(parser); - } + pm_node_t *part = parse_string_part(parser); // If we got a string part, then it's possible that we could transform // what looks like an interpolated symbol into a regular symbol. @@ -10971,16 +11044,7 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s if (part) pm_node_list_append(&node_list, part); while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) { - if (match1(parser, PM_TOKEN_STRING_CONTENT)) { - pm_token_t opening = not_provided(parser); - pm_token_t closing = not_provided(parser); - part = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing); - parser_lex(parser); - } else { - part = parse_string_part(parser); - } - - if (part != NULL) { + if ((part = parse_string_part(parser)) != NULL) { pm_node_list_append(&node_list, part); } } @@ -12026,10 +12090,7 @@ parse_strings(pm_parser_t *parser) { pm_node_list_append(&parts, part); while (!match3(parser, PM_TOKEN_STRING_END, PM_TOKEN_LABEL_END, PM_TOKEN_EOF)) { - if (match1(parser, PM_TOKEN_STRING_CONTENT)) { - pm_node_list_append(&parts, (pm_node_t *) pm_string_node_create_current_string(parser, &string_opening, &parser->current, &string_closing)); - parser_lex(parser); - } else if ((part = parse_string_part(parser)) != NULL) { + if ((part = parse_string_part(parser)) != NULL) { pm_node_list_append(&parts, part); } } @@ -12046,15 +12107,10 @@ parse_strings(pm_parser_t *parser) { // string content, in which case we need to parse the string as an // interpolated string. pm_node_list_t parts = PM_EMPTY_NODE_LIST; - pm_token_t string_opening = not_provided(parser); - pm_token_t string_closing = not_provided(parser); - pm_node_t *part = NULL; + pm_node_t *part; while (!match3(parser, PM_TOKEN_STRING_END, PM_TOKEN_LABEL_END, PM_TOKEN_EOF)) { - if (match1(parser, PM_TOKEN_STRING_CONTENT)) { - pm_node_list_append(&parts, (pm_node_t *) pm_string_node_create_current_string(parser, &string_opening, &parser->current, &string_closing)); - parser_lex(parser); - } else if ((part = parse_string_part(parser)) != NULL) { + if ((part = parse_string_part(parser)) != NULL) { pm_node_list_append(&parts, part); } } @@ -12529,103 +12585,92 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) { } node->location.end = opening.end; + } else if ((part = parse_string_part(parser)) == NULL) { + // If we get here, then we tried to find something in the + // heredoc but couldn't actually parse anything, so we'll just + // return a missing node. + node = (pm_node_t *) pm_missing_node_create(parser, parser->previous.start, parser->previous.end); + } else if (PM_NODE_TYPE_P(part, PM_STRING_NODE) && match2(parser, PM_TOKEN_HEREDOC_END, PM_TOKEN_EOF)) { + // If we get here, then the part that we parsed was plain string + // content and we're at the end of the heredoc, so we can return + // just a string node with the heredoc opening and closing as + // its opening and closing. + pm_string_node_t *cast = (pm_string_node_t *) part; + + cast->opening_loc = PM_LOCATION_TOKEN_VALUE(&opening); + cast->closing_loc = PM_LOCATION_TOKEN_VALUE(&parser->current); + cast->base.location = cast->opening_loc; + + if (quote == PM_HEREDOC_QUOTE_BACKTICK) { + assert(sizeof(pm_string_node_t) == sizeof(pm_x_string_node_t)); + cast->base.type = PM_X_STRING_NODE; + } + + size_t common_whitespace = lex_mode->as.heredoc.common_whitespace; + if (indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) { + parse_heredoc_dedent_string(&cast->unescaped, common_whitespace); + } + + node = (pm_node_t *) cast; + lex_state_set(parser, PM_LEX_STATE_END); + expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM); } else { - if (match1(parser, PM_TOKEN_STRING_CONTENT)) { - pm_token_t opening = not_provided(parser); - pm_token_t closing = not_provided(parser); - part = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing); - parser_lex(parser); - } else { - part = parse_string_part(parser); - } - - if (part == NULL) { - // If we get here, then we tried to find something in the - // heredoc but couldn't actually parse anything, so we'll just - // return a missing node. - node = (pm_node_t *) pm_missing_node_create(parser, parser->previous.start, parser->previous.end); - } else if (PM_NODE_TYPE_P(part, PM_STRING_NODE) && match2(parser, PM_TOKEN_HEREDOC_END, PM_TOKEN_EOF)) { - // If we get here, then the part that we parsed was plain string - // content and we're at the end of the heredoc, so we can return - // just a string node with the heredoc opening and closing as - // its opening and closing. - pm_string_node_t *cast = (pm_string_node_t *) part; - - cast->opening_loc = PM_LOCATION_TOKEN_VALUE(&opening); - cast->closing_loc = PM_LOCATION_TOKEN_VALUE(&parser->current); - cast->base.location = cast->opening_loc; + // If we get here, then we have multiple parts in the heredoc, + // so we'll need to create an interpolated string node to hold + // them all. + pm_node_list_t parts = PM_EMPTY_NODE_LIST; + pm_node_list_append(&parts, part); - if (quote == PM_HEREDOC_QUOTE_BACKTICK) { - assert(sizeof(pm_string_node_t) == sizeof(pm_x_string_node_t)); - cast->base.type = PM_X_STRING_NODE; + while (!match2(parser, PM_TOKEN_HEREDOC_END, PM_TOKEN_EOF)) { + if (match1(parser, PM_TOKEN_STRING_CONTENT)) { + pm_token_t opening = not_provided(parser); + pm_token_t closing = not_provided(parser); + part = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing); + parser_lex(parser); + } else { + part = parse_string_part(parser); } - size_t common_whitespace = lex_mode->as.heredoc.common_whitespace; - if (indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) { - parse_heredoc_dedent_string(&cast->unescaped, common_whitespace); + if (part != NULL) { + pm_node_list_append(&parts, part); } + } + + // Now that we have all of the parts, create the correct type of + // interpolated node. + if (quote == PM_HEREDOC_QUOTE_BACKTICK) { + pm_interpolated_x_string_node_t *cast = pm_interpolated_xstring_node_create(parser, &opening, &opening); + cast->parts = parts; - node = (pm_node_t *) cast; lex_state_set(parser, PM_LEX_STATE_END); expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM); + + pm_interpolated_xstring_node_closing_set(cast, &parser->previous); + cast->base.location = cast->opening_loc; + node = (pm_node_t *) cast; } else { - // If we get here, then we have multiple parts in the heredoc, - // so we'll need to create an interpolated string node to hold - // them all. - pm_node_list_t parts = PM_EMPTY_NODE_LIST; - pm_node_list_append(&parts, part); + pm_interpolated_string_node_t *cast = pm_interpolated_string_node_create(parser, &opening, &parts, &opening); - while (!match2(parser, PM_TOKEN_HEREDOC_END, PM_TOKEN_EOF)) { - if (match1(parser, PM_TOKEN_STRING_CONTENT)) { - pm_token_t opening = not_provided(parser); - pm_token_t closing = not_provided(parser); - part = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing); - parser_lex(parser); - } else { - part = parse_string_part(parser); - } + lex_state_set(parser, PM_LEX_STATE_END); + expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM); - if (part != NULL) { - pm_node_list_append(&parts, part); - } - } + pm_interpolated_string_node_closing_set(cast, &parser->previous); + cast->base.location = cast->opening_loc; + node = (pm_node_t *) cast; + } - // Now that we have all of the parts, create the correct type of - // interpolated node. + // If this is a heredoc that is indented with a ~, then we need + // to dedent each line by the common leading whitespace. + size_t common_whitespace = lex_mode->as.heredoc.common_whitespace; + if (indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) { + pm_node_list_t *nodes; if (quote == PM_HEREDOC_QUOTE_BACKTICK) { - pm_interpolated_x_string_node_t *cast = pm_interpolated_xstring_node_create(parser, &opening, &opening); - cast->parts = parts; - - lex_state_set(parser, PM_LEX_STATE_END); - expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM); - - pm_interpolated_xstring_node_closing_set(cast, &parser->previous); - cast->base.location = cast->opening_loc; - node = (pm_node_t *) cast; + nodes = &((pm_interpolated_x_string_node_t *) node)->parts; } else { - pm_interpolated_string_node_t *cast = pm_interpolated_string_node_create(parser, &opening, &parts, &opening); - - lex_state_set(parser, PM_LEX_STATE_END); - expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM); - - pm_interpolated_string_node_closing_set(cast, &parser->previous); - cast->base.location = cast->opening_loc; - node = (pm_node_t *) cast; + nodes = &((pm_interpolated_string_node_t *) node)->parts; } - // If this is a heredoc that is indented with a ~, then we need - // to dedent each line by the common leading whitespace. - size_t common_whitespace = lex_mode->as.heredoc.common_whitespace; - if (indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) { - pm_node_list_t *nodes; - if (quote == PM_HEREDOC_QUOTE_BACKTICK) { - nodes = &((pm_interpolated_x_string_node_t *) node)->parts; - } else { - nodes = &((pm_interpolated_string_node_t *) node)->parts; - } - - parse_heredoc_dedent(parser, nodes, common_whitespace); - } + parse_heredoc_dedent(parser, nodes, common_whitespace); } } @@ -13586,11 +13631,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) { if (match1(parser, PM_TOKEN_STRING_CONTENT)) { pm_token_t opening = not_provided(parser); pm_token_t closing = not_provided(parser); - - pm_symbol_node_t *symbol = (pm_symbol_node_t *) pm_symbol_node_create(parser, &opening, &parser->current, &closing); - symbol->unescaped = parser->current_string; - - pm_array_node_elements_append(array, (pm_node_t *) symbol); + pm_array_node_elements_append(array, (pm_node_t *) pm_symbol_node_create_current_string(parser, &opening, &parser->current, &closing)); } expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_I_LOWER_ELEMENT); @@ -13633,11 +13674,8 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) { // If we hit content and the current node is NULL, then this is // the first string content we've seen. In that case we're going // to create a new string node and set that to the current. - pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->current, &closing); - symbol->unescaped = parser->current_string; - + current = (pm_node_t *) pm_symbol_node_create_current_string(parser, &opening, &parser->current, &closing); parser_lex(parser); - current = (pm_node_t *) symbol; } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_SYMBOL_NODE)) { // If we hit string content and the current node is an // interpolated string, then we need to append the string content @@ -13930,6 +13968,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) { // expression at least has something in it. We'll need to check if the // following token is the end (in which case we can return a plain // regular expression) or if it's not then it has interpolation. + pm_string_t unescaped = parser->current_string; pm_token_t content = parser->current; parser_lex(parser); @@ -13937,7 +13976,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) { // without interpolation, which can be represented more succinctly and // more easily compiled. if (accept1(parser, PM_TOKEN_REGEXP_END)) { - return (pm_node_t *) pm_regular_expression_node_create_and_unescape(parser, &opening, &content, &parser->previous, PM_UNESCAPE_ALL); + return (pm_node_t *) pm_regular_expression_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped); } // If we get here, then we have interpolation so we'll need to create @@ -13946,7 +13985,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) { pm_token_t opening = not_provided(parser); pm_token_t closing = not_provided(parser); - pm_node_t *part = (pm_node_t *) pm_string_node_create_and_unescape(parser, &opening, &parser->previous, &closing, PM_UNESCAPE_ALL); + pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &parser->previous, &closing, &unescaped); pm_interpolated_regular_expression_node_append(node, part); } else { // If the first part of the body of the regular expression is not a @@ -13957,9 +13996,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) { // Now that we're here and we have interpolation, we'll parse all of the // parts into the list. + pm_node_t *part; while (!match2(parser, PM_TOKEN_REGEXP_END, PM_TOKEN_EOF)) { - pm_node_t *part = parse_string_part(parser); - if (part != NULL) { + if ((part = parse_string_part(parser)) != NULL) { pm_interpolated_regular_expression_node_append(node, part); } } @@ -14023,19 +14062,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) { node = pm_interpolated_xstring_node_create(parser, &opening, &opening); } + pm_node_t *part; while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) { - pm_node_t *part = NULL; - - if (match1(parser, PM_TOKEN_STRING_CONTENT)) { - pm_token_t opening = not_provided(parser); - pm_token_t closing = not_provided(parser); - part = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing); - parser_lex(parser); - } else { - part = parse_string_part(parser); - } - - if (part != NULL) { + if ((part = parse_string_part(parser)) != NULL) { pm_interpolated_xstring_node_append(node, part); } } |