aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHaldun Bayhantopcu <haldun@github.com>2023-12-01 19:59:50 +0100
committergit <svn-admin@ruby-lang.org>2023-12-01 20:10:58 +0000
commit562d949e022cb3e7288256af8d0df3f4e17b66cb (patch)
treea851552964f82540eb2e4f7ffb37e6df8610b500
parent39238888bc784eb5887d899dc09fad30997464ac (diff)
downloadruby-562d949e022cb3e7288256af8d0df3f4e17b66cb.tar.gz
[ruby/prism] Fix parsing heredoc ends
https://github.com/ruby/prism/commit/aa8c702271
-rw-r--r--prism/prism.c95
-rw-r--r--test/prism/fixtures/heredocs_leading_whitespace.txt19
-rw-r--r--test/prism/locals_test.rb7
-rw-r--r--test/prism/parse_test.rb5
-rw-r--r--test/prism/snapshots/heredocs_leading_whitespace.txt49
5 files changed, 147 insertions, 28 deletions
diff --git a/prism/prism.c b/prism/prism.c
index 21dfd58c67..9021f5b0f8 100644
--- a/prism/prism.c
+++ b/prism/prism.c
@@ -9761,24 +9761,43 @@ parser_lex(pm_parser_t *parser) {
// terminator, then we need to return the ending of the heredoc.
if (current_token_starts_line(parser)) {
const uint8_t *start = parser->current.start;
- size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
-
- if ((start + ident_length <= parser->end) && (memcmp(start, ident_start, ident_length) == 0)) {
- bool matched = true;
+ if (start + ident_length <= parser->end) {
bool at_end = false;
+ const uint8_t *newline = next_newline(start, parser->end - start);
+ const uint8_t *ident_end = newline;
+ const uint8_t *terminator_end = newline;
- size_t eol_length = match_eol_at(parser, start + ident_length);
- if (eol_length) {
- parser->current.end = start + ident_length + eol_length;
- pm_newline_list_append(&parser->newline_list, parser->current.end - 1);
- } else if (parser->end == (start + ident_length)) {
- parser->current.end = start + ident_length;
+ if (newline == NULL) {
+ terminator_end = parser->end;
+ ident_end = parser->end;
at_end = true;
} else {
- matched = false;
+ terminator_end++;
+ if (newline[-1] == '\r') {
+ ident_end--; // Remove \r
+ }
+ }
+
+ const uint8_t *terminator_start = ident_end - ident_length;
+ const uint8_t *cursor = start;
+
+ if (
+ lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_DASH ||
+ lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE
+ ) {
+ while (cursor < terminator_start && pm_char_is_inline_whitespace(*cursor)) {
+ cursor++;
+ }
}
- if (matched) {
+ if (
+ (cursor == terminator_start) &&
+ (memcmp(terminator_start, ident_start, ident_length) == 0)
+ ) {
+ if (newline != NULL) {
+ pm_newline_list_append(&parser->newline_list, newline);
+ }
+ parser->current.end = terminator_end;
if (*lex_mode->as.heredoc.next_start == '\\') {
parser->next_start = NULL;
} else {
@@ -9794,7 +9813,7 @@ parser_lex(pm_parser_t *parser) {
LEX(PM_TOKEN_HEREDOC_END);
}
}
-
+ size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
if (
lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE &&
(lex_mode->as.heredoc.common_whitespace > whitespace) &&
@@ -9838,23 +9857,35 @@ parser_lex(pm_parser_t *parser) {
// If we have a - or ~ heredoc, then we can match after
// some leading whitespace.
const uint8_t *start = breakpoint + 1;
- size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
- // If we have hit a newline that is followed by a valid
- // terminator, then we need to return the content of the
- // heredoc here as string content. Then, the next time a
- // token is lexed, it will match again and return the
- // end of the heredoc.
- if (
- !was_escaped_newline &&
- (start + ident_length <= parser->end) &&
- (memcmp(start, ident_start, ident_length) == 0)
- ) {
- // Heredoc terminators must be followed by a
- // newline, CRLF, or EOF to be valid.
+ if (!was_escaped_newline && (start + ident_length <= parser->end)) {
+ // We want to match the terminator starting from the end of the line in case
+ // there is whitespace in the ident such as <<-' DOC' or <<~' DOC'.
+ const uint8_t *newline = next_newline(start, parser->end - start);
+
+ if (newline == NULL) {
+ newline = parser->end;
+ } else if (newline[-1] == '\r') {
+ newline--; // Remove \r
+ }
+
+ // Start of a possible terminator.
+ const uint8_t *terminator_start = newline - ident_length;
+
+ // Cursor to check for the leading whitespace. We skip the
+ // leading whitespace if we have a - or ~ heredoc.
+ const uint8_t *cursor = start;
+
+ if (lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_DASH ||
+ lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) {
+ while (cursor < terminator_start && pm_char_is_inline_whitespace(*cursor)) {
+ cursor++;
+ }
+ }
+
if (
- start + ident_length == parser->end ||
- match_eol_at(parser, start + ident_length)
+ cursor == terminator_start &&
+ (memcmp(terminator_start, ident_start, ident_length) == 0)
) {
parser->current.end = breakpoint + 1;
pm_token_buffer_flush(parser, &token_buffer);
@@ -9862,6 +9893,14 @@ parser_lex(pm_parser_t *parser) {
}
}
+ size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
+
+ // If we have hit a newline that is followed by a valid
+ // terminator, then we need to return the content of the
+ // heredoc here as string content. Then, the next time a
+ // token is lexed, it will match again and return the
+ // end of the heredoc.
+
if (lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) {
if ((lex_mode->as.heredoc.common_whitespace > whitespace) && peek_at(parser, start) != '\n') {
lex_mode->as.heredoc.common_whitespace = whitespace;
diff --git a/test/prism/fixtures/heredocs_leading_whitespace.txt b/test/prism/fixtures/heredocs_leading_whitespace.txt
new file mode 100644
index 0000000000..e786f08774
--- /dev/null
+++ b/test/prism/fixtures/heredocs_leading_whitespace.txt
@@ -0,0 +1,19 @@
+<<-' FOO'
+a
+b
+ FOO
+
+<<-' FOO'
+a
+b
+ FOO
+
+<<~' FOO'
+a
+b
+ FOO
+
+<<~' FOO'
+a
+b
+ FOO
diff --git a/test/prism/locals_test.rb b/test/prism/locals_test.rb
index 06324f9d94..df391ca048 100644
--- a/test/prism/locals_test.rb
+++ b/test/prism/locals_test.rb
@@ -68,6 +68,13 @@ module Prism
# HERE
todos << "seattlerb/heredoc_nested.txt"
+ # Ruby < 3.3.0 fails to parse:
+ #
+ # <<-' HERE'
+ # foo
+ # HERE
+ invalid << "heredocs_leading_whitespace.txt" if RUBY_VERSION < "3.3.0"
+
base = File.join(__dir__, "fixtures")
skips = invalid | todos
diff --git a/test/prism/parse_test.rb b/test/prism/parse_test.rb
index 2feb15b48b..e2de55463c 100644
--- a/test/prism/parse_test.rb
+++ b/test/prism/parse_test.rb
@@ -111,6 +111,11 @@ module Prism
# Additionally, Ripper cannot parse the %w[] fixture in this file, so set ripper_should_parse to false.
ripper_should_parse = false if relative == "spanning_heredoc.txt"
+ # Ruby < 3.3.0 cannot parse heredocs where there are leading whitespace charactes in the heredoc start.
+ # Example: <<~' EOF' or <<-' EOF'
+ # https://bugs.ruby-lang.org/issues/19539
+ ripper_should_parse = false if relative == "heredocs_leading_whitespace.txt" && RUBY_VERSION < "3.3.0"
+
define_method "test_filepath_#{relative}" do
# First, read the source from the filepath. Use binmode to avoid converting CRLF on Windows,
# and explicitly set the external encoding to UTF-8 to override the binmode default.
diff --git a/test/prism/snapshots/heredocs_leading_whitespace.txt b/test/prism/snapshots/heredocs_leading_whitespace.txt
new file mode 100644
index 0000000000..06116821ca
--- /dev/null
+++ b/test/prism/snapshots/heredocs_leading_whitespace.txt
@@ -0,0 +1,49 @@
+@ ProgramNode (location: (1,0)-(16,10))
+├── locals: []
+└── statements:
+ @ StatementsNode (location: (1,0)-(16,10))
+ └── body: (length: 4)
+ ├── @ StringNode (location: (1,0)-(1,10))
+ │ ├── flags: ∅
+ │ ├── opening_loc: (1,0)-(1,10) = "<<-' FOO'"
+ │ ├── content_loc: (2,0)-(4,0) = "a\nb\n"
+ │ ├── closing_loc: (4,0)-(5,0) = " FOO\n"
+ │ └── unescaped: "a\nb\n"
+ ├── @ StringNode (location: (6,0)-(6,10))
+ │ ├── flags: ∅
+ │ ├── opening_loc: (6,0)-(6,10) = "<<-' FOO'"
+ │ ├── content_loc: (7,0)-(9,0) = "a\nb\n"
+ │ ├── closing_loc: (9,0)-(10,0) = " FOO\n"
+ │ └── unescaped: "a\nb\n"
+ ├── @ InterpolatedStringNode (location: (11,0)-(11,10))
+ │ ├── opening_loc: (11,0)-(11,10) = "<<~' FOO'"
+ │ ├── parts: (length: 2)
+ │ │ ├── @ StringNode (location: (12,0)-(13,0))
+ │ │ │ ├── flags: ∅
+ │ │ │ ├── opening_loc: ∅
+ │ │ │ ├── content_loc: (12,0)-(13,0) = "a\n"
+ │ │ │ ├── closing_loc: ∅
+ │ │ │ └── unescaped: "a\n"
+ │ │ └── @ StringNode (location: (13,0)-(14,0))
+ │ │ ├── flags: ∅
+ │ │ ├── opening_loc: ∅
+ │ │ ├── content_loc: (13,0)-(14,0) = "b\n"
+ │ │ ├── closing_loc: ∅
+ │ │ └── unescaped: "b\n"
+ │ └── closing_loc: (14,0)-(15,0) = " FOO\n"
+ └── @ InterpolatedStringNode (location: (16,0)-(16,10))
+ ├── opening_loc: (16,0)-(16,10) = "<<~' FOO'"
+ ├── parts: (length: 2)
+ │ ├── @ StringNode (location: (17,0)-(18,0))
+ │ │ ├── flags: ∅
+ │ │ ├── opening_loc: ∅
+ │ │ ├── content_loc: (17,0)-(18,0) = "a\n"
+ │ │ ├── closing_loc: ∅
+ │ │ └── unescaped: "a\n"
+ │ └── @ StringNode (location: (18,0)-(19,0))
+ │ ├── flags: ∅
+ │ ├── opening_loc: ∅
+ │ ├── content_loc: (18,0)-(19,0) = "b\n"
+ │ ├── closing_loc: ∅
+ │ └── unescaped: "b\n"
+ └── closing_loc: (19,0)-(20,0) = " FOO\n"