[ruby/prism] Move DATA parsing into its own parse result field

https://github.com/ruby/prism/commit/42b60b6e95
author: Kevin Newton <kddnewton@gmail.com> 2023-11-27 14:17:02 -0500
committer: git <svn-admin@ruby-lang.org> 2023-11-28 13:25:48 +0000
commit: c798943a4a272f213d21295a837da06ed5fa9a51 (patch)
tree: 618a074700e2d501beec4db66320e2e1f5a6f085
parent: 43dc8e9012dd7c390f1299d1b653656c81ae2aa7 (diff)
download: ruby-c798943a4a272f213d21295a837da06ed5fa9a51.tar.gz
9 files changed, 76 insertions, 74 deletions
diff --git a/lib/prism/ffi.rb b/lib/prism/ffi.rb
index 847990ed9a..36f1c398de 100644
--- a/lib/prism/ffi.rb
+++ b/lib/prism/ffi.rb
@@ -254,10 +254,10 @@ module Prism
         loader = Serialize::Loader.new(source, buffer.read)
 
         tokens = loader.load_tokens
-        node, comments, magic_comments, errors, warnings = loader.load_nodes
+        node, comments, magic_comments, data_loc, errors, warnings = loader.load_nodes
         tokens.each { |token,| token.value.force_encoding(loader.encoding) }
 
-        ParseResult.new([node, tokens], comments, magic_comments, errors, warnings, source)
+        ParseResult.new([node, tokens], comments, magic_comments, data_loc, errors, warnings, source)
       end
     end
 
diff --git a/lib/prism/lex_compat.rb b/lib/prism/lex_compat.rb
index b6d12053a0..66be275bcd 100644
--- a/lib/prism/lex_compat.rb
+++ b/lib/prism/lex_compat.rb
@@ -831,7 +831,7 @@ module Prism
       # We sort by location to compare against Ripper's output
       tokens.sort_by!(&:location)
 
-      ParseResult.new(tokens, result.comments, result.magic_comments, result.errors, result.warnings, [])
+      ParseResult.new(tokens, result.comments, result.magic_comments, result.data_loc, result.errors, result.warnings, [])
     end
   end
 
diff --git a/lib/prism/parse_result.rb b/lib/prism/parse_result.rb
index 50c23bce65..753d72f10b 100644
--- a/lib/prism/parse_result.rb
+++ b/lib/prism/parse_result.rb
@@ -238,11 +238,6 @@ module Prism
     def deconstruct_keys(keys)
       { location: location }
     end
-
-    # This can only be true for inline comments.
-    def trailing?
-      false
-    end
   end
 
   # InlineComment objects are the most common. They correspond to comments in
@@ -263,18 +258,14 @@ module Prism
   # EmbDocComment objects correspond to comments that are surrounded by =begin
   # and =end.
   class EmbDocComment < Comment
-    # Returns a string representation of this comment.
-    def inspect
-      "#<Prism::EmbDocComment @location=#{location.inspect}>"
+    # This can only be true for inline comments.
+    def trailing?
+      false
     end
-  end
 
-  # DATAComment objects correspond to comments that are after the __END__
-  # keyword in a source file.
-  class DATAComment < Comment
     # Returns a string representation of this comment.
     def inspect
-      "#<Prism::DATAComment @location=#{location.inspect}>"
+      "#<Prism::EmbDocComment @location=#{location.inspect}>"
     end
   end
 
@@ -378,6 +369,11 @@ module Prism
     # The list of magic comments that were encountered during parsing.
     attr_reader :magic_comments
 
+    # An optional location that represents the location of the content after the
+    # __END__ marker. This content is loaded into the DATA constant when the
+    # file being parsed is the main file being executed.
+    attr_reader :data_loc
+
     # The list of errors that were generated during parsing.
     attr_reader :errors
 
@@ -388,10 +384,11 @@ module Prism
     attr_reader :source
 
     # Create a new parse result object with the given values.
-    def initialize(value, comments, magic_comments, errors, warnings, source)
+    def initialize(value, comments, magic_comments, data_loc, errors, warnings, source)
       @value = value
       @comments = comments
       @magic_comments = magic_comments
+      @data_loc = data_loc
       @errors = errors
       @warnings = warnings
       @source = source
@@ -399,7 +396,7 @@ module Prism
 
     # Implement the hash pattern matching interface for ParseResult.
     def deconstruct_keys(keys)
-      { value: value, comments: comments, magic_comments: magic_comments, errors: errors, warnings: warnings }
+      { value: value, comments: comments, magic_comments: magic_comments, data_loc: data_loc, errors: errors, warnings: warnings }
     end
 
     # Returns true if there were no errors during parsing and false if there
diff --git a/prism/extension.c b/prism/extension.c
index 9ecd1e30da..3637cc1617 100644
--- a/prism/extension.c
+++ b/prism/extension.c
@@ -12,7 +12,6 @@ VALUE rb_cPrismLocation;
 VALUE rb_cPrismComment;
 VALUE rb_cPrismInlineComment;
 VALUE rb_cPrismEmbDocComment;
-VALUE rb_cPrismDATAComment;
 VALUE rb_cPrismMagicComment;
 VALUE rb_cPrismParseError;
 VALUE rb_cPrismParseWarning;
@@ -320,22 +319,7 @@ parser_comments(pm_parser_t *parser, VALUE source) {
             LONG2FIX(comment->end - comment->start)
         };
 
-        VALUE type;
-        switch (comment->type) {
-            case PM_COMMENT_INLINE:
-                type = rb_cPrismInlineComment;
-                break;
-            case PM_COMMENT_EMBDOC:
-                type = rb_cPrismEmbDocComment;
-                break;
-            case PM_COMMENT___END__:
-                type = rb_cPrismDATAComment;
-                break;
-            default:
-                type = rb_cPrismInlineComment;
-                break;
-        }
-
+        VALUE type = (comment->type == PM_COMMENT_EMBDOC) ? rb_cPrismEmbDocComment : rb_cPrismInlineComment;
         VALUE comment_argv[] = { rb_class_new_instance(3, location_argv, rb_cPrismLocation) };
         rb_ary_push(comments, rb_class_new_instance(1, comment_argv, type));
     }
@@ -375,6 +359,25 @@ parser_magic_comments(pm_parser_t *parser, VALUE source) {
 }
 
 /**
+ * Extract out the data location from the parser into a Location instance if one
+ * exists.
+ */
+static VALUE
+parser_data_loc(const pm_parser_t *parser, VALUE source) {
+    if (parser->data_loc.end == NULL) {
+        return Qnil;
+    } else {
+        VALUE argv[] = {
+            source,
+            LONG2FIX(parser->data_loc.start - parser->start),
+            LONG2FIX(parser->data_loc.end - parser->data_loc.start)
+        };
+
+        return rb_class_new_instance(3, argv, rb_cPrismLocation);
+    }
+}
+
+/**
  * Extract the errors out of the parser into an array.
  */
 static VALUE
@@ -531,6 +534,7 @@ parse_lex_input(pm_string_t *input, const pm_options_t *options, bool return_nod
         value,
         parser_comments(&parser, source),
         parser_magic_comments(&parser, source),
+        parser_data_loc(&parser, source),
         parser_errors(&parser, parse_lex_data.encoding, source),
         parser_warnings(&parser, parse_lex_data.encoding, source),
         source
@@ -538,7 +542,7 @@ parse_lex_input(pm_string_t *input, const pm_options_t *options, bool return_nod
 
     pm_node_destroy(&parser, node);
     pm_parser_free(&parser);
-    return rb_class_new_instance(6, result_argv, rb_cPrismParseResult);
+    return rb_class_new_instance(7, result_argv, rb_cPrismParseResult);
 }
 
 /**
@@ -601,12 +605,13 @@ parse_input(pm_string_t *input, const pm_options_t *options) {
         pm_ast_new(&parser, node, encoding),
         parser_comments(&parser, source),
         parser_magic_comments(&parser, source),
+        parser_data_loc(&parser, source),
         parser_errors(&parser, encoding, source),
         parser_warnings(&parser, encoding, source),
         source
     };
 
-    VALUE result = rb_class_new_instance(6, result_argv, rb_cPrismParseResult);
+    VALUE result = rb_class_new_instance(7, result_argv, rb_cPrismParseResult);
 
     pm_node_destroy(&parser, node);
     pm_parser_free(&parser);
@@ -938,7 +943,6 @@ Init_prism(void) {
     rb_cPrismComment = rb_define_class_under(rb_cPrism, "Comment", rb_cObject);
     rb_cPrismInlineComment = rb_define_class_under(rb_cPrism, "InlineComment", rb_cPrismComment);
     rb_cPrismEmbDocComment = rb_define_class_under(rb_cPrism, "EmbDocComment", rb_cPrismComment);
-    rb_cPrismDATAComment = rb_define_class_under(rb_cPrism, "DATAComment", rb_cPrismComment);
     rb_cPrismMagicComment = rb_define_class_under(rb_cPrism, "MagicComment", rb_cObject);
     rb_cPrismParseError = rb_define_class_under(rb_cPrism, "ParseError", rb_cObject);
     rb_cPrismParseWarning = rb_define_class_under(rb_cPrism, "ParseWarning", rb_cObject);
diff --git a/prism/parser.h b/prism/parser.h
index e3c93b4246..86442d2a22 100644
--- a/prism/parser.h
+++ b/prism/parser.h
@@ -361,8 +361,7 @@ typedef struct pm_context_node {
 /** This is the type of a comment that we've found while parsing. */
 typedef enum {
     PM_COMMENT_INLINE,
-    PM_COMMENT_EMBDOC,
-    PM_COMMENT___END__
+    PM_COMMENT_EMBDOC
 } pm_comment_type_t;
 
 /**
@@ -571,6 +570,9 @@ struct pm_parser {
     /** The list of magic comments that have been found while parsing. */
     pm_list_t magic_comment_list;
 
+    /** The optional location of the __END__ keyword and its contents. */
+    pm_location_t data_loc;
+
     /** The list of warnings that have been found while parsing. */
     pm_list_t warning_list;
 
diff --git a/prism/prism.c b/prism/prism.c
index 1751857e1e..f1c0e07760 100644
--- a/prism/prism.c
+++ b/prism/prism.c
@@ -9248,8 +9248,8 @@ parser_lex(pm_parser_t *parser) {
                         parser->current.type = PM_TOKEN___END__;
                         parser_lex_callback(parser);
 
-                        pm_comment_t *comment = parser_comment(parser, PM_COMMENT___END__);
-                        pm_list_append(&parser->comment_list, (pm_list_node_t *) comment);
+                        parser->data_loc.start = parser->current.start;
+                        parser->data_loc.end = parser->current.end;
 
                         LEX(PM_TOKEN_EOF);
                     }
diff --git a/prism/templates/lib/prism/serialize.rb.erb b/prism/templates/lib/prism/serialize.rb.erb
index 058142682e..681b6117b4 100644
--- a/prism/templates/lib/prism/serialize.rb.erb
+++ b/prism/templates/lib/prism/serialize.rb.erb
@@ -95,9 +95,10 @@ module Prism
       def load_metadata
         comments = load_comments
         magic_comments = load_varint.times.map { MagicComment.new(load_location, load_location) }
+        data_loc = load_optional_location
         errors = load_varint.times.map { ParseError.new(load_embedded_string, load_location) }
         warnings = load_varint.times.map { ParseWarning.new(load_embedded_string, load_location) }
-        [comments, magic_comments, errors, warnings]
+        [comments, magic_comments, data_loc, errors, warnings]
       end
 
       def load_tokens
@@ -117,11 +118,11 @@ module Prism
         tokens = load_tokens
         encoding = load_encoding
         load_start_line
-        comments, magic_comments, errors, warnings = load_metadata
+        comments, magic_comments, data_loc, errors, warnings = load_metadata
         tokens.each { |token,| token.value.force_encoding(encoding) }
 
         raise "Expected to consume all bytes while deserializing" unless @io.eof?
-        Prism::ParseResult.new(tokens, comments, magic_comments, errors, warnings, @source)
+        Prism::ParseResult.new(tokens, comments, magic_comments, data_loc, errors, warnings, @source)
       end
 
       def load_nodes
@@ -129,17 +130,17 @@ module Prism
         load_encoding
         load_start_line
 
-        comments, magic_comments, errors, warnings = load_metadata
+        comments, magic_comments, data_loc, errors, warnings = load_metadata
 
         @constant_pool_offset = io.read(4).unpack1("L")
         @constant_pool = Array.new(load_varint, nil)
 
-        [load_node, comments, magic_comments, errors, warnings]
+        [load_node, comments, magic_comments, data_loc, errors, warnings]
       end
 
       def load_result
-        node, comments, magic_comments, errors, warnings = load_nodes
-        Prism::ParseResult.new(node, comments, magic_comments, errors, warnings, @source)
+        node, comments, magic_comments, data_loc, errors, warnings = load_nodes
+        Prism::ParseResult.new(node, comments, magic_comments, data_loc, errors, warnings, @source)
       end
 
       private
diff --git a/prism/templates/src/serialize.c.erb b/prism/templates/src/serialize.c.erb
index db4c91e0cd..0ea70a3976 100644
--- a/prism/templates/src/serialize.c.erb
+++ b/prism/templates/src/serialize.c.erb
@@ -15,7 +15,7 @@ pm_sizet_to_u32(size_t value) {
 }
 
 static void
-pm_serialize_location(pm_parser_t *parser, pm_location_t *location, pm_buffer_t *buffer) {
+pm_serialize_location(const pm_parser_t *parser, const pm_location_t *location, pm_buffer_t *buffer) {
     assert(location->start);
     assert(location->end);
     assert(location->start <= location->end);
@@ -171,6 +171,16 @@ pm_serialize_magic_comment_list(pm_parser_t *parser, pm_list_t *list, pm_buffer_
 }
 
 static void
+pm_serialize_data_loc(const pm_parser_t *parser, pm_buffer_t *buffer) {
+    if (parser->data_loc.end == NULL) {
+        pm_buffer_append_byte(buffer, 0);
+    } else {
+        pm_buffer_append_byte(buffer, 1);
+        pm_serialize_location(parser, &parser->data_loc, buffer);
+    }
+}
+
+static void
 pm_serialize_diagnostic(pm_parser_t *parser, pm_diagnostic_t *diagnostic, pm_buffer_t *buffer) {
     // serialize message
     size_t message_length = strlen(diagnostic->message);
@@ -214,6 +224,7 @@ pm_serialize_content(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer)
     pm_serialize_comment_list(parser, &parser->comment_list, buffer);
 <%- end -%>
     pm_serialize_magic_comment_list(parser, &parser->magic_comment_list, buffer);
+    pm_serialize_data_loc(parser, buffer);
     pm_serialize_diagnostic_list(parser, &parser->error_list, buffer);
     pm_serialize_diagnostic_list(parser, &parser->warning_list, buffer);
 
@@ -310,6 +321,7 @@ pm_serialize_lex(pm_buffer_t *buffer, const uint8_t *source, size_t size, const
     pm_buffer_append_varint(buffer, parser.start_line);
     pm_serialize_comment_list(&parser, &parser.comment_list, buffer);
     pm_serialize_magic_comment_list(&parser, &parser.magic_comment_list, buffer);
+    pm_serialize_data_loc(&parser, buffer);
     pm_serialize_diagnostic_list(&parser, &parser.error_list, buffer);
     pm_serialize_diagnostic_list(&parser, &parser.warning_list, buffer);
 
diff --git a/test/prism/comments_test.rb b/test/prism/comments_test.rb
index d14409458c..b99c00268c 100644
--- a/test/prism/comments_test.rb
+++ b/test/prism/comments_test.rb
@@ -39,37 +39,23 @@ module Prism
       )
     end
 
-    def test_comment___END__
-      source = <<~RUBY
+    def test___END__
+      result = Prism.parse(<<~RUBY)
         __END__
         comment
       RUBY
 
-      assert_comment(
-        source,
-        DATAComment,
-        start_offset: 0,
-        end_offset: 16,
-        start_line: 1,
-        end_line: 3,
-        start_column: 0,
-        end_column: 0
-      )
+      data_loc = result.data_loc
+      assert_equal 0, data_loc.start_offset
+      assert_equal 16, data_loc.end_offset
     end
 
-    def test_comment___END__crlf
-      source = "__END__\r\ncomment\r\n"
+    def test___END__crlf
+      result = Prism.parse("__END__\r\ncomment\r\n")
 
-      assert_comment(
-        source,
-        DATAComment,
-        start_offset: 0,
-        end_offset: 18,
-        start_line: 1,
-        end_line: 3,
-        start_column: 0,
-        end_column: 0
-      )
+      data_loc = result.data_loc
+      assert_equal 0, data_loc.start_offset
+      assert_equal 18, data_loc.end_offset
     end
 
     def test_comment_embedded_document
author	Kevin Newton <kddnewton@gmail.com>	2023-11-27 14:17:02 -0500
committer	git <svn-admin@ruby-lang.org>	2023-11-28 13:25:48 +0000
commit	c798943a4a272f213d21295a837da06ed5fa9a51 (patch)
tree	618a074700e2d501beec4db66320e2e1f5a6f085
parent	43dc8e9012dd7c390f1299d1b653656c81ae2aa7 (diff)
download	ruby-c798943a4a272f213d21295a837da06ed5fa9a51.tar.gz