diff options
author | Yusuke Endoh <mame@ruby-lang.org> | 2019-08-07 01:53:56 +0900 |
---|---|---|
committer | aycabta <aycabta@gmail.com> | 2019-08-16 06:02:45 +0900 |
commit | 0a0760aa632f05bc04df395d0173580042d9f730 (patch) | |
tree | 2b287cf83c29bfbba5d517f9d14a63cfd73a0621 | |
parent | 9d2fed2ccd1724d1cf42a3075c20dcc418082761 (diff) | |
download | ruby-0a0760aa632f05bc04df395d0173580042d9f730.tar.gz |
Refactor and improve performance of RDoc::Markup::Parser
This change introduces a wrapper of StringScanner that is aware of the
current position (column and lineno).
It has two advantages: faster and more modular.
The old code frequently runs `@input.byteslice(0, byte_offset).length`
to get the current position, but it was painfully slow. This change
keeps track of the position at each scan, which reduces about half of
time of "Generating RI format into ..." in Ruby's `make rdoc`
(5.5 sec -> 3.0 sec).
And the old code used four instance variables (`@input`, `@line`,
`@line_pos`, and `@s`) to track the position. This change factors them
out into MyStringScanner, so now only one variable (`@s`) is needed.
-rw-r--r-- | lib/rdoc/markup/parser.rb | 101 | ||||
-rw-r--r-- | lib/rdoc/tom_doc.rb | 13 | ||||
-rw-r--r-- | test/rdoc/test_rdoc_markup_parser.rb | 18 |
3 files changed, 65 insertions, 67 deletions
diff --git a/lib/rdoc/markup/parser.rb b/lib/rdoc/markup/parser.rb index 14f1f6c719..600eb841ac 100644 --- a/lib/rdoc/markup/parser.rb +++ b/lib/rdoc/markup/parser.rb @@ -80,10 +80,6 @@ class RDoc::Markup::Parser @binary_input = nil @current_token = nil @debug = false - @input = nil - @input_encoding = nil - @line = 0 - @line_pos = 0 @s = nil @tokens = [] end @@ -320,13 +316,6 @@ class RDoc::Markup::Parser end ## - # The character offset for the input string at the given +byte_offset+ - - def char_pos byte_offset - @input.byteslice(0, byte_offset).length - end - - ## # Pulls the next token from the stream. def get @@ -425,14 +414,53 @@ class RDoc::Markup::Parser end ## + # A simple wrapper of StringScanner that is aware of the current column and lineno + + class MyStringScanner + def initialize(input) + @line = @column = 0 + @s = StringScanner.new input + end + + def scan(re) + prev_pos = @s.pos + ret = @s.scan(re) + @column += ret.length if ret + ret + end + + def unscan(s) + @s.pos -= s.bytesize + @column -= s.length + end + + def pos + [@column, @line] + end + + def newline! + @column = 0 + @line += 1 + end + + def eos? + @s.eos? + end + + def matched + @s.matched + end + + def [](i) + @s[i] + end + end + + ## # Creates the StringScanner def setup_scanner input - @line = 0 - @line_pos = 0 - @input = input.dup - - @s = StringScanner.new input + @s = MyStringScanner.new input end ## @@ -467,31 +495,30 @@ class RDoc::Markup::Parser @tokens << case # [CR]LF => :NEWLINE when @s.scan(/\r?\n/) then - token = [:NEWLINE, @s.matched, *token_pos(pos)] - @line_pos = char_pos @s.pos - @line += 1 + token = [:NEWLINE, @s.matched, *pos] + @s.newline! token # === text => :HEADER then :TEXT when @s.scan(/(=+)(\s*)/) then level = @s[1].length - header = [:HEADER, level, *token_pos(pos)] + header = [:HEADER, level, *pos] if @s[2] =~ /^\r?\n/ then - @s.pos -= @s[2].length + @s.unscan(@s[2]) header else pos = @s.pos @s.scan(/.*/) @tokens << header - [:TEXT, @s.matched.sub(/\r$/, ''), *token_pos(pos)] + [:TEXT, @s.matched.sub(/\r$/, ''), *pos] end # --- (at least 3) and nothing else on the line => :RULE when @s.scan(/(-{3,}) *\r?$/) then - [:RULE, @s[1].length - 2, *token_pos(pos)] + [:RULE, @s[1].length - 2, *pos] # * or - followed by white space and text => :BULLET when @s.scan(/([*-]) +(\S)/) then - @s.pos -= @s[2].bytesize # unget \S - [:BULLET, @s[1], *token_pos(pos)] + @s.unscan(@s[2]) + [:BULLET, @s[1], *pos] # A. text, a. text, 12. text => :UALPHA, :LALPHA, :NUMBER when @s.scan(/([a-z]|\d+)\. +(\S)/i) then # FIXME if tab(s), the column will be wrong @@ -500,7 +527,7 @@ class RDoc::Markup::Parser # before (and provide a check for that at least in debug # mode) list_label = @s[1] - @s.pos -= @s[2].bytesize # unget \S + @s.unscan(@s[2]) list_type = case list_label when /[a-z]/ then :LALPHA @@ -509,24 +536,24 @@ class RDoc::Markup::Parser else raise ParseError, "BUG token #{list_label}" end - [list_type, list_label, *token_pos(pos)] + [list_type, list_label, *pos] # [text] followed by spaces or end of line => :LABEL when @s.scan(/\[(.*?)\]( +|\r?$)/) then - [:LABEL, @s[1], *token_pos(pos)] + [:LABEL, @s[1], *pos] # text:: followed by spaces or end of line => :NOTE when @s.scan(/(.*?)::( +|\r?$)/) then - [:NOTE, @s[1], *token_pos(pos)] + [:NOTE, @s[1], *pos] # >>> followed by end of line => :BLOCKQUOTE when @s.scan(/>>> *(\w+)?$/) then - [:BLOCKQUOTE, @s[1], *token_pos(pos)] + [:BLOCKQUOTE, @s[1], *pos] # anything else: :TEXT else @s.scan(/(.*?)( )?\r?$/) - token = [:TEXT, @s[1], *token_pos(pos)] + token = [:TEXT, @s[1], *pos] if @s[2] then @tokens << token - [:BREAK, @s[2], *token_pos(pos + @s[1].length)] + [:BREAK, @s[2], pos[0] + @s[1].length, pos[1]] else token end @@ -537,16 +564,6 @@ class RDoc::Markup::Parser end ## - # Calculates the column (by character) and line of the current token based - # on +byte_offset+. - - def token_pos byte_offset - offset = char_pos byte_offset - - [offset - @line_pos, @line] - end - - ## # Returns the current token to the token stream def unget diff --git a/lib/rdoc/tom_doc.rb b/lib/rdoc/tom_doc.rb index 625a6b5cfa..e161fcf42f 100644 --- a/lib/rdoc/tom_doc.rb +++ b/lib/rdoc/tom_doc.rb @@ -242,19 +242,18 @@ class RDoc::TomDoc < RDoc::Markup::Parser @tokens << case when @s.scan(/\r?\n/) then - token = [:NEWLINE, @s.matched, *token_pos(pos)] - @line_pos = char_pos @s.pos - @line += 1 + token = [:NEWLINE, @s.matched, *pos] + @s.newline! token when @s.scan(/(Examples|Signature)$/) then - @tokens << [:HEADER, 3, *token_pos(pos)] + @tokens << [:HEADER, 3, *pos] - [:TEXT, @s[1], *token_pos(pos)] + [:TEXT, @s[1], *pos] when @s.scan(/([:\w][\w\[\]]*)[ ]+- /) then - [:NOTE, @s[1], *token_pos(pos)] + [:NOTE, @s[1], *pos] else @s.scan(/.*/) - [:TEXT, @s.matched.sub(/\r$/, ''), *token_pos(pos)] + [:TEXT, @s.matched.sub(/\r$/, ''), *pos] end end diff --git a/test/rdoc/test_rdoc_markup_parser.rb b/test/rdoc/test_rdoc_markup_parser.rb index 344d67df39..b9705e19d1 100644 --- a/test/rdoc/test_rdoc_markup_parser.rb +++ b/test/rdoc/test_rdoc_markup_parser.rb @@ -22,15 +22,6 @@ class TestRDocMarkupParser < RDoc::TestCase assert_equal @RM::Heading.new(3, 'heading three'), parser.build_heading(3) end - def test_char_pos - parser = @RMP.new - s = parser.setup_scanner 'cät' - - s.scan(/\S+/) - - assert_equal 3, parser.char_pos(s.pos) - end - def test_get parser = util_parser @@ -1647,15 +1638,6 @@ Example heading: assert_equal expected, @RMP.tokenize(str) end - def test_token_pos - parser = @RMP.new - s = parser.setup_scanner 'cät' - - s.scan(/\S+/) - - assert_equal [3, 0], parser.token_pos(s.pos) - end - # HACK move to Verbatim test case def test_verbatim_normalize v = @RM::Verbatim.new "foo\n", "\n", "\n", "bar\n" |