diff options
Diffstat (limited to 'lib/rdoc/markup/parser.rb')
-rw-r--r-- | lib/rdoc/markup/parser.rb | 528 |
1 files changed, 528 insertions, 0 deletions
diff --git a/lib/rdoc/markup/parser.rb b/lib/rdoc/markup/parser.rb new file mode 100644 index 0000000000..c0d6519fd5 --- /dev/null +++ b/lib/rdoc/markup/parser.rb @@ -0,0 +1,528 @@ +require 'strscan' +require 'rdoc/text' + +## +# A recursive-descent parser for RDoc markup. +# +# The parser tokenizes an input string then parses the tokens into a Document. +# Documents can be converted into output formats by writing a visitor like +# RDoc::Markup::ToHTML. +# +# The parser only handles the block-level constructs Paragraph, List, +# ListItem, Heading, Verbatim, BlankLine and Rule. Inline markup such as +# <tt>\+blah\+</tt> is handled separately by RDoc::Markup::AttributeManager. +# +# To see what markup the Parser implements read RDoc. To see how to use +# RDoc markup to format text in your program read RDoc::Markup. + +class RDoc::Markup::Parser + + include RDoc::Text + + ## + # List token types + + LIST_TOKENS = [ + :BULLET, + :LABEL, + :LALPHA, + :NOTE, + :NUMBER, + :UALPHA, + ] + + ## + # Parser error subclass + + class Error < RuntimeError; end + + ## + # Raised when the parser is unable to handle the given markup + + class ParseError < Error; end + + ## + # Enables display of debugging information + + attr_accessor :debug + + ## + # Token accessor + + attr_reader :tokens + + ## + # Parsers +str+ into a Document + + def self.parse str + parser = new + #parser.debug = true + parser.tokenize str + RDoc::Markup::Document.new(*parser.parse) + end + + ## + # Returns a token stream for +str+, for testing + + def self.tokenize str + parser = new + parser.tokenize str + parser.tokens + end + + ## + # Creates a new Parser. See also ::parse + + def initialize + @tokens = [] + @current_token = nil + @debug = false + + @line = 0 + @line_pos = 0 + end + + ## + # Builds a Heading of +level+ + + def build_heading level + heading = RDoc::Markup::Heading.new level, text + skip :NEWLINE + + heading + end + + ## + # Builds a List flush to +margin+ + + def build_list margin + p :list_start => margin if @debug + + list = RDoc::Markup::List.new + + until @tokens.empty? do + type, data, column, = get + + case type + when :BULLET, :LABEL, :LALPHA, :NOTE, :NUMBER, :UALPHA then + list_type = type + + if column < margin then + unget + break + end + + if list.type and list.type != list_type then + unget + break + end + + list.type = list_type + + case type + when :NOTE, :LABEL then + _, indent, = get # SPACE + if :NEWLINE == peek_token.first then + get + peek_type, new_indent, peek_column, = peek_token + indent = new_indent if + peek_type == :INDENT and peek_column >= column + unget + end + else + data = nil + _, indent, = get + end + + list_item = build_list_item(margin + indent, data) + + list << list_item if list_item + else + unget + break + end + end + + p :list_end => margin if @debug + + return nil if list.empty? + + list + end + + ## + # Builds a ListItem that is flush to +indent+ with type +item_type+ + + def build_list_item indent, item_type = nil + p :list_item_start => [indent, item_type] if @debug + + list_item = RDoc::Markup::ListItem.new item_type + + until @tokens.empty? do + type, data, column = get + + if column < indent and + not type == :NEWLINE and + (type != :INDENT or data < indent) then + unget + break + end + + case type + when :INDENT then + unget + list_item.push(*parse(indent)) + when :TEXT then + unget + list_item << build_paragraph(indent) + when :HEADER then + list_item << build_heading(data) + when :NEWLINE then + list_item << RDoc::Markup::BlankLine.new + when *LIST_TOKENS then + unget + list_item << build_list(column) + else + raise ParseError, "Unhandled token #{@current_token.inspect}" + end + end + + p :list_item_end => [indent, item_type] if @debug + + return nil if list_item.empty? + + list_item.parts.shift if + RDoc::Markup::BlankLine === list_item.parts.first and + list_item.length > 1 + + list_item + end + + ## + # Builds a Paragraph that is flush to +margin+ + + def build_paragraph margin + p :paragraph_start => margin if @debug + + paragraph = RDoc::Markup::Paragraph.new + + until @tokens.empty? do + type, data, column, = get + + case type + when :INDENT then + next if data == margin and peek_token[0] == :TEXT + + unget + break + when :TEXT then + if column != margin then + unget + break + end + + paragraph << data + skip :NEWLINE + else + unget + break + end + end + + p :paragraph_end => margin if @debug + + paragraph + end + + ## + # Builds a Verbatim that is flush to +margin+ + + def build_verbatim margin + p :verbatim_begin => margin if @debug + verbatim = RDoc::Markup::Verbatim.new + + until @tokens.empty? do + type, data, column, = get + + case type + when :INDENT then + if margin >= data then + unget + break + end + + indent = data - margin + + verbatim << ' ' * indent + when :HEADER then + verbatim << '=' * data + + _, _, peek_column, = peek_token + peek_column ||= column + data + verbatim << ' ' * (peek_column - column - data) + when :RULE then + width = 2 + data + verbatim << '-' * width + + _, _, peek_column, = peek_token + peek_column ||= column + data + 2 + verbatim << ' ' * (peek_column - column - width) + when :TEXT then + verbatim << data + when *LIST_TOKENS then + if column <= margin then + unget + break + end + + list_marker = case type + when :BULLET then '*' + when :LABEL then "[#{data}]" + when :LALPHA, :NUMBER, :UALPHA then "#{data}." + when :NOTE then "#{data}::" + end + + verbatim << list_marker + + _, data, = get + + verbatim << ' ' * (data - list_marker.length) + when :NEWLINE then + verbatim << data + break unless [:INDENT, :NEWLINE].include? peek_token[0] + else + unget + break + end + end + + verbatim.normalize + + p :verbatim_end => margin if @debug + + verbatim + end + + ## + # Pulls the next token from the stream. + + def get + @current_token = @tokens.shift + p :get => @current_token if @debug + @current_token + end + + ## + # Parses the tokens into a Document + + def parse indent = 0 + p :parse_start => indent if @debug + + document = [] + + until @tokens.empty? do + type, data, column, = get + + if type != :INDENT and column < indent then + unget + break + end + + case type + when :HEADER then + document << build_heading(data) + when :INDENT then + if indent > data then + unget + break + elsif indent == data then + next + end + + unget + document << build_verbatim(indent) + when :NEWLINE then + document << RDoc::Markup::BlankLine.new + skip :NEWLINE, false + when :RULE then + document << RDoc::Markup::Rule.new(data) + skip :NEWLINE + when :TEXT then + unget + document << build_paragraph(indent) + + # we're done with this paragraph (indent mismatch) + break if peek_token[0] == :TEXT + when *LIST_TOKENS then + unget + + list = build_list(indent) + + document << list if list + + # we're done with this list (indent mismatch) + break if LIST_TOKENS.include? peek_token.first and indent > 0 + else + type, data, column, line = @current_token + raise ParseError, + "Unhandled token #{type} (#{data.inspect}) at #{line}:#{column}" + end + end + + p :parse_end => indent if @debug + + document + end + + ## + # Returns the next token on the stream without modifying the stream + + def peek_token + token = @tokens.first || [] + p :peek => token if @debug + token + end + + ## + # Skips a token of +token_type+, optionally raising an error. + + def skip token_type, error = true + type, data, = get + + return unless type # end of stream + + return @current_token if token_type == type + + unget + + raise ParseError, "expected #{token_type} got #{@current_token.inspect}" if + error + end + + ## + # Consumes tokens until NEWLINE and turns them back into text + + def text + text = '' + + loop do + type, data, = get + + text << case type + when :BULLET then + _, space, = get # SPACE + "*#{' ' * (space - 1)}" + when :LABEL then + _, space, = get # SPACE + "[#{data}]#{' ' * (space - data.length - 2)}" + when :LALPHA, :NUMBER, :UALPHA then + _, space, = get # SPACE + "#{data}.#{' ' * (space - 2)}" + when :NOTE then + _, space = get # SPACE + "#{data}::#{' ' * (space - data.length - 2)}" + when :TEXT then + data + when :NEWLINE then + unget + break + when nil then + break + else + raise ParseError, "unhandled token #{@current_token.inspect}" + end + end + + text + end + + ## + # Calculates the column and line of the current token based on +offset+. + + def token_pos offset + [offset - @line_pos, @line] + end + + ## + # Turns text +input+ into a stream of tokens + + def tokenize input + s = StringScanner.new input + + @line = 0 + @line_pos = 0 + + until s.eos? do + pos = s.pos + + @tokens << case + when s.scan(/\r?\n/) then + token = [:NEWLINE, s.matched, *token_pos(pos)] + @line_pos = s.pos + @line += 1 + token + when s.scan(/ +/) then + [:INDENT, s.matched_size, *token_pos(pos)] + when s.scan(/(=+)\s+/) then + level = s[1].length + level = 6 if level > 6 + @tokens << [:HEADER, level, *token_pos(pos)] + + pos = s.pos + s.scan(/.*/) + [:TEXT, s.matched, *token_pos(pos)] + when s.scan(/^(-{3,}) *$/) then + [:RULE, s[1].length - 2, *token_pos(pos)] + when s.scan(/([*-])\s+/) then + @tokens << [:BULLET, :BULLET, *token_pos(pos)] + [:SPACE, s.matched_size, *token_pos(pos)] + when s.scan(/([a-z]|\d+)\.[ \t]+\S/i) then + list_label = s[1] + width = s.matched_size - 1 + + s.pos -= 1 # unget \S + + list_type = case list_label + when /[a-z]/ then :LALPHA + when /[A-Z]/ then :UALPHA + when /\d/ then :NUMBER + else + raise ParseError, "BUG token #{list_label}" + end + + @tokens << [list_type, list_label, *token_pos(pos)] + [:SPACE, width, *token_pos(pos)] + when s.scan(/\[(.*?)\]( +|$)/) then + @tokens << [:LABEL, s[1], *token_pos(pos)] + [:SPACE, s.matched_size, *token_pos(pos)] + when s.scan(/(.*?)::( +|$)/) then + @tokens << [:NOTE, s[1], *token_pos(pos)] + [:SPACE, s.matched_size, *token_pos(pos)] + else s.scan(/.*/) + [:TEXT, s.matched, *token_pos(pos)] + end + end + + self + end + + ## + # Returns the current token or +token+ to the token stream + + def unget token = @current_token + p :unget => token if @debug + raise Error, 'too many #ungets' if token == @tokens.first + @tokens.unshift token if token + end + +end + +require 'rdoc/markup/blank_line' +require 'rdoc/markup/document' +require 'rdoc/markup/heading' +require 'rdoc/markup/list' +require 'rdoc/markup/list_item' +require 'rdoc/markup/paragraph' +require 'rdoc/markup/rule' +require 'rdoc/markup/verbatim' + |