diff options
Diffstat (limited to 'tool/lrama/lib/lrama/lexer.rb')
-rw-r--r-- | tool/lrama/lib/lrama/lexer.rb | 434 |
1 files changed, 131 insertions, 303 deletions
diff --git a/tool/lrama/lib/lrama/lexer.rb b/tool/lrama/lib/lrama/lexer.rb index 72ce90195f..926606f3b9 100644 --- a/tool/lrama/lib/lrama/lexer.rb +++ b/tool/lrama/lib/lrama/lexer.rb @@ -1,346 +1,174 @@ require "strscan" -require "lrama/report/duration" require "lrama/lexer/token" module Lrama - # Lexer for parse.y class Lexer - include Lrama::Report::Duration - - # States - # - # See: https://www.gnu.org/software/bison/manual/html_node/Grammar-Outline.html - Initial = 0 - Prologue = 1 - BisonDeclarations = 2 - GrammarRules = 3 - Epilogue = 4 - - # Token types - - attr_reader :prologue, :bison_declarations, :grammar_rules, :epilogue, - :bison_declarations_tokens, :grammar_rules_tokens + attr_accessor :status + attr_accessor :end_symbol + + SYMBOLS = %w(%{ %} %% { } \[ \] : \| ;) + PERCENT_TOKENS = %w( + %union + %token + %type + %left + %right + %nonassoc + %expect + %define + %require + %printer + %lex-param + %parse-param + %initial-action + %precedence + %prec + %error-token + ) def initialize(text) - @text = text - @state = Initial - # Array of texts - @prologue = [] - @bison_declarations = [] - @grammar_rules = [] - @epilogue = [] - - @bison_declarations_tokens = [] - @grammar_rules_tokens = [] - - @debug = false + @scanner = StringScanner.new(text) + @head = @scanner.pos + @line = 1 + @status = :initial + @end_symbol = nil + end - report_duration(:lex) do - lex_text - lex_bison_declarations_tokens - lex_grammar_rules_tokens + def next_token + case @status + when :initial + lex_token + when :c_declaration + lex_c_code end end - private - - def create_token(type, s_value, line, column) - t = Token.new(type: type, s_value: s_value) - t.line = line - t.column = column - - return t + def line + @line end - # TODO: Remove this - def lex_text - @text.each_line.with_index(1) do |string, lineno| - case @state - when Initial - # Skip until "%{" - if string == "%{\n" - @state = Prologue - @prologue << ["", lineno] - next - end - when Prologue - # Between "%{" and "%}" - if string == "%}\n" - @state = BisonDeclarations - @prologue << ["", lineno] - next - end - - @prologue << [string, lineno] - when BisonDeclarations - if string == "%%\n" - @state = GrammarRules - next - end - - @bison_declarations << [string, lineno] - when GrammarRules - # Between "%%" and "%%" - if string == "%%\n" - @state = Epilogue - next - end - - @grammar_rules << [string, lineno] - when Epilogue - @epilogue << [string, lineno] - else - raise "Unknown state: #{@state}" - end - end + def column + @scanner.pos - @head end - # See: - # * https://www.gnu.org/software/bison/manual/html_node/Decl-Summary.html - # * https://www.gnu.org/software/bison/manual/html_node/Symbol-Decls.html - # * https://www.gnu.org/software/bison/manual/html_node/Empty-Rules.html - def lex_common(lines, tokens) - line = lines.first[1] - column = 0 - ss = StringScanner.new(lines.map(&:first).join) - - while !ss.eos? do + def lex_token + while !@scanner.eos? do case - when ss.scan(/\n/) - line += 1 - column = ss.pos - when ss.scan(/\s+/) - # skip - when ss.scan(/;/) - tokens << create_token(Token::Semicolon, ss[0], line, ss.pos - column) - when ss.scan(/\|/) - tokens << create_token(Token::Bar, ss[0], line, ss.pos - column) - when ss.scan(/(\d+)/) - tokens << create_token(Token::Number, Integer(ss[0]), line, ss.pos - column) - when ss.scan(/(<[a-zA-Z0-9_]+>)/) - tokens << create_token(Token::Tag, ss[0], line, ss.pos - column) - when ss.scan(/([a-zA-Z_.][-a-zA-Z0-9_.]*)\[([a-zA-Z_.][-a-zA-Z0-9_.]*)\]\s*:/) - tokens << create_token(Token::Ident_Colon, ss[1], line, ss.pos - column) - tokens << create_token(Token::Named_Ref, ss[2], line, ss.pos - column) - when ss.scan(/([a-zA-Z_.][-a-zA-Z0-9_.]*)\s*:/) - tokens << create_token(Token::Ident_Colon, ss[1], line, ss.pos - column) - when ss.scan(/([a-zA-Z_.][-a-zA-Z0-9_.]*)/) - tokens << create_token(Token::Ident, ss[0], line, ss.pos - column) - when ss.scan(/\[([a-zA-Z_.][-a-zA-Z0-9_.]*)\]/) - tokens << create_token(Token::Named_Ref, ss[1], line, ss.pos - column) - when ss.scan(/%expect/) - tokens << create_token(Token::P_expect, ss[0], line, ss.pos - column) - when ss.scan(/%define/) - tokens << create_token(Token::P_define, ss[0], line, ss.pos - column) - when ss.scan(/%printer/) - tokens << create_token(Token::P_printer, ss[0], line, ss.pos - column) - when ss.scan(/%error-token/) - tokens << create_token(Token::P_error_token, ss[0], line, ss.pos - column) - when ss.scan(/%lex-param/) - tokens << create_token(Token::P_lex_param, ss[0], line, ss.pos - column) - when ss.scan(/%parse-param/) - tokens << create_token(Token::P_parse_param, ss[0], line, ss.pos - column) - when ss.scan(/%initial-action/) - tokens << create_token(Token::P_initial_action, ss[0], line, ss.pos - column) - when ss.scan(/%union/) - tokens << create_token(Token::P_union, ss[0], line, ss.pos - column) - when ss.scan(/%token/) - tokens << create_token(Token::P_token, ss[0], line, ss.pos - column) - when ss.scan(/%type/) - tokens << create_token(Token::P_type, ss[0], line, ss.pos - column) - when ss.scan(/%nonassoc/) - tokens << create_token(Token::P_nonassoc, ss[0], line, ss.pos - column) - when ss.scan(/%left/) - tokens << create_token(Token::P_left, ss[0], line, ss.pos - column) - when ss.scan(/%right/) - tokens << create_token(Token::P_right, ss[0], line, ss.pos - column) - when ss.scan(/%precedence/) - tokens << create_token(Token::P_precedence, ss[0], line, ss.pos - column) - when ss.scan(/%prec/) - tokens << create_token(Token::P_prec, ss[0], line, ss.pos - column) - when ss.scan(/{/) - token, line = lex_user_code(ss, line, ss.pos - column, lines) - tokens << token - when ss.scan(/"/) - string, line = lex_string(ss, "\"", line, lines) - token = create_token(Token::String, string, line, ss.pos - column) - tokens << token - when ss.scan(/\/\*/) - # TODO: Need to keep comment? - line = lex_comment(ss, line, lines, "") - when ss.scan(/\/\//) - line = lex_line_comment(ss, line, "") - when ss.scan(/'(.)'/) - tokens << create_token(Token::Char, ss[0], line, ss.pos - column) - when ss.scan(/'\\(.)'/) # '\\', '\t' - tokens << create_token(Token::Char, ss[0], line, ss.pos - column) - when ss.scan(/'\\(\d+)'/) # '\13' - tokens << create_token(Token::Char, ss[0], line, ss.pos - column) - when ss.scan(/%empty/) - # skip + when @scanner.scan(/\n/) + newline + when @scanner.scan(/\s+/) + # noop + when @scanner.scan(/\/\*/) + lex_comment + when @scanner.scan(/\/\//) + @scanner.scan_until(/\n/) + newline + when @scanner.scan(/%empty/) + # noop else - l = line - lines.first[1] - split = ss.string.split("\n") - col = ss.pos - split[0...l].join("\n").length - raise "Parse error (unknown token): #{split[l]} \"#{ss.string[ss.pos]}\" (#{line}: #{col})" + break end end - end - def lex_bison_declarations_tokens - lex_common(@bison_declarations, @bison_declarations_tokens) + @head_line = line + @head_column = column + + case + when @scanner.eos? + return + when @scanner.scan(/#{SYMBOLS.join('|')}/) + return [@scanner.matched, @scanner.matched] + when @scanner.scan(/#{PERCENT_TOKENS.join('|')}/) + return [@scanner.matched, @scanner.matched] + when @scanner.scan(/<\w+>/) + return [:TAG, build_token(type: Token::Tag, s_value: @scanner.matched)] + when @scanner.scan(/'.'/) + return [:CHARACTER, build_token(type: Token::Char, s_value: @scanner.matched)] + when @scanner.scan(/'\\\\'|'\\b'|'\\t'|'\\f'|'\\r'|'\\n'|'\\v'|'\\13'/) + return [:CHARACTER, build_token(type: Token::Char, s_value: @scanner.matched)] + when @scanner.scan(/"/) + return [:STRING, %Q("#{@scanner.scan_until(/"/)})] + when @scanner.scan(/\d+/) + return [:INTEGER, Integer(@scanner.matched)] + when @scanner.scan(/([a-zA-Z_.][-a-zA-Z0-9_.]*)/) + token = build_token(type: Token::Ident, s_value: @scanner.matched) + type = + if @scanner.check(/\s*(\[\s*[a-zA-Z_.][-a-zA-Z0-9_.]*\s*\])?\s*:/) + :IDENT_COLON + else + :IDENTIFIER + end + return [type, token] + else + raise + end end - def lex_user_code(ss, line, column, lines) - first_line = line - first_column = column - debug("Enter lex_user_code: #{line}") - brace_count = 1 - str = "{" - # Array of [type, $n, tag, first column, last column] - # TODO: Is it better to keep string, like "$$", and use gsub? - references = [] - - while !ss.eos? do + def lex_c_code + nested = 0 + code = '' + while !@scanner.eos? do case - when ss.scan(/\n/) - line += 1 - when ss.scan(/"/) - string, line = lex_string(ss, "\"", line, lines) - str << string - next - when ss.scan(/'/) - string, line = lex_string(ss, "'", line, lines) - str << string - next - - # $ references - # It need to wrap an identifier with brackets to use ".-" for identifiers - when ss.scan(/\$(<[a-zA-Z0-9_]+>)?\$/) # $$, $<long>$ - tag = ss[1] ? create_token(Token::Tag, ss[1], line, str.length) : nil - references << [:dollar, "$", tag, str.length, str.length + ss[0].length - 1] - when ss.scan(/\$(<[a-zA-Z0-9_]+>)?(\d+)/) # $1, $2, $<long>1 - tag = ss[1] ? create_token(Token::Tag, ss[1], line, str.length) : nil - references << [:dollar, Integer(ss[2]), tag, str.length, str.length + ss[0].length - 1] - when ss.scan(/\$(<[a-zA-Z0-9_]+>)?([a-zA-Z_][a-zA-Z0-9_]*)/) # $foo, $expr, $<long>program (named reference without brackets) - tag = ss[1] ? create_token(Token::Tag, ss[1], line, str.length) : nil - references << [:dollar, ss[2], tag, str.length, str.length + ss[0].length - 1] - when ss.scan(/\$(<[a-zA-Z0-9_]+>)?\[([a-zA-Z_.][-a-zA-Z0-9_.]*)\]/) # $expr.right, $expr-right, $<long>program (named reference with brackets) - tag = ss[1] ? create_token(Token::Tag, ss[1], line, str.length) : nil - references << [:dollar, ss[2], tag, str.length, str.length + ss[0].length - 1] - - # @ references - # It need to wrap an identifier with brackets to use ".-" for identifiers - when ss.scan(/@\$/) # @$ - references << [:at, "$", nil, str.length, str.length + ss[0].length - 1] - when ss.scan(/@(\d+)/) # @1 - references << [:at, Integer(ss[1]), nil, str.length, str.length + ss[0].length - 1] - when ss.scan(/@([a-zA-Z][a-zA-Z0-9_]*)/) # @foo, @expr (named reference without brackets) - references << [:at, ss[1], nil, str.length, str.length + ss[0].length - 1] - when ss.scan(/@\[([a-zA-Z_.][-a-zA-Z0-9_.]*)\]/) # @expr.right, @expr-right (named reference with brackets) - references << [:at, ss[1], nil, str.length, str.length + ss[0].length - 1] - - when ss.scan(/{/) - brace_count += 1 - when ss.scan(/}/) - brace_count -= 1 - - debug("Return lex_user_code: #{line}") - if brace_count == 0 - str << ss[0] - user_code = Token.new(type: Token::User_code, s_value: str.freeze) - user_code.line = first_line - user_code.column = first_column - user_code.references = references - return [user_code, line] + when @scanner.scan(/{/) + code += @scanner.matched + nested += 1 + when @scanner.scan(/}/) + if nested == 0 && @end_symbol == '}' + @scanner.unscan + return [:C_DECLARATION, build_token(type: Token::User_code, s_value: code, references: [])] + else + code += @scanner.matched + nested -= 1 end - when ss.scan(/\/\*/) - str << ss[0] - line = lex_comment(ss, line, lines, str) - when ss.scan(/\/\//) - str << ss[0] - line = lex_line_comment(ss, line, str) + when @scanner.check(/#{@end_symbol}/) + return [:C_DECLARATION, build_token(type: Token::User_code, s_value: code, references: [])] + when @scanner.scan(/\n/) + code += @scanner.matched + newline + when @scanner.scan(/"/) + matched = @scanner.scan_until(/"/) + code += %Q("#{matched}) + @line += matched.count("\n") + when @scanner.scan(/'/) + matched = @scanner.scan_until(/'/) + code += %Q('#{matched}) else - # noop, just consume char - str << ss.getch - next + code += @scanner.getch end - - str << ss[0] end - - # Reach to end of input but brace does not match - l = line - lines.first[1] - raise "Parse error (brace mismatch): #{ss.string.split("\n")[l]} \"#{ss.string[ss.pos]}\" (#{line}: #{ss.pos})" + raise end - def lex_string(ss, terminator, line, lines) - debug("Enter lex_string: #{line}") - - str = terminator.dup - - while (c = ss.getch) do - str << c - - case c - when "\n" - line += 1 - when terminator - debug("Return lex_string: #{line}") - return [str, line] - else - # noop - end - end - - # Reach to end of input but quote does not match - l = line - lines.first[1] - raise "Parse error (quote mismatch): #{ss.string.split("\n")[l]} \"#{ss.string[ss.pos]}\" (#{line}: #{ss.pos})" - end + private - # /* */ style comment - def lex_comment(ss, line, lines, str) - while !ss.eos? do + def lex_comment + while !@scanner.eos? do case - when ss.scan(/\n/) - line += 1 - when ss.scan(/\*\//) - return line + when @scanner.scan(/\n/) + @line += 1 + @head = @scanner.pos + 1 + when @scanner.scan(/\*\//) + return else - str << ss.getch - next + @scanner.getch end - - str << ss[0] end - - # Reach to end of input but quote does not match - l = line - lines.first[1] - raise "Parse error (comment mismatch): #{ss.string.split("\n")[l]} \"#{ss.string[ss.pos]}\" (#{line}: #{ss.pos})" end - # // style comment - def lex_line_comment(ss, line, str) - while !ss.eos? do - case - when ss.scan(/\n/) - return line + 1 - else - str << ss.getch - next - end + def build_token(type:, s_value:, **options) + token = Token.new(type: type, s_value: s_value) + token.line = @head_line + token.column = @head_column + options.each do |attr, value| + token.public_send("#{attr}=", value) end - line # Reach to end of input - end - - def lex_grammar_rules_tokens - lex_common(@grammar_rules, @grammar_rules_tokens) + token end - def debug(msg) - return unless @debug - puts "#{msg}\n" + def newline + @line += 1 + @head = @scanner.pos + 1 end end end |