aboutsummaryrefslogtreecommitdiffstats
path: root/tool/lrama/lib/lrama/lexer.rb
diff options
context:
space:
mode:
Diffstat (limited to 'tool/lrama/lib/lrama/lexer.rb')
-rw-r--r--tool/lrama/lib/lrama/lexer.rb434
1 files changed, 131 insertions, 303 deletions
diff --git a/tool/lrama/lib/lrama/lexer.rb b/tool/lrama/lib/lrama/lexer.rb
index 72ce90195f..926606f3b9 100644
--- a/tool/lrama/lib/lrama/lexer.rb
+++ b/tool/lrama/lib/lrama/lexer.rb
@@ -1,346 +1,174 @@
require "strscan"
-require "lrama/report/duration"
require "lrama/lexer/token"
module Lrama
- # Lexer for parse.y
class Lexer
- include Lrama::Report::Duration
-
- # States
- #
- # See: https://www.gnu.org/software/bison/manual/html_node/Grammar-Outline.html
- Initial = 0
- Prologue = 1
- BisonDeclarations = 2
- GrammarRules = 3
- Epilogue = 4
-
- # Token types
-
- attr_reader :prologue, :bison_declarations, :grammar_rules, :epilogue,
- :bison_declarations_tokens, :grammar_rules_tokens
+ attr_accessor :status
+ attr_accessor :end_symbol
+
+ SYMBOLS = %w(%{ %} %% { } \[ \] : \| ;)
+ PERCENT_TOKENS = %w(
+ %union
+ %token
+ %type
+ %left
+ %right
+ %nonassoc
+ %expect
+ %define
+ %require
+ %printer
+ %lex-param
+ %parse-param
+ %initial-action
+ %precedence
+ %prec
+ %error-token
+ )
def initialize(text)
- @text = text
- @state = Initial
- # Array of texts
- @prologue = []
- @bison_declarations = []
- @grammar_rules = []
- @epilogue = []
-
- @bison_declarations_tokens = []
- @grammar_rules_tokens = []
-
- @debug = false
+ @scanner = StringScanner.new(text)
+ @head = @scanner.pos
+ @line = 1
+ @status = :initial
+ @end_symbol = nil
+ end
- report_duration(:lex) do
- lex_text
- lex_bison_declarations_tokens
- lex_grammar_rules_tokens
+ def next_token
+ case @status
+ when :initial
+ lex_token
+ when :c_declaration
+ lex_c_code
end
end
- private
-
- def create_token(type, s_value, line, column)
- t = Token.new(type: type, s_value: s_value)
- t.line = line
- t.column = column
-
- return t
+ def line
+ @line
end
- # TODO: Remove this
- def lex_text
- @text.each_line.with_index(1) do |string, lineno|
- case @state
- when Initial
- # Skip until "%{"
- if string == "%{\n"
- @state = Prologue
- @prologue << ["", lineno]
- next
- end
- when Prologue
- # Between "%{" and "%}"
- if string == "%}\n"
- @state = BisonDeclarations
- @prologue << ["", lineno]
- next
- end
-
- @prologue << [string, lineno]
- when BisonDeclarations
- if string == "%%\n"
- @state = GrammarRules
- next
- end
-
- @bison_declarations << [string, lineno]
- when GrammarRules
- # Between "%%" and "%%"
- if string == "%%\n"
- @state = Epilogue
- next
- end
-
- @grammar_rules << [string, lineno]
- when Epilogue
- @epilogue << [string, lineno]
- else
- raise "Unknown state: #{@state}"
- end
- end
+ def column
+ @scanner.pos - @head
end
- # See:
- # * https://www.gnu.org/software/bison/manual/html_node/Decl-Summary.html
- # * https://www.gnu.org/software/bison/manual/html_node/Symbol-Decls.html
- # * https://www.gnu.org/software/bison/manual/html_node/Empty-Rules.html
- def lex_common(lines, tokens)
- line = lines.first[1]
- column = 0
- ss = StringScanner.new(lines.map(&:first).join)
-
- while !ss.eos? do
+ def lex_token
+ while !@scanner.eos? do
case
- when ss.scan(/\n/)
- line += 1
- column = ss.pos
- when ss.scan(/\s+/)
- # skip
- when ss.scan(/;/)
- tokens << create_token(Token::Semicolon, ss[0], line, ss.pos - column)
- when ss.scan(/\|/)
- tokens << create_token(Token::Bar, ss[0], line, ss.pos - column)
- when ss.scan(/(\d+)/)
- tokens << create_token(Token::Number, Integer(ss[0]), line, ss.pos - column)
- when ss.scan(/(<[a-zA-Z0-9_]+>)/)
- tokens << create_token(Token::Tag, ss[0], line, ss.pos - column)
- when ss.scan(/([a-zA-Z_.][-a-zA-Z0-9_.]*)\[([a-zA-Z_.][-a-zA-Z0-9_.]*)\]\s*:/)
- tokens << create_token(Token::Ident_Colon, ss[1], line, ss.pos - column)
- tokens << create_token(Token::Named_Ref, ss[2], line, ss.pos - column)
- when ss.scan(/([a-zA-Z_.][-a-zA-Z0-9_.]*)\s*:/)
- tokens << create_token(Token::Ident_Colon, ss[1], line, ss.pos - column)
- when ss.scan(/([a-zA-Z_.][-a-zA-Z0-9_.]*)/)
- tokens << create_token(Token::Ident, ss[0], line, ss.pos - column)
- when ss.scan(/\[([a-zA-Z_.][-a-zA-Z0-9_.]*)\]/)
- tokens << create_token(Token::Named_Ref, ss[1], line, ss.pos - column)
- when ss.scan(/%expect/)
- tokens << create_token(Token::P_expect, ss[0], line, ss.pos - column)
- when ss.scan(/%define/)
- tokens << create_token(Token::P_define, ss[0], line, ss.pos - column)
- when ss.scan(/%printer/)
- tokens << create_token(Token::P_printer, ss[0], line, ss.pos - column)
- when ss.scan(/%error-token/)
- tokens << create_token(Token::P_error_token, ss[0], line, ss.pos - column)
- when ss.scan(/%lex-param/)
- tokens << create_token(Token::P_lex_param, ss[0], line, ss.pos - column)
- when ss.scan(/%parse-param/)
- tokens << create_token(Token::P_parse_param, ss[0], line, ss.pos - column)
- when ss.scan(/%initial-action/)
- tokens << create_token(Token::P_initial_action, ss[0], line, ss.pos - column)
- when ss.scan(/%union/)
- tokens << create_token(Token::P_union, ss[0], line, ss.pos - column)
- when ss.scan(/%token/)
- tokens << create_token(Token::P_token, ss[0], line, ss.pos - column)
- when ss.scan(/%type/)
- tokens << create_token(Token::P_type, ss[0], line, ss.pos - column)
- when ss.scan(/%nonassoc/)
- tokens << create_token(Token::P_nonassoc, ss[0], line, ss.pos - column)
- when ss.scan(/%left/)
- tokens << create_token(Token::P_left, ss[0], line, ss.pos - column)
- when ss.scan(/%right/)
- tokens << create_token(Token::P_right, ss[0], line, ss.pos - column)
- when ss.scan(/%precedence/)
- tokens << create_token(Token::P_precedence, ss[0], line, ss.pos - column)
- when ss.scan(/%prec/)
- tokens << create_token(Token::P_prec, ss[0], line, ss.pos - column)
- when ss.scan(/{/)
- token, line = lex_user_code(ss, line, ss.pos - column, lines)
- tokens << token
- when ss.scan(/"/)
- string, line = lex_string(ss, "\"", line, lines)
- token = create_token(Token::String, string, line, ss.pos - column)
- tokens << token
- when ss.scan(/\/\*/)
- # TODO: Need to keep comment?
- line = lex_comment(ss, line, lines, "")
- when ss.scan(/\/\//)
- line = lex_line_comment(ss, line, "")
- when ss.scan(/'(.)'/)
- tokens << create_token(Token::Char, ss[0], line, ss.pos - column)
- when ss.scan(/'\\(.)'/) # '\\', '\t'
- tokens << create_token(Token::Char, ss[0], line, ss.pos - column)
- when ss.scan(/'\\(\d+)'/) # '\13'
- tokens << create_token(Token::Char, ss[0], line, ss.pos - column)
- when ss.scan(/%empty/)
- # skip
+ when @scanner.scan(/\n/)
+ newline
+ when @scanner.scan(/\s+/)
+ # noop
+ when @scanner.scan(/\/\*/)
+ lex_comment
+ when @scanner.scan(/\/\//)
+ @scanner.scan_until(/\n/)
+ newline
+ when @scanner.scan(/%empty/)
+ # noop
else
- l = line - lines.first[1]
- split = ss.string.split("\n")
- col = ss.pos - split[0...l].join("\n").length
- raise "Parse error (unknown token): #{split[l]} \"#{ss.string[ss.pos]}\" (#{line}: #{col})"
+ break
end
end
- end
- def lex_bison_declarations_tokens
- lex_common(@bison_declarations, @bison_declarations_tokens)
+ @head_line = line
+ @head_column = column
+
+ case
+ when @scanner.eos?
+ return
+ when @scanner.scan(/#{SYMBOLS.join('|')}/)
+ return [@scanner.matched, @scanner.matched]
+ when @scanner.scan(/#{PERCENT_TOKENS.join('|')}/)
+ return [@scanner.matched, @scanner.matched]
+ when @scanner.scan(/<\w+>/)
+ return [:TAG, build_token(type: Token::Tag, s_value: @scanner.matched)]
+ when @scanner.scan(/'.'/)
+ return [:CHARACTER, build_token(type: Token::Char, s_value: @scanner.matched)]
+ when @scanner.scan(/'\\\\'|'\\b'|'\\t'|'\\f'|'\\r'|'\\n'|'\\v'|'\\13'/)
+ return [:CHARACTER, build_token(type: Token::Char, s_value: @scanner.matched)]
+ when @scanner.scan(/"/)
+ return [:STRING, %Q("#{@scanner.scan_until(/"/)})]
+ when @scanner.scan(/\d+/)
+ return [:INTEGER, Integer(@scanner.matched)]
+ when @scanner.scan(/([a-zA-Z_.][-a-zA-Z0-9_.]*)/)
+ token = build_token(type: Token::Ident, s_value: @scanner.matched)
+ type =
+ if @scanner.check(/\s*(\[\s*[a-zA-Z_.][-a-zA-Z0-9_.]*\s*\])?\s*:/)
+ :IDENT_COLON
+ else
+ :IDENTIFIER
+ end
+ return [type, token]
+ else
+ raise
+ end
end
- def lex_user_code(ss, line, column, lines)
- first_line = line
- first_column = column
- debug("Enter lex_user_code: #{line}")
- brace_count = 1
- str = "{"
- # Array of [type, $n, tag, first column, last column]
- # TODO: Is it better to keep string, like "$$", and use gsub?
- references = []
-
- while !ss.eos? do
+ def lex_c_code
+ nested = 0
+ code = ''
+ while !@scanner.eos? do
case
- when ss.scan(/\n/)
- line += 1
- when ss.scan(/"/)
- string, line = lex_string(ss, "\"", line, lines)
- str << string
- next
- when ss.scan(/'/)
- string, line = lex_string(ss, "'", line, lines)
- str << string
- next
-
- # $ references
- # It need to wrap an identifier with brackets to use ".-" for identifiers
- when ss.scan(/\$(<[a-zA-Z0-9_]+>)?\$/) # $$, $<long>$
- tag = ss[1] ? create_token(Token::Tag, ss[1], line, str.length) : nil
- references << [:dollar, "$", tag, str.length, str.length + ss[0].length - 1]
- when ss.scan(/\$(<[a-zA-Z0-9_]+>)?(\d+)/) # $1, $2, $<long>1
- tag = ss[1] ? create_token(Token::Tag, ss[1], line, str.length) : nil
- references << [:dollar, Integer(ss[2]), tag, str.length, str.length + ss[0].length - 1]
- when ss.scan(/\$(<[a-zA-Z0-9_]+>)?([a-zA-Z_][a-zA-Z0-9_]*)/) # $foo, $expr, $<long>program (named reference without brackets)
- tag = ss[1] ? create_token(Token::Tag, ss[1], line, str.length) : nil
- references << [:dollar, ss[2], tag, str.length, str.length + ss[0].length - 1]
- when ss.scan(/\$(<[a-zA-Z0-9_]+>)?\[([a-zA-Z_.][-a-zA-Z0-9_.]*)\]/) # $expr.right, $expr-right, $<long>program (named reference with brackets)
- tag = ss[1] ? create_token(Token::Tag, ss[1], line, str.length) : nil
- references << [:dollar, ss[2], tag, str.length, str.length + ss[0].length - 1]
-
- # @ references
- # It need to wrap an identifier with brackets to use ".-" for identifiers
- when ss.scan(/@\$/) # @$
- references << [:at, "$", nil, str.length, str.length + ss[0].length - 1]
- when ss.scan(/@(\d+)/) # @1
- references << [:at, Integer(ss[1]), nil, str.length, str.length + ss[0].length - 1]
- when ss.scan(/@([a-zA-Z][a-zA-Z0-9_]*)/) # @foo, @expr (named reference without brackets)
- references << [:at, ss[1], nil, str.length, str.length + ss[0].length - 1]
- when ss.scan(/@\[([a-zA-Z_.][-a-zA-Z0-9_.]*)\]/) # @expr.right, @expr-right (named reference with brackets)
- references << [:at, ss[1], nil, str.length, str.length + ss[0].length - 1]
-
- when ss.scan(/{/)
- brace_count += 1
- when ss.scan(/}/)
- brace_count -= 1
-
- debug("Return lex_user_code: #{line}")
- if brace_count == 0
- str << ss[0]
- user_code = Token.new(type: Token::User_code, s_value: str.freeze)
- user_code.line = first_line
- user_code.column = first_column
- user_code.references = references
- return [user_code, line]
+ when @scanner.scan(/{/)
+ code += @scanner.matched
+ nested += 1
+ when @scanner.scan(/}/)
+ if nested == 0 && @end_symbol == '}'
+ @scanner.unscan
+ return [:C_DECLARATION, build_token(type: Token::User_code, s_value: code, references: [])]
+ else
+ code += @scanner.matched
+ nested -= 1
end
- when ss.scan(/\/\*/)
- str << ss[0]
- line = lex_comment(ss, line, lines, str)
- when ss.scan(/\/\//)
- str << ss[0]
- line = lex_line_comment(ss, line, str)
+ when @scanner.check(/#{@end_symbol}/)
+ return [:C_DECLARATION, build_token(type: Token::User_code, s_value: code, references: [])]
+ when @scanner.scan(/\n/)
+ code += @scanner.matched
+ newline
+ when @scanner.scan(/"/)
+ matched = @scanner.scan_until(/"/)
+ code += %Q("#{matched})
+ @line += matched.count("\n")
+ when @scanner.scan(/'/)
+ matched = @scanner.scan_until(/'/)
+ code += %Q('#{matched})
else
- # noop, just consume char
- str << ss.getch
- next
+ code += @scanner.getch
end
-
- str << ss[0]
end
-
- # Reach to end of input but brace does not match
- l = line - lines.first[1]
- raise "Parse error (brace mismatch): #{ss.string.split("\n")[l]} \"#{ss.string[ss.pos]}\" (#{line}: #{ss.pos})"
+ raise
end
- def lex_string(ss, terminator, line, lines)
- debug("Enter lex_string: #{line}")
-
- str = terminator.dup
-
- while (c = ss.getch) do
- str << c
-
- case c
- when "\n"
- line += 1
- when terminator
- debug("Return lex_string: #{line}")
- return [str, line]
- else
- # noop
- end
- end
-
- # Reach to end of input but quote does not match
- l = line - lines.first[1]
- raise "Parse error (quote mismatch): #{ss.string.split("\n")[l]} \"#{ss.string[ss.pos]}\" (#{line}: #{ss.pos})"
- end
+ private
- # /* */ style comment
- def lex_comment(ss, line, lines, str)
- while !ss.eos? do
+ def lex_comment
+ while !@scanner.eos? do
case
- when ss.scan(/\n/)
- line += 1
- when ss.scan(/\*\//)
- return line
+ when @scanner.scan(/\n/)
+ @line += 1
+ @head = @scanner.pos + 1
+ when @scanner.scan(/\*\//)
+ return
else
- str << ss.getch
- next
+ @scanner.getch
end
-
- str << ss[0]
end
-
- # Reach to end of input but quote does not match
- l = line - lines.first[1]
- raise "Parse error (comment mismatch): #{ss.string.split("\n")[l]} \"#{ss.string[ss.pos]}\" (#{line}: #{ss.pos})"
end
- # // style comment
- def lex_line_comment(ss, line, str)
- while !ss.eos? do
- case
- when ss.scan(/\n/)
- return line + 1
- else
- str << ss.getch
- next
- end
+ def build_token(type:, s_value:, **options)
+ token = Token.new(type: type, s_value: s_value)
+ token.line = @head_line
+ token.column = @head_column
+ options.each do |attr, value|
+ token.public_send("#{attr}=", value)
end
- line # Reach to end of input
- end
-
- def lex_grammar_rules_tokens
- lex_common(@grammar_rules, @grammar_rules_tokens)
+ token
end
- def debug(msg)
- return unless @debug
- puts "#{msg}\n"
+ def newline
+ @line += 1
+ @head = @scanner.pos + 1
end
end
end