diff --git a/parsertest.py b/parsertest.py new file mode 100755 index 000000000..704c1885f --- /dev/null +++ b/parsertest.py @@ -0,0 +1,106 @@ +#!/usr/bin/python3 + +# Copyright 2014 Jussi Pakkanen + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +import sys + +class ParseException(Exception): + def __init__(self, lineno, colno): + super().__init__() + self.lineno = lineno + self.colno = colno + +class Token: + def __init__(self, id, lineno, colno): + self.id = id + self.lineno = lineno + self.colno = colno + +class Lexer: + def __init__(self): + self.keywords = {'true', 'false', 'if', 'else', 'elif', + 'endif', 'and', 'or', 'not'} + self.token_specification = [ + # Need to be sorted longest to shortest. + ('ignore', re.compile(r'[ \t]')), + ('id', re.compile('[_a-zA-Z][_0-9a-zA-Z]*')), + ('number', re.compile(r'\d+')), + ('eol_cont', re.compile(r'\\\n')), + ('eol', re.compile(r'\n')), + ('multiline_string', re.compile(r"'''(.|\n)*?'''", re.M)), + ('comment', re.compile(r'\#.*')), + ('lparen', re.compile(r'\(')), + ('rparen', re.compile(r'\)')), + ('lbracket', re.compile(r'\[')), + ('lbracket', re.compile(r'\]')), + ('string', re.compile("'[^']*?'")), + ('comma', re.compile(r',')), + ('dot', re.compile(r'\.')), + ('semicolon', re.compile(r':')), + ('assign', re.compile(r'==')), + ('equal', re.compile(r'=')), + ('nequals', re.compile(r'\!=')), + ] + + def lex(self, code): + lineno = 1 + line_start = 0 + loc = 0; + par_count = 0 + bracket_count = 0 + col = 0 + while(loc < len(code)): + matched = False + for (tid, reg) in self.token_specification: + mo = reg.match(code, loc) + if mo: + curline = lineno + col = mo.start()-line_start + matched = True + loc = mo.end() + match_text = mo.group() + if tid == 'ignore': + break + elif tid == 'lparen': + par_count += 1 + elif tid == 'rparen': + par_count -= 1 + elif tid == 'lbracket': + bracket_count += 1 + elif tid == 'rbracket': + bracket_count -= 1 + elif tid == 'multiline_string': + lines = match_text.split('\n') + if len(lines) > 1: + lineno += len(lines) - 1 + line_start = mo.end() - len(lines[-1]) + elif tid == 'eol' or tid == 'eol_cont': + lineno += 1 + line_start = loc + if par_count > 0 or bracket_count > 0: + break + yield Token(tid, curline, col) + if not matched: + raise ParseException(lineno, col) + +if __name__ == '__main__': + code = open(sys.argv[1]).read() + lex = Lexer() + try: + for i in lex.lex(code): + print('Token:', i.id, 'Line:', i.lineno, 'Column:', i.colno) + except ParseException as e: + print('Error line', e.lineno, 'column', e.colno) \ No newline at end of file