parent
c7865cd98f
commit
fc42ae0450
1 changed files with 106 additions and 0 deletions
@ -0,0 +1,106 @@ |
||||
#!/usr/bin/python3 |
||||
|
||||
# Copyright 2014 Jussi Pakkanen |
||||
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
# you may not use this file except in compliance with the License. |
||||
# You may obtain a copy of the License at |
||||
|
||||
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
||||
# Unless required by applicable law or agreed to in writing, software |
||||
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
# See the License for the specific language governing permissions and |
||||
# limitations under the License. |
||||
|
||||
import re |
||||
import sys |
||||
|
||||
class ParseException(Exception): |
||||
def __init__(self, lineno, colno): |
||||
super().__init__() |
||||
self.lineno = lineno |
||||
self.colno = colno |
||||
|
||||
class Token: |
||||
def __init__(self, id, lineno, colno): |
||||
self.id = id |
||||
self.lineno = lineno |
||||
self.colno = colno |
||||
|
||||
class Lexer: |
||||
def __init__(self): |
||||
self.keywords = {'true', 'false', 'if', 'else', 'elif', |
||||
'endif', 'and', 'or', 'not'} |
||||
self.token_specification = [ |
||||
# Need to be sorted longest to shortest. |
||||
('ignore', re.compile(r'[ \t]')), |
||||
('id', re.compile('[_a-zA-Z][_0-9a-zA-Z]*')), |
||||
('number', re.compile(r'\d+')), |
||||
('eol_cont', re.compile(r'\\\n')), |
||||
('eol', re.compile(r'\n')), |
||||
('multiline_string', re.compile(r"'''(.|\n)*?'''", re.M)), |
||||
('comment', re.compile(r'\#.*')), |
||||
('lparen', re.compile(r'\(')), |
||||
('rparen', re.compile(r'\)')), |
||||
('lbracket', re.compile(r'\[')), |
||||
('lbracket', re.compile(r'\]')), |
||||
('string', re.compile("'[^']*?'")), |
||||
('comma', re.compile(r',')), |
||||
('dot', re.compile(r'\.')), |
||||
('semicolon', re.compile(r':')), |
||||
('assign', re.compile(r'==')), |
||||
('equal', re.compile(r'=')), |
||||
('nequals', re.compile(r'\!=')), |
||||
] |
||||
|
||||
def lex(self, code): |
||||
lineno = 1 |
||||
line_start = 0 |
||||
loc = 0; |
||||
par_count = 0 |
||||
bracket_count = 0 |
||||
col = 0 |
||||
while(loc < len(code)): |
||||
matched = False |
||||
for (tid, reg) in self.token_specification: |
||||
mo = reg.match(code, loc) |
||||
if mo: |
||||
curline = lineno |
||||
col = mo.start()-line_start |
||||
matched = True |
||||
loc = mo.end() |
||||
match_text = mo.group() |
||||
if tid == 'ignore': |
||||
break |
||||
elif tid == 'lparen': |
||||
par_count += 1 |
||||
elif tid == 'rparen': |
||||
par_count -= 1 |
||||
elif tid == 'lbracket': |
||||
bracket_count += 1 |
||||
elif tid == 'rbracket': |
||||
bracket_count -= 1 |
||||
elif tid == 'multiline_string': |
||||
lines = match_text.split('\n') |
||||
if len(lines) > 1: |
||||
lineno += len(lines) - 1 |
||||
line_start = mo.end() - len(lines[-1]) |
||||
elif tid == 'eol' or tid == 'eol_cont': |
||||
lineno += 1 |
||||
line_start = loc |
||||
if par_count > 0 or bracket_count > 0: |
||||
break |
||||
yield Token(tid, curline, col) |
||||
if not matched: |
||||
raise ParseException(lineno, col) |
||||
|
||||
if __name__ == '__main__': |
||||
code = open(sys.argv[1]).read() |
||||
lex = Lexer() |
||||
try: |
||||
for i in lex.lex(code): |
||||
print('Token:', i.id, 'Line:', i.lineno, 'Column:', i.colno) |
||||
except ParseException as e: |
||||
print('Error line', e.lineno, 'column', e.colno) |
Loading…
Reference in new issue