Some experiments with a self-written parser.

pull/15/head
Jussi Pakkanen 11 years ago
parent c7865cd98f
commit fc42ae0450
  1. 106
      parsertest.py

@ -0,0 +1,106 @@
#!/usr/bin/python3
# Copyright 2014 Jussi Pakkanen
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import sys
class ParseException(Exception):
def __init__(self, lineno, colno):
super().__init__()
self.lineno = lineno
self.colno = colno
class Token:
def __init__(self, id, lineno, colno):
self.id = id
self.lineno = lineno
self.colno = colno
class Lexer:
def __init__(self):
self.keywords = {'true', 'false', 'if', 'else', 'elif',
'endif', 'and', 'or', 'not'}
self.token_specification = [
# Need to be sorted longest to shortest.
('ignore', re.compile(r'[ \t]')),
('id', re.compile('[_a-zA-Z][_0-9a-zA-Z]*')),
('number', re.compile(r'\d+')),
('eol_cont', re.compile(r'\\\n')),
('eol', re.compile(r'\n')),
('multiline_string', re.compile(r"'''(.|\n)*?'''", re.M)),
('comment', re.compile(r'\#.*')),
('lparen', re.compile(r'\(')),
('rparen', re.compile(r'\)')),
('lbracket', re.compile(r'\[')),
('lbracket', re.compile(r'\]')),
('string', re.compile("'[^']*?'")),
('comma', re.compile(r',')),
('dot', re.compile(r'\.')),
('semicolon', re.compile(r':')),
('assign', re.compile(r'==')),
('equal', re.compile(r'=')),
('nequals', re.compile(r'\!=')),
]
def lex(self, code):
lineno = 1
line_start = 0
loc = 0;
par_count = 0
bracket_count = 0
col = 0
while(loc < len(code)):
matched = False
for (tid, reg) in self.token_specification:
mo = reg.match(code, loc)
if mo:
curline = lineno
col = mo.start()-line_start
matched = True
loc = mo.end()
match_text = mo.group()
if tid == 'ignore':
break
elif tid == 'lparen':
par_count += 1
elif tid == 'rparen':
par_count -= 1
elif tid == 'lbracket':
bracket_count += 1
elif tid == 'rbracket':
bracket_count -= 1
elif tid == 'multiline_string':
lines = match_text.split('\n')
if len(lines) > 1:
lineno += len(lines) - 1
line_start = mo.end() - len(lines[-1])
elif tid == 'eol' or tid == 'eol_cont':
lineno += 1
line_start = loc
if par_count > 0 or bracket_count > 0:
break
yield Token(tid, curline, col)
if not matched:
raise ParseException(lineno, col)
if __name__ == '__main__':
code = open(sys.argv[1]).read()
lex = Lexer()
try:
for i in lex.lex(code):
print('Token:', i.id, 'Line:', i.lineno, 'Column:', i.colno)
except ParseException as e:
print('Error line', e.lineno, 'column', e.colno)
Loading…
Cancel
Save