Some experiments with a self-written parser.

11 years ago · fc42ae0450
parent c7865cd98f
commit fc42ae0450
1 changed files with 106 additions and 0 deletions
--- a/parsertest.py
+++ b/parsertest.py
@ -0,0 +1,106 @@
+#!/usr/bin/python3
+
+# Copyright 2014 Jussi Pakkanen
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+import sys
+
+class ParseException(Exception):
+    def __init__(self, lineno, colno):
+        super().__init__()
+        self.lineno = lineno
+        self.colno = colno
+
+class Token:
+    def __init__(self, id, lineno, colno):
+        self.id = id
+        self.lineno = lineno
+        self.colno = colno
+
+class Lexer:
+    def __init__(self):
+        self.keywords = {'true', 'false', 'if', 'else', 'elif',
+                         'endif', 'and', 'or', 'not'}
+        self.token_specification = [
+            # Need to be sorted longest to shortest.
+            ('ignore', re.compile(r'[ \t]')),
+            ('id', re.compile('[_a-zA-Z][_0-9a-zA-Z]*')),
+            ('number', re.compile(r'\d+')),
+            ('eol_cont', re.compile(r'\\\n')),
+            ('eol', re.compile(r'\n')),
+            ('multiline_string', re.compile(r"'''(.|\n)*?'''", re.M)),
+            ('comment', re.compile(r'\#.*')),
+            ('lparen', re.compile(r'\(')),
+            ('rparen', re.compile(r'\)')),
+            ('lbracket', re.compile(r'\[')),
+            ('lbracket', re.compile(r'\]')),
+            ('string', re.compile("'[^']*?'")),
+            ('comma', re.compile(r',')),
+            ('dot', re.compile(r'\.')),
+            ('semicolon', re.compile(r':')),
+            ('assign', re.compile(r'==')),
+            ('equal', re.compile(r'=')),
+            ('nequals', re.compile(r'\!=')),
+        ]
+
+    def lex(self, code):
+        lineno = 1
+        line_start = 0
+        loc = 0;
+        par_count = 0
+        bracket_count = 0
+        col = 0
+        while(loc < len(code)):
+            matched = False
+            for (tid, reg) in self.token_specification:
+                mo = reg.match(code, loc)
+                if mo:
+                    curline = lineno
+                    col = mo.start()-line_start
+                    matched = True
+                    loc = mo.end()
+                    match_text = mo.group()
+                    if tid == 'ignore':
+                        break
+                    elif tid == 'lparen':
+                        par_count += 1
+                    elif tid == 'rparen':
+                        par_count -= 1
+                    elif tid == 'lbracket':
+                        bracket_count += 1
+                    elif tid == 'rbracket':
+                        bracket_count -= 1
+                    elif tid == 'multiline_string':
+                        lines = match_text.split('\n')
+                        if len(lines) > 1:
+                            lineno += len(lines) - 1
+                            line_start = mo.end() - len(lines[-1])
+                    elif tid == 'eol' or tid == 'eol_cont':
+                        lineno += 1
+                        line_start = loc
+                        if par_count > 0 or bracket_count > 0:
+                            break
+                    yield Token(tid, curline, col)
+            if not matched:
+                raise ParseException(lineno, col)
+
+if __name__ == '__main__':
+    code = open(sys.argv[1]).read()
+    lex = Lexer()
+    try:
+        for i in lex.lex(code):
+            print('Token:', i.id, 'Line:', i.lineno, 'Column:', i.colno)
+    except ParseException as e:
+        print('Error line', e.lineno, 'column', e.colno)