Added a very rudimentary fur-to-c compiler
[fur] / tokenization.py
1 import collections
2 import re
3
4 import util
5
6 Token = collections.namedtuple(
7     'Token',
8     [
9         'type',
10         'match',
11     ],
12 )
13
14 def _make_token_matcher(definition):
15     name, regex = definition
16     regex_matcher = re.compile(regex)
17
18     def token_matcher(index, source):
19         match = regex_matcher.match(source[index:])
20
21         if match is None:
22             return False, index, None
23
24         return True, index + len(match.group()), Token(type=name, match=match.group())
25
26     return token_matcher
27
28
29 _TOKEN_MATCHERS = [
30     ('open_parenthese',                 r'\('),
31     ('close_parenthese',                r'\)'),
32     ('symbol',                          r'[a-z]+'),
33     ('single_quoted_string_literal',    r"'.*?'"),
34 ]
35
36 _TOKEN_MATCHERS = list(map(_make_token_matcher, _TOKEN_MATCHERS))
37
38 @util.force_generator
39 def tokenize(source):
40     index = 0
41
42     while index < len(source):
43         success = False
44
45         for matcher in _TOKEN_MATCHERS:
46             success, index, token = matcher(index, source)
47
48             if success:
49                 yield token
50                 break
51
52         if not success:
53             raise Exception('Unexpected character "{}"'.format(source[index]))
54
55         while index < len(source) and source[index] in set(['\n']):
56             index += 1
57
58 if __name__ == '__main__':
59     import unittest
60
61     class TokenizeTests(unittest.TestCase):
62         def test_tokenizes_open_parenthese(self):
63             self.assertEqual(
64                 tokenize('('),
65                 [Token(
66                     type='open_parenthese',
67                     match='(',
68                 )],
69             )
70
71         def test_tokenizes_close_parenthese(self):
72             self.assertEqual(
73                 tokenize(')'),
74                 [Token(
75                     type='close_parenthese',
76                     match=')',
77                 )],
78             )
79
80         def test_tokenizes_symbol(self):
81             self.assertEqual(
82                 tokenize('print'),
83                 [Token(
84                     type='symbol',
85                     match='print',
86                 )],
87             )
88
89         def test_tokenizes_single_quoted_string_literal(self):
90             self.assertEqual(
91                 tokenize("'Hello, world'"),
92                 [Token(
93                     type='single_quoted_string_literal',
94                     match="'Hello, world'",
95                 )],
96             )
97
98         def test_handles_trailing_newline(self):
99             self.assertEqual(
100                 tokenize('print\n'),
101                 [Token(
102                     type='symbol',
103                     match='print',
104                 )],
105             )
106
107     unittest.main()