Allocate Fur stacks on the C heap
[fur] / tokenization.py
1 import collections
2 import re
3
4 import util
5
6 Token = collections.namedtuple(
7     'Token',
8     [
9         'type',
10         'match',
11         'index',
12         'line',
13     ],
14 )
15
16 def _make_token_matcher(definition):
17     name, regex = definition
18     regex_matcher = re.compile(regex)
19
20     def token_matcher(index, source, line):
21         match = regex_matcher.match(source[index:])
22
23         if match is None:
24             return False, index, None
25
26         return (
27             True,
28             index + len(match.group()),
29             Token(type=name, match=match.group(), index=index, line=line),
30         )
31
32     return token_matcher
33
34 _TOKEN_MATCHERS = [
35     ('keyword',                         r'(def|do|else|end|if)(?![a-z_])'),
36     ('open_bracket',                    r'\['),
37     ('close_bracket',                   r'\]'),
38     ('open_parenthese',                 r'\('),
39     ('close_parenthese',                r'\)'),
40     ('comma',                           r','),
41     ('colon',                           r':'),
42     ('period',                          r'\.'),
43     ('integer_literal',                 r'\d+'),
44     ('symbol',                          r'[a-z_]+'),
45     ('single_quoted_string_literal',    r"'.*?'"),
46     ('double_quoted_string_literal',    r'".*?"'),
47     ('comparison_level_operator',       r'(<=|>=|==|!=|<|>)'),
48     ('assignment_operator',             r'='),
49     ('addition_level_operator',         r'(\+\+|\+|-)'),
50     ('multiplication_level_operator',   r'(\*|//|%)'),
51     ('newline',                         r'\n'),
52 ]
53
54 _TOKEN_MATCHERS = list(map(_make_token_matcher, _TOKEN_MATCHERS))
55
56 @util.force_generator(tuple)
57 def tokenize(source):
58     index = 0
59     line = 1
60
61     while index < len(source):
62         if source[index] == ' ':
63             index += 1
64             continue
65
66         if source[index] == '#':
67             while index < len(source) and source[index] != '\n':
68                 index += 1
69
70             continue
71
72         success = False
73
74         for matcher in _TOKEN_MATCHERS:
75             success, index, token = matcher(index, source, line)
76
77             if success:
78                 yield token
79                 break
80
81         if not success:
82             raise Exception('Unexpected character "{}" on line {}'.format(
83                 source[index],
84                 line,
85             ))
86
87         if token.type == 'newline':
88             line += 1
89
90 if __name__ == '__main__':
91     import unittest
92
93     class TokenizeTests(unittest.TestCase):
94         def test_tokenizes_open_parenthese(self):
95             self.assertEqual(
96                 tokenize('('),
97                 (Token(
98                     type='open_parenthese',
99                     match='(',
100                     index=0,
101                     line=1,
102                 ),),
103             )
104
105         def test_tokenizes_close_parenthese(self):
106             self.assertEqual(
107                 tokenize(')'),
108                 (Token(
109                     type='close_parenthese',
110                     match=')',
111                     index=0,
112                     line=1,
113                 ),),
114             )
115
116         def test_tokenizes_symbol(self):
117             self.assertEqual(
118                 tokenize('print'),
119                 (Token(
120                     type='symbol',
121                     match='print',
122                     index=0,
123                     line=1,
124                 ),),
125             )
126
127         def test_tokenizes_single_quoted_string_literal(self):
128             self.assertEqual(
129                 tokenize("'Hello, world'"),
130                 (Token(
131                     type='single_quoted_string_literal',
132                     match="'Hello, world'",
133                     index=0,
134                     line=1,
135                 ),),
136             )
137
138         def test_tokenizes_plus(self):
139             self.assertEqual(
140                 tokenize('+'),
141                 (Token(
142                     type='addition_level_operator',
143                     match='+',
144                     index=0,
145                     line=1,
146                 ),),
147             )
148
149         def test_tokenizes_minus(self):
150             self.assertEqual(
151                 tokenize('-'),
152                 (Token(
153                     type='addition_level_operator',
154                     match='-',
155                     index=0,
156                     line=1,
157                 ),),
158             )
159
160         def test_tokenizes_times(self):
161             self.assertEqual(
162                 tokenize('*'),
163                 (Token(
164                     type='multiplication_level_operator',
165                     match='*',
166                     index=0,
167                     line=1,
168                 ),),
169             )
170
171         def test_tokenizes_integer_divide(self):
172             self.assertEqual(
173                 tokenize('//'),
174                 (Token(
175                     type='multiplication_level_operator',
176                     match='//',
177                     index=0,
178                     line=1,
179                 ),),
180             )
181
182         def test_tokenizes_modular_divide(self):
183             self.assertEqual(
184                 tokenize('%'),
185                 (Token(
186                     type='multiplication_level_operator',
187                     match='%',
188                     index=0,
189                     line=1,
190                 ),),
191             )
192
193         def test_tokenizes_comma(self):
194             self.assertEqual(
195                 tokenize(','),
196                 (Token(
197                     type='comma',
198                     match=',',
199                     index=0,
200                     line=1,
201                 ),),
202             )
203
204         def test_tokenizes_assignment_operator(self):
205             self.assertEqual(
206                 tokenize('='),
207                 (Token(
208                     type='assignment_operator',
209                     match='=',
210                     index=0,
211                     line=1,
212                 ),),
213             )
214
215         def test_tokenizes_equality_operator(self):
216             self.assertEqual(
217                 tokenize('=='),
218                 (Token(
219                     type='comparison_level_operator',
220                     match='==',
221                     index=0,
222                     line=1,
223                 ),),
224             )
225
226         def test_tokenizes_greater_than_or_equal_operator(self):
227             self.assertEqual(
228                 tokenize('>='),
229                 (Token(
230                     type='comparison_level_operator',
231                     match='>=',
232                     index=0,
233                     line=1,
234                 ),),
235             )
236
237         def test_tokenizes_less_than_or_equal_operator(self):
238             self.assertEqual(
239                 tokenize('<='),
240                 (Token(
241                     type='comparison_level_operator',
242                     match='<=',
243                     index=0,
244                     line=1,
245                 ),),
246             )
247
248         def test_tokenizes_greater_than_equal_operator(self):
249             self.assertEqual(
250                 tokenize('>'),
251                 (Token(
252                     type='comparison_level_operator',
253                     match='>',
254                     index=0,
255                     line=1,
256                 ),),
257             )
258
259         def test_tokenizes_less_than_equal_operator(self):
260             self.assertEqual(
261                 tokenize('<'),
262                 (Token(
263                     type='comparison_level_operator',
264                     match='<',
265                     index=0,
266                     line=1,
267                 ),),
268             )
269
270         def test_tokenizes_not_equal_operator(self):
271             self.assertEqual(
272                 tokenize('!='),
273                 (Token(
274                     type='comparison_level_operator',
275                     match='!=',
276                     index=0,
277                     line=1,
278                 ),),
279             )
280
281         def test_tokenizes_newline(self):
282             self.assertEqual(
283                 tokenize('\n'),
284                 (Token(
285                     type='newline',
286                     match='\n',
287                     index=0,
288                     line=1,
289                 ),),
290             )
291
292         def test_handles_leading_space(self):
293             self.assertEqual(
294                 tokenize(' print'),
295                 (Token(
296                     type='symbol',
297                     match='print',
298                     index=1,
299                     line=1,
300                 ),),
301             )
302
303         def test_tokenizes_with_proper_line_numbers(self):
304             self.assertEqual(
305                 tokenize('print\n('),
306                 (
307                     Token(
308                         type='symbol',
309                         match='print',
310                         index=0,
311                         line=1,
312                     ),
313                     Token(
314                         type='newline',
315                         match='\n',
316                         index=5,
317                         line=1,
318                     ),
319                     Token(
320                         type='open_parenthese',
321                         match='(',
322                         index=6,
323                         line=2,
324                     ),
325                 ),
326             )
327
328
329     unittest.main()