Add symbol and structure support
[fur] / tokenization.py
1 import collections
2 import re
3
4 import util
5
6 Token = collections.namedtuple(
7     'Token',
8     (
9         'type',
10         'match',
11         'metadata',
12     ),
13 )
14
15 NodeMetadata = collections.namedtuple(
16     'NodeMetadata',
17     (
18         'index',
19         'line',
20     ),
21 )
22
23 def _make_token_matcher(definition):
24     name, regex = definition
25     regex_matcher = re.compile(regex)
26
27     def token_matcher(index, source, line):
28         match = regex_matcher.match(source[index:])
29
30         if match is None:
31             return False, index, None
32
33         return (
34             True,
35             index + len(match.group()),
36             Token(
37                 type=name,
38                 match=match.group(),
39                 metadata=NodeMetadata(
40                     index=index,
41                     line=line,
42                 ),
43             ),
44         )
45
46     return token_matcher
47
48 _TOKEN_MATCHERS = [
49     ('keyword',                         r'(def|do|else|end|if|lambda)(?![a-z_])'),
50     ('open_bracket',                    r'\['),
51     ('close_bracket',                   r'\]'),
52     ('open_parenthese',                 r'\('),
53     ('close_parenthese',                r'\)'),
54     ('comma',                           r','),
55     ('colon',                           r':'),
56     ('period',                          r'\.'),
57     ('integer_literal',                 r'\d+'),
58     ('symbol',                          r'[a-z_]+'),
59     ('single_quoted_string_literal',    r"'.*?'"),
60     ('double_quoted_string_literal',    r'".*?"'),
61     ('comparison_level_operator',       r'(<=|>=|==|!=|<|>)'),
62     ('assignment_operator',             r'='),
63     ('addition_level_operator',         r'(\+\+|\+|-)'),
64     ('multiplication_level_operator',   r'(\*|//|%)'),
65     ('newline',                         r'\n'),
66 ]
67
68 _TOKEN_MATCHERS = list(map(_make_token_matcher, _TOKEN_MATCHERS))
69
70 @util.force_generator(tuple)
71 def tokenize(source):
72     index = 0
73     line = 1
74
75     while index < len(source):
76         if source[index] == ' ':
77             index += 1
78             continue
79
80         if source[index] == '#':
81             while index < len(source) and source[index] != '\n':
82                 index += 1
83
84             continue
85
86         success = False
87
88         for matcher in _TOKEN_MATCHERS:
89             success, index, token = matcher(index, source, line)
90
91             if success:
92                 yield token
93                 break
94
95         if not success:
96             raise Exception('Unexpected character "{}" on line {}'.format(
97                 source[index],
98                 line,
99             ))
100
101         if token.type == 'newline':
102             line += 1
103
104 if __name__ == '__main__':
105     import unittest
106
107     class TokenizeTests(unittest.TestCase):
108         def test_tokenizes_open_parenthese(self):
109             self.assertEqual(
110                 tokenize('('),
111                 (Token(
112                     type='open_parenthese',
113                     match='(',
114                     index=0,
115                     line=1,
116                 ),),
117             )
118
119         def test_tokenizes_close_parenthese(self):
120             self.assertEqual(
121                 tokenize(')'),
122                 (Token(
123                     type='close_parenthese',
124                     match=')',
125                     index=0,
126                     line=1,
127                 ),),
128             )
129
130         def test_tokenizes_symbol(self):
131             self.assertEqual(
132                 tokenize('print'),
133                 (Token(
134                     type='symbol',
135                     match='print',
136                     index=0,
137                     line=1,
138                 ),),
139             )
140
141         def test_tokenizes_single_quoted_string_literal(self):
142             self.assertEqual(
143                 tokenize("'Hello, world'"),
144                 (Token(
145                     type='single_quoted_string_literal',
146                     match="'Hello, world'",
147                     index=0,
148                     line=1,
149                 ),),
150             )
151
152         def test_tokenizes_plus(self):
153             self.assertEqual(
154                 tokenize('+'),
155                 (Token(
156                     type='addition_level_operator',
157                     match='+',
158                     index=0,
159                     line=1,
160                 ),),
161             )
162
163         def test_tokenizes_minus(self):
164             self.assertEqual(
165                 tokenize('-'),
166                 (Token(
167                     type='addition_level_operator',
168                     match='-',
169                     index=0,
170                     line=1,
171                 ),),
172             )
173
174         def test_tokenizes_times(self):
175             self.assertEqual(
176                 tokenize('*'),
177                 (Token(
178                     type='multiplication_level_operator',
179                     match='*',
180                     index=0,
181                     line=1,
182                 ),),
183             )
184
185         def test_tokenizes_integer_divide(self):
186             self.assertEqual(
187                 tokenize('//'),
188                 (Token(
189                     type='multiplication_level_operator',
190                     match='//',
191                     index=0,
192                     line=1,
193                 ),),
194             )
195
196         def test_tokenizes_modular_divide(self):
197             self.assertEqual(
198                 tokenize('%'),
199                 (Token(
200                     type='multiplication_level_operator',
201                     match='%',
202                     index=0,
203                     line=1,
204                 ),),
205             )
206
207         def test_tokenizes_comma(self):
208             self.assertEqual(
209                 tokenize(','),
210                 (Token(
211                     type='comma',
212                     match=',',
213                     index=0,
214                     line=1,
215                 ),),
216             )
217
218         def test_tokenizes_assignment_operator(self):
219             self.assertEqual(
220                 tokenize('='),
221                 (Token(
222                     type='assignment_operator',
223                     match='=',
224                     index=0,
225                     line=1,
226                 ),),
227             )
228
229         def test_tokenizes_equality_operator(self):
230             self.assertEqual(
231                 tokenize('=='),
232                 (Token(
233                     type='comparison_level_operator',
234                     match='==',
235                     index=0,
236                     line=1,
237                 ),),
238             )
239
240         def test_tokenizes_greater_than_or_equal_operator(self):
241             self.assertEqual(
242                 tokenize('>='),
243                 (Token(
244                     type='comparison_level_operator',
245                     match='>=',
246                     index=0,
247                     line=1,
248                 ),),
249             )
250
251         def test_tokenizes_less_than_or_equal_operator(self):
252             self.assertEqual(
253                 tokenize('<='),
254                 (Token(
255                     type='comparison_level_operator',
256                     match='<=',
257                     index=0,
258                     line=1,
259                 ),),
260             )
261
262         def test_tokenizes_greater_than_equal_operator(self):
263             self.assertEqual(
264                 tokenize('>'),
265                 (Token(
266                     type='comparison_level_operator',
267                     match='>',
268                     index=0,
269                     line=1,
270                 ),),
271             )
272
273         def test_tokenizes_less_than_equal_operator(self):
274             self.assertEqual(
275                 tokenize('<'),
276                 (Token(
277                     type='comparison_level_operator',
278                     match='<',
279                     index=0,
280                     line=1,
281                 ),),
282             )
283
284         def test_tokenizes_not_equal_operator(self):
285             self.assertEqual(
286                 tokenize('!='),
287                 (Token(
288                     type='comparison_level_operator',
289                     match='!=',
290                     index=0,
291                     line=1,
292                 ),),
293             )
294
295         def test_tokenizes_newline(self):
296             self.assertEqual(
297                 tokenize('\n'),
298                 (Token(
299                     type='newline',
300                     match='\n',
301                     index=0,
302                     line=1,
303                 ),),
304             )
305
306         def test_handles_leading_space(self):
307             self.assertEqual(
308                 tokenize(' print'),
309                 (Token(
310                     type='symbol',
311                     match='print',
312                     index=1,
313                     line=1,
314                 ),),
315             )
316
317         def test_tokenizes_with_proper_line_numbers(self):
318             self.assertEqual(
319                 tokenize('print\n('),
320                 (
321                     Token(
322                         type='symbol',
323                         match='print',
324                         index=0,
325                         line=1,
326                     ),
327                     Token(
328                         type='newline',
329                         match='\n',
330                         index=5,
331                         line=1,
332                     ),
333                     Token(
334                         type='open_parenthese',
335                         match='(',
336                         index=6,
337                         line=2,
338                     ),
339                 ),
340             )
341
342
343     unittest.main()