Added list literals
[fur] / tokenization.py
1 import collections
2 import re
3
4 import util
5
6 Token = collections.namedtuple(
7     'Token',
8     [
9         'type',
10         'match',
11         'index',
12         'line',
13     ],
14 )
15
16 def _make_token_matcher(definition):
17     name, regex = definition
18     regex_matcher = re.compile(regex)
19
20     def token_matcher(index, source, line):
21         match = regex_matcher.match(source[index:])
22
23         if match is None:
24             return False, index, None
25
26         return (
27             True,
28             index + len(match.group()),
29             Token(type=name, match=match.group(), index=index, line=line),
30         )
31
32     return token_matcher
33
34 _TOKEN_MATCHERS = [
35     ('keyword',                         r'(def|do|else|end|if)(?![a-z_])'),
36     ('open_bracket',                    r'\['),
37     ('close_bracket',                   r'\]'),
38     ('open_parenthese',                 r'\('),
39     ('close_parenthese',                r'\)'),
40     ('comma',                           r','),
41     ('integer_literal',                 r'\d+'),
42     ('symbol',                          r'[a-z_]+'),
43     ('single_quoted_string_literal',    r"'.*?'"),
44     ('comparison_level_operator',       r'(<=|>=|==|!=|<|>)'),
45     ('assignment_operator',             r'='),
46     ('addition_level_operator',         r'(\+|-)'),
47     ('multiplication_level_operator',   r'(\*|//|%)'),
48     ('newline',                         r'\n'),
49 ]
50
51 _TOKEN_MATCHERS = list(map(_make_token_matcher, _TOKEN_MATCHERS))
52
53 @util.force_generator(tuple)
54 def tokenize(source):
55     index = 0
56     line = 1
57
58     while index < len(source):
59         if source[index] == ' ':
60             index += 1
61             continue
62
63         if source[index] == '#':
64             while index < len(source) and source[index] != '\n':
65                 index += 1
66
67             continue
68
69         success = False
70
71         for matcher in _TOKEN_MATCHERS:
72             success, index, token = matcher(index, source, line)
73
74             if success:
75                 yield token
76                 break
77
78         if not success:
79             raise Exception('Unexpected character "{}" on line {}'.format(
80                 source[index],
81                 line,
82             ))
83
84         if token.type == 'newline':
85             line += 1
86
87 if __name__ == '__main__':
88     import unittest
89
90     class TokenizeTests(unittest.TestCase):
91         def test_tokenizes_open_parenthese(self):
92             self.assertEqual(
93                 tokenize('('),
94                 (Token(
95                     type='open_parenthese',
96                     match='(',
97                     index=0,
98                     line=1,
99                 ),),
100             )
101
102         def test_tokenizes_close_parenthese(self):
103             self.assertEqual(
104                 tokenize(')'),
105                 (Token(
106                     type='close_parenthese',
107                     match=')',
108                     index=0,
109                     line=1,
110                 ),),
111             )
112
113         def test_tokenizes_symbol(self):
114             self.assertEqual(
115                 tokenize('print'),
116                 (Token(
117                     type='symbol',
118                     match='print',
119                     index=0,
120                     line=1,
121                 ),),
122             )
123
124         def test_tokenizes_single_quoted_string_literal(self):
125             self.assertEqual(
126                 tokenize("'Hello, world'"),
127                 (Token(
128                     type='single_quoted_string_literal',
129                     match="'Hello, world'",
130                     index=0,
131                     line=1,
132                 ),),
133             )
134
135         def test_tokenizes_plus(self):
136             self.assertEqual(
137                 tokenize('+'),
138                 (Token(
139                     type='addition_level_operator',
140                     match='+',
141                     index=0,
142                     line=1,
143                 ),),
144             )
145
146         def test_tokenizes_minus(self):
147             self.assertEqual(
148                 tokenize('-'),
149                 (Token(
150                     type='addition_level_operator',
151                     match='-',
152                     index=0,
153                     line=1,
154                 ),),
155             )
156
157         def test_tokenizes_times(self):
158             self.assertEqual(
159                 tokenize('*'),
160                 (Token(
161                     type='multiplication_level_operator',
162                     match='*',
163                     index=0,
164                     line=1,
165                 ),),
166             )
167
168         def test_tokenizes_integer_divide(self):
169             self.assertEqual(
170                 tokenize('//'),
171                 (Token(
172                     type='multiplication_level_operator',
173                     match='//',
174                     index=0,
175                     line=1,
176                 ),),
177             )
178
179         def test_tokenizes_modular_divide(self):
180             self.assertEqual(
181                 tokenize('%'),
182                 (Token(
183                     type='multiplication_level_operator',
184                     match='%',
185                     index=0,
186                     line=1,
187                 ),),
188             )
189
190         def test_tokenizes_comma(self):
191             self.assertEqual(
192                 tokenize(','),
193                 (Token(
194                     type='comma',
195                     match=',',
196                     index=0,
197                     line=1,
198                 ),),
199             )
200
201         def test_tokenizes_assignment_operator(self):
202             self.assertEqual(
203                 tokenize('='),
204                 (Token(
205                     type='assignment_operator',
206                     match='=',
207                     index=0,
208                     line=1,
209                 ),),
210             )
211
212         def test_tokenizes_equality_operator(self):
213             self.assertEqual(
214                 tokenize('=='),
215                 (Token(
216                     type='comparison_level_operator',
217                     match='==',
218                     index=0,
219                     line=1,
220                 ),),
221             )
222
223         def test_tokenizes_greater_than_or_equal_operator(self):
224             self.assertEqual(
225                 tokenize('>='),
226                 (Token(
227                     type='comparison_level_operator',
228                     match='>=',
229                     index=0,
230                     line=1,
231                 ),),
232             )
233
234         def test_tokenizes_less_than_or_equal_operator(self):
235             self.assertEqual(
236                 tokenize('<='),
237                 (Token(
238                     type='comparison_level_operator',
239                     match='<=',
240                     index=0,
241                     line=1,
242                 ),),
243             )
244
245         def test_tokenizes_greater_than_equal_operator(self):
246             self.assertEqual(
247                 tokenize('>'),
248                 (Token(
249                     type='comparison_level_operator',
250                     match='>',
251                     index=0,
252                     line=1,
253                 ),),
254             )
255
256         def test_tokenizes_less_than_equal_operator(self):
257             self.assertEqual(
258                 tokenize('<'),
259                 (Token(
260                     type='comparison_level_operator',
261                     match='<',
262                     index=0,
263                     line=1,
264                 ),),
265             )
266
267         def test_tokenizes_not_equal_operator(self):
268             self.assertEqual(
269                 tokenize('!='),
270                 (Token(
271                     type='comparison_level_operator',
272                     match='!=',
273                     index=0,
274                     line=1,
275                 ),),
276             )
277
278         def test_tokenizes_newline(self):
279             self.assertEqual(
280                 tokenize('\n'),
281                 (Token(
282                     type='newline',
283                     match='\n',
284                     index=0,
285                     line=1,
286                 ),),
287             )
288
289         def test_handles_leading_space(self):
290             self.assertEqual(
291                 tokenize(' print'),
292                 (Token(
293                     type='symbol',
294                     match='print',
295                     index=1,
296                     line=1,
297                 ),),
298             )
299
300         def test_tokenizes_with_proper_line_numbers(self):
301             self.assertEqual(
302                 tokenize('print\n('),
303                 (
304                     Token(
305                         type='symbol',
306                         match='print',
307                         index=0,
308                         line=1,
309                     ),
310                     Token(
311                         type='newline',
312                         match='\n',
313                         index=5,
314                         line=1,
315                     ),
316                     Token(
317                         type='open_parenthese',
318                         match='(',
319                         index=6,
320                         line=2,
321                     ),
322                 ),
323             )
324
325
326     unittest.main()