Added if expression statements
[fur] / tokenization.py
1 import collections
2 import re
3
4 import util
5
6 Token = collections.namedtuple(
7     'Token',
8     [
9         'type',
10         'match',
11         'index',
12         'line',
13     ],
14 )
15
16 def _make_token_matcher(definition):
17     name, regex = definition
18     regex_matcher = re.compile(regex)
19
20     def token_matcher(index, source, line):
21         match = regex_matcher.match(source[index:])
22
23         if match is None:
24             return False, index, None
25
26         return (
27             True,
28             index + len(match.group()),
29             Token(type=name, match=match.group(), index=index, line=line),
30         )
31
32     return token_matcher
33
34 _TOKEN_MATCHERS = [
35     ('keyword',                         r'(def|do|else|end|if)(?![a-z_])'),
36     ('open_parenthese',                 r'\('),
37     ('close_parenthese',                r'\)'),
38     ('comma',                           r','),
39     ('integer_literal',                 r'\d+'),
40     ('symbol',                          r'[a-z_]+'),
41     ('single_quoted_string_literal',    r"'.*?'"),
42     ('comparison_level_operator',         r'(<=|>=|==|!=|<|>)'),
43     ('assignment_operator',             r'='),
44     ('addition_level_operator',         r'(\+|-)'),
45     ('multiplication_level_operator',   r'(\*|//|%)'),
46     ('newline',                         r'\n'),
47 ]
48
49 _TOKEN_MATCHERS = list(map(_make_token_matcher, _TOKEN_MATCHERS))
50
51 @util.force_generator(tuple)
52 def tokenize(source):
53     index = 0
54     line = 1
55
56     while index < len(source):
57         if source[index] == ' ':
58             index += 1
59             continue
60
61         if source[index] == '#':
62             while index < len(source) and source[index] != '\n':
63                 index += 1
64
65             continue
66
67         success = False
68
69         for matcher in _TOKEN_MATCHERS:
70             success, index, token = matcher(index, source, line)
71
72             if success:
73                 yield token
74                 break
75
76         if not success:
77             raise Exception('Unexpected character "{}" on line {}'.format(
78                 source[index],
79                 line,
80             ))
81
82         if token.type == 'newline':
83             line += 1
84
85 if __name__ == '__main__':
86     import unittest
87
88     class TokenizeTests(unittest.TestCase):
89         def test_tokenizes_open_parenthese(self):
90             self.assertEqual(
91                 tokenize('('),
92                 (Token(
93                     type='open_parenthese',
94                     match='(',
95                     index=0,
96                     line=1,
97                 ),),
98             )
99
100         def test_tokenizes_close_parenthese(self):
101             self.assertEqual(
102                 tokenize(')'),
103                 (Token(
104                     type='close_parenthese',
105                     match=')',
106                     index=0,
107                     line=1,
108                 ),),
109             )
110
111         def test_tokenizes_symbol(self):
112             self.assertEqual(
113                 tokenize('print'),
114                 (Token(
115                     type='symbol',
116                     match='print',
117                     index=0,
118                     line=1,
119                 ),),
120             )
121
122         def test_tokenizes_single_quoted_string_literal(self):
123             self.assertEqual(
124                 tokenize("'Hello, world'"),
125                 (Token(
126                     type='single_quoted_string_literal',
127                     match="'Hello, world'",
128                     index=0,
129                     line=1,
130                 ),),
131             )
132
133         def test_tokenizes_plus(self):
134             self.assertEqual(
135                 tokenize('+'),
136                 (Token(
137                     type='addition_level_operator',
138                     match='+',
139                     index=0,
140                     line=1,
141                 ),),
142             )
143
144         def test_tokenizes_minus(self):
145             self.assertEqual(
146                 tokenize('-'),
147                 (Token(
148                     type='addition_level_operator',
149                     match='-',
150                     index=0,
151                     line=1,
152                 ),),
153             )
154
155         def test_tokenizes_times(self):
156             self.assertEqual(
157                 tokenize('*'),
158                 (Token(
159                     type='multiplication_level_operator',
160                     match='*',
161                     index=0,
162                     line=1,
163                 ),),
164             )
165
166         def test_tokenizes_integer_divide(self):
167             self.assertEqual(
168                 tokenize('//'),
169                 (Token(
170                     type='multiplication_level_operator',
171                     match='//',
172                     index=0,
173                     line=1,
174                 ),),
175             )
176
177         def test_tokenizes_modular_divide(self):
178             self.assertEqual(
179                 tokenize('%'),
180                 (Token(
181                     type='multiplication_level_operator',
182                     match='%',
183                     index=0,
184                     line=1,
185                 ),),
186             )
187
188         def test_tokenizes_comma(self):
189             self.assertEqual(
190                 tokenize(','),
191                 (Token(
192                     type='comma',
193                     match=',',
194                     index=0,
195                     line=1,
196                 ),),
197             )
198
199         def test_tokenizes_assignment_operator(self):
200             self.assertEqual(
201                 tokenize('='),
202                 (Token(
203                     type='assignment_operator',
204                     match='=',
205                     index=0,
206                     line=1,
207                 ),),
208             )
209
210         def test_tokenizes_equality_operator(self):
211             self.assertEqual(
212                 tokenize('=='),
213                 (Token(
214                     type='comparison_level_operator',
215                     match='==',
216                     index=0,
217                     line=1,
218                 ),),
219             )
220
221         def test_tokenizes_greater_than_or_equal_operator(self):
222             self.assertEqual(
223                 tokenize('>='),
224                 (Token(
225                     type='comparison_level_operator',
226                     match='>=',
227                     index=0,
228                     line=1,
229                 ),),
230             )
231
232         def test_tokenizes_less_than_or_equal_operator(self):
233             self.assertEqual(
234                 tokenize('<='),
235                 (Token(
236                     type='comparison_level_operator',
237                     match='<=',
238                     index=0,
239                     line=1,
240                 ),),
241             )
242
243         def test_tokenizes_greater_than_equal_operator(self):
244             self.assertEqual(
245                 tokenize('>'),
246                 (Token(
247                     type='comparison_level_operator',
248                     match='>',
249                     index=0,
250                     line=1,
251                 ),),
252             )
253
254         def test_tokenizes_less_than_equal_operator(self):
255             self.assertEqual(
256                 tokenize('<'),
257                 (Token(
258                     type='comparison_level_operator',
259                     match='<',
260                     index=0,
261                     line=1,
262                 ),),
263             )
264
265         def test_tokenizes_not_equal_operator(self):
266             self.assertEqual(
267                 tokenize('!='),
268                 (Token(
269                     type='comparison_level_operator',
270                     match='!=',
271                     index=0,
272                     line=1,
273                 ),),
274             )
275
276         def test_tokenizes_newline(self):
277             self.assertEqual(
278                 tokenize('\n'),
279                 (Token(
280                     type='newline',
281                     match='\n',
282                     index=0,
283                     line=1,
284                 ),),
285             )
286
287         def test_handles_leading_space(self):
288             self.assertEqual(
289                 tokenize(' print'),
290                 (Token(
291                     type='symbol',
292                     match='print',
293                     index=1,
294                     line=1,
295                 ),),
296             )
297
298         def test_tokenizes_with_proper_line_numbers(self):
299             self.assertEqual(
300                 tokenize('print\n('),
301                 (
302                     Token(
303                         type='symbol',
304                         match='print',
305                         index=0,
306                         line=1,
307                     ),
308                     Token(
309                         type='newline',
310                         match='\n',
311                         index=5,
312                         line=1,
313                     ),
314                     Token(
315                         type='open_parenthese',
316                         match='(',
317                         index=6,
318                         line=2,
319                     ),
320                 ),
321             )
322
323
324     unittest.main()