Ignore symbol folders
[fur] / tokenization.py
1 import collections
2 import re
3
4 import util
5
6 Token = collections.namedtuple(
7     'Token',
8     [
9         'type',
10         'match',
11         'index',
12         'line',
13     ],
14 )
15
16 def _make_token_matcher(definition):
17     name, regex = definition
18     regex_matcher = re.compile(regex)
19
20     def token_matcher(index, source, line):
21         match = regex_matcher.match(source[index:])
22
23         if match is None:
24             return False, index, None
25
26         return (
27             True,
28             index + len(match.group()),
29             Token(type=name, match=match.group(), index=index, line=line),
30         )
31
32     return token_matcher
33
34 _TOKEN_MATCHERS = [
35     ('keyword',                         r'(def|end)(?![a-z_])'),
36     ('open_parenthese',                 r'\('),
37     ('close_parenthese',                r'\)'),
38     ('comma',                           r','),
39     ('integer_literal',                 r'\d+'),
40     ('symbol',                          r'[a-z_]+'),
41     ('single_quoted_string_literal',    r"'.*?'"),
42     ('comparison_level_operator',         r'(<=|>=|==|!=|<|>)'),
43     ('assignment_operator',             r'='),
44     ('addition_level_operator',         r'(\+|-)'),
45     ('multiplication_level_operator',   r'(\*|//|%)'),
46     ('newline',                         r'\n'),
47 ]
48
49 _TOKEN_MATCHERS = list(map(_make_token_matcher, _TOKEN_MATCHERS))
50
51 @util.force_generator(tuple)
52 def tokenize(source):
53     index = 0
54     line = 1
55
56     while index < len(source):
57         if source[index] == ' ':
58             index += 1
59             continue
60
61         success = False
62
63         for matcher in _TOKEN_MATCHERS:
64             success, index, token = matcher(index, source, line)
65
66             if success:
67                 yield token
68                 break
69
70         if not success:
71             raise Exception('Unexpected character "{}" on line {}'.format(
72                 source[index],
73                 line,
74             ))
75
76         if token.type == 'newline':
77             line += 1
78
79 if __name__ == '__main__':
80     import unittest
81
82     class TokenizeTests(unittest.TestCase):
83         def test_tokenizes_open_parenthese(self):
84             self.assertEqual(
85                 tokenize('('),
86                 (Token(
87                     type='open_parenthese',
88                     match='(',
89                     index=0,
90                     line=1,
91                 ),),
92             )
93
94         def test_tokenizes_close_parenthese(self):
95             self.assertEqual(
96                 tokenize(')'),
97                 (Token(
98                     type='close_parenthese',
99                     match=')',
100                     index=0,
101                     line=1,
102                 ),),
103             )
104
105         def test_tokenizes_symbol(self):
106             self.assertEqual(
107                 tokenize('print'),
108                 (Token(
109                     type='symbol',
110                     match='print',
111                     index=0,
112                     line=1,
113                 ),),
114             )
115
116         def test_tokenizes_single_quoted_string_literal(self):
117             self.assertEqual(
118                 tokenize("'Hello, world'"),
119                 (Token(
120                     type='single_quoted_string_literal',
121                     match="'Hello, world'",
122                     index=0,
123                     line=1,
124                 ),),
125             )
126
127         def test_tokenizes_plus(self):
128             self.assertEqual(
129                 tokenize('+'),
130                 (Token(
131                     type='addition_level_operator',
132                     match='+',
133                     index=0,
134                     line=1,
135                 ),),
136             )
137
138         def test_tokenizes_minus(self):
139             self.assertEqual(
140                 tokenize('-'),
141                 (Token(
142                     type='addition_level_operator',
143                     match='-',
144                     index=0,
145                     line=1,
146                 ),),
147             )
148
149         def test_tokenizes_times(self):
150             self.assertEqual(
151                 tokenize('*'),
152                 (Token(
153                     type='multiplication_level_operator',
154                     match='*',
155                     index=0,
156                     line=1,
157                 ),),
158             )
159
160         def test_tokenizes_integer_divide(self):
161             self.assertEqual(
162                 tokenize('//'),
163                 (Token(
164                     type='multiplication_level_operator',
165                     match='//',
166                     index=0,
167                     line=1,
168                 ),),
169             )
170
171         def test_tokenizes_modular_divide(self):
172             self.assertEqual(
173                 tokenize('%'),
174                 (Token(
175                     type='multiplication_level_operator',
176                     match='%',
177                     index=0,
178                     line=1,
179                 ),),
180             )
181
182         def test_tokenizes_comma(self):
183             self.assertEqual(
184                 tokenize(','),
185                 (Token(
186                     type='comma',
187                     match=',',
188                     index=0,
189                     line=1,
190                 ),),
191             )
192
193         def test_tokenizes_assignment_operator(self):
194             self.assertEqual(
195                 tokenize('='),
196                 (Token(
197                     type='assignment_operator',
198                     match='=',
199                     index=0,
200                     line=1,
201                 ),),
202             )
203
204         def test_tokenizes_equality_operator(self):
205             self.assertEqual(
206                 tokenize('=='),
207                 (Token(
208                     type='comparison_level_operator',
209                     match='==',
210                     index=0,
211                     line=1,
212                 ),),
213             )
214
215         def test_tokenizes_greater_than_or_equal_operator(self):
216             self.assertEqual(
217                 tokenize('>='),
218                 (Token(
219                     type='comparison_level_operator',
220                     match='>=',
221                     index=0,
222                     line=1,
223                 ),),
224             )
225
226         def test_tokenizes_less_than_or_equal_operator(self):
227             self.assertEqual(
228                 tokenize('<='),
229                 (Token(
230                     type='comparison_level_operator',
231                     match='<=',
232                     index=0,
233                     line=1,
234                 ),),
235             )
236
237         def test_tokenizes_greater_than_equal_operator(self):
238             self.assertEqual(
239                 tokenize('>'),
240                 (Token(
241                     type='comparison_level_operator',
242                     match='>',
243                     index=0,
244                     line=1,
245                 ),),
246             )
247
248         def test_tokenizes_less_than_equal_operator(self):
249             self.assertEqual(
250                 tokenize('<'),
251                 (Token(
252                     type='comparison_level_operator',
253                     match='<',
254                     index=0,
255                     line=1,
256                 ),),
257             )
258
259         def test_tokenizes_not_equal_operator(self):
260             self.assertEqual(
261                 tokenize('!='),
262                 (Token(
263                     type='comparison_level_operator',
264                     match='!=',
265                     index=0,
266                     line=1,
267                 ),),
268             )
269
270         def test_tokenizes_newline(self):
271             self.assertEqual(
272                 tokenize('\n'),
273                 (Token(
274                     type='newline',
275                     match='\n',
276                     index=0,
277                     line=1,
278                 ),),
279             )
280
281         def test_handles_leading_space(self):
282             self.assertEqual(
283                 tokenize(' print'),
284                 (Token(
285                     type='symbol',
286                     match='print',
287                     index=1,
288                     line=1,
289                 ),),
290             )
291
292         def test_tokenizes_with_proper_line_numbers(self):
293             self.assertEqual(
294                 tokenize('print\n('),
295                 (
296                     Token(
297                         type='symbol',
298                         match='print',
299                         index=0,
300                         line=1,
301                     ),
302                     Token(
303                         type='newline',
304                         match='\n',
305                         index=5,
306                         line=1,
307                     ),
308                     Token(
309                         type='open_parenthese',
310                         match='(',
311                         index=6,
312                         line=2,
313                     ),
314                 ),
315             )
316
317
318     unittest.main()