Added support for comparison operators
[fur] / tokenization.py
1 import collections
2 import re
3
4 import util
5
6 Token = collections.namedtuple(
7     'Token',
8     [
9         'type',
10         'match',
11         'index',
12         'line',
13     ],
14 )
15
16 def _make_token_matcher(definition):
17     name, regex = definition
18     regex_matcher = re.compile(regex)
19
20     def token_matcher(index, source, line):
21         match = regex_matcher.match(source[index:])
22
23         if match is None:
24             return False, index, None
25
26         return (
27             True,
28             index + len(match.group()),
29             Token(type=name, match=match.group(), index=index, line=line),
30         )
31
32     return token_matcher
33
34
35 _TOKEN_MATCHERS = [
36     ('open_parenthese',                 r'\('),
37     ('close_parenthese',                r'\)'),
38     ('comma',                           r','),
39     ('integer_literal',                 r'\d+'),
40     ('symbol',                          r'[a-z]+'),
41     ('single_quoted_string_literal',    r"'.*?'"),
42     ('equality_level_operator',         r'(<=|>=|==|!=|<|>)'),
43     ('addition_level_operator',         r'(\+|-)'),
44     ('multiplication_level_operator',   r'(\*|//|%)'),
45     ('assignment_operator',             r'='),
46 ]
47
48 _TOKEN_MATCHERS = list(map(_make_token_matcher, _TOKEN_MATCHERS))
49
50 @util.force_generator(tuple)
51 def tokenize(source):
52     index = 0
53     line = 1
54
55     while index < len(source):
56         if source[index] == ' ':
57             index += 1
58             continue
59
60         success = False
61
62         for matcher in _TOKEN_MATCHERS:
63             success, index, token = matcher(index, source, line)
64
65             if success:
66                 yield token
67                 break
68
69         if not success:
70             raise Exception('Unexpected character "{}"'.format(source[index]))
71
72         while index < len(source) and source[index] in set(['\n']):
73             line += 1
74             index += 1
75
76 if __name__ == '__main__':
77     import unittest
78
79     class TokenizeTests(unittest.TestCase):
80         def test_tokenizes_open_parenthese(self):
81             self.assertEqual(
82                 tokenize('('),
83                 (Token(
84                     type='open_parenthese',
85                     match='(',
86                     index=0,
87                     line=1,
88                 ),),
89             )
90
91         def test_tokenizes_close_parenthese(self):
92             self.assertEqual(
93                 tokenize(')'),
94                 (Token(
95                     type='close_parenthese',
96                     match=')',
97                     index=0,
98                     line=1,
99                 ),),
100             )
101
102         def test_tokenizes_symbol(self):
103             self.assertEqual(
104                 tokenize('print'),
105                 (Token(
106                     type='symbol',
107                     match='print',
108                     index=0,
109                     line=1,
110                 ),),
111             )
112
113         def test_tokenizes_single_quoted_string_literal(self):
114             self.assertEqual(
115                 tokenize("'Hello, world'"),
116                 (Token(
117                     type='single_quoted_string_literal',
118                     match="'Hello, world'",
119                     index=0,
120                     line=1,
121                 ),),
122             )
123
124         def test_tokenizes_plus(self):
125             self.assertEqual(
126                 tokenize('+'),
127                 (Token(
128                     type='addition_level_operator',
129                     match='+',
130                     index=0,
131                     line=1,
132                 ),),
133             )
134
135         def test_tokenizes_minus(self):
136             self.assertEqual(
137                 tokenize('-'),
138                 (Token(
139                     type='addition_level_operator',
140                     match='-',
141                     index=0,
142                     line=1,
143                 ),),
144             )
145
146         def test_tokenizes_times(self):
147             self.assertEqual(
148                 tokenize('*'),
149                 (Token(
150                     type='multiplication_level_operator',
151                     match='*',
152                     index=0,
153                     line=1,
154                 ),),
155             )
156
157         def test_tokenizes_integer_divide(self):
158             self.assertEqual(
159                 tokenize('//'),
160                 (Token(
161                     type='multiplication_level_operator',
162                     match='//',
163                     index=0,
164                     line=1,
165                 ),),
166             )
167
168         def test_tokenizes_modular_divide(self):
169             self.assertEqual(
170                 tokenize('%'),
171                 (Token(
172                     type='multiplication_level_operator',
173                     match='%',
174                     index=0,
175                     line=1,
176                 ),),
177             )
178
179         def test_tokenizes_comma(self):
180             self.assertEqual(
181                 tokenize(','),
182                 (Token(
183                     type='comma',
184                     match=',',
185                     index=0,
186                     line=1,
187                 ),),
188             )
189
190         def test_tokenizes_assignment_operator(self):
191             self.assertEqual(
192                 tokenize('='),
193                 (Token(
194                     type='assignment_operator',
195                     match='=',
196                     index=0,
197                     line=1,
198                 ),),
199             )
200
201         def test_tokenizes_equality_operator(self):
202             self.assertEqual(
203                 tokenize('=='),
204                 (Token(
205                     type='equality_level_operator',
206                     match='==',
207                     index=0,
208                     line=1,
209                 ),),
210             )
211
212         def test_tokenizes_greater_than_or_equal_operator(self):
213             self.assertEqual(
214                 tokenize('>='),
215                 (Token(
216                     type='equality_level_operator',
217                     match='>=',
218                     index=0,
219                     line=1,
220                 ),),
221             )
222
223         def test_tokenizes_less_than_or_equal_operator(self):
224             self.assertEqual(
225                 tokenize('<='),
226                 (Token(
227                     type='equality_level_operator',
228                     match='<=',
229                     index=0,
230                     line=1,
231                 ),),
232             )
233
234         def test_tokenizes_greater_than_equal_operator(self):
235             self.assertEqual(
236                 tokenize('>'),
237                 (Token(
238                     type='equality_level_operator',
239                     match='>',
240                     index=0,
241                     line=1,
242                 ),),
243             )
244
245         def test_tokenizes_less_than_equal_operator(self):
246             self.assertEqual(
247                 tokenize('<'),
248                 (Token(
249                     type='equality_level_operator',
250                     match='<',
251                     index=0,
252                     line=1,
253                 ),),
254             )
255
256         def test_tokenizes_not_equal_operator(self):
257             self.assertEqual(
258                 tokenize('!='),
259                 (Token(
260                     type='equality_level_operator',
261                     match='!=',
262                     index=0,
263                     line=1,
264                 ),),
265             )
266
267         def test_handles_trailing_newline(self):
268             self.assertEqual(
269                 tokenize('print\n'),
270                 (Token(
271                     type='symbol',
272                     match='print',
273                     index=0,
274                     line=1,
275                 ),),
276             )
277
278         def test_handles_leading_space(self):
279             self.assertEqual(
280                 tokenize(' print'),
281                 (Token(
282                     type='symbol',
283                     match='print',
284                     index=1,
285                     line=1,
286                 ),),
287             )
288
289         def test_tokenizes_with_proper_line_numbers(self):
290             self.assertEqual(
291                 tokenize('print\n('),
292                 (
293                     Token(
294                         type='symbol',
295                         match='print',
296                         index=0,
297                         line=1,
298                     ),
299                     Token(
300                         type='open_parenthese',
301                         match='(',
302                         index=6,
303                         line=2,
304                     ),
305                 ),
306             )
307
308
309     unittest.main()