Add double-quoted strings
[fur] / tokenization.py
1 import collections
2 import re
3
4 import util
5
6 Token = collections.namedtuple(
7     'Token',
8     [
9         'type',
10         'match',
11         'index',
12         'line',
13     ],
14 )
15
16 def _make_token_matcher(definition):
17     name, regex = definition
18     regex_matcher = re.compile(regex)
19
20     def token_matcher(index, source, line):
21         match = regex_matcher.match(source[index:])
22
23         if match is None:
24             return False, index, None
25
26         return (
27             True,
28             index + len(match.group()),
29             Token(type=name, match=match.group(), index=index, line=line),
30         )
31
32     return token_matcher
33
34 _TOKEN_MATCHERS = [
35     ('keyword',                         r'(def|do|else|end|if)(?![a-z_])'),
36     ('open_bracket',                    r'\['),
37     ('close_bracket',                   r'\]'),
38     ('open_parenthese',                 r'\('),
39     ('close_parenthese',                r'\)'),
40     ('comma',                           r','),
41     ('integer_literal',                 r'\d+'),
42     ('symbol',                          r'[a-z_]+'),
43     ('single_quoted_string_literal',    r"'.*?'"),
44     ('double_quoted_string_literal',    r'".*?"'),
45     ('comparison_level_operator',       r'(<=|>=|==|!=|<|>)'),
46     ('assignment_operator',             r'='),
47     ('addition_level_operator',         r'(\+\+|\+|-)'),
48     ('multiplication_level_operator',   r'(\*|//|%)'),
49     ('newline',                         r'\n'),
50 ]
51
52 _TOKEN_MATCHERS = list(map(_make_token_matcher, _TOKEN_MATCHERS))
53
54 @util.force_generator(tuple)
55 def tokenize(source):
56     index = 0
57     line = 1
58
59     while index < len(source):
60         if source[index] == ' ':
61             index += 1
62             continue
63
64         if source[index] == '#':
65             while index < len(source) and source[index] != '\n':
66                 index += 1
67
68             continue
69
70         success = False
71
72         for matcher in _TOKEN_MATCHERS:
73             success, index, token = matcher(index, source, line)
74
75             if success:
76                 yield token
77                 break
78
79         if not success:
80             raise Exception('Unexpected character "{}" on line {}'.format(
81                 source[index],
82                 line,
83             ))
84
85         if token.type == 'newline':
86             line += 1
87
88 if __name__ == '__main__':
89     import unittest
90
91     class TokenizeTests(unittest.TestCase):
92         def test_tokenizes_open_parenthese(self):
93             self.assertEqual(
94                 tokenize('('),
95                 (Token(
96                     type='open_parenthese',
97                     match='(',
98                     index=0,
99                     line=1,
100                 ),),
101             )
102
103         def test_tokenizes_close_parenthese(self):
104             self.assertEqual(
105                 tokenize(')'),
106                 (Token(
107                     type='close_parenthese',
108                     match=')',
109                     index=0,
110                     line=1,
111                 ),),
112             )
113
114         def test_tokenizes_symbol(self):
115             self.assertEqual(
116                 tokenize('print'),
117                 (Token(
118                     type='symbol',
119                     match='print',
120                     index=0,
121                     line=1,
122                 ),),
123             )
124
125         def test_tokenizes_single_quoted_string_literal(self):
126             self.assertEqual(
127                 tokenize("'Hello, world'"),
128                 (Token(
129                     type='single_quoted_string_literal',
130                     match="'Hello, world'",
131                     index=0,
132                     line=1,
133                 ),),
134             )
135
136         def test_tokenizes_plus(self):
137             self.assertEqual(
138                 tokenize('+'),
139                 (Token(
140                     type='addition_level_operator',
141                     match='+',
142                     index=0,
143                     line=1,
144                 ),),
145             )
146
147         def test_tokenizes_minus(self):
148             self.assertEqual(
149                 tokenize('-'),
150                 (Token(
151                     type='addition_level_operator',
152                     match='-',
153                     index=0,
154                     line=1,
155                 ),),
156             )
157
158         def test_tokenizes_times(self):
159             self.assertEqual(
160                 tokenize('*'),
161                 (Token(
162                     type='multiplication_level_operator',
163                     match='*',
164                     index=0,
165                     line=1,
166                 ),),
167             )
168
169         def test_tokenizes_integer_divide(self):
170             self.assertEqual(
171                 tokenize('//'),
172                 (Token(
173                     type='multiplication_level_operator',
174                     match='//',
175                     index=0,
176                     line=1,
177                 ),),
178             )
179
180         def test_tokenizes_modular_divide(self):
181             self.assertEqual(
182                 tokenize('%'),
183                 (Token(
184                     type='multiplication_level_operator',
185                     match='%',
186                     index=0,
187                     line=1,
188                 ),),
189             )
190
191         def test_tokenizes_comma(self):
192             self.assertEqual(
193                 tokenize(','),
194                 (Token(
195                     type='comma',
196                     match=',',
197                     index=0,
198                     line=1,
199                 ),),
200             )
201
202         def test_tokenizes_assignment_operator(self):
203             self.assertEqual(
204                 tokenize('='),
205                 (Token(
206                     type='assignment_operator',
207                     match='=',
208                     index=0,
209                     line=1,
210                 ),),
211             )
212
213         def test_tokenizes_equality_operator(self):
214             self.assertEqual(
215                 tokenize('=='),
216                 (Token(
217                     type='comparison_level_operator',
218                     match='==',
219                     index=0,
220                     line=1,
221                 ),),
222             )
223
224         def test_tokenizes_greater_than_or_equal_operator(self):
225             self.assertEqual(
226                 tokenize('>='),
227                 (Token(
228                     type='comparison_level_operator',
229                     match='>=',
230                     index=0,
231                     line=1,
232                 ),),
233             )
234
235         def test_tokenizes_less_than_or_equal_operator(self):
236             self.assertEqual(
237                 tokenize('<='),
238                 (Token(
239                     type='comparison_level_operator',
240                     match='<=',
241                     index=0,
242                     line=1,
243                 ),),
244             )
245
246         def test_tokenizes_greater_than_equal_operator(self):
247             self.assertEqual(
248                 tokenize('>'),
249                 (Token(
250                     type='comparison_level_operator',
251                     match='>',
252                     index=0,
253                     line=1,
254                 ),),
255             )
256
257         def test_tokenizes_less_than_equal_operator(self):
258             self.assertEqual(
259                 tokenize('<'),
260                 (Token(
261                     type='comparison_level_operator',
262                     match='<',
263                     index=0,
264                     line=1,
265                 ),),
266             )
267
268         def test_tokenizes_not_equal_operator(self):
269             self.assertEqual(
270                 tokenize('!='),
271                 (Token(
272                     type='comparison_level_operator',
273                     match='!=',
274                     index=0,
275                     line=1,
276                 ),),
277             )
278
279         def test_tokenizes_newline(self):
280             self.assertEqual(
281                 tokenize('\n'),
282                 (Token(
283                     type='newline',
284                     match='\n',
285                     index=0,
286                     line=1,
287                 ),),
288             )
289
290         def test_handles_leading_space(self):
291             self.assertEqual(
292                 tokenize(' print'),
293                 (Token(
294                     type='symbol',
295                     match='print',
296                     index=1,
297                     line=1,
298                 ),),
299             )
300
301         def test_tokenizes_with_proper_line_numbers(self):
302             self.assertEqual(
303                 tokenize('print\n('),
304                 (
305                     Token(
306                         type='symbol',
307                         match='print',
308                         index=0,
309                         line=1,
310                     ),
311                     Token(
312                         type='newline',
313                         match='\n',
314                         index=5,
315                         line=1,
316                     ),
317                     Token(
318                         type='open_parenthese',
319                         match='(',
320                         index=6,
321                         line=2,
322                     ),
323                 ),
324             )
325
326
327     unittest.main()