Some comments
[fur] / tokenization.py
1 import collections
2 import re
3
4 import util
5
6 Token = collections.namedtuple(
7     'Token',
8     [
9         'type',
10         'match',
11         'index',
12         'line',
13     ],
14 )
15
16 def _make_token_matcher(definition):
17     name, regex = definition
18     regex_matcher = re.compile(regex)
19
20     def token_matcher(index, source, line):
21         match = regex_matcher.match(source[index:])
22
23         if match is None:
24             return False, index, None
25
26         return (
27             True,
28             index + len(match.group()),
29             Token(type=name, match=match.group(), index=index, line=line),
30         )
31
32     return token_matcher
33
34
35 _TOKEN_MATCHERS = [
36     ('open_parenthese',                 r'\('),
37     ('close_parenthese',                r'\)'),
38     ('comma',                           r','),
39     ('assignment_operator',             r'='),
40     ('integer_literal',                 r'\d+'),
41     ('symbol',                          r'[a-z]+'),
42     ('single_quoted_string_literal',    r"'.*?'"),
43     ('addition_level_operator',         r'(\+|-)'),
44     ('multiplication_level_operator',  r'(\*|//|%)'),
45 ]
46
47 _TOKEN_MATCHERS = list(map(_make_token_matcher, _TOKEN_MATCHERS))
48
49 @util.force_generator(tuple)
50 def tokenize(source):
51     index = 0
52     line = 1
53
54     while index < len(source):
55         if source[index] == ' ':
56             index += 1
57             continue
58
59         success = False
60
61         for matcher in _TOKEN_MATCHERS:
62             success, index, token = matcher(index, source, line)
63
64             if success:
65                 yield token
66                 break
67
68         if not success:
69             raise Exception('Unexpected character "{}"'.format(source[index]))
70
71         while index < len(source) and source[index] in set(['\n']):
72             line += 1
73             index += 1
74
75 if __name__ == '__main__':
76     import unittest
77
78     class TokenizeTests(unittest.TestCase):
79         def test_tokenizes_open_parenthese(self):
80             self.assertEqual(
81                 tokenize('('),
82                 (Token(
83                     type='open_parenthese',
84                     match='(',
85                     index=0,
86                     line=1,
87                 ),),
88             )
89
90         def test_tokenizes_close_parenthese(self):
91             self.assertEqual(
92                 tokenize(')'),
93                 (Token(
94                     type='close_parenthese',
95                     match=')',
96                     index=0,
97                     line=1,
98                 ),),
99             )
100
101         def test_tokenizes_symbol(self):
102             self.assertEqual(
103                 tokenize('print'),
104                 (Token(
105                     type='symbol',
106                     match='print',
107                     index=0,
108                     line=1,
109                 ),),
110             )
111
112         def test_tokenizes_single_quoted_string_literal(self):
113             self.assertEqual(
114                 tokenize("'Hello, world'"),
115                 (Token(
116                     type='single_quoted_string_literal',
117                     match="'Hello, world'",
118                     index=0,
119                     line=1,
120                 ),),
121             )
122
123         def test_tokenizes_plus(self):
124             self.assertEqual(
125                 tokenize('+'),
126                 (Token(
127                     type='addition_level_operator',
128                     match='+',
129                     index=0,
130                     line=1,
131                 ),),
132             )
133
134         def test_tokenizes_minus(self):
135             self.assertEqual(
136                 tokenize('-'),
137                 (Token(
138                     type='addition_level_operator',
139                     match='-',
140                     index=0,
141                     line=1,
142                 ),),
143             )
144
145         def test_tokenizes_times(self):
146             self.assertEqual(
147                 tokenize('*'),
148                 (Token(
149                     type='multiplication_level_operator',
150                     match='*',
151                     index=0,
152                     line=1,
153                 ),),
154             )
155
156         def test_tokenizes_integer_divide(self):
157             self.assertEqual(
158                 tokenize('//'),
159                 (Token(
160                     type='multiplication_level_operator',
161                     match='//',
162                     index=0,
163                     line=1,
164                 ),),
165             )
166
167         def test_tokenizes_modular_divide(self):
168             self.assertEqual(
169                 tokenize('%'),
170                 (Token(
171                     type='multiplication_level_operator',
172                     match='%',
173                     index=0,
174                     line=1,
175                 ),),
176             )
177
178         def test_tokenizes_comma(self):
179             self.assertEqual(
180                 tokenize(','),
181                 (Token(
182                     type='comma',
183                     match=',',
184                     index=0,
185                     line=1,
186                 ),),
187             )
188
189         def test_tokenizes_assignment_operator(self):
190             self.assertEqual(
191                 tokenize('='),
192                 (Token(
193                     type='assignment_operator',
194                     match='=',
195                     index=0,
196                     line=1,
197                 ),),
198             )
199
200         def test_handles_trailing_newline(self):
201             self.assertEqual(
202                 tokenize('print\n'),
203                 (Token(
204                     type='symbol',
205                     match='print',
206                     index=0,
207                     line=1,
208                 ),),
209             )
210
211         def test_handles_leading_space(self):
212             self.assertEqual(
213                 tokenize(' print'),
214                 (Token(
215                     type='symbol',
216                     match='print',
217                     index=1,
218                     line=1,
219                 ),),
220             )
221
222         def test_tokenizes_with_proper_line_numbers(self):
223             self.assertEqual(
224                 tokenize('print\n('),
225                 (
226                     Token(
227                         type='symbol',
228                         match='print',
229                         index=0,
230                         line=1,
231                     ),
232                     Token(
233                         type='open_parenthese',
234                         match='(',
235                         index=6,
236                         line=2,
237                     ),
238                 ),
239             )
240
241
242     unittest.main()