A pretty featureful commit:
[fur] / tokenization.py
1 import collections
2 import re
3
4 import util
5
6 Token = collections.namedtuple(
7     'Token',
8     [
9         'type',
10         'match',
11         'index',
12         'line',
13     ],
14 )
15
16 def _make_token_matcher(definition):
17     name, regex = definition
18     regex_matcher = re.compile(regex)
19
20     def token_matcher(index, source, line):
21         match = regex_matcher.match(source[index:])
22
23         if match is None:
24             return False, index, None
25
26         return (
27             True,
28             index + len(match.group()),
29             Token(type=name, match=match.group(), index=index, line=line),
30         )
31
32     return token_matcher
33
34
35 _TOKEN_MATCHERS = [
36     ('open_parenthese',                 r'\('),
37     ('close_parenthese',                r'\)'),
38     ('comma',                           r','),
39     ('integer_literal',                 r'\d+'),
40     ('symbol',                          r'[a-z]+'),
41     ('single_quoted_string_literal',    r"'.*?'"),
42     ('addition_level_operator',         r'(\+|-)'),
43     ('multiplication_level_operator',  r'(\*|//|%)'),
44 ]
45
46 _TOKEN_MATCHERS = list(map(_make_token_matcher, _TOKEN_MATCHERS))
47
48 @util.force_generator(tuple)
49 def tokenize(source):
50     index = 0
51     line = 1
52
53     while index < len(source):
54         if source[index] == ' ':
55             index += 1
56             continue
57
58         success = False
59
60         for matcher in _TOKEN_MATCHERS:
61             success, index, token = matcher(index, source, line)
62
63             if success:
64                 yield token
65                 break
66
67         if not success:
68             raise Exception('Unexpected character "{}"'.format(source[index]))
69
70         while index < len(source) and source[index] in set(['\n']):
71             line += 1
72             index += 1
73
74 if __name__ == '__main__':
75     import unittest
76
77     class TokenizeTests(unittest.TestCase):
78         def test_tokenizes_open_parenthese(self):
79             self.assertEqual(
80                 tokenize('('),
81                 (Token(
82                     type='open_parenthese',
83                     match='(',
84                     index=0,
85                     line=1,
86                 ),),
87             )
88
89         def test_tokenizes_close_parenthese(self):
90             self.assertEqual(
91                 tokenize(')'),
92                 (Token(
93                     type='close_parenthese',
94                     match=')',
95                     index=0,
96                     line=1,
97                 ),),
98             )
99
100         def test_tokenizes_symbol(self):
101             self.assertEqual(
102                 tokenize('print'),
103                 (Token(
104                     type='symbol',
105                     match='print',
106                     index=0,
107                     line=1,
108                 ),),
109             )
110
111         def test_tokenizes_single_quoted_string_literal(self):
112             self.assertEqual(
113                 tokenize("'Hello, world'"),
114                 (Token(
115                     type='single_quoted_string_literal',
116                     match="'Hello, world'",
117                     index=0,
118                     line=1,
119                 ),),
120             )
121
122         def test_tokenizes_plus(self):
123             self.assertEqual(
124                 tokenize('+'),
125                 (Token(
126                     type='addition_level_operator',
127                     match='+',
128                     index=0,
129                     line=1,
130                 ),),
131             )
132
133         def test_tokenizes_minus(self):
134             self.assertEqual(
135                 tokenize('-'),
136                 (Token(
137                     type='addition_level_operator',
138                     match='-',
139                     index=0,
140                     line=1,
141                 ),),
142             )
143
144         def test_tokenizes_times(self):
145             self.assertEqual(
146                 tokenize('*'),
147                 (Token(
148                     type='multiplication_level_operator',
149                     match='*',
150                     index=0,
151                     line=1,
152                 ),),
153             )
154
155         def test_tokenizes_integer_divide(self):
156             self.assertEqual(
157                 tokenize('//'),
158                 (Token(
159                     type='multiplication_level_operator',
160                     match='//',
161                     index=0,
162                     line=1,
163                 ),),
164             )
165
166         def test_tokenizes_modular_divide(self):
167             self.assertEqual(
168                 tokenize('%'),
169                 (Token(
170                     type='multiplication_level_operator',
171                     match='%',
172                     index=0,
173                     line=1,
174                 ),),
175             )
176
177         def test_tokenizes_comma(self):
178             self.assertEqual(
179                 tokenize(','),
180                 (Token(
181                     type='comma',
182                     match=',',
183                     index=0,
184                     line=1,
185                 ),),
186             )
187
188
189         def test_handles_trailing_newline(self):
190             self.assertEqual(
191                 tokenize('print\n'),
192                 (Token(
193                     type='symbol',
194                     match='print',
195                     index=0,
196                     line=1,
197                 ),),
198             )
199
200         def test_handles_leading_space(self):
201             self.assertEqual(
202                 tokenize(' print'),
203                 (Token(
204                     type='symbol',
205                     match='print',
206                     index=1,
207                     line=1,
208                 ),),
209             )
210
211         def test_tokenizes_with_proper_line_numbers(self):
212             self.assertEqual(
213                 tokenize('print\n('),
214                 (
215                     Token(
216                         type='symbol',
217                         match='print',
218                         index=0,
219                         line=1,
220                     ),
221                     Token(
222                         type='open_parenthese',
223                         match='(',
224                         index=6,
225                         line=2,
226                     ),
227                 ),
228             )
229
230
231     unittest.main()