Added support for integers
[fur] / tokenization.py
1 import collections
2 import re
3
4 import util
5
6 Token = collections.namedtuple(
7     'Token',
8     [
9         'type',
10         'match',
11     ],
12 )
13
14 def _make_token_matcher(definition):
15     name, regex = definition
16     regex_matcher = re.compile(regex)
17
18     def token_matcher(index, source):
19         match = regex_matcher.match(source[index:])
20
21         if match is None:
22             return False, index, None
23
24         return True, index + len(match.group()), Token(type=name, match=match.group())
25
26     return token_matcher
27
28
29 _TOKEN_MATCHERS = [
30     ('open_parenthese',                 r'\('),
31     ('close_parenthese',                r'\)'),
32     ('integer_literal',                 r'-?\s*\d+'),
33     ('symbol',                          r'[a-z]+'),
34     ('single_quoted_string_literal',    r"'.*?'"),
35 ]
36
37 _TOKEN_MATCHERS = list(map(_make_token_matcher, _TOKEN_MATCHERS))
38
39 @util.force_generator
40 def tokenize(source):
41     index = 0
42
43     while index < len(source):
44         success = False
45
46         for matcher in _TOKEN_MATCHERS:
47             success, index, token = matcher(index, source)
48
49             if success:
50                 yield token
51                 break
52
53         if not success:
54             raise Exception('Unexpected character "{}"'.format(source[index]))
55
56         while index < len(source) and source[index] in set(['\n']):
57             index += 1
58
59 if __name__ == '__main__':
60     import unittest
61
62     class TokenizeTests(unittest.TestCase):
63         def test_tokenizes_open_parenthese(self):
64             self.assertEqual(
65                 tokenize('('),
66                 [Token(
67                     type='open_parenthese',
68                     match='(',
69                 )],
70             )
71
72         def test_tokenizes_close_parenthese(self):
73             self.assertEqual(
74                 tokenize(')'),
75                 [Token(
76                     type='close_parenthese',
77                     match=')',
78                 )],
79             )
80
81         def test_tokenizes_symbol(self):
82             self.assertEqual(
83                 tokenize('print'),
84                 [Token(
85                     type='symbol',
86                     match='print',
87                 )],
88             )
89
90         def test_tokenizes_single_quoted_string_literal(self):
91             self.assertEqual(
92                 tokenize("'Hello, world'"),
93                 [Token(
94                     type='single_quoted_string_literal',
95                     match="'Hello, world'",
96                 )],
97             )
98
99         def test_handles_trailing_newline(self):
100             self.assertEqual(
101                 tokenize('print\n'),
102                 [Token(
103                     type='symbol',
104                     match='print',
105                 )],
106             )
107
108     unittest.main()