code.kerkeslager.com Git - fur/blob - tokenization.py

   1 import collections
   2 import re
   3
   4 import util
   5
   6 Token = collections.namedtuple(
   7     'Token',
   8     [
   9         'type',
  10         'match',
  11         'index',
  12         'line',
  13     ],
  14 )
  15
  16 def _make_token_matcher(definition):
  17     name, regex = definition
  18     regex_matcher = re.compile(regex)
  19
  20     def token_matcher(index, source, line):
  21         match = regex_matcher.match(source[index:])
  22
  23         if match is None:
  24             return False, index, None
  25
  26         return (
  27             True,
  28             index + len(match.group()),
  29             Token(type=name, match=match.group(), index=index, line=line),
  30         )
  31
  32     return token_matcher
  33
  34
  35 _TOKEN_MATCHERS = [
  36     ('open_parenthese',                 r'\('),
  37     ('close_parenthese',                r'\)'),
  38     ('comma',                           r','),
  39     ('integer_literal',                 r'\d+'),
  40     ('symbol',                          r'[a-z]+'),
  41     ('single_quoted_string_literal',    r"'.*?'"),
  42     ('addition_level_operator',         r'(\+|-)'),
  43     ('multiplication_level_operator',  r'(\*|//|%)'),
  44 ]
  45
  46 _TOKEN_MATCHERS = list(map(_make_token_matcher, _TOKEN_MATCHERS))
  47
  48 @util.force_generator(tuple)
  49 def tokenize(source):
  50     index = 0
  51     line = 1
  52
  53     while index < len(source):
  54         if source[index] == ' ':
  55             index += 1
  56             continue
  57
  58         success = False
  59
  60         for matcher in _TOKEN_MATCHERS:
  61             success, index, token = matcher(index, source, line)
  62
  63             if success:
  64                 yield token
  65                 break
  66
  67         if not success:
  68             raise Exception('Unexpected character "{}"'.format(source[index]))
  69
  70         while index < len(source) and source[index] in set(['\n']):
  71             line += 1
  72             index += 1
  73
  74 if __name__ == '__main__':
  75     import unittest
  76
  77     class TokenizeTests(unittest.TestCase):
  78         def test_tokenizes_open_parenthese(self):
  79             self.assertEqual(
  80                 tokenize('('),
  81                 (Token(
  82                     type='open_parenthese',
  83                     match='(',
  84                     index=0,
  85                     line=1,
  86                 ),),
  87             )
  88
  89         def test_tokenizes_close_parenthese(self):
  90             self.assertEqual(
  91                 tokenize(')'),
  92                 (Token(
  93                     type='close_parenthese',
  94                     match=')',
  95                     index=0,
  96                     line=1,
  97                 ),),
  98             )
  99
 100         def test_tokenizes_symbol(self):
 101             self.assertEqual(
 102                 tokenize('print'),
 103                 (Token(
 104                     type='symbol',
 105                     match='print',
 106                     index=0,
 107                     line=1,
 108                 ),),
 109             )
 110
 111         def test_tokenizes_single_quoted_string_literal(self):
 112             self.assertEqual(
 113                 tokenize("'Hello, world'"),
 114                 (Token(
 115                     type='single_quoted_string_literal',
 116                     match="'Hello, world'",
 117                     index=0,
 118                     line=1,
 119                 ),),
 120             )
 121
 122         def test_tokenizes_plus(self):
 123             self.assertEqual(
 124                 tokenize('+'),
 125                 (Token(
 126                     type='addition_level_operator',
 127                     match='+',
 128                     index=0,
 129                     line=1,
 130                 ),),
 131             )
 132
 133         def test_tokenizes_minus(self):
 134             self.assertEqual(
 135                 tokenize('-'),
 136                 (Token(
 137                     type='addition_level_operator',
 138                     match='-',
 139                     index=0,
 140                     line=1,
 141                 ),),
 142             )
 143
 144         def test_tokenizes_times(self):
 145             self.assertEqual(
 146                 tokenize('*'),
 147                 (Token(
 148                     type='multiplication_level_operator',
 149                     match='*',
 150                     index=0,
 151                     line=1,
 152                 ),),
 153             )
 154
 155         def test_tokenizes_integer_divide(self):
 156             self.assertEqual(
 157                 tokenize('//'),
 158                 (Token(
 159                     type='multiplication_level_operator',
 160                     match='//',
 161                     index=0,
 162                     line=1,
 163                 ),),
 164             )
 165
 166         def test_tokenizes_modular_divide(self):
 167             self.assertEqual(
 168                 tokenize('%'),
 169                 (Token(
 170                     type='multiplication_level_operator',
 171                     match='%',
 172                     index=0,
 173                     line=1,
 174                 ),),
 175             )
 176
 177         def test_tokenizes_comma(self):
 178             self.assertEqual(
 179                 tokenize(','),
 180                 (Token(
 181                     type='comma',
 182                     match=',',
 183                     index=0,
 184                     line=1,
 185                 ),),
 186             )
 187
 188
 189         def test_handles_trailing_newline(self):
 190             self.assertEqual(
 191                 tokenize('print\n'),
 192                 (Token(
 193                     type='symbol',
 194                     match='print',
 195                     index=0,
 196                     line=1,
 197                 ),),
 198             )
 199
 200         def test_handles_leading_space(self):
 201             self.assertEqual(
 202                 tokenize(' print'),
 203                 (Token(
 204                     type='symbol',
 205                     match='print',
 206                     index=1,
 207                     line=1,
 208                 ),),
 209             )
 210
 211         def test_tokenizes_with_proper_line_numbers(self):
 212             self.assertEqual(
 213                 tokenize('print\n('),
 214                 (
 215                     Token(
 216                         type='symbol',
 217                         match='print',
 218                         index=0,
 219                         line=1,
 220                     ),
 221                     Token(
 222                         type='open_parenthese',
 223                         match='(',
 224                         index=6,
 225                         line=2,
 226                     ),
 227                 ),
 228             )
 229
 230
 231     unittest.main()