code.kerkeslager.com Git - fur/blob - tokenization.py

   1 import collections
   2 import re
   3
   4 import util
   5
   6 Token = collections.namedtuple(
   7     'Token',
   8     (
   9         'type',
  10         'match',
  11         'metadata',
  12     ),
  13 )
  14
  15 NodeMetadata = collections.namedtuple(
  16     'NodeMetadata',
  17     (
  18         'index',
  19         'line',
  20     ),
  21 )
  22
  23 def _make_token_matcher(definition):
  24     name, regex = definition
  25     regex_matcher = re.compile(regex)
  26
  27     def token_matcher(index, source, line):
  28         match = regex_matcher.match(source[index:])
  29
  30         if match is None:
  31             return False, index, None
  32
  33         return (
  34             True,
  35             index + len(match.group()),
  36             Token(
  37                 type=name,
  38                 match=match.group(),
  39                 metadata=NodeMetadata(
  40                     index=index,
  41                     line=line,
  42                 ),
  43             ),
  44         )
  45
  46     return token_matcher
  47
  48 _TOKEN_MATCHERS = [
  49     ('keyword',                         r'(def|do|else|end|if|lambda)(?![a-z_])'),
  50     ('open_bracket',                    r'\['),
  51     ('close_bracket',                   r'\]'),
  52     ('open_parenthese',                 r'\('),
  53     ('close_parenthese',                r'\)'),
  54     ('comma',                           r','),
  55     ('colon',                           r':'),
  56     ('period',                          r'\.'),
  57     ('integer_literal',                 r'\d+'),
  58     ('symbol',                          r'[a-z_]+'),
  59     ('single_quoted_string_literal',    r"'.*?'"),
  60     ('double_quoted_string_literal',    r'".*?"'),
  61     ('comparison_level_operator',       r'(<=|>=|==|!=|<|>)'),
  62     ('assignment_operator',             r'='),
  63     ('addition_level_operator',         r'(\+\+|\+|-)'),
  64     ('multiplication_level_operator',   r'(\*|//|%)'),
  65     ('newline',                         r'\n'),
  66 ]
  67
  68 _TOKEN_MATCHERS = list(map(_make_token_matcher, _TOKEN_MATCHERS))
  69
  70 @util.force_generator(tuple)
  71 def tokenize(source):
  72     index = 0
  73     line = 1
  74
  75     while index < len(source):
  76         if source[index] == ' ':
  77             index += 1
  78             continue
  79
  80         if source[index] == '#':
  81             while index < len(source) and source[index] != '\n':
  82                 index += 1
  83
  84             continue
  85
  86         success = False
  87
  88         for matcher in _TOKEN_MATCHERS:
  89             success, index, token = matcher(index, source, line)
  90
  91             if success:
  92                 yield token
  93                 break
  94
  95         if not success:
  96             raise Exception('Unexpected character "{}" on line {}'.format(
  97                 source[index],
  98                 line,
  99             ))
 100
 101         if token.type == 'newline':
 102             line += 1
 103
 104 if __name__ == '__main__':
 105     import unittest
 106
 107     class TokenizeTests(unittest.TestCase):
 108         def test_tokenizes_open_parenthese(self):
 109             self.assertEqual(
 110                 tokenize('('),
 111                 (Token(
 112                     type='open_parenthese',
 113                     match='(',
 114                     index=0,
 115                     line=1,
 116                 ),),
 117             )
 118
 119         def test_tokenizes_close_parenthese(self):
 120             self.assertEqual(
 121                 tokenize(')'),
 122                 (Token(
 123                     type='close_parenthese',
 124                     match=')',
 125                     index=0,
 126                     line=1,
 127                 ),),
 128             )
 129
 130         def test_tokenizes_symbol(self):
 131             self.assertEqual(
 132                 tokenize('print'),
 133                 (Token(
 134                     type='symbol',
 135                     match='print',
 136                     index=0,
 137                     line=1,
 138                 ),),
 139             )
 140
 141         def test_tokenizes_single_quoted_string_literal(self):
 142             self.assertEqual(
 143                 tokenize("'Hello, world'"),
 144                 (Token(
 145                     type='single_quoted_string_literal',
 146                     match="'Hello, world'",
 147                     index=0,
 148                     line=1,
 149                 ),),
 150             )
 151
 152         def test_tokenizes_plus(self):
 153             self.assertEqual(
 154                 tokenize('+'),
 155                 (Token(
 156                     type='addition_level_operator',
 157                     match='+',
 158                     index=0,
 159                     line=1,
 160                 ),),
 161             )
 162
 163         def test_tokenizes_minus(self):
 164             self.assertEqual(
 165                 tokenize('-'),
 166                 (Token(
 167                     type='addition_level_operator',
 168                     match='-',
 169                     index=0,
 170                     line=1,
 171                 ),),
 172             )
 173
 174         def test_tokenizes_times(self):
 175             self.assertEqual(
 176                 tokenize('*'),
 177                 (Token(
 178                     type='multiplication_level_operator',
 179                     match='*',
 180                     index=0,
 181                     line=1,
 182                 ),),
 183             )
 184
 185         def test_tokenizes_integer_divide(self):
 186             self.assertEqual(
 187                 tokenize('//'),
 188                 (Token(
 189                     type='multiplication_level_operator',
 190                     match='//',
 191                     index=0,
 192                     line=1,
 193                 ),),
 194             )
 195
 196         def test_tokenizes_modular_divide(self):
 197             self.assertEqual(
 198                 tokenize('%'),
 199                 (Token(
 200                     type='multiplication_level_operator',
 201                     match='%',
 202                     index=0,
 203                     line=1,
 204                 ),),
 205             )
 206
 207         def test_tokenizes_comma(self):
 208             self.assertEqual(
 209                 tokenize(','),
 210                 (Token(
 211                     type='comma',
 212                     match=',',
 213                     index=0,
 214                     line=1,
 215                 ),),
 216             )
 217
 218         def test_tokenizes_assignment_operator(self):
 219             self.assertEqual(
 220                 tokenize('='),
 221                 (Token(
 222                     type='assignment_operator',
 223                     match='=',
 224                     index=0,
 225                     line=1,
 226                 ),),
 227             )
 228
 229         def test_tokenizes_equality_operator(self):
 230             self.assertEqual(
 231                 tokenize('=='),
 232                 (Token(
 233                     type='comparison_level_operator',
 234                     match='==',
 235                     index=0,
 236                     line=1,
 237                 ),),
 238             )
 239
 240         def test_tokenizes_greater_than_or_equal_operator(self):
 241             self.assertEqual(
 242                 tokenize('>='),
 243                 (Token(
 244                     type='comparison_level_operator',
 245                     match='>=',
 246                     index=0,
 247                     line=1,
 248                 ),),
 249             )
 250
 251         def test_tokenizes_less_than_or_equal_operator(self):
 252             self.assertEqual(
 253                 tokenize('<='),
 254                 (Token(
 255                     type='comparison_level_operator',
 256                     match='<=',
 257                     index=0,
 258                     line=1,
 259                 ),),
 260             )
 261
 262         def test_tokenizes_greater_than_equal_operator(self):
 263             self.assertEqual(
 264                 tokenize('>'),
 265                 (Token(
 266                     type='comparison_level_operator',
 267                     match='>',
 268                     index=0,
 269                     line=1,
 270                 ),),
 271             )
 272
 273         def test_tokenizes_less_than_equal_operator(self):
 274             self.assertEqual(
 275                 tokenize('<'),
 276                 (Token(
 277                     type='comparison_level_operator',
 278                     match='<',
 279                     index=0,
 280                     line=1,
 281                 ),),
 282             )
 283
 284         def test_tokenizes_not_equal_operator(self):
 285             self.assertEqual(
 286                 tokenize('!='),
 287                 (Token(
 288                     type='comparison_level_operator',
 289                     match='!=',
 290                     index=0,
 291                     line=1,
 292                 ),),
 293             )
 294
 295         def test_tokenizes_newline(self):
 296             self.assertEqual(
 297                 tokenize('\n'),
 298                 (Token(
 299                     type='newline',
 300                     match='\n',
 301                     index=0,
 302                     line=1,
 303                 ),),
 304             )
 305
 306         def test_handles_leading_space(self):
 307             self.assertEqual(
 308                 tokenize(' print'),
 309                 (Token(
 310                     type='symbol',
 311                     match='print',
 312                     index=1,
 313                     line=1,
 314                 ),),
 315             )
 316
 317         def test_tokenizes_with_proper_line_numbers(self):
 318             self.assertEqual(
 319                 tokenize('print\n('),
 320                 (
 321                     Token(
 322                         type='symbol',
 323                         match='print',
 324                         index=0,
 325                         line=1,
 326                     ),
 327                     Token(
 328                         type='newline',
 329                         match='\n',
 330                         index=5,
 331                         line=1,
 332                     ),
 333                     Token(
 334                         type='open_parenthese',
 335                         match='(',
 336                         index=6,
 337                         line=2,
 338                     ),
 339                 ),
 340             )
 341
 342
 343     unittest.main()