code.kerkeslager.com Git - fur/blob - tokenization.py

   1 import collections
   2 import re
   3
   4 import util
   5
   6 Token = collections.namedtuple(
   7     'Token',
   8     [
   9         'type',
  10         'match',
  11         'index',
  12         'line',
  13     ],
  14 )
  15
  16 def _make_token_matcher(definition):
  17     name, regex = definition
  18     regex_matcher = re.compile(regex)
  19
  20     def token_matcher(index, source, line):
  21         match = regex_matcher.match(source[index:])
  22
  23         if match is None:
  24             return False, index, None
  25
  26         return (
  27             True,
  28             index + len(match.group()),
  29             Token(type=name, match=match.group(), index=index, line=line),
  30         )
  31
  32     return token_matcher
  33
  34 _TOKEN_MATCHERS = [
  35     ('keyword',                         r'(def|do|else|end|if)(?![a-z_])'),
  36     ('open_bracket',                    r'\['),
  37     ('close_bracket',                   r'\]'),
  38     ('open_parenthese',                 r'\('),
  39     ('close_parenthese',                r'\)'),
  40     ('comma',                           r','),
  41     ('integer_literal',                 r'\d+'),
  42     ('symbol',                          r'[a-z_]+'),
  43     ('single_quoted_string_literal',    r"'.*?'"),
  44     ('comparison_level_operator',       r'(<=|>=|==|!=|<|>)'),
  45     ('assignment_operator',             r'='),
  46     ('addition_level_operator',         r'(\+|-)'),
  47     ('multiplication_level_operator',   r'(\*|//|%)'),
  48     ('newline',                         r'\n'),
  49 ]
  50
  51 _TOKEN_MATCHERS = list(map(_make_token_matcher, _TOKEN_MATCHERS))
  52
  53 @util.force_generator(tuple)
  54 def tokenize(source):
  55     index = 0
  56     line = 1
  57
  58     while index < len(source):
  59         if source[index] == ' ':
  60             index += 1
  61             continue
  62
  63         if source[index] == '#':
  64             while index < len(source) and source[index] != '\n':
  65                 index += 1
  66
  67             continue
  68
  69         success = False
  70
  71         for matcher in _TOKEN_MATCHERS:
  72             success, index, token = matcher(index, source, line)
  73
  74             if success:
  75                 yield token
  76                 break
  77
  78         if not success:
  79             raise Exception('Unexpected character "{}" on line {}'.format(
  80                 source[index],
  81                 line,
  82             ))
  83
  84         if token.type == 'newline':
  85             line += 1
  86
  87 if __name__ == '__main__':
  88     import unittest
  89
  90     class TokenizeTests(unittest.TestCase):
  91         def test_tokenizes_open_parenthese(self):
  92             self.assertEqual(
  93                 tokenize('('),
  94                 (Token(
  95                     type='open_parenthese',
  96                     match='(',
  97                     index=0,
  98                     line=1,
  99                 ),),
 100             )
 101
 102         def test_tokenizes_close_parenthese(self):
 103             self.assertEqual(
 104                 tokenize(')'),
 105                 (Token(
 106                     type='close_parenthese',
 107                     match=')',
 108                     index=0,
 109                     line=1,
 110                 ),),
 111             )
 112
 113         def test_tokenizes_symbol(self):
 114             self.assertEqual(
 115                 tokenize('print'),
 116                 (Token(
 117                     type='symbol',
 118                     match='print',
 119                     index=0,
 120                     line=1,
 121                 ),),
 122             )
 123
 124         def test_tokenizes_single_quoted_string_literal(self):
 125             self.assertEqual(
 126                 tokenize("'Hello, world'"),
 127                 (Token(
 128                     type='single_quoted_string_literal',
 129                     match="'Hello, world'",
 130                     index=0,
 131                     line=1,
 132                 ),),
 133             )
 134
 135         def test_tokenizes_plus(self):
 136             self.assertEqual(
 137                 tokenize('+'),
 138                 (Token(
 139                     type='addition_level_operator',
 140                     match='+',
 141                     index=0,
 142                     line=1,
 143                 ),),
 144             )
 145
 146         def test_tokenizes_minus(self):
 147             self.assertEqual(
 148                 tokenize('-'),
 149                 (Token(
 150                     type='addition_level_operator',
 151                     match='-',
 152                     index=0,
 153                     line=1,
 154                 ),),
 155             )
 156
 157         def test_tokenizes_times(self):
 158             self.assertEqual(
 159                 tokenize('*'),
 160                 (Token(
 161                     type='multiplication_level_operator',
 162                     match='*',
 163                     index=0,
 164                     line=1,
 165                 ),),
 166             )
 167
 168         def test_tokenizes_integer_divide(self):
 169             self.assertEqual(
 170                 tokenize('//'),
 171                 (Token(
 172                     type='multiplication_level_operator',
 173                     match='//',
 174                     index=0,
 175                     line=1,
 176                 ),),
 177             )
 178
 179         def test_tokenizes_modular_divide(self):
 180             self.assertEqual(
 181                 tokenize('%'),
 182                 (Token(
 183                     type='multiplication_level_operator',
 184                     match='%',
 185                     index=0,
 186                     line=1,
 187                 ),),
 188             )
 189
 190         def test_tokenizes_comma(self):
 191             self.assertEqual(
 192                 tokenize(','),
 193                 (Token(
 194                     type='comma',
 195                     match=',',
 196                     index=0,
 197                     line=1,
 198                 ),),
 199             )
 200
 201         def test_tokenizes_assignment_operator(self):
 202             self.assertEqual(
 203                 tokenize('='),
 204                 (Token(
 205                     type='assignment_operator',
 206                     match='=',
 207                     index=0,
 208                     line=1,
 209                 ),),
 210             )
 211
 212         def test_tokenizes_equality_operator(self):
 213             self.assertEqual(
 214                 tokenize('=='),
 215                 (Token(
 216                     type='comparison_level_operator',
 217                     match='==',
 218                     index=0,
 219                     line=1,
 220                 ),),
 221             )
 222
 223         def test_tokenizes_greater_than_or_equal_operator(self):
 224             self.assertEqual(
 225                 tokenize('>='),
 226                 (Token(
 227                     type='comparison_level_operator',
 228                     match='>=',
 229                     index=0,
 230                     line=1,
 231                 ),),
 232             )
 233
 234         def test_tokenizes_less_than_or_equal_operator(self):
 235             self.assertEqual(
 236                 tokenize('<='),
 237                 (Token(
 238                     type='comparison_level_operator',
 239                     match='<=',
 240                     index=0,
 241                     line=1,
 242                 ),),
 243             )
 244
 245         def test_tokenizes_greater_than_equal_operator(self):
 246             self.assertEqual(
 247                 tokenize('>'),
 248                 (Token(
 249                     type='comparison_level_operator',
 250                     match='>',
 251                     index=0,
 252                     line=1,
 253                 ),),
 254             )
 255
 256         def test_tokenizes_less_than_equal_operator(self):
 257             self.assertEqual(
 258                 tokenize('<'),
 259                 (Token(
 260                     type='comparison_level_operator',
 261                     match='<',
 262                     index=0,
 263                     line=1,
 264                 ),),
 265             )
 266
 267         def test_tokenizes_not_equal_operator(self):
 268             self.assertEqual(
 269                 tokenize('!='),
 270                 (Token(
 271                     type='comparison_level_operator',
 272                     match='!=',
 273                     index=0,
 274                     line=1,
 275                 ),),
 276             )
 277
 278         def test_tokenizes_newline(self):
 279             self.assertEqual(
 280                 tokenize('\n'),
 281                 (Token(
 282                     type='newline',
 283                     match='\n',
 284                     index=0,
 285                     line=1,
 286                 ),),
 287             )
 288
 289         def test_handles_leading_space(self):
 290             self.assertEqual(
 291                 tokenize(' print'),
 292                 (Token(
 293                     type='symbol',
 294                     match='print',
 295                     index=1,
 296                     line=1,
 297                 ),),
 298             )
 299
 300         def test_tokenizes_with_proper_line_numbers(self):
 301             self.assertEqual(
 302                 tokenize('print\n('),
 303                 (
 304                     Token(
 305                         type='symbol',
 306                         match='print',
 307                         index=0,
 308                         line=1,
 309                     ),
 310                     Token(
 311                         type='newline',
 312                         match='\n',
 313                         index=5,
 314                         line=1,
 315                     ),
 316                     Token(
 317                         type='open_parenthese',
 318                         match='(',
 319                         index=6,
 320                         line=2,
 321                     ),
 322                 ),
 323             )
 324
 325
 326     unittest.main()