code.kerkeslager.com Git - fur/blob - tokenization.py

   1 import collections
   2 import re
   3
   4 import util
   5
   6 Token = collections.namedtuple(
   7     'Token',
   8     [
   9         'type',
  10         'match',
  11         'index',
  12         'line',
  13     ],
  14 )
  15
  16 def _make_token_matcher(definition):
  17     name, regex = definition
  18     regex_matcher = re.compile(regex)
  19
  20     def token_matcher(index, source, line):
  21         match = regex_matcher.match(source[index:])
  22
  23         if match is None:
  24             return False, index, None
  25
  26         return (
  27             True,
  28             index + len(match.group()),
  29             Token(type=name, match=match.group(), index=index, line=line),
  30         )
  31
  32     return token_matcher
  33
  34 _TOKEN_MATCHERS = [
  35     ('keyword',                         r'(def|do|else|end|if)(?![a-z_])'),
  36     ('open_bracket',                    r'\['),
  37     ('close_bracket',                   r'\]'),
  38     ('open_parenthese',                 r'\('),
  39     ('close_parenthese',                r'\)'),
  40     ('comma',                           r','),
  41     ('integer_literal',                 r'\d+'),
  42     ('symbol',                          r'[a-z_]+'),
  43     ('single_quoted_string_literal',    r"'.*?'"),
  44     ('double_quoted_string_literal',    r'".*?"'),
  45     ('comparison_level_operator',       r'(<=|>=|==|!=|<|>)'),
  46     ('assignment_operator',             r'='),
  47     ('addition_level_operator',         r'(\+\+|\+|-)'),
  48     ('multiplication_level_operator',   r'(\*|//|%)'),
  49     ('newline',                         r'\n'),
  50 ]
  51
  52 _TOKEN_MATCHERS = list(map(_make_token_matcher, _TOKEN_MATCHERS))
  53
  54 @util.force_generator(tuple)
  55 def tokenize(source):
  56     index = 0
  57     line = 1
  58
  59     while index < len(source):
  60         if source[index] == ' ':
  61             index += 1
  62             continue
  63
  64         if source[index] == '#':
  65             while index < len(source) and source[index] != '\n':
  66                 index += 1
  67
  68             continue
  69
  70         success = False
  71
  72         for matcher in _TOKEN_MATCHERS:
  73             success, index, token = matcher(index, source, line)
  74
  75             if success:
  76                 yield token
  77                 break
  78
  79         if not success:
  80             raise Exception('Unexpected character "{}" on line {}'.format(
  81                 source[index],
  82                 line,
  83             ))
  84
  85         if token.type == 'newline':
  86             line += 1
  87
  88 if __name__ == '__main__':
  89     import unittest
  90
  91     class TokenizeTests(unittest.TestCase):
  92         def test_tokenizes_open_parenthese(self):
  93             self.assertEqual(
  94                 tokenize('('),
  95                 (Token(
  96                     type='open_parenthese',
  97                     match='(',
  98                     index=0,
  99                     line=1,
 100                 ),),
 101             )
 102
 103         def test_tokenizes_close_parenthese(self):
 104             self.assertEqual(
 105                 tokenize(')'),
 106                 (Token(
 107                     type='close_parenthese',
 108                     match=')',
 109                     index=0,
 110                     line=1,
 111                 ),),
 112             )
 113
 114         def test_tokenizes_symbol(self):
 115             self.assertEqual(
 116                 tokenize('print'),
 117                 (Token(
 118                     type='symbol',
 119                     match='print',
 120                     index=0,
 121                     line=1,
 122                 ),),
 123             )
 124
 125         def test_tokenizes_single_quoted_string_literal(self):
 126             self.assertEqual(
 127                 tokenize("'Hello, world'"),
 128                 (Token(
 129                     type='single_quoted_string_literal',
 130                     match="'Hello, world'",
 131                     index=0,
 132                     line=1,
 133                 ),),
 134             )
 135
 136         def test_tokenizes_plus(self):
 137             self.assertEqual(
 138                 tokenize('+'),
 139                 (Token(
 140                     type='addition_level_operator',
 141                     match='+',
 142                     index=0,
 143                     line=1,
 144                 ),),
 145             )
 146
 147         def test_tokenizes_minus(self):
 148             self.assertEqual(
 149                 tokenize('-'),
 150                 (Token(
 151                     type='addition_level_operator',
 152                     match='-',
 153                     index=0,
 154                     line=1,
 155                 ),),
 156             )
 157
 158         def test_tokenizes_times(self):
 159             self.assertEqual(
 160                 tokenize('*'),
 161                 (Token(
 162                     type='multiplication_level_operator',
 163                     match='*',
 164                     index=0,
 165                     line=1,
 166                 ),),
 167             )
 168
 169         def test_tokenizes_integer_divide(self):
 170             self.assertEqual(
 171                 tokenize('//'),
 172                 (Token(
 173                     type='multiplication_level_operator',
 174                     match='//',
 175                     index=0,
 176                     line=1,
 177                 ),),
 178             )
 179
 180         def test_tokenizes_modular_divide(self):
 181             self.assertEqual(
 182                 tokenize('%'),
 183                 (Token(
 184                     type='multiplication_level_operator',
 185                     match='%',
 186                     index=0,
 187                     line=1,
 188                 ),),
 189             )
 190
 191         def test_tokenizes_comma(self):
 192             self.assertEqual(
 193                 tokenize(','),
 194                 (Token(
 195                     type='comma',
 196                     match=',',
 197                     index=0,
 198                     line=1,
 199                 ),),
 200             )
 201
 202         def test_tokenizes_assignment_operator(self):
 203             self.assertEqual(
 204                 tokenize('='),
 205                 (Token(
 206                     type='assignment_operator',
 207                     match='=',
 208                     index=0,
 209                     line=1,
 210                 ),),
 211             )
 212
 213         def test_tokenizes_equality_operator(self):
 214             self.assertEqual(
 215                 tokenize('=='),
 216                 (Token(
 217                     type='comparison_level_operator',
 218                     match='==',
 219                     index=0,
 220                     line=1,
 221                 ),),
 222             )
 223
 224         def test_tokenizes_greater_than_or_equal_operator(self):
 225             self.assertEqual(
 226                 tokenize('>='),
 227                 (Token(
 228                     type='comparison_level_operator',
 229                     match='>=',
 230                     index=0,
 231                     line=1,
 232                 ),),
 233             )
 234
 235         def test_tokenizes_less_than_or_equal_operator(self):
 236             self.assertEqual(
 237                 tokenize('<='),
 238                 (Token(
 239                     type='comparison_level_operator',
 240                     match='<=',
 241                     index=0,
 242                     line=1,
 243                 ),),
 244             )
 245
 246         def test_tokenizes_greater_than_equal_operator(self):
 247             self.assertEqual(
 248                 tokenize('>'),
 249                 (Token(
 250                     type='comparison_level_operator',
 251                     match='>',
 252                     index=0,
 253                     line=1,
 254                 ),),
 255             )
 256
 257         def test_tokenizes_less_than_equal_operator(self):
 258             self.assertEqual(
 259                 tokenize('<'),
 260                 (Token(
 261                     type='comparison_level_operator',
 262                     match='<',
 263                     index=0,
 264                     line=1,
 265                 ),),
 266             )
 267
 268         def test_tokenizes_not_equal_operator(self):
 269             self.assertEqual(
 270                 tokenize('!='),
 271                 (Token(
 272                     type='comparison_level_operator',
 273                     match='!=',
 274                     index=0,
 275                     line=1,
 276                 ),),
 277             )
 278
 279         def test_tokenizes_newline(self):
 280             self.assertEqual(
 281                 tokenize('\n'),
 282                 (Token(
 283                     type='newline',
 284                     match='\n',
 285                     index=0,
 286                     line=1,
 287                 ),),
 288             )
 289
 290         def test_handles_leading_space(self):
 291             self.assertEqual(
 292                 tokenize(' print'),
 293                 (Token(
 294                     type='symbol',
 295                     match='print',
 296                     index=1,
 297                     line=1,
 298                 ),),
 299             )
 300
 301         def test_tokenizes_with_proper_line_numbers(self):
 302             self.assertEqual(
 303                 tokenize('print\n('),
 304                 (
 305                     Token(
 306                         type='symbol',
 307                         match='print',
 308                         index=0,
 309                         line=1,
 310                     ),
 311                     Token(
 312                         type='newline',
 313                         match='\n',
 314                         index=5,
 315                         line=1,
 316                     ),
 317                     Token(
 318                         type='open_parenthese',
 319                         match='(',
 320                         index=6,
 321                         line=2,
 322                     ),
 323                 ),
 324             )
 325
 326
 327     unittest.main()