code.kerkeslager.com Git - fur/blob - tokenization.py

   1 import collections
   2 import re
   3
   4 import util
   5
   6 Token = collections.namedtuple(
   7     'Token',
   8     [
   9         'type',
  10         'match',
  11         'index',
  12         'line',
  13     ],
  14 )
  15
  16 def _make_token_matcher(definition):
  17     name, regex = definition
  18     regex_matcher = re.compile(regex)
  19
  20     def token_matcher(index, source, line):
  21         match = regex_matcher.match(source[index:])
  22
  23         if match is None:
  24             return False, index, None
  25
  26         return (
  27             True,
  28             index + len(match.group()),
  29             Token(type=name, match=match.group(), index=index, line=line),
  30         )
  31
  32     return token_matcher
  33
  34 _TOKEN_MATCHERS = [
  35     ('keyword',                         r'(def|do|else|end|if)(?![a-z_])'),
  36     ('open_parenthese',                 r'\('),
  37     ('close_parenthese',                r'\)'),
  38     ('comma',                           r','),
  39     ('integer_literal',                 r'\d+'),
  40     ('symbol',                          r'[a-z_]+'),
  41     ('single_quoted_string_literal',    r"'.*?'"),
  42     ('comparison_level_operator',         r'(<=|>=|==|!=|<|>)'),
  43     ('assignment_operator',             r'='),
  44     ('addition_level_operator',         r'(\+|-)'),
  45     ('multiplication_level_operator',   r'(\*|//|%)'),
  46     ('newline',                         r'\n'),
  47 ]
  48
  49 _TOKEN_MATCHERS = list(map(_make_token_matcher, _TOKEN_MATCHERS))
  50
  51 @util.force_generator(tuple)
  52 def tokenize(source):
  53     index = 0
  54     line = 1
  55
  56     while index < len(source):
  57         if source[index] == ' ':
  58             index += 1
  59             continue
  60
  61         if source[index] == '#':
  62             while index < len(source) and source[index] != '\n':
  63                 index += 1
  64
  65             continue
  66
  67         success = False
  68
  69         for matcher in _TOKEN_MATCHERS:
  70             success, index, token = matcher(index, source, line)
  71
  72             if success:
  73                 yield token
  74                 break
  75
  76         if not success:
  77             raise Exception('Unexpected character "{}" on line {}'.format(
  78                 source[index],
  79                 line,
  80             ))
  81
  82         if token.type == 'newline':
  83             line += 1
  84
  85 if __name__ == '__main__':
  86     import unittest
  87
  88     class TokenizeTests(unittest.TestCase):
  89         def test_tokenizes_open_parenthese(self):
  90             self.assertEqual(
  91                 tokenize('('),
  92                 (Token(
  93                     type='open_parenthese',
  94                     match='(',
  95                     index=0,
  96                     line=1,
  97                 ),),
  98             )
  99
 100         def test_tokenizes_close_parenthese(self):
 101             self.assertEqual(
 102                 tokenize(')'),
 103                 (Token(
 104                     type='close_parenthese',
 105                     match=')',
 106                     index=0,
 107                     line=1,
 108                 ),),
 109             )
 110
 111         def test_tokenizes_symbol(self):
 112             self.assertEqual(
 113                 tokenize('print'),
 114                 (Token(
 115                     type='symbol',
 116                     match='print',
 117                     index=0,
 118                     line=1,
 119                 ),),
 120             )
 121
 122         def test_tokenizes_single_quoted_string_literal(self):
 123             self.assertEqual(
 124                 tokenize("'Hello, world'"),
 125                 (Token(
 126                     type='single_quoted_string_literal',
 127                     match="'Hello, world'",
 128                     index=0,
 129                     line=1,
 130                 ),),
 131             )
 132
 133         def test_tokenizes_plus(self):
 134             self.assertEqual(
 135                 tokenize('+'),
 136                 (Token(
 137                     type='addition_level_operator',
 138                     match='+',
 139                     index=0,
 140                     line=1,
 141                 ),),
 142             )
 143
 144         def test_tokenizes_minus(self):
 145             self.assertEqual(
 146                 tokenize('-'),
 147                 (Token(
 148                     type='addition_level_operator',
 149                     match='-',
 150                     index=0,
 151                     line=1,
 152                 ),),
 153             )
 154
 155         def test_tokenizes_times(self):
 156             self.assertEqual(
 157                 tokenize('*'),
 158                 (Token(
 159                     type='multiplication_level_operator',
 160                     match='*',
 161                     index=0,
 162                     line=1,
 163                 ),),
 164             )
 165
 166         def test_tokenizes_integer_divide(self):
 167             self.assertEqual(
 168                 tokenize('//'),
 169                 (Token(
 170                     type='multiplication_level_operator',
 171                     match='//',
 172                     index=0,
 173                     line=1,
 174                 ),),
 175             )
 176
 177         def test_tokenizes_modular_divide(self):
 178             self.assertEqual(
 179                 tokenize('%'),
 180                 (Token(
 181                     type='multiplication_level_operator',
 182                     match='%',
 183                     index=0,
 184                     line=1,
 185                 ),),
 186             )
 187
 188         def test_tokenizes_comma(self):
 189             self.assertEqual(
 190                 tokenize(','),
 191                 (Token(
 192                     type='comma',
 193                     match=',',
 194                     index=0,
 195                     line=1,
 196                 ),),
 197             )
 198
 199         def test_tokenizes_assignment_operator(self):
 200             self.assertEqual(
 201                 tokenize('='),
 202                 (Token(
 203                     type='assignment_operator',
 204                     match='=',
 205                     index=0,
 206                     line=1,
 207                 ),),
 208             )
 209
 210         def test_tokenizes_equality_operator(self):
 211             self.assertEqual(
 212                 tokenize('=='),
 213                 (Token(
 214                     type='comparison_level_operator',
 215                     match='==',
 216                     index=0,
 217                     line=1,
 218                 ),),
 219             )
 220
 221         def test_tokenizes_greater_than_or_equal_operator(self):
 222             self.assertEqual(
 223                 tokenize('>='),
 224                 (Token(
 225                     type='comparison_level_operator',
 226                     match='>=',
 227                     index=0,
 228                     line=1,
 229                 ),),
 230             )
 231
 232         def test_tokenizes_less_than_or_equal_operator(self):
 233             self.assertEqual(
 234                 tokenize('<='),
 235                 (Token(
 236                     type='comparison_level_operator',
 237                     match='<=',
 238                     index=0,
 239                     line=1,
 240                 ),),
 241             )
 242
 243         def test_tokenizes_greater_than_equal_operator(self):
 244             self.assertEqual(
 245                 tokenize('>'),
 246                 (Token(
 247                     type='comparison_level_operator',
 248                     match='>',
 249                     index=0,
 250                     line=1,
 251                 ),),
 252             )
 253
 254         def test_tokenizes_less_than_equal_operator(self):
 255             self.assertEqual(
 256                 tokenize('<'),
 257                 (Token(
 258                     type='comparison_level_operator',
 259                     match='<',
 260                     index=0,
 261                     line=1,
 262                 ),),
 263             )
 264
 265         def test_tokenizes_not_equal_operator(self):
 266             self.assertEqual(
 267                 tokenize('!='),
 268                 (Token(
 269                     type='comparison_level_operator',
 270                     match='!=',
 271                     index=0,
 272                     line=1,
 273                 ),),
 274             )
 275
 276         def test_tokenizes_newline(self):
 277             self.assertEqual(
 278                 tokenize('\n'),
 279                 (Token(
 280                     type='newline',
 281                     match='\n',
 282                     index=0,
 283                     line=1,
 284                 ),),
 285             )
 286
 287         def test_handles_leading_space(self):
 288             self.assertEqual(
 289                 tokenize(' print'),
 290                 (Token(
 291                     type='symbol',
 292                     match='print',
 293                     index=1,
 294                     line=1,
 295                 ),),
 296             )
 297
 298         def test_tokenizes_with_proper_line_numbers(self):
 299             self.assertEqual(
 300                 tokenize('print\n('),
 301                 (
 302                     Token(
 303                         type='symbol',
 304                         match='print',
 305                         index=0,
 306                         line=1,
 307                     ),
 308                     Token(
 309                         type='newline',
 310                         match='\n',
 311                         index=5,
 312                         line=1,
 313                     ),
 314                     Token(
 315                         type='open_parenthese',
 316                         match='(',
 317                         index=6,
 318                         line=2,
 319                     ),
 320                 ),
 321             )
 322
 323
 324     unittest.main()