code.kerkeslager.com Git - fur/blob - tokenization.py

   1 import collections
   2 import re
   3
   4 import util
   5
   6 Token = collections.namedtuple(
   7     'Token',
   8     [
   9         'type',
  10         'match',
  11         'index',
  12         'line',
  13     ],
  14 )
  15
  16 def _make_token_matcher(definition):
  17     name, regex = definition
  18     regex_matcher = re.compile(regex)
  19
  20     def token_matcher(index, source, line):
  21         match = regex_matcher.match(source[index:])
  22
  23         if match is None:
  24             return False, index, None
  25
  26         return (
  27             True,
  28             index + len(match.group()),
  29             Token(type=name, match=match.group(), index=index, line=line),
  30         )
  31
  32     return token_matcher
  33
  34 _TOKEN_MATCHERS = [
  35     ('keyword',                         r'(def|do|else|end|if)(?![a-z_])'),
  36     ('open_bracket',                    r'\['),
  37     ('close_bracket',                   r'\]'),
  38     ('open_parenthese',                 r'\('),
  39     ('close_parenthese',                r'\)'),
  40     ('comma',                           r','),
  41     ('colon',                           r':'),
  42     ('period',                          r'\.'),
  43     ('integer_literal',                 r'\d+'),
  44     ('symbol',                          r'[a-z_]+'),
  45     ('single_quoted_string_literal',    r"'.*?'"),
  46     ('double_quoted_string_literal',    r'".*?"'),
  47     ('comparison_level_operator',       r'(<=|>=|==|!=|<|>)'),
  48     ('assignment_operator',             r'='),
  49     ('addition_level_operator',         r'(\+\+|\+|-)'),
  50     ('multiplication_level_operator',   r'(\*|//|%)'),
  51     ('newline',                         r'\n'),
  52 ]
  53
  54 _TOKEN_MATCHERS = list(map(_make_token_matcher, _TOKEN_MATCHERS))
  55
  56 @util.force_generator(tuple)
  57 def tokenize(source):
  58     index = 0
  59     line = 1
  60
  61     while index < len(source):
  62         if source[index] == ' ':
  63             index += 1
  64             continue
  65
  66         if source[index] == '#':
  67             while index < len(source) and source[index] != '\n':
  68                 index += 1
  69
  70             continue
  71
  72         success = False
  73
  74         for matcher in _TOKEN_MATCHERS:
  75             success, index, token = matcher(index, source, line)
  76
  77             if success:
  78                 yield token
  79                 break
  80
  81         if not success:
  82             raise Exception('Unexpected character "{}" on line {}'.format(
  83                 source[index],
  84                 line,
  85             ))
  86
  87         if token.type == 'newline':
  88             line += 1
  89
  90 if __name__ == '__main__':
  91     import unittest
  92
  93     class TokenizeTests(unittest.TestCase):
  94         def test_tokenizes_open_parenthese(self):
  95             self.assertEqual(
  96                 tokenize('('),
  97                 (Token(
  98                     type='open_parenthese',
  99                     match='(',
 100                     index=0,
 101                     line=1,
 102                 ),),
 103             )
 104
 105         def test_tokenizes_close_parenthese(self):
 106             self.assertEqual(
 107                 tokenize(')'),
 108                 (Token(
 109                     type='close_parenthese',
 110                     match=')',
 111                     index=0,
 112                     line=1,
 113                 ),),
 114             )
 115
 116         def test_tokenizes_symbol(self):
 117             self.assertEqual(
 118                 tokenize('print'),
 119                 (Token(
 120                     type='symbol',
 121                     match='print',
 122                     index=0,
 123                     line=1,
 124                 ),),
 125             )
 126
 127         def test_tokenizes_single_quoted_string_literal(self):
 128             self.assertEqual(
 129                 tokenize("'Hello, world'"),
 130                 (Token(
 131                     type='single_quoted_string_literal',
 132                     match="'Hello, world'",
 133                     index=0,
 134                     line=1,
 135                 ),),
 136             )
 137
 138         def test_tokenizes_plus(self):
 139             self.assertEqual(
 140                 tokenize('+'),
 141                 (Token(
 142                     type='addition_level_operator',
 143                     match='+',
 144                     index=0,
 145                     line=1,
 146                 ),),
 147             )
 148
 149         def test_tokenizes_minus(self):
 150             self.assertEqual(
 151                 tokenize('-'),
 152                 (Token(
 153                     type='addition_level_operator',
 154                     match='-',
 155                     index=0,
 156                     line=1,
 157                 ),),
 158             )
 159
 160         def test_tokenizes_times(self):
 161             self.assertEqual(
 162                 tokenize('*'),
 163                 (Token(
 164                     type='multiplication_level_operator',
 165                     match='*',
 166                     index=0,
 167                     line=1,
 168                 ),),
 169             )
 170
 171         def test_tokenizes_integer_divide(self):
 172             self.assertEqual(
 173                 tokenize('//'),
 174                 (Token(
 175                     type='multiplication_level_operator',
 176                     match='//',
 177                     index=0,
 178                     line=1,
 179                 ),),
 180             )
 181
 182         def test_tokenizes_modular_divide(self):
 183             self.assertEqual(
 184                 tokenize('%'),
 185                 (Token(
 186                     type='multiplication_level_operator',
 187                     match='%',
 188                     index=0,
 189                     line=1,
 190                 ),),
 191             )
 192
 193         def test_tokenizes_comma(self):
 194             self.assertEqual(
 195                 tokenize(','),
 196                 (Token(
 197                     type='comma',
 198                     match=',',
 199                     index=0,
 200                     line=1,
 201                 ),),
 202             )
 203
 204         def test_tokenizes_assignment_operator(self):
 205             self.assertEqual(
 206                 tokenize('='),
 207                 (Token(
 208                     type='assignment_operator',
 209                     match='=',
 210                     index=0,
 211                     line=1,
 212                 ),),
 213             )
 214
 215         def test_tokenizes_equality_operator(self):
 216             self.assertEqual(
 217                 tokenize('=='),
 218                 (Token(
 219                     type='comparison_level_operator',
 220                     match='==',
 221                     index=0,
 222                     line=1,
 223                 ),),
 224             )
 225
 226         def test_tokenizes_greater_than_or_equal_operator(self):
 227             self.assertEqual(
 228                 tokenize('>='),
 229                 (Token(
 230                     type='comparison_level_operator',
 231                     match='>=',
 232                     index=0,
 233                     line=1,
 234                 ),),
 235             )
 236
 237         def test_tokenizes_less_than_or_equal_operator(self):
 238             self.assertEqual(
 239                 tokenize('<='),
 240                 (Token(
 241                     type='comparison_level_operator',
 242                     match='<=',
 243                     index=0,
 244                     line=1,
 245                 ),),
 246             )
 247
 248         def test_tokenizes_greater_than_equal_operator(self):
 249             self.assertEqual(
 250                 tokenize('>'),
 251                 (Token(
 252                     type='comparison_level_operator',
 253                     match='>',
 254                     index=0,
 255                     line=1,
 256                 ),),
 257             )
 258
 259         def test_tokenizes_less_than_equal_operator(self):
 260             self.assertEqual(
 261                 tokenize('<'),
 262                 (Token(
 263                     type='comparison_level_operator',
 264                     match='<',
 265                     index=0,
 266                     line=1,
 267                 ),),
 268             )
 269
 270         def test_tokenizes_not_equal_operator(self):
 271             self.assertEqual(
 272                 tokenize('!='),
 273                 (Token(
 274                     type='comparison_level_operator',
 275                     match='!=',
 276                     index=0,
 277                     line=1,
 278                 ),),
 279             )
 280
 281         def test_tokenizes_newline(self):
 282             self.assertEqual(
 283                 tokenize('\n'),
 284                 (Token(
 285                     type='newline',
 286                     match='\n',
 287                     index=0,
 288                     line=1,
 289                 ),),
 290             )
 291
 292         def test_handles_leading_space(self):
 293             self.assertEqual(
 294                 tokenize(' print'),
 295                 (Token(
 296                     type='symbol',
 297                     match='print',
 298                     index=1,
 299                     line=1,
 300                 ),),
 301             )
 302
 303         def test_tokenizes_with_proper_line_numbers(self):
 304             self.assertEqual(
 305                 tokenize('print\n('),
 306                 (
 307                     Token(
 308                         type='symbol',
 309                         match='print',
 310                         index=0,
 311                         line=1,
 312                     ),
 313                     Token(
 314                         type='newline',
 315                         match='\n',
 316                         index=5,
 317                         line=1,
 318                     ),
 319                     Token(
 320                         type='open_parenthese',
 321                         match='(',
 322                         index=6,
 323                         line=2,
 324                     ),
 325                 ),
 326             )
 327
 328
 329     unittest.main()