code.kerkeslager.com Git - fur/blob - tokenization.py

   1 import collections
   2 import re
   3
   4 import util
   5
   6 Token = collections.namedtuple(
   7     'Token',
   8     [
   9         'type',
  10         'match',
  11         'index',
  12         'line',
  13     ],
  14 )
  15
  16 def _make_token_matcher(definition):
  17     name, regex = definition
  18     regex_matcher = re.compile(regex)
  19
  20     def token_matcher(index, source, line):
  21         match = regex_matcher.match(source[index:])
  22
  23         if match is None:
  24             return False, index, None
  25
  26         return (
  27             True,
  28             index + len(match.group()),
  29             Token(type=name, match=match.group(), index=index, line=line),
  30         )
  31
  32     return token_matcher
  33
  34 _TOKEN_MATCHERS = [
  35     ('keyword',                         r'(def|end)(?![a-z_])'),
  36     ('open_parenthese',                 r'\('),
  37     ('close_parenthese',                r'\)'),
  38     ('comma',                           r','),
  39     ('integer_literal',                 r'\d+'),
  40     ('symbol',                          r'[a-z_]+'),
  41     ('single_quoted_string_literal',    r"'.*?'"),
  42     ('comparison_level_operator',         r'(<=|>=|==|!=|<|>)'),
  43     ('assignment_operator',             r'='),
  44     ('addition_level_operator',         r'(\+|-)'),
  45     ('multiplication_level_operator',   r'(\*|//|%)'),
  46     ('newline',                         r'\n'),
  47 ]
  48
  49 _TOKEN_MATCHERS = list(map(_make_token_matcher, _TOKEN_MATCHERS))
  50
  51 @util.force_generator(tuple)
  52 def tokenize(source):
  53     index = 0
  54     line = 1
  55
  56     while index < len(source):
  57         if source[index] == ' ':
  58             index += 1
  59             continue
  60
  61         success = False
  62
  63         for matcher in _TOKEN_MATCHERS:
  64             success, index, token = matcher(index, source, line)
  65
  66             if success:
  67                 yield token
  68                 break
  69
  70         if not success:
  71             raise Exception('Unexpected character "{}" on line {}'.format(
  72                 source[index],
  73                 line,
  74             ))
  75
  76         if token.type == 'newline':
  77             line += 1
  78
  79 if __name__ == '__main__':
  80     import unittest
  81
  82     class TokenizeTests(unittest.TestCase):
  83         def test_tokenizes_open_parenthese(self):
  84             self.assertEqual(
  85                 tokenize('('),
  86                 (Token(
  87                     type='open_parenthese',
  88                     match='(',
  89                     index=0,
  90                     line=1,
  91                 ),),
  92             )
  93
  94         def test_tokenizes_close_parenthese(self):
  95             self.assertEqual(
  96                 tokenize(')'),
  97                 (Token(
  98                     type='close_parenthese',
  99                     match=')',
 100                     index=0,
 101                     line=1,
 102                 ),),
 103             )
 104
 105         def test_tokenizes_symbol(self):
 106             self.assertEqual(
 107                 tokenize('print'),
 108                 (Token(
 109                     type='symbol',
 110                     match='print',
 111                     index=0,
 112                     line=1,
 113                 ),),
 114             )
 115
 116         def test_tokenizes_single_quoted_string_literal(self):
 117             self.assertEqual(
 118                 tokenize("'Hello, world'"),
 119                 (Token(
 120                     type='single_quoted_string_literal',
 121                     match="'Hello, world'",
 122                     index=0,
 123                     line=1,
 124                 ),),
 125             )
 126
 127         def test_tokenizes_plus(self):
 128             self.assertEqual(
 129                 tokenize('+'),
 130                 (Token(
 131                     type='addition_level_operator',
 132                     match='+',
 133                     index=0,
 134                     line=1,
 135                 ),),
 136             )
 137
 138         def test_tokenizes_minus(self):
 139             self.assertEqual(
 140                 tokenize('-'),
 141                 (Token(
 142                     type='addition_level_operator',
 143                     match='-',
 144                     index=0,
 145                     line=1,
 146                 ),),
 147             )
 148
 149         def test_tokenizes_times(self):
 150             self.assertEqual(
 151                 tokenize('*'),
 152                 (Token(
 153                     type='multiplication_level_operator',
 154                     match='*',
 155                     index=0,
 156                     line=1,
 157                 ),),
 158             )
 159
 160         def test_tokenizes_integer_divide(self):
 161             self.assertEqual(
 162                 tokenize('//'),
 163                 (Token(
 164                     type='multiplication_level_operator',
 165                     match='//',
 166                     index=0,
 167                     line=1,
 168                 ),),
 169             )
 170
 171         def test_tokenizes_modular_divide(self):
 172             self.assertEqual(
 173                 tokenize('%'),
 174                 (Token(
 175                     type='multiplication_level_operator',
 176                     match='%',
 177                     index=0,
 178                     line=1,
 179                 ),),
 180             )
 181
 182         def test_tokenizes_comma(self):
 183             self.assertEqual(
 184                 tokenize(','),
 185                 (Token(
 186                     type='comma',
 187                     match=',',
 188                     index=0,
 189                     line=1,
 190                 ),),
 191             )
 192
 193         def test_tokenizes_assignment_operator(self):
 194             self.assertEqual(
 195                 tokenize('='),
 196                 (Token(
 197                     type='assignment_operator',
 198                     match='=',
 199                     index=0,
 200                     line=1,
 201                 ),),
 202             )
 203
 204         def test_tokenizes_equality_operator(self):
 205             self.assertEqual(
 206                 tokenize('=='),
 207                 (Token(
 208                     type='comparison_level_operator',
 209                     match='==',
 210                     index=0,
 211                     line=1,
 212                 ),),
 213             )
 214
 215         def test_tokenizes_greater_than_or_equal_operator(self):
 216             self.assertEqual(
 217                 tokenize('>='),
 218                 (Token(
 219                     type='comparison_level_operator',
 220                     match='>=',
 221                     index=0,
 222                     line=1,
 223                 ),),
 224             )
 225
 226         def test_tokenizes_less_than_or_equal_operator(self):
 227             self.assertEqual(
 228                 tokenize('<='),
 229                 (Token(
 230                     type='comparison_level_operator',
 231                     match='<=',
 232                     index=0,
 233                     line=1,
 234                 ),),
 235             )
 236
 237         def test_tokenizes_greater_than_equal_operator(self):
 238             self.assertEqual(
 239                 tokenize('>'),
 240                 (Token(
 241                     type='comparison_level_operator',
 242                     match='>',
 243                     index=0,
 244                     line=1,
 245                 ),),
 246             )
 247
 248         def test_tokenizes_less_than_equal_operator(self):
 249             self.assertEqual(
 250                 tokenize('<'),
 251                 (Token(
 252                     type='comparison_level_operator',
 253                     match='<',
 254                     index=0,
 255                     line=1,
 256                 ),),
 257             )
 258
 259         def test_tokenizes_not_equal_operator(self):
 260             self.assertEqual(
 261                 tokenize('!='),
 262                 (Token(
 263                     type='comparison_level_operator',
 264                     match='!=',
 265                     index=0,
 266                     line=1,
 267                 ),),
 268             )
 269
 270         def test_tokenizes_newline(self):
 271             self.assertEqual(
 272                 tokenize('\n'),
 273                 (Token(
 274                     type='newline',
 275                     match='\n',
 276                     index=0,
 277                     line=1,
 278                 ),),
 279             )
 280
 281         def test_handles_leading_space(self):
 282             self.assertEqual(
 283                 tokenize(' print'),
 284                 (Token(
 285                     type='symbol',
 286                     match='print',
 287                     index=1,
 288                     line=1,
 289                 ),),
 290             )
 291
 292         def test_tokenizes_with_proper_line_numbers(self):
 293             self.assertEqual(
 294                 tokenize('print\n('),
 295                 (
 296                     Token(
 297                         type='symbol',
 298                         match='print',
 299                         index=0,
 300                         line=1,
 301                     ),
 302                     Token(
 303                         type='newline',
 304                         match='\n',
 305                         index=5,
 306                         line=1,
 307                     ),
 308                     Token(
 309                         type='open_parenthese',
 310                         match='(',
 311                         index=6,
 312                         line=2,
 313                     ),
 314                 ),
 315             )
 316
 317
 318     unittest.main()