code.kerkeslager.com Git - fur/blob - tokenization.py

   1 import collections
   2 import re
   3
   4 import util
   5
   6 Token = collections.namedtuple(
   7     'Token',
   8     [
   9         'type',
  10         'match',
  11         'index',
  12         'line',
  13     ],
  14 )
  15
  16 def _make_token_matcher(definition):
  17     name, regex = definition
  18     regex_matcher = re.compile(regex)
  19
  20     def token_matcher(index, source, line):
  21         match = regex_matcher.match(source[index:])
  22
  23         if match is None:
  24             return False, index, None
  25
  26         return (
  27             True,
  28             index + len(match.group()),
  29             Token(type=name, match=match.group(), index=index, line=line),
  30         )
  31
  32     return token_matcher
  33
  34
  35 _TOKEN_MATCHERS = [
  36     ('open_parenthese',                 r'\('),
  37     ('close_parenthese',                r'\)'),
  38     ('comma',                           r','),
  39     ('integer_literal',                 r'\d+'),
  40     ('symbol',                          r'[a-z]+'),
  41     ('single_quoted_string_literal',    r"'.*?'"),
  42     ('equality_level_operator',         r'(<=|>=|==|!=|<|>)'),
  43     ('addition_level_operator',         r'(\+|-)'),
  44     ('multiplication_level_operator',   r'(\*|//|%)'),
  45     ('assignment_operator',             r'='),
  46 ]
  47
  48 _TOKEN_MATCHERS = list(map(_make_token_matcher, _TOKEN_MATCHERS))
  49
  50 @util.force_generator(tuple)
  51 def tokenize(source):
  52     index = 0
  53     line = 1
  54
  55     while index < len(source):
  56         if source[index] == ' ':
  57             index += 1
  58             continue
  59
  60         success = False
  61
  62         for matcher in _TOKEN_MATCHERS:
  63             success, index, token = matcher(index, source, line)
  64
  65             if success:
  66                 yield token
  67                 break
  68
  69         if not success:
  70             raise Exception('Unexpected character "{}"'.format(source[index]))
  71
  72         while index < len(source) and source[index] in set(['\n']):
  73             line += 1
  74             index += 1
  75
  76 if __name__ == '__main__':
  77     import unittest
  78
  79     class TokenizeTests(unittest.TestCase):
  80         def test_tokenizes_open_parenthese(self):
  81             self.assertEqual(
  82                 tokenize('('),
  83                 (Token(
  84                     type='open_parenthese',
  85                     match='(',
  86                     index=0,
  87                     line=1,
  88                 ),),
  89             )
  90
  91         def test_tokenizes_close_parenthese(self):
  92             self.assertEqual(
  93                 tokenize(')'),
  94                 (Token(
  95                     type='close_parenthese',
  96                     match=')',
  97                     index=0,
  98                     line=1,
  99                 ),),
 100             )
 101
 102         def test_tokenizes_symbol(self):
 103             self.assertEqual(
 104                 tokenize('print'),
 105                 (Token(
 106                     type='symbol',
 107                     match='print',
 108                     index=0,
 109                     line=1,
 110                 ),),
 111             )
 112
 113         def test_tokenizes_single_quoted_string_literal(self):
 114             self.assertEqual(
 115                 tokenize("'Hello, world'"),
 116                 (Token(
 117                     type='single_quoted_string_literal',
 118                     match="'Hello, world'",
 119                     index=0,
 120                     line=1,
 121                 ),),
 122             )
 123
 124         def test_tokenizes_plus(self):
 125             self.assertEqual(
 126                 tokenize('+'),
 127                 (Token(
 128                     type='addition_level_operator',
 129                     match='+',
 130                     index=0,
 131                     line=1,
 132                 ),),
 133             )
 134
 135         def test_tokenizes_minus(self):
 136             self.assertEqual(
 137                 tokenize('-'),
 138                 (Token(
 139                     type='addition_level_operator',
 140                     match='-',
 141                     index=0,
 142                     line=1,
 143                 ),),
 144             )
 145
 146         def test_tokenizes_times(self):
 147             self.assertEqual(
 148                 tokenize('*'),
 149                 (Token(
 150                     type='multiplication_level_operator',
 151                     match='*',
 152                     index=0,
 153                     line=1,
 154                 ),),
 155             )
 156
 157         def test_tokenizes_integer_divide(self):
 158             self.assertEqual(
 159                 tokenize('//'),
 160                 (Token(
 161                     type='multiplication_level_operator',
 162                     match='//',
 163                     index=0,
 164                     line=1,
 165                 ),),
 166             )
 167
 168         def test_tokenizes_modular_divide(self):
 169             self.assertEqual(
 170                 tokenize('%'),
 171                 (Token(
 172                     type='multiplication_level_operator',
 173                     match='%',
 174                     index=0,
 175                     line=1,
 176                 ),),
 177             )
 178
 179         def test_tokenizes_comma(self):
 180             self.assertEqual(
 181                 tokenize(','),
 182                 (Token(
 183                     type='comma',
 184                     match=',',
 185                     index=0,
 186                     line=1,
 187                 ),),
 188             )
 189
 190         def test_tokenizes_assignment_operator(self):
 191             self.assertEqual(
 192                 tokenize('='),
 193                 (Token(
 194                     type='assignment_operator',
 195                     match='=',
 196                     index=0,
 197                     line=1,
 198                 ),),
 199             )
 200
 201         def test_tokenizes_equality_operator(self):
 202             self.assertEqual(
 203                 tokenize('=='),
 204                 (Token(
 205                     type='equality_level_operator',
 206                     match='==',
 207                     index=0,
 208                     line=1,
 209                 ),),
 210             )
 211
 212         def test_tokenizes_greater_than_or_equal_operator(self):
 213             self.assertEqual(
 214                 tokenize('>='),
 215                 (Token(
 216                     type='equality_level_operator',
 217                     match='>=',
 218                     index=0,
 219                     line=1,
 220                 ),),
 221             )
 222
 223         def test_tokenizes_less_than_or_equal_operator(self):
 224             self.assertEqual(
 225                 tokenize('<='),
 226                 (Token(
 227                     type='equality_level_operator',
 228                     match='<=',
 229                     index=0,
 230                     line=1,
 231                 ),),
 232             )
 233
 234         def test_tokenizes_greater_than_equal_operator(self):
 235             self.assertEqual(
 236                 tokenize('>'),
 237                 (Token(
 238                     type='equality_level_operator',
 239                     match='>',
 240                     index=0,
 241                     line=1,
 242                 ),),
 243             )
 244
 245         def test_tokenizes_less_than_equal_operator(self):
 246             self.assertEqual(
 247                 tokenize('<'),
 248                 (Token(
 249                     type='equality_level_operator',
 250                     match='<',
 251                     index=0,
 252                     line=1,
 253                 ),),
 254             )
 255
 256         def test_tokenizes_not_equal_operator(self):
 257             self.assertEqual(
 258                 tokenize('!='),
 259                 (Token(
 260                     type='equality_level_operator',
 261                     match='!=',
 262                     index=0,
 263                     line=1,
 264                 ),),
 265             )
 266
 267         def test_handles_trailing_newline(self):
 268             self.assertEqual(
 269                 tokenize('print\n'),
 270                 (Token(
 271                     type='symbol',
 272                     match='print',
 273                     index=0,
 274                     line=1,
 275                 ),),
 276             )
 277
 278         def test_handles_leading_space(self):
 279             self.assertEqual(
 280                 tokenize(' print'),
 281                 (Token(
 282                     type='symbol',
 283                     match='print',
 284                     index=1,
 285                     line=1,
 286                 ),),
 287             )
 288
 289         def test_tokenizes_with_proper_line_numbers(self):
 290             self.assertEqual(
 291                 tokenize('print\n('),
 292                 (
 293                     Token(
 294                         type='symbol',
 295                         match='print',
 296                         index=0,
 297                         line=1,
 298                     ),
 299                     Token(
 300                         type='open_parenthese',
 301                         match='(',
 302                         index=6,
 303                         line=2,
 304                     ),
 305                 ),
 306             )
 307
 308
 309     unittest.main()