code.kerkeslager.com Git - fur/blob - tokenization.py

   1 import collections
   2 import re
   3
   4 import util
   5
   6 Token = collections.namedtuple(
   7     'Token',
   8     [
   9         'type',
  10         'match',
  11         'index',
  12         'line',
  13     ],
  14 )
  15
  16 def _make_token_matcher(definition):
  17     name, regex = definition
  18     regex_matcher = re.compile(regex)
  19
  20     def token_matcher(index, source, line):
  21         match = regex_matcher.match(source[index:])
  22
  23         if match is None:
  24             return False, index, None
  25
  26         return (
  27             True,
  28             index + len(match.group()),
  29             Token(type=name, match=match.group(), index=index, line=line),
  30         )
  31
  32     return token_matcher
  33
  34
  35 _TOKEN_MATCHERS = [
  36     ('open_parenthese',                 r'\('),
  37     ('close_parenthese',                r'\)'),
  38     ('comma',                           r','),
  39     ('assignment_operator',             r'='),
  40     ('integer_literal',                 r'\d+'),
  41     ('symbol',                          r'[a-z]+'),
  42     ('single_quoted_string_literal',    r"'.*?'"),
  43     ('addition_level_operator',         r'(\+|-)'),
  44     ('multiplication_level_operator',  r'(\*|//|%)'),
  45 ]
  46
  47 _TOKEN_MATCHERS = list(map(_make_token_matcher, _TOKEN_MATCHERS))
  48
  49 @util.force_generator(tuple)
  50 def tokenize(source):
  51     index = 0
  52     line = 1
  53
  54     while index < len(source):
  55         if source[index] == ' ':
  56             index += 1
  57             continue
  58
  59         success = False
  60
  61         for matcher in _TOKEN_MATCHERS:
  62             success, index, token = matcher(index, source, line)
  63
  64             if success:
  65                 yield token
  66                 break
  67
  68         if not success:
  69             raise Exception('Unexpected character "{}"'.format(source[index]))
  70
  71         while index < len(source) and source[index] in set(['\n']):
  72             line += 1
  73             index += 1
  74
  75 if __name__ == '__main__':
  76     import unittest
  77
  78     class TokenizeTests(unittest.TestCase):
  79         def test_tokenizes_open_parenthese(self):
  80             self.assertEqual(
  81                 tokenize('('),
  82                 (Token(
  83                     type='open_parenthese',
  84                     match='(',
  85                     index=0,
  86                     line=1,
  87                 ),),
  88             )
  89
  90         def test_tokenizes_close_parenthese(self):
  91             self.assertEqual(
  92                 tokenize(')'),
  93                 (Token(
  94                     type='close_parenthese',
  95                     match=')',
  96                     index=0,
  97                     line=1,
  98                 ),),
  99             )
 100
 101         def test_tokenizes_symbol(self):
 102             self.assertEqual(
 103                 tokenize('print'),
 104                 (Token(
 105                     type='symbol',
 106                     match='print',
 107                     index=0,
 108                     line=1,
 109                 ),),
 110             )
 111
 112         def test_tokenizes_single_quoted_string_literal(self):
 113             self.assertEqual(
 114                 tokenize("'Hello, world'"),
 115                 (Token(
 116                     type='single_quoted_string_literal',
 117                     match="'Hello, world'",
 118                     index=0,
 119                     line=1,
 120                 ),),
 121             )
 122
 123         def test_tokenizes_plus(self):
 124             self.assertEqual(
 125                 tokenize('+'),
 126                 (Token(
 127                     type='addition_level_operator',
 128                     match='+',
 129                     index=0,
 130                     line=1,
 131                 ),),
 132             )
 133
 134         def test_tokenizes_minus(self):
 135             self.assertEqual(
 136                 tokenize('-'),
 137                 (Token(
 138                     type='addition_level_operator',
 139                     match='-',
 140                     index=0,
 141                     line=1,
 142                 ),),
 143             )
 144
 145         def test_tokenizes_times(self):
 146             self.assertEqual(
 147                 tokenize('*'),
 148                 (Token(
 149                     type='multiplication_level_operator',
 150                     match='*',
 151                     index=0,
 152                     line=1,
 153                 ),),
 154             )
 155
 156         def test_tokenizes_integer_divide(self):
 157             self.assertEqual(
 158                 tokenize('//'),
 159                 (Token(
 160                     type='multiplication_level_operator',
 161                     match='//',
 162                     index=0,
 163                     line=1,
 164                 ),),
 165             )
 166
 167         def test_tokenizes_modular_divide(self):
 168             self.assertEqual(
 169                 tokenize('%'),
 170                 (Token(
 171                     type='multiplication_level_operator',
 172                     match='%',
 173                     index=0,
 174                     line=1,
 175                 ),),
 176             )
 177
 178         def test_tokenizes_comma(self):
 179             self.assertEqual(
 180                 tokenize(','),
 181                 (Token(
 182                     type='comma',
 183                     match=',',
 184                     index=0,
 185                     line=1,
 186                 ),),
 187             )
 188
 189         def test_tokenizes_assignment_operator(self):
 190             self.assertEqual(
 191                 tokenize('='),
 192                 (Token(
 193                     type='assignment_operator',
 194                     match='=',
 195                     index=0,
 196                     line=1,
 197                 ),),
 198             )
 199
 200         def test_handles_trailing_newline(self):
 201             self.assertEqual(
 202                 tokenize('print\n'),
 203                 (Token(
 204                     type='symbol',
 205                     match='print',
 206                     index=0,
 207                     line=1,
 208                 ),),
 209             )
 210
 211         def test_handles_leading_space(self):
 212             self.assertEqual(
 213                 tokenize(' print'),
 214                 (Token(
 215                     type='symbol',
 216                     match='print',
 217                     index=1,
 218                     line=1,
 219                 ),),
 220             )
 221
 222         def test_tokenizes_with_proper_line_numbers(self):
 223             self.assertEqual(
 224                 tokenize('print\n('),
 225                 (
 226                     Token(
 227                         type='symbol',
 228                         match='print',
 229                         index=0,
 230                         line=1,
 231                     ),
 232                     Token(
 233                         type='open_parenthese',
 234                         match='(',
 235                         index=6,
 236                         line=2,
 237                     ),
 238                 ),
 239             )
 240
 241
 242     unittest.main()