code.kerkeslager.com Git - fur/blob - tokenization.py

   1 import collections
   2 import re
   3
   4 import util
   5
   6 Token = collections.namedtuple(
   7     'Token',
   8     [
   9         'type',
  10         'match',
  11         'index',
  12     ],
  13 )
  14
  15 def _make_token_matcher(definition):
  16     name, regex = definition
  17     regex_matcher = re.compile(regex)
  18
  19     def token_matcher(index, source):
  20         match = regex_matcher.match(source[index:])
  21
  22         if match is None:
  23             return False, index, None
  24
  25         return (
  26             True,
  27             index + len(match.group()),
  28             Token(type=name, match=match.group(), index=index),
  29         )
  30
  31     return token_matcher
  32
  33
  34 _TOKEN_MATCHERS = [
  35     ('open_parenthese',                 r'\('),
  36     ('close_parenthese',                r'\)'),
  37     ('integer_literal',                 r'\d+'),
  38     ('symbol',                          r'[a-z]+'),
  39     ('single_quoted_string_literal',    r"'.*?'"),
  40     ('addition_level_operator',         r'(\+|-)'),
  41     ('multiplication_level_operator',  r'(\*|//|%)'),
  42 ]
  43
  44 _TOKEN_MATCHERS = list(map(_make_token_matcher, _TOKEN_MATCHERS))
  45
  46 @util.force_generator(tuple)
  47 def tokenize(source):
  48     index = 0
  49
  50     while index < len(source):
  51         if source[index] == ' ':
  52             index += 1
  53             continue
  54
  55         success = False
  56
  57         for matcher in _TOKEN_MATCHERS:
  58             success, index, token = matcher(index, source)
  59
  60             if success:
  61                 yield token
  62                 break
  63
  64         if not success:
  65             raise Exception('Unexpected character "{}"'.format(source[index]))
  66
  67         while index < len(source) and source[index] in set(['\n']):
  68             index += 1
  69
  70 if __name__ == '__main__':
  71     import unittest
  72
  73     class TokenizeTests(unittest.TestCase):
  74         def test_tokenizes_open_parenthese(self):
  75             self.assertEqual(
  76                 tokenize('('),
  77                 (Token(
  78                     type='open_parenthese',
  79                     match='(',
  80                     index=0,
  81                 ),),
  82             )
  83
  84         def test_tokenizes_close_parenthese(self):
  85             self.assertEqual(
  86                 tokenize(')'),
  87                 (Token(
  88                     type='close_parenthese',
  89                     match=')',
  90                     index=0,
  91                 ),),
  92             )
  93
  94         def test_tokenizes_symbol(self):
  95             self.assertEqual(
  96                 tokenize('print'),
  97                 (Token(
  98                     type='symbol',
  99                     match='print',
 100                     index=0,
 101                 ),),
 102             )
 103
 104         def test_tokenizes_single_quoted_string_literal(self):
 105             self.assertEqual(
 106                 tokenize("'Hello, world'"),
 107                 (Token(
 108                     type='single_quoted_string_literal',
 109                     match="'Hello, world'",
 110                     index=0,
 111                 ),),
 112             )
 113
 114         def test_tokenizes_plus(self):
 115             self.assertEqual(
 116                 tokenize('+'),
 117                 (Token(
 118                     type='addition_level_operator',
 119                     match='+',
 120                     index=0,
 121                 ),),
 122             )
 123
 124         def test_tokenizes_minus(self):
 125             self.assertEqual(
 126                 tokenize('-'),
 127                 (Token(
 128                     type='addition_level_operator',
 129                     match='-',
 130                     index=0,
 131                 ),),
 132             )
 133
 134         def test_tokenizes_times(self):
 135             self.assertEqual(
 136                 tokenize('*'),
 137                 (Token(
 138                     type='multiplication_level_operator',
 139                     match='*',
 140                     index=0,
 141                 ),),
 142             )
 143
 144         def test_tokenizes_integer_divide(self):
 145             self.assertEqual(
 146                 tokenize('//'),
 147                 (Token(
 148                     type='multiplication_level_operator',
 149                     match='//',
 150                     index=0,
 151                 ),),
 152             )
 153
 154         def test_tokenizes_modular_divide(self):
 155             self.assertEqual(
 156                 tokenize('%'),
 157                 (Token(
 158                     type='multiplication_level_operator',
 159                     match='%',
 160                     index=0,
 161                 ),),
 162             )
 163
 164         def test_handles_trailing_newline(self):
 165             self.assertEqual(
 166                 tokenize('print\n'),
 167                 (Token(
 168                     type='symbol',
 169                     match='print',
 170                     index=0,
 171                 ),),
 172             )
 173
 174         def test_handles_leading_space(self):
 175             self.assertEqual(
 176                 tokenize(' print'),
 177                 (Token(
 178                     type='symbol',
 179                     match='print',
 180                     index=1,
 181                 ),),
 182             )
 183
 184     unittest.main()