'type',
'match',
'index',
+ 'line',
],
)
name, regex = definition
regex_matcher = re.compile(regex)
- def token_matcher(index, source):
+ def token_matcher(index, source, line):
match = regex_matcher.match(source[index:])
if match is None:
return (
True,
index + len(match.group()),
- Token(type=name, match=match.group(), index=index),
+ Token(type=name, match=match.group(), index=index, line=line),
)
return token_matcher
_TOKEN_MATCHERS = [
('open_parenthese', r'\('),
('close_parenthese', r'\)'),
+ ('comma', r','),
('integer_literal', r'\d+'),
('symbol', r'[a-z]+'),
('single_quoted_string_literal', r"'.*?'"),
@util.force_generator(tuple)
def tokenize(source):
index = 0
+ line = 1
while index < len(source):
if source[index] == ' ':
success = False
for matcher in _TOKEN_MATCHERS:
- success, index, token = matcher(index, source)
+ success, index, token = matcher(index, source, line)
if success:
yield token
raise Exception('Unexpected character "{}"'.format(source[index]))
while index < len(source) and source[index] in set(['\n']):
+ line += 1
index += 1
if __name__ == '__main__':
type='open_parenthese',
match='(',
index=0,
+ line=1,
),),
)
type='close_parenthese',
match=')',
index=0,
+ line=1,
),),
)
type='symbol',
match='print',
index=0,
+ line=1,
),),
)
type='single_quoted_string_literal',
match="'Hello, world'",
index=0,
+ line=1,
),),
)
type='addition_level_operator',
match='+',
index=0,
+ line=1,
),),
)
type='addition_level_operator',
match='-',
index=0,
+ line=1,
),),
)
type='multiplication_level_operator',
match='*',
index=0,
+ line=1,
),),
)
type='multiplication_level_operator',
match='//',
index=0,
+ line=1,
),),
)
type='multiplication_level_operator',
match='%',
index=0,
+ line=1,
),),
)
+ def test_tokenizes_comma(self):
+ self.assertEqual(
+ tokenize(','),
+ (Token(
+ type='comma',
+ match=',',
+ index=0,
+ line=1,
+ ),),
+ )
+
+
def test_handles_trailing_newline(self):
self.assertEqual(
tokenize('print\n'),
type='symbol',
match='print',
index=0,
+ line=1,
),),
)
type='symbol',
match='print',
index=1,
+ line=1,
),),
)
+ def test_tokenizes_with_proper_line_numbers(self):
+ self.assertEqual(
+ tokenize('print\n('),
+ (
+ Token(
+ type='symbol',
+ match='print',
+ index=0,
+ line=1,
+ ),
+ Token(
+ type='open_parenthese',
+ match='(',
+ index=6,
+ line=2,
+ ),
+ ),
+ )
+
+
unittest.main()