[
'type',
'match',
+ 'index',
+ 'line',
],
)
name, regex = definition
regex_matcher = re.compile(regex)
- def token_matcher(index, source):
+ def token_matcher(index, source, line):
match = regex_matcher.match(source[index:])
if match is None:
return False, index, None
- return True, index + len(match.group()), Token(type=name, match=match.group())
+ return (
+ True,
+ index + len(match.group()),
+ Token(type=name, match=match.group(), index=index, line=line),
+ )
return token_matcher
-
_TOKEN_MATCHERS = [
+ ('keyword', r'(def|end)(?![a-z_])'),
('open_parenthese', r'\('),
('close_parenthese', r'\)'),
- ('integer_literal', r'-?\s*\d+'),
- ('symbol', r'[a-z]+'),
+ ('comma', r','),
+ ('integer_literal', r'\d+'),
+ ('symbol', r'[a-z_]+'),
('single_quoted_string_literal', r"'.*?'"),
+ ('comparison_level_operator', r'(<=|>=|==|!=|<|>)'),
+ ('assignment_operator', r'='),
+ ('addition_level_operator', r'(\+|-)'),
+ ('multiplication_level_operator', r'(\*|//|%)'),
+ ('newline', r'\n'),
]
_TOKEN_MATCHERS = list(map(_make_token_matcher, _TOKEN_MATCHERS))
@util.force_generator(tuple)
def tokenize(source):
index = 0
+ line = 1
while index < len(source):
+ if source[index] == ' ':
+ index += 1
+ continue
+
success = False
for matcher in _TOKEN_MATCHERS:
- success, index, token = matcher(index, source)
+ success, index, token = matcher(index, source, line)
if success:
yield token
break
if not success:
- raise Exception('Unexpected character "{}"'.format(source[index]))
+ raise Exception('Unexpected character "{}" on line {}'.format(
+ source[index],
+ line,
+ ))
- while index < len(source) and source[index] in set(['\n']):
- index += 1
+ if token.type == 'newline':
+ line += 1
if __name__ == '__main__':
import unittest
def test_tokenizes_open_parenthese(self):
self.assertEqual(
tokenize('('),
- [Token(
+ (Token(
type='open_parenthese',
match='(',
- )],
+ index=0,
+ line=1,
+ ),),
)
def test_tokenizes_close_parenthese(self):
self.assertEqual(
tokenize(')'),
- [Token(
+ (Token(
type='close_parenthese',
match=')',
- )],
+ index=0,
+ line=1,
+ ),),
)
def test_tokenizes_symbol(self):
self.assertEqual(
tokenize('print'),
- [Token(
+ (Token(
type='symbol',
match='print',
- )],
+ index=0,
+ line=1,
+ ),),
)
def test_tokenizes_single_quoted_string_literal(self):
self.assertEqual(
tokenize("'Hello, world'"),
- [Token(
+ (Token(
type='single_quoted_string_literal',
match="'Hello, world'",
- )],
+ index=0,
+ line=1,
+ ),),
+ )
+
+ def test_tokenizes_plus(self):
+ self.assertEqual(
+ tokenize('+'),
+ (Token(
+ type='addition_level_operator',
+ match='+',
+ index=0,
+ line=1,
+ ),),
+ )
+
+ def test_tokenizes_minus(self):
+ self.assertEqual(
+ tokenize('-'),
+ (Token(
+ type='addition_level_operator',
+ match='-',
+ index=0,
+ line=1,
+ ),),
+ )
+
+ def test_tokenizes_times(self):
+ self.assertEqual(
+ tokenize('*'),
+ (Token(
+ type='multiplication_level_operator',
+ match='*',
+ index=0,
+ line=1,
+ ),),
+ )
+
+ def test_tokenizes_integer_divide(self):
+ self.assertEqual(
+ tokenize('//'),
+ (Token(
+ type='multiplication_level_operator',
+ match='//',
+ index=0,
+ line=1,
+ ),),
+ )
+
+ def test_tokenizes_modular_divide(self):
+ self.assertEqual(
+ tokenize('%'),
+ (Token(
+ type='multiplication_level_operator',
+ match='%',
+ index=0,
+ line=1,
+ ),),
+ )
+
+ def test_tokenizes_comma(self):
+ self.assertEqual(
+ tokenize(','),
+ (Token(
+ type='comma',
+ match=',',
+ index=0,
+ line=1,
+ ),),
+ )
+
+ def test_tokenizes_assignment_operator(self):
+ self.assertEqual(
+ tokenize('='),
+ (Token(
+ type='assignment_operator',
+ match='=',
+ index=0,
+ line=1,
+ ),),
)
- def test_handles_trailing_newline(self):
+ def test_tokenizes_equality_operator(self):
self.assertEqual(
- tokenize('print\n'),
- [Token(
+ tokenize('=='),
+ (Token(
+ type='comparison_level_operator',
+ match='==',
+ index=0,
+ line=1,
+ ),),
+ )
+
+ def test_tokenizes_greater_than_or_equal_operator(self):
+ self.assertEqual(
+ tokenize('>='),
+ (Token(
+ type='comparison_level_operator',
+ match='>=',
+ index=0,
+ line=1,
+ ),),
+ )
+
+ def test_tokenizes_less_than_or_equal_operator(self):
+ self.assertEqual(
+ tokenize('<='),
+ (Token(
+ type='comparison_level_operator',
+ match='<=',
+ index=0,
+ line=1,
+ ),),
+ )
+
+ def test_tokenizes_greater_than_equal_operator(self):
+ self.assertEqual(
+ tokenize('>'),
+ (Token(
+ type='comparison_level_operator',
+ match='>',
+ index=0,
+ line=1,
+ ),),
+ )
+
+ def test_tokenizes_less_than_equal_operator(self):
+ self.assertEqual(
+ tokenize('<'),
+ (Token(
+ type='comparison_level_operator',
+ match='<',
+ index=0,
+ line=1,
+ ),),
+ )
+
+ def test_tokenizes_not_equal_operator(self):
+ self.assertEqual(
+ tokenize('!='),
+ (Token(
+ type='comparison_level_operator',
+ match='!=',
+ index=0,
+ line=1,
+ ),),
+ )
+
+ def test_tokenizes_newline(self):
+ self.assertEqual(
+ tokenize('\n'),
+ (Token(
+ type='newline',
+ match='\n',
+ index=0,
+ line=1,
+ ),),
+ )
+
+ def test_handles_leading_space(self):
+ self.assertEqual(
+ tokenize(' print'),
+ (Token(
type='symbol',
match='print',
- )],
+ index=1,
+ line=1,
+ ),),
+ )
+
+ def test_tokenizes_with_proper_line_numbers(self):
+ self.assertEqual(
+ tokenize('print\n('),
+ (
+ Token(
+ type='symbol',
+ match='print',
+ index=0,
+ line=1,
+ ),
+ Token(
+ type='newline',
+ match='\n',
+ index=5,
+ line=1,
+ ),
+ Token(
+ type='open_parenthese',
+ match='(',
+ index=6,
+ line=2,
+ ),
+ ),
)
+
unittest.main()