Token = collections.namedtuple(
'Token',
- [
+ (
'type',
'match',
+ 'metadata',
+ ),
+)
+
+NodeMetadata = collections.namedtuple(
+ 'NodeMetadata',
+ (
'index',
- ],
+ 'line',
+ ),
)
def _make_token_matcher(definition):
name, regex = definition
regex_matcher = re.compile(regex)
- def token_matcher(index, source):
+ def token_matcher(index, source, line):
match = regex_matcher.match(source[index:])
if match is None:
return (
True,
index + len(match.group()),
- Token(type=name, match=match.group(), index=index),
+ Token(
+ type=name,
+ match=match.group(),
+ metadata=NodeMetadata(
+ index=index,
+ line=line,
+ ),
+ ),
)
return token_matcher
-
_TOKEN_MATCHERS = [
+ ('keyword', r'(def|do|else|end|if|lambda)(?![a-z_])'),
+ ('open_bracket', r'\['),
+ ('close_bracket', r'\]'),
('open_parenthese', r'\('),
('close_parenthese', r'\)'),
+ ('comma', r','),
+ ('colon', r':'),
+ ('period', r'\.'),
('integer_literal', r'\d+'),
- ('symbol', r'[a-z]+'),
+ ('symbol', r'[a-z_]+'),
('single_quoted_string_literal', r"'.*?'"),
- ('addition_level_operator', r'(\+|-)'),
- ('multiplication_level_operator', r'(\*|//|%)'),
+ ('double_quoted_string_literal', r'".*?"'),
+ ('comparison_level_operator', r'(<=|>=|==|!=|<|>)'),
+ ('assignment_operator', r'='),
+ ('addition_level_operator', r'(\+\+|\+|-)'),
+ ('multiplication_level_operator', r'(\*|//|%)'),
+ ('newline', r'\n'),
]
_TOKEN_MATCHERS = list(map(_make_token_matcher, _TOKEN_MATCHERS))
@util.force_generator(tuple)
def tokenize(source):
index = 0
+ line = 1
while index < len(source):
if source[index] == ' ':
index += 1
continue
+ if source[index] == '#':
+ while index < len(source) and source[index] != '\n':
+ index += 1
+
+ continue
+
success = False
for matcher in _TOKEN_MATCHERS:
- success, index, token = matcher(index, source)
+ success, index, token = matcher(index, source, line)
if success:
yield token
break
if not success:
- raise Exception('Unexpected character "{}"'.format(source[index]))
+ raise Exception('Unexpected character "{}" on line {}'.format(
+ source[index],
+ line,
+ ))
- while index < len(source) and source[index] in set(['\n']):
- index += 1
+ if token.type == 'newline':
+ line += 1
if __name__ == '__main__':
import unittest
type='open_parenthese',
match='(',
index=0,
+ line=1,
),),
)
type='close_parenthese',
match=')',
index=0,
+ line=1,
),),
)
type='symbol',
match='print',
index=0,
+ line=1,
),),
)
type='single_quoted_string_literal',
match="'Hello, world'",
index=0,
+ line=1,
),),
)
type='addition_level_operator',
match='+',
index=0,
+ line=1,
),),
)
type='addition_level_operator',
match='-',
index=0,
+ line=1,
),),
)
type='multiplication_level_operator',
match='*',
index=0,
+ line=1,
),),
)
type='multiplication_level_operator',
match='//',
index=0,
+ line=1,
),),
)
type='multiplication_level_operator',
match='%',
index=0,
+ line=1,
),),
)
- def test_handles_trailing_newline(self):
+ def test_tokenizes_comma(self):
self.assertEqual(
- tokenize('print\n'),
+ tokenize(','),
(Token(
- type='symbol',
- match='print',
+ type='comma',
+ match=',',
+ index=0,
+ line=1,
+ ),),
+ )
+
+ def test_tokenizes_assignment_operator(self):
+ self.assertEqual(
+ tokenize('='),
+ (Token(
+ type='assignment_operator',
+ match='=',
index=0,
+ line=1,
+ ),),
+ )
+
+ def test_tokenizes_equality_operator(self):
+ self.assertEqual(
+ tokenize('=='),
+ (Token(
+ type='comparison_level_operator',
+ match='==',
+ index=0,
+ line=1,
+ ),),
+ )
+
+ def test_tokenizes_greater_than_or_equal_operator(self):
+ self.assertEqual(
+ tokenize('>='),
+ (Token(
+ type='comparison_level_operator',
+ match='>=',
+ index=0,
+ line=1,
+ ),),
+ )
+
+ def test_tokenizes_less_than_or_equal_operator(self):
+ self.assertEqual(
+ tokenize('<='),
+ (Token(
+ type='comparison_level_operator',
+ match='<=',
+ index=0,
+ line=1,
+ ),),
+ )
+
+ def test_tokenizes_greater_than_equal_operator(self):
+ self.assertEqual(
+ tokenize('>'),
+ (Token(
+ type='comparison_level_operator',
+ match='>',
+ index=0,
+ line=1,
+ ),),
+ )
+
+ def test_tokenizes_less_than_equal_operator(self):
+ self.assertEqual(
+ tokenize('<'),
+ (Token(
+ type='comparison_level_operator',
+ match='<',
+ index=0,
+ line=1,
+ ),),
+ )
+
+ def test_tokenizes_not_equal_operator(self):
+ self.assertEqual(
+ tokenize('!='),
+ (Token(
+ type='comparison_level_operator',
+ match='!=',
+ index=0,
+ line=1,
+ ),),
+ )
+
+ def test_tokenizes_newline(self):
+ self.assertEqual(
+ tokenize('\n'),
+ (Token(
+ type='newline',
+ match='\n',
+ index=0,
+ line=1,
),),
)
type='symbol',
match='print',
index=1,
+ line=1,
),),
)
+ def test_tokenizes_with_proper_line_numbers(self):
+ self.assertEqual(
+ tokenize('print\n('),
+ (
+ Token(
+ type='symbol',
+ match='print',
+ index=0,
+ line=1,
+ ),
+ Token(
+ type='newline',
+ match='\n',
+ index=5,
+ line=1,
+ ),
+ Token(
+ type='open_parenthese',
+ match='(',
+ index=6,
+ line=2,
+ ),
+ ),
+ )
+
+
unittest.main()