X-Git-Url: https://code.kerkeslager.com/?p=fur;a=blobdiff_plain;f=tokenization.py;h=819a0de9ea7fd5e07a9c2c0a5c7d93b55d6f0fda;hp=10ed80f28a5f47caa16f2c7ad03bfae631f1ecfc;hb=c7f381fbcb57ba1b7e33558a28fdb34f31234c07;hpb=c40954b865b48ffa4993f735d56213fb91c3e90e diff --git a/tokenization.py b/tokenization.py index 10ed80f..819a0de 100644 --- a/tokenization.py +++ b/tokenization.py @@ -5,56 +5,101 @@ import util Token = collections.namedtuple( 'Token', - [ + ( 'type', 'match', - ], + 'metadata', + ), +) + +NodeMetadata = collections.namedtuple( + 'NodeMetadata', + ( + 'index', + 'line', + ), ) def _make_token_matcher(definition): name, regex = definition regex_matcher = re.compile(regex) - def token_matcher(index, source): + def token_matcher(index, source, line): match = regex_matcher.match(source[index:]) if match is None: return False, index, None - return True, index + len(match.group()), Token(type=name, match=match.group()) + return ( + True, + index + len(match.group()), + Token( + type=name, + match=match.group(), + metadata=NodeMetadata( + index=index, + line=line, + ), + ), + ) return token_matcher - _TOKEN_MATCHERS = [ + ('keyword', r'(def|do|else|end|if|lambda)(?![a-z_])'), + ('open_bracket', r'\['), + ('close_bracket', r'\]'), ('open_parenthese', r'\('), ('close_parenthese', r'\)'), - ('integer_literal', r'-?\s*\d+'), - ('symbol', r'[a-z]+'), + ('comma', r','), + ('colon', r':'), + ('period', r'\.'), + ('integer_literal', r'\d+'), + ('symbol', r'[a-z_]+'), ('single_quoted_string_literal', r"'.*?'"), + ('double_quoted_string_literal', r'".*?"'), + ('comparison_level_operator', r'(<=|>=|==|!=|<|>)'), + ('assignment_operator', r'='), + ('addition_level_operator', r'(\+\+|\+|-)'), + ('multiplication_level_operator', r'(\*|//|%)'), + ('newline', r'\n'), ] _TOKEN_MATCHERS = list(map(_make_token_matcher, _TOKEN_MATCHERS)) -@util.force_generator +@util.force_generator(tuple) def tokenize(source): index = 0 + line = 1 while index < len(source): + if source[index] == ' ': + index += 1 + continue + + if source[index] == '#': + while index < len(source) and source[index] != '\n': + index += 1 + + continue + success = False for matcher in _TOKEN_MATCHERS: - success, index, token = matcher(index, source) + success, index, token = matcher(index, source, line) if success: yield token break if not success: - raise Exception('Unexpected character "{}"'.format(source[index])) + raise Exception('Unexpected character "{}" on line {}'.format( + source[index], + line, + )) - while index < len(source) and source[index] in set(['\n']): - index += 1 + if token.type == 'newline': + line += 1 if __name__ == '__main__': import unittest @@ -63,46 +108,236 @@ if __name__ == '__main__': def test_tokenizes_open_parenthese(self): self.assertEqual( tokenize('('), - [Token( + (Token( type='open_parenthese', match='(', - )], + index=0, + line=1, + ),), ) def test_tokenizes_close_parenthese(self): self.assertEqual( tokenize(')'), - [Token( + (Token( type='close_parenthese', match=')', - )], + index=0, + line=1, + ),), ) def test_tokenizes_symbol(self): self.assertEqual( tokenize('print'), - [Token( + (Token( type='symbol', match='print', - )], + index=0, + line=1, + ),), ) def test_tokenizes_single_quoted_string_literal(self): self.assertEqual( tokenize("'Hello, world'"), - [Token( + (Token( type='single_quoted_string_literal', match="'Hello, world'", - )], + index=0, + line=1, + ),), + ) + + def test_tokenizes_plus(self): + self.assertEqual( + tokenize('+'), + (Token( + type='addition_level_operator', + match='+', + index=0, + line=1, + ),), + ) + + def test_tokenizes_minus(self): + self.assertEqual( + tokenize('-'), + (Token( + type='addition_level_operator', + match='-', + index=0, + line=1, + ),), + ) + + def test_tokenizes_times(self): + self.assertEqual( + tokenize('*'), + (Token( + type='multiplication_level_operator', + match='*', + index=0, + line=1, + ),), + ) + + def test_tokenizes_integer_divide(self): + self.assertEqual( + tokenize('//'), + (Token( + type='multiplication_level_operator', + match='//', + index=0, + line=1, + ),), + ) + + def test_tokenizes_modular_divide(self): + self.assertEqual( + tokenize('%'), + (Token( + type='multiplication_level_operator', + match='%', + index=0, + line=1, + ),), + ) + + def test_tokenizes_comma(self): + self.assertEqual( + tokenize(','), + (Token( + type='comma', + match=',', + index=0, + line=1, + ),), + ) + + def test_tokenizes_assignment_operator(self): + self.assertEqual( + tokenize('='), + (Token( + type='assignment_operator', + match='=', + index=0, + line=1, + ),), + ) + + def test_tokenizes_equality_operator(self): + self.assertEqual( + tokenize('=='), + (Token( + type='comparison_level_operator', + match='==', + index=0, + line=1, + ),), + ) + + def test_tokenizes_greater_than_or_equal_operator(self): + self.assertEqual( + tokenize('>='), + (Token( + type='comparison_level_operator', + match='>=', + index=0, + line=1, + ),), + ) + + def test_tokenizes_less_than_or_equal_operator(self): + self.assertEqual( + tokenize('<='), + (Token( + type='comparison_level_operator', + match='<=', + index=0, + line=1, + ),), + ) + + def test_tokenizes_greater_than_equal_operator(self): + self.assertEqual( + tokenize('>'), + (Token( + type='comparison_level_operator', + match='>', + index=0, + line=1, + ),), ) - def test_handles_trailing_newline(self): + def test_tokenizes_less_than_equal_operator(self): self.assertEqual( - tokenize('print\n'), - [Token( + tokenize('<'), + (Token( + type='comparison_level_operator', + match='<', + index=0, + line=1, + ),), + ) + + def test_tokenizes_not_equal_operator(self): + self.assertEqual( + tokenize('!='), + (Token( + type='comparison_level_operator', + match='!=', + index=0, + line=1, + ),), + ) + + def test_tokenizes_newline(self): + self.assertEqual( + tokenize('\n'), + (Token( + type='newline', + match='\n', + index=0, + line=1, + ),), + ) + + def test_handles_leading_space(self): + self.assertEqual( + tokenize(' print'), + (Token( type='symbol', match='print', - )], + index=1, + line=1, + ),), + ) + + def test_tokenizes_with_proper_line_numbers(self): + self.assertEqual( + tokenize('print\n('), + ( + Token( + type='symbol', + match='print', + index=0, + line=1, + ), + Token( + type='newline', + match='\n', + index=5, + line=1, + ), + Token( + type='open_parenthese', + match='(', + index=6, + line=2, + ), + ), ) + unittest.main()