Implement negatives, use typedef

[fur] / tokenization.py
diff --git a/tokenization.py b/tokenization.py

index 7733ab7..819a0de 100644 (file)
--- a/tokenization.py
+++ b/tokenization.py
@@ -5,33 +5,64 @@ import util
  
  Token = collections.namedtuple(
      'Token',
-    [
+    (
          'type',
          'match',
-    ],
+        'metadata',
+    ),
+)
+
+NodeMetadata = collections.namedtuple(
+    'NodeMetadata',
+    (
+        'index',
+        'line',
+    ),
  )
  
  def _make_token_matcher(definition):
      name, regex = definition
      regex_matcher = re.compile(regex)
  
-    def token_matcher(index, source):
+    def token_matcher(index, source, line):
          match = regex_matcher.match(source[index:])
  
          if match is None:
              return False, index, None
  
-        return True, index + len(match.group()), Token(type=name, match=match.group())
+        return (
+            True,
+            index + len(match.group()),
+            Token(
+                type=name,
+                match=match.group(),
+                metadata=NodeMetadata(
+                    index=index,
+                    line=line,
+                ),
+            ),
+        )
  
      return token_matcher
  
-
  _TOKEN_MATCHERS = [
+    ('keyword',                         r'(def|do|else|end|if|lambda)(?![a-z_])'),
+    ('open_bracket',                    r'\['),
+    ('close_bracket',                   r'\]'),
      ('open_parenthese',                 r'\('),
      ('close_parenthese',                r'\)'),
-    ('integer_literal',                 r'-?\s*\d+'),
-    ('symbol',                          r'[a-z]+'),
+    ('comma',                           r','),
+    ('colon',                           r':'),
+    ('period',                          r'\.'),
+    ('integer_literal',                 r'\d+'),
+    ('symbol',                          r'[a-z_]+'),
      ('single_quoted_string_literal',    r"'.*?'"),
+    ('double_quoted_string_literal',    r'".*?"'),
+    ('comparison_level_operator',       r'(<=|>=|==|!=|<|>)'),
+    ('assignment_operator',             r'='),
+    ('addition_level_operator',         r'(\+\+|\+|-)'),
+    ('multiplication_level_operator',   r'(\*|//|%)'),
+    ('newline',                         r'\n'),
  ]
  
  _TOKEN_MATCHERS = list(map(_make_token_matcher, _TOKEN_MATCHERS))
@@ -39,22 +70,36 @@ _TOKEN_MATCHERS = list(map(_make_token_matcher, _TOKEN_MATCHERS))
  @util.force_generator(tuple)
  def tokenize(source):
      index = 0
+    line = 1
  
      while index < len(source):
+        if source[index] == ' ':
+            index += 1
+            continue
+
+        if source[index] == '#':
+            while index < len(source) and source[index] != '\n':
+                index += 1
+
+            continue
+
          success = False
  
          for matcher in _TOKEN_MATCHERS:
-            success, index, token = matcher(index, source)
+            success, index, token = matcher(index, source, line)
  
              if success:
                  yield token
                  break
  
          if not success:
-            raise Exception('Unexpected character "{}"'.format(source[index]))
+            raise Exception('Unexpected character "{}" on line {}'.format(
+                source[index],
+                line,
+            ))
  
-        while index < len(source) and source[index] in set(['\n']):
-            index += 1
+        if token.type == 'newline':
+            line += 1
  
  if __name__ == '__main__':
      import unittest
@@ -63,46 +108,236 @@ if __name__ == '__main__':
          def test_tokenizes_open_parenthese(self):
              self.assertEqual(
                  tokenize('('),
-                [Token(
+                (Token(
                      type='open_parenthese',
                      match='(',
-                )],
+                    index=0,
+                    line=1,
+                ),),
              )
  
          def test_tokenizes_close_parenthese(self):
              self.assertEqual(
                  tokenize(')'),
-                [Token(
+                (Token(
                      type='close_parenthese',
                      match=')',
-                )],
+                    index=0,
+                    line=1,
+                ),),
              )
  
          def test_tokenizes_symbol(self):
              self.assertEqual(
                  tokenize('print'),
-                [Token(
+                (Token(
                      type='symbol',
                      match='print',
-                )],
+                    index=0,
+                    line=1,
+                ),),
              )
  
          def test_tokenizes_single_quoted_string_literal(self):
              self.assertEqual(
                  tokenize("'Hello, world'"),
-                [Token(
+                (Token(
                      type='single_quoted_string_literal',
                      match="'Hello, world'",
-                )],
+                    index=0,
+                    line=1,
+                ),),
+            )
+
+        def test_tokenizes_plus(self):
+            self.assertEqual(
+                tokenize('+'),
+                (Token(
+                    type='addition_level_operator',
+                    match='+',
+                    index=0,
+                    line=1,
+                ),),
+            )
+
+        def test_tokenizes_minus(self):
+            self.assertEqual(
+                tokenize('-'),
+                (Token(
+                    type='addition_level_operator',
+                    match='-',
+                    index=0,
+                    line=1,
+                ),),
+            )
+
+        def test_tokenizes_times(self):
+            self.assertEqual(
+                tokenize('*'),
+                (Token(
+                    type='multiplication_level_operator',
+                    match='*',
+                    index=0,
+                    line=1,
+                ),),
+            )
+
+        def test_tokenizes_integer_divide(self):
+            self.assertEqual(
+                tokenize('//'),
+                (Token(
+                    type='multiplication_level_operator',
+                    match='//',
+                    index=0,
+                    line=1,
+                ),),
+            )
+
+        def test_tokenizes_modular_divide(self):
+            self.assertEqual(
+                tokenize('%'),
+                (Token(
+                    type='multiplication_level_operator',
+                    match='%',
+                    index=0,
+                    line=1,
+                ),),
+            )
+
+        def test_tokenizes_comma(self):
+            self.assertEqual(
+                tokenize(','),
+                (Token(
+                    type='comma',
+                    match=',',
+                    index=0,
+                    line=1,
+                ),),
+            )
+
+        def test_tokenizes_assignment_operator(self):
+            self.assertEqual(
+                tokenize('='),
+                (Token(
+                    type='assignment_operator',
+                    match='=',
+                    index=0,
+                    line=1,
+                ),),
+            )
+
+        def test_tokenizes_equality_operator(self):
+            self.assertEqual(
+                tokenize('=='),
+                (Token(
+                    type='comparison_level_operator',
+                    match='==',
+                    index=0,
+                    line=1,
+                ),),
+            )
+
+        def test_tokenizes_greater_than_or_equal_operator(self):
+            self.assertEqual(
+                tokenize('>='),
+                (Token(
+                    type='comparison_level_operator',
+                    match='>=',
+                    index=0,
+                    line=1,
+                ),),
+            )
+
+        def test_tokenizes_less_than_or_equal_operator(self):
+            self.assertEqual(
+                tokenize('<='),
+                (Token(
+                    type='comparison_level_operator',
+                    match='<=',
+                    index=0,
+                    line=1,
+                ),),
+            )
+
+        def test_tokenizes_greater_than_equal_operator(self):
+            self.assertEqual(
+                tokenize('>'),
+                (Token(
+                    type='comparison_level_operator',
+                    match='>',
+                    index=0,
+                    line=1,
+                ),),
              )
  
-        def test_handles_trailing_newline(self):
+        def test_tokenizes_less_than_equal_operator(self):
              self.assertEqual(
-                tokenize('print\n'),
-                [Token(
+                tokenize('<'),
+                (Token(
+                    type='comparison_level_operator',
+                    match='<',
+                    index=0,
+                    line=1,
+                ),),
+            )
+
+        def test_tokenizes_not_equal_operator(self):
+            self.assertEqual(
+                tokenize('!='),
+                (Token(
+                    type='comparison_level_operator',
+                    match='!=',
+                    index=0,
+                    line=1,
+                ),),
+            )
+
+        def test_tokenizes_newline(self):
+            self.assertEqual(
+                tokenize('\n'),
+                (Token(
+                    type='newline',
+                    match='\n',
+                    index=0,
+                    line=1,
+                ),),
+            )
+
+        def test_handles_leading_space(self):
+            self.assertEqual(
+                tokenize(' print'),
+                (Token(
                      type='symbol',
                      match='print',
-                )],
+                    index=1,
+                    line=1,
+                ),),
+            )
+
+        def test_tokenizes_with_proper_line_numbers(self):
+            self.assertEqual(
+                tokenize('print\n('),
+                (
+                    Token(
+                        type='symbol',
+                        match='print',
+                        index=0,
+                        line=1,
+                    ),
+                    Token(
+                        type='newline',
+                        match='\n',
+                        index=5,
+                        line=1,
+                    ),
+                    Token(
+                        type='open_parenthese',
+                        match='(',
+                        index=6,
+                        line=2,
+                    ),
+                ),
              )
  
+
      unittest.main()