Add a diff utility written in Python
authorDavid Kerkeslager <kerkeslager@gmail.com>
Sun, 4 Aug 2019 16:55:01 +0000 (12:55 -0400)
committerDavid Kerkeslager <kerkeslager@gmail.com>
Fri, 13 Sep 2019 06:56:40 +0000 (02:56 -0400)
diff/after.txt [new file with mode: 0644]
diff/before.txt [new file with mode: 0644]
diff/diff.py [new file with mode: 0644]

diff --git a/diff/after.txt b/diff/after.txt
new file mode 100644 (file)
index 0000000..dc81f09
--- /dev/null
@@ -0,0 +1,29 @@
+This is an important
+notice! It should
+therefore be located at
+the beginning of this
+document!
+
+This part of the
+document has stayed the
+same from version to
+version.  It shouldn't
+be shown if it doesn't
+change.  Otherwise, that
+would not be helping to
+compress the size of the
+changes.
+
+It is important to spell
+check this document. On
+the other hand, a
+misspelled word isn't
+the end of the world.
+Nothing in the rest of
+this paragraph needs to
+be changed. Things can
+be added after it.
+
+This paragraph contains
+important new additions
+to this document.
diff --git a/diff/before.txt b/diff/before.txt
new file mode 100644 (file)
index 0000000..a413be2
--- /dev/null
@@ -0,0 +1,24 @@
+This part of the
+document has stayed the
+same from version to
+version.  It shouldn't
+be shown if it doesn't
+change.  Otherwise, that
+would not be helping to
+compress the size of the
+changes.
+
+This paragraph contains
+text that is outdated.
+It will be deleted in the
+near future.
+
+It is important to spell
+check this dokument. On
+the other hand, a
+misspelled word isn't
+the end of the world.
+Nothing in the rest of
+this paragraph needs to
+be changed. Things can
+be added after it.
diff --git a/diff/diff.py b/diff/diff.py
new file mode 100644 (file)
index 0000000..b5da09a
--- /dev/null
@@ -0,0 +1,123 @@
+import collections
+import functools
+import sys
+
+def memoize(f):
+    memo = {}
+    sentinel = object()
+
+    @functools.wraps(f)
+    def wrapped(*args):
+        result = memo.get(args, sentinel)
+
+        if result is sentinel:
+            result = f(*args)
+            memo[args] = result
+
+        return result
+
+    return wrapped
+
+@memoize
+def lcs(a, b, i=None, j=None):
+    if i is None and j is None:
+        i = len(a)
+        j = len(b)
+
+    if i == 0 or j == 0:
+        return ()
+
+    if a[i - 1] == b[j - 1]:
+        return lcs(a, b, i - 1, j - 1) + (a[i - 1],)
+
+    x = lcs(a, b, i, j - 1)
+    y = lcs(a, b, i - 1, j)
+
+    if len(x) > len(y):
+        return x
+    else:
+        return y
+
+def changes(a, b, c):
+    c = lcs(a, b)
+
+    i, j, k = 0, 0, 0
+
+    while i < len(a) or j < len(b):
+        while i < len(a) and (k == len(c) or a[i] != c[k]):
+            yield ('d', i, a[i])
+            i += 1
+
+        while j < len(b) and (k == len(c) or b[j] != c[k]):
+            yield ('a', j, b[j])
+            j += 1
+
+        while i < len(a) and j < len(b) and a[i] == b[j]:
+            assert a[i] == c[k]
+            assert b[j] == c[k]
+
+            i += 1
+            j += 1
+            k += 1
+
+Block = collections.namedtuple(
+    'Block',
+    (
+        'kind',
+        'start',
+        'finish',
+        'lines',
+    ),
+)
+
+def compress_block(b):
+    return Block(
+        kind=b[0][0],
+        start=b[0][1],
+        finish=b[-1][1],
+        lines=tuple(ch[2] for ch in b),
+    )
+
+def format_block(b):
+    direction = '>' if b.kind == 'a' else '<'
+
+    def format_line(l):
+        return '{} {}'.format(direction, l)
+
+    return '{}{},{}\n{}'.format(
+        b.kind,
+        b.start,
+        b.finish,
+        '\n'.join(format_line(l) for l in b.lines),
+    )
+
+def diff(a, b):
+    c = lcs(a, b)
+
+    block = []
+    i = 0
+
+    for change in changes(a, b, c):
+        if len(block) == 0:
+            block.append(change)
+
+        elif block[-1][0] == change[0] and block[-1][1] + 1 == change[1]:
+            block.append(change)
+
+        else:
+            yield compress_block(block)
+            block = [change]
+
+    yield compress_block(block)
+
+before_file_path = sys.argv[1]
+after_file_path = sys.argv[2]
+
+with open(before_file_path, 'r') as before_file:
+    before = tuple(l.rstrip() for l in before_file.readlines())
+
+with open(after_file_path, 'r') as after_file:
+    after = tuple(l.rstrip() for l in after_file.readlines())
+
+for block in diff(before, after):
+    print(format_block(block))