From 174082a837be659555a19538ec8c6571a394e095 Mon Sep 17 00:00:00 2001 From: David Kerkeslager Date: Sat, 2 Apr 2016 15:34:07 -0400 Subject: [PATCH] Commit initial version of the binary serializer/deserializer --- .gitignore | 4 + don/__init__.py | 8 ++ don/binary.py | 254 ++++++++++++++++++++++++++++++++++++++++++++++++ don/string.py | 5 + test_don.py | 124 +++++++++++++++++++++++ 5 files changed, 395 insertions(+) create mode 100644 .gitignore create mode 100644 don/__init__.py create mode 100644 don/binary.py create mode 100644 don/string.py create mode 100644 test_don.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a61d7c1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +__pycache__/ +*.swp +*.swo +*.pyc diff --git a/don/__init__.py b/don/__init__.py new file mode 100644 index 0000000..ac70fdb --- /dev/null +++ b/don/__init__.py @@ -0,0 +1,8 @@ +import don.binary as binary +import don.string as string + +def binary_to_string(b): + return string.serialize(binary.deserialize(b)) + +def string_to_binary(s): + return binary.serialize(string.deserialize(s)) diff --git a/don/binary.py b/don/binary.py new file mode 100644 index 0000000..f4942dd --- /dev/null +++ b/don/binary.py @@ -0,0 +1,254 @@ +import collections +import struct + +VOID = 0x00 +TRUE = 0x01 +FALSE = 0x02 +BOOL = (TRUE, FALSE) +INT8 = 0x10 +INT16 = 0x11 +INT32 = 0x12 +INT64 = 0x13 +FLOAT = 0x20 +DOUBLE = 0x21 +BINARY = 0x30 +UTF8 = 0x31 +UTF16 = 0x32 +UTF32 = 0x33 +LIST = 0x40 +DICTIONARY = 0x41 + +DEFAULT_INTEGER_ENCODING = INT32 +DEFAULT_DECIMAL_ENCODING = DOUBLE +DEFAULT_STRING_ENCODING = UTF8 + +TaggedObject = collections.namedtuple('TaggedObject', ['tag', 'value']) + +_TYPES_TO_TAGS = { + int: DEFAULT_INTEGER_ENCODING, + float: DEFAULT_DECIMAL_ENCODING, + bytes: BINARY, + str: DEFAULT_STRING_ENCODING, + list: LIST, + dict: DICTIONARY, + collections.OrderedDict: DICTIONARY, +} + +def tag(o): + if isinstance(o, TaggedObject): + return o + + if o is None: + return TaggedObject(tag = VOID, value = o) + + if o is True: + return TaggedObject(tag = TRUE, value = o) + + if o is False: + return TaggedObject(tag = FALSE, value = o) + + return TaggedObject(tag = _TYPES_TO_TAGS[type(o)], value = o) + +def serialize_tag_only_type(o): + return b'' + +def make_serializer_from_pack_format_string(pfs): + def serializer(i): + return struct.pack(pfs, i) + return serializer + +def make_string_serializer_from_encoder(e): + def serializer(s): + encoded = e(s) + return struct.pack('!I', len(encoded)) + encoded + return serializer + +def serialize_list(items): + # TODO Enforce that items are all the same type + items = [tag(i) for i in items] + + if len(items) == 0: + item_tag = VOID + else: + item_tag = items[0].tag + + item_serializer = _SERIALIZERS[item_tag] + items = [item_serializer(i.value) for i in items] + item_length = len(items) + items = b''.join(items) + byte_length = len(items) + return struct.pack('!BII', item_tag, byte_length, item_length) + items + +def serialize_dict(d): + item_length = 0 + serialized = b'' + + key_serializer = _SERIALIZERS[UTF8] + + for key, value in d.items(): + assert isinstance(key, str) + item_length += 1 + serialized += key_serializer(key) + serialize(value) + + byte_length = len(serialized) + return struct.pack('!II', byte_length, item_length) + serialized + +_SERIALIZERS = { + VOID: serialize_tag_only_type, + TRUE: serialize_tag_only_type, + FALSE: serialize_tag_only_type, + INT8: make_serializer_from_pack_format_string('!b'), + INT16: make_serializer_from_pack_format_string('!h'), + INT32: make_serializer_from_pack_format_string('!i'), + FLOAT: make_serializer_from_pack_format_string('!f'), + DOUBLE: make_serializer_from_pack_format_string('!d'), + BINARY: make_string_serializer_from_encoder(lambda b: b), + UTF8: make_string_serializer_from_encoder(lambda s: s.encode('utf-8')), + UTF16: make_string_serializer_from_encoder(lambda s: s.encode('utf-16')), + UTF32: make_string_serializer_from_encoder(lambda s: s.encode('utf-32')), + LIST: serialize_list, + DICTIONARY: serialize_dict, +} + +def serialize(o): + o = tag(o) + return struct.pack('!B', o.tag) + _SERIALIZERS[o.tag](o.value) + +ParseResult = collections.namedtuple( + 'ParseResult', + [ + 'success', + 'value', + 'remaining', + ], +) + +_FAILED_PARSE_RESULT = ParseResult(success = False, value = None, remaining = None) + +_BYTE_SIZES_TO_UNPACK_FORMATS = { + 1: '!b', + 2: '!h', + 4: '!i', + 8: '!q', +} + +def make_integer_parser(size_in_bytes): + unpack_format = _BYTE_SIZES_TO_UNPACK_FORMATS[size_in_bytes] + + def integer_parser(source): + value = struct.unpack(unpack_format, source[:size_in_bytes])[0] + remaining = source[size_in_bytes:] + + return ParseResult(success = True, value = value, remaining = remaining) + + return integer_parser + +def binary64_parser(source): + return ParseResult( + success = True, + value = struct.unpack('!d', source[:8])[0], + remaining = source[8:], + ) + +def make_string_parser(decoder): + def string_parser(source): + length = struct.unpack('!I', source[:4])[0] + source = source[4:] + return ParseResult( + success = True, + value = decoder(source[:length]), + remaining = source[length:], + ) + + return string_parser + +def list_parser(source): + tag = source[0] + parser = _TAGS_TO_PARSERS[tag] + + source = source[1:] + byte_length, items_length = struct.unpack('!II', source[:8]) + source = source[8:] + + remaining = source[byte_length:] + source = source[:byte_length] + + def item_iterator(source): + count = 0 + + while len(source) > 0: + parse_result = parser(source) + + if parse_result.success: + count += 1 + yield parse_result.value + source = parse_result.remaining + + assert count == items_length + + return ParseResult( + success = True, + value = item_iterator(source), + remaining = remaining, + ) + +def dictionary_parser(source): + key_parser = _TAGS_TO_PARSERS[UTF8] + + byte_length, item_length = struct.unpack('!II', source[:8]) + source = source[8:] + + remaining = source[byte_length:] + source = source[:byte_length] + + def kvp_iterator(source): + count = 0 + + while len(source) > 0: + count += 1 + key_parse_result = key_parser(source) + key, source = key_parse_result.value, key_parse_result.remaining + value_parse_result = _object_parser(source) + value, source = value_parse_result.value, value_parse_result.remaining + + yield key, value + + assert count == item_length + + return ParseResult( + success = True, + value = collections.OrderedDict(kvp_iterator(source)), + remaining = remaining, + ) + + +_TAGS_TO_PARSERS = { + VOID: lambda r: ParseResult(True, None, r), + TRUE: lambda r: ParseResult(True, True, r), + FALSE: lambda r: ParseResult(True, False, r), + INT8: make_integer_parser(1), + INT16: make_integer_parser(2), + INT32: make_integer_parser(4), + INT64: make_integer_parser(8), + DOUBLE: binary64_parser, + BINARY: make_string_parser(lambda b : b), + UTF8: make_string_parser(lambda b : b.decode('utf-8')), + UTF16: make_string_parser(lambda b : b.decode('utf-16')), + UTF32: make_string_parser(lambda b : b.decode('utf-32')), + LIST: list_parser, + DICTIONARY: dictionary_parser, +} + +def _object_parser(source): + return _TAGS_TO_PARSERS[source[0]](source[1:]) + +def _parse(parser, source, consume_all = True): + result = parser(source) + + if result.success and result.remaining == b'': + return result.value + + raise Exception('Unparsed trailing bytes: {}'.format(result.remaining)) + +def deserialize(b): + return _parse(_object_parser, b) diff --git a/don/string.py b/don/string.py new file mode 100644 index 0000000..a005098 --- /dev/null +++ b/don/string.py @@ -0,0 +1,5 @@ +def serialize(o): + pass + +def deserialize(s): + pass diff --git a/test_don.py b/test_don.py new file mode 100644 index 0000000..8541e74 --- /dev/null +++ b/test_don.py @@ -0,0 +1,124 @@ +import collections +import unittest + +import don +import don.binary as binary +import don.string as string + +class TestBinarySerialize(unittest.TestCase): + def test_serializes_null(self): + self.assertEqual(binary.serialize(None), b'\x00') + + def test_serializes_true(self): + self.assertEqual(binary.serialize(True), b'\x01') + + def test_serializes_false(self): + self.assertEqual(binary.serialize(False), b'\x02') + + def test_serializes_integers_in_32_bit_twos_complement_with_network_byte_order(self): + self.assertEqual(binary.serialize(-2147483648), b'\x12\x80\x00\x00\x00') + self.assertEqual(binary.serialize(-1), b'\x12\xff\xff\xff\xff') + self.assertEqual(binary.serialize(0), b'\x12\x00\x00\x00\x00') + self.assertEqual(binary.serialize(1), b'\x12\x00\x00\x00\x01') + self.assertEqual(binary.serialize(2147483647), b'\x12\x7f\xff\xff\xff') + + def test_serializes_floats_into_binary64_with_network_byte_order(self): + self.assertEqual(binary.serialize(1.0), b'\x21\x3f\xf0\x00\x00\x00\x00\x00\x00') + self.assertEqual(binary.serialize(2.0), b'\x21\x40\x00\x00\x00\x00\x00\x00\x00') + self.assertEqual(binary.serialize(-2.0), b'\x21\xc0\x00\x00\x00\x00\x00\x00\x00') + self.assertEqual(binary.serialize(0.5), b'\x21\x3f\xe0\x00\x00\x00\x00\x00\x00') + self.assertEqual(binary.serialize(2.0 ** -1074), b'\x21\x00\x00\x00\x00\x00\x00\x00\x01') + self.assertEqual(binary.serialize(2.0 ** -1022), b'\x21\x00\x10\x00\x00\x00\x00\x00\x00') + self.assertEqual(binary.serialize(0.0), b'\x21\x00\x00\x00\x00\x00\x00\x00\x00') + + def test_serializes_binary(self): + self.assertEqual(binary.serialize(b'\xde\xad\xbe\xef'), b'\x30\x00\x00\x00\x04\xde\xad\xbe\xef') + + def test_serializes_utf8(self): + self.assertEqual(binary.serialize('Hello, world'), b'\x31\x00\x00\x00\x0cHello, world') + self.assertEqual(binary.serialize('世界'), b'\x31\x00\x00\x00\x06\xe4\xb8\x96\xe7\x95\x8c') + + def test_serializes_list(self): + self.assertEqual(binary.serialize([]), b'\x40\x00\x00\x00\x00\x00\x00\x00\x00\x00') + self.assertEqual(binary.serialize([1,2,3]), b'\x40\x12\x00\x00\x00\x0c\x00\x00\x00\x03\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x03') + self.assertEqual(binary.serialize(['Hello, world', 'Goodnight, moon']), b'\x40\x31\x00\x00\x00#\x00\x00\x00\x02\x00\x00\x00\x0cHello, world\x00\x00\x00\x0fGoodnight, moon') + self.assertEqual(binary.serialize([1.618, 2.718, 3.142]), b'\x40\x21\x00\x00\x00\x18\x00\x00\x00\x03?\xf9\xe3S\xf7\xce\xd9\x17@\x05\xbev\xc8\xb49X@\t"\xd0\xe5`A\x89') + + def test_serializes_dictionary(self): + self.assertEqual(binary.serialize({}), b'\x41\x00\x00\x00\x00\x00\x00\x00\x00') + self.assertEqual(binary.serialize(collections.OrderedDict([ + ('foo',42), + ('bar',3.14), + ('baz','qux'), + ])), b'A\x00\x00\x00+\x00\x00\x00\x03\x00\x00\x00\x03foo\x12\x00\x00\x00*\x00\x00\x00\x03bar!@\t\x1e\xb8Q\xeb\x85\x1f\x00\x00\x00\x03baz1\x00\x00\x00\x03qux') + +class TestBinaryDeserialize(unittest.TestCase): + def test_deserializes_null(self): + self.assertEqual(binary.deserialize(b'\x00'), None) + + def test_deserializes_true(self): + self.assertEqual(binary.deserialize(b'\x01'), True) + + def test_deserializes_false(self): + self.assertEqual(binary.deserialize(b'\x02'), False) + + def test_deserializes_8_bit_twos_complement_with_network_byte_order(self): + self.assertEqual(binary.deserialize(b'\x10\x80'), -128) + self.assertEqual(binary.deserialize(b'\x10\xff'), -1) + self.assertEqual(binary.deserialize(b'\x10\x00'), 0) + self.assertEqual(binary.deserialize(b'\x10\x01'), 1) + self.assertEqual(binary.deserialize(b'\x10\x7f'), 127) + + def test_deserializes_16_bit_twos_complement_with_network_byte_order(self): + self.assertEqual(binary.deserialize(b'\x11\x80\x00'), -32768) + self.assertEqual(binary.deserialize(b'\x11\xff\xff'), -1) + self.assertEqual(binary.deserialize(b'\x11\x00\x00'), 0) + self.assertEqual(binary.deserialize(b'\x11\x00\x01'), 1) + self.assertEqual(binary.deserialize(b'\x11\x7f\xff'), 32767) + + def test_deserializes_32_bit_twos_complement_with_network_byte_order(self): + self.assertEqual(binary.deserialize(b'\x12\x80\x00\x00\x00'), -2147483648) + self.assertEqual(binary.deserialize(b'\x12\xff\xff\xff\xff'), -1) + self.assertEqual(binary.deserialize(b'\x12\x00\x00\x00\x00'), 0) + self.assertEqual(binary.deserialize(b'\x12\x00\x00\x00\x01'), 1) + self.assertEqual(binary.deserialize(b'\x12\x7f\xff\xff\xff'), 2147483647) + + def test_deserializes_64_bit_twos_complement_with_network_byte_order(self): + self.assertEqual(binary.deserialize(b'\x13\x80\x00\x00\x00\x00\x00\x00\x00'), -9223372036854775808) + self.assertEqual(binary.deserialize(b'\x13\xff\xff\xff\xff\xff\xff\xff\xff'), -1) + self.assertEqual(binary.deserialize(b'\x13\x00\x00\x00\x00\x00\x00\x00\x00'), 0) + self.assertEqual(binary.deserialize(b'\x13\x00\x00\x00\x00\x00\x00\x00\x01'), 1) + self.assertEqual(binary.deserialize(b'\x13\x7f\xff\xff\xff\xff\xff\xff\xff'), 9223372036854775807) + + def test_deserializes_binary64_as_float(self): + self.assertEqual(binary.deserialize(b'\x21\x3f\xf0\x00\x00\x00\x00\x00\x00'), 1.0) + self.assertEqual(binary.deserialize(b'\x21\x40\x00\x00\x00\x00\x00\x00\x00'), 2.0) + self.assertEqual(binary.deserialize(b'\x21\xc0\x00\x00\x00\x00\x00\x00\x00'), -2.0) + self.assertEqual(binary.deserialize(b'\x21\x3f\xe0\x00\x00\x00\x00\x00\x00'), 0.5) + self.assertEqual(binary.deserialize(b'\x21\x00\x00\x00\x00\x00\x00\x00\x01'), 2.0 ** -1074) + self.assertEqual(binary.deserialize(b'\x21\x00\x10\x00\x00\x00\x00\x00\x00'), 2.0 ** -1022) + self.assertEqual(binary.deserialize(b'\x21\x00\x00\x00\x00\x00\x00\x00\x00'), 0.0) + + def test_deserializes_binary(self): + self.assertEqual(binary.deserialize(b'\x30\x00\x00\x00\x04\xde\xad\xbe\xef'), b'\xde\xad\xbe\xef') + + def test_deserializes_utf8(self): + self.assertEqual(binary.deserialize(b'\x31\x00\x00\x00\x0cHello, world'), 'Hello, world') + self.assertEqual(binary.deserialize(b'\x31\x00\x00\x00\x06\xe4\xb8\x96\xe7\x95\x8c'), '世界') + + def test_deserializes_lists(self): + self.assertEqual(list(binary.deserialize(b'\x40\x12\x00\x00\x00\x00\x00\x00\x00\x00')), []) + self.assertEqual(list(binary.deserialize(b'\x40\x12\x00\x00\x00\x0c\x00\x00\x00\x03\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00\x03')), [1,2,3]) + + def test_deserializes_dictionaries(self): + self.assertEqual(binary.deserialize(b'\x41\x00\x00\x00\x00\x00\x00\x00\x00'), collections.OrderedDict([])) + self.assertEqual(binary.deserialize(b'\x41\x00\x00\x00\x1b\x00\x00\x00\x02\x00\x00\x00\x03foo\x12\x00\x00\x00\x2a\x00\x00\x00\x03bar\x31\x00\x00\x00\x03baz'), collections.OrderedDict([('foo',42), ('bar','baz')])) + + +class TestStringSerialize(unittest.TestCase): + pass + +class TestStringDeserialize(unittest.TestCase): + pass + +unittest.main() -- 2.20.1