From: David Kerkeslager Date: Mon, 4 Apr 2016 14:46:25 +0000 (-0400) Subject: Add a string serializer X-Git-Url: https://code.kerkeslager.com/?a=commitdiff_plain;h=30c21d4218bece6ae4d3671d6e02f1421c816976;p=ton Add a string serializer --- diff --git a/TODO b/TODO new file mode 100644 index 0000000..ccb756b --- /dev/null +++ b/TODO @@ -0,0 +1,30 @@ +The following needs doing before a 1.0 release: +1. String serialization settings (defaults): + a. Width (80) + b. Maintain tags for default types (true) + c. Handling of non-displayable and non-ascii characters +2. String deserialization settings (defaults): + a. Default string type (utf-8) + b. Defailt integer type (int32) + c. Default decimal type (double) + d. Maintain tags universally, not at all, or only for non-default types (universally) +3. Binary serialization settings (defaults): + a. Default string type (utf-8) + b. Default integer type (int32) + c. Default decimal type (double) +4. Binary deserialization settings (defaults): + a. Maintain tags universally, not at all, or only for non-default types (universally) + b. Return lists as iterables or as lists (lists) + c. Return dictionaries as OrderedDicts, dicts, lists of key-value pairs, or iterables of key-value pairs (OrderedDict) +5. Add unsinged integer types. +6. Think about type signatures for string serialization (do we want 1i32 or 1int32?). +7. Include the string encoding byte for dictionary keys. + a. It adds only 1 byte per key. + b. It prevents excessively long keys in utf16 friendly languages. +8. Consider limiting key string length to 2^16 or even 2^8: + a. It saves 2 or 3 bytes per key, and keys are typically short. + b. Perhaps we should create a struct type with this feature. +9. Add a rationale doc that includes rationale for the technical decisions made. +10. Binary deserialization needs tooling for lazy deserializtion and deserializing from buffers. +11. String escaping in string serialization and deserialization. +12. diff --git a/don/__init__.py b/don/__init__.py index eb0ca78..6f8bf47 100644 --- a/don/__init__.py +++ b/don/__init__.py @@ -1,3 +1,4 @@ +import binascii import collections import struct @@ -34,7 +35,7 @@ _TYPES_TO_TAGS = { collections.OrderedDict: DICTIONARY, } -def tag(o): +def _tag(o): if isinstance(o, TaggedObject): return o @@ -49,41 +50,41 @@ def tag(o): return TaggedObject(tag = _TYPES_TO_TAGS[type(o)], value = o) -def serialize_tag_only_type(o): +def _binary_serialize_tag_only_type(o): return b'' -def make_serializer_from_pack_format_string(pfs): +def _pack_format_string_to_binary_serializer(pfs): def serializer(i): return struct.pack(pfs, i) return serializer -def make_string_serializer_from_encoder(e): +def _encoder_to_binary_serializer(e): def serializer(s): encoded = e(s) return struct.pack('!I', len(encoded)) + encoded return serializer -def serialize_list(items): +def _binary_serialize_list(items): # TODO Enforce that items are all the same type - items = [tag(i) for i in items] + items = [_tag(i) for i in items] if len(items) == 0: item_tag = VOID else: item_tag = items[0].tag - item_serializer = _SERIALIZERS[item_tag] + item_serializer = _BINARY_SERIALIZERS[item_tag] items = [item_serializer(i.value) for i in items] item_length = len(items) items = b''.join(items) byte_length = len(items) return struct.pack('!BII', item_tag, byte_length, item_length) + items -def serialize_dict(d): +def _binary_serialize_dict(d): item_length = 0 serialized = b'' - key_serializer = _SERIALIZERS[UTF8] + key_serializer = _BINARY_SERIALIZERS[UTF8] for key, value in d.items(): assert isinstance(key, str) @@ -93,26 +94,26 @@ def serialize_dict(d): byte_length = len(serialized) return struct.pack('!II', byte_length, item_length) + serialized -_SERIALIZERS = { - VOID: serialize_tag_only_type, - TRUE: serialize_tag_only_type, - FALSE: serialize_tag_only_type, - INT8: make_serializer_from_pack_format_string('!b'), - INT16: make_serializer_from_pack_format_string('!h'), - INT32: make_serializer_from_pack_format_string('!i'), - FLOAT: make_serializer_from_pack_format_string('!f'), - DOUBLE: make_serializer_from_pack_format_string('!d'), - BINARY: make_string_serializer_from_encoder(lambda b: b), - UTF8: make_string_serializer_from_encoder(lambda s: s.encode('utf-8')), - UTF16: make_string_serializer_from_encoder(lambda s: s.encode('utf-16')), - UTF32: make_string_serializer_from_encoder(lambda s: s.encode('utf-32')), - LIST: serialize_list, - DICTIONARY: serialize_dict, +_BINARY_SERIALIZERS = { + VOID: _binary_serialize_tag_only_type, + TRUE: _binary_serialize_tag_only_type, + FALSE: _binary_serialize_tag_only_type, + INT8: _pack_format_string_to_binary_serializer('!b'), + INT16: _pack_format_string_to_binary_serializer('!h'), + INT32: _pack_format_string_to_binary_serializer('!i'), + FLOAT: _pack_format_string_to_binary_serializer('!f'), + DOUBLE: _pack_format_string_to_binary_serializer('!d'), + BINARY: _encoder_to_binary_serializer(lambda b: b), + UTF8: _encoder_to_binary_serializer(lambda s: s.encode('utf-8')), + UTF16: _encoder_to_binary_serializer(lambda s: s.encode('utf-16')), + UTF32: _encoder_to_binary_serializer(lambda s: s.encode('utf-32')), + LIST: _binary_serialize_list, + DICTIONARY: _binary_serialize_dict, } def _binary_serialize(o): - o = tag(o) - return struct.pack('!B', o.tag) + _SERIALIZERS[o.tag](o.value) + o = _tag(o) + return struct.pack('!B', o.tag) + _BINARY_SERIALIZERS[o.tag](o.value) ParseResult = collections.namedtuple( 'ParseResult', @@ -162,7 +163,7 @@ def make_string_parser(decoder): return string_parser -def list_parser(source): +def _list_parser(source): tag = source[0] parser = _TAGS_TO_PARSERS[tag] @@ -235,7 +236,7 @@ _TAGS_TO_PARSERS = { UTF8: make_string_parser(lambda b : b.decode('utf-8')), UTF16: make_string_parser(lambda b : b.decode('utf-16')), UTF32: make_string_parser(lambda b : b.decode('utf-32')), - LIST: list_parser, + LIST: _list_parser, DICTIONARY: dictionary_parser, } @@ -253,6 +254,65 @@ def _parse(parser, source, consume_all = True): def _binary_deserialize(b): return _parse(_object_parser, b) +def _integer_size_to_string_serializer(integer_size): + minimum = -(2 ** (integer_size - 1)) + maximum = 2 ** (integer_size - 1) - 1 + + def serializer(integer): + assert minimum <= integer and integer <= maximum + return '{}i{}'.format(integer, integer_size) + + return serializer + +def _serialize_float(f): + return '{}f'.format(f) + +def _serialize_double(d): + return '{}d'.format(d) + +def _serialize_binary(b): + return '"{}"b'.format(binascii.hexlify(b).decode('ascii')) + +def _utf_encoding_to_serializer(utf_encoding): + def serializer(s): + return '"{}"{}'.format(s, utf_encoding) + + return serializer + +def _string_serialize_list(l): + return '[{}]'.format(', '.join(map(_string_serialize, l))) + +def _string_serialize_dictionary(d): + def serialize_kvp(kvp): + return _string_serialize(kvp[0]) + ': ' + _string_serialize(kvp[1]) + return '{ ' + ', '.join(map(serialize_kvp, d.items())) + ' }' + +_STRING_SERIALIZERS = { + VOID: lambda o: 'null', + TRUE: lambda o: 'true', + FALSE: lambda o: 'false', + INT8: _integer_size_to_string_serializer(8), + INT16: _integer_size_to_string_serializer(16), + INT32: _integer_size_to_string_serializer(32), + INT64: _integer_size_to_string_serializer(64), + FLOAT: _serialize_float, + DOUBLE: _serialize_double, + BINARY: _serialize_binary, + UTF8: _utf_encoding_to_serializer('utf8'), + UTF16: _utf_encoding_to_serializer('utf16'), + UTF32: _utf_encoding_to_serializer('utf32'), + LIST: _string_serialize_list, + DICTIONARY: _string_serialize_dictionary, +} + +def _string_serialize(o): + o = _tag(o) + + return _STRING_SERIALIZERS[o.tag](o.value) + +def _string_deserialize(o): + pass + Serializer = collections.namedtuple('Serializer', ['serialize', 'deserialize']) binary = Serializer( @@ -260,6 +320,11 @@ binary = Serializer( deserialize = _binary_deserialize, ) +string = Serializer( + serialize = _string_serialize, + deserialize = _string_deserialize, +) + def binary_to_string(b): return string.serialize(binary.deserialize(b)) diff --git a/test_don.py b/test_don.py index 1eb8f5d..6d5ed81 100644 --- a/test_don.py +++ b/test_don.py @@ -1,9 +1,7 @@ import collections import unittest -import don -from don import binary -from don import string +from don import * class TestBinarySerialize(unittest.TestCase): def test_serializes_null(self): @@ -116,7 +114,64 @@ class TestBinaryDeserialize(unittest.TestCase): class TestStringSerialize(unittest.TestCase): - pass + def test_serializes_null(self): + self.assertEqual(string.serialize(None), 'null') + + def test_serializes_true(self): + self.assertEqual(string.serialize(True), 'true') + + def test_serializes_false(self): + self.assertEqual(string.serialize(False), 'false') + + def test_serializes_int8(self): + self.assertEqual(string.serialize(TaggedObject(INT8, 1)), '1i8') + self.assertEqual(string.serialize(TaggedObject(INT8, -1)), '-1i8') + self.assertEqual(string.serialize(TaggedObject(INT8, 42)), '42i8') + + def test_serializes_int16(self): + self.assertEqual(string.serialize(TaggedObject(INT16, 1)), '1i16') + self.assertEqual(string.serialize(TaggedObject(INT16, -1)), '-1i16') + self.assertEqual(string.serialize(TaggedObject(INT16, 42)), '42i16') + + def test_serializes_int32(self): + self.assertEqual(string.serialize(TaggedObject(INT32, 1)), '1i32') + self.assertEqual(string.serialize(TaggedObject(INT32, -1)), '-1i32') + self.assertEqual(string.serialize(TaggedObject(INT32, 42)), '42i32') + + def test_serializes_int64(self): + self.assertEqual(string.serialize(TaggedObject(INT64, 1)), '1i64') + self.assertEqual(string.serialize(TaggedObject(INT64, -1)), '-1i64') + self.assertEqual(string.serialize(TaggedObject(INT64, 42)), '42i64') + + def test_serializes_float(self): + self.assertEqual(string.serialize(TaggedObject(FLOAT, 1.0)), '1.0f') + + def test_serializes_double(self): + self.assertEqual(string.serialize(TaggedObject(DOUBLE, 1.0)), '1.0d') + + def test_serializes_binary(self): + self.assertEqual(string.serialize(TaggedObject(BINARY, b'\xde\xad\xbe\xef')), '"deadbeef"b') + + def test_serializes_utf8(self): + self.assertEqual(string.serialize(TaggedObject(UTF8, 'Hello, world')), '"Hello, world"utf8') + + def test_serializes_utf16(self): + self.assertEqual(string.serialize(TaggedObject(UTF16, 'Hello, world')), '"Hello, world"utf16') + + def test_serializes_utf32(self): + self.assertEqual(string.serialize(TaggedObject(UTF32, 'Hello, world')), '"Hello, world"utf32') + + def test_serializes_list(self): + self.assertEqual(string.serialize(TaggedObject(LIST, [1,2,3])), '[1i32, 2i32, 3i32]') + + def test_serializes_dictionary(self): + self.assertEqual( + string.serialize(TaggedObject(DICTIONARY, collections.OrderedDict([ + ('foo', 1), + ('bar', 'baz'), + ]))), + '{ "foo"utf8: 1i32, "bar"utf8: "baz"utf8 }' + ) class TestStringDeserialize(unittest.TestCase): pass