From 7cbd1a28137f09d5633d23d9159aae0b4450ec45 Mon Sep 17 00:00:00 2001 From: Tony Garnock-Jones Date: Thu, 16 Mar 2023 20:55:49 +0100 Subject: [PATCH] Docs for preserves.binary; better encoding annotations with canonicalization --- implementations/python/mkdocs.yml | 6 +- implementations/python/preserves/binary.py | 186 +++++++++++++++--- implementations/python/preserves/text.py | 4 +- implementations/python/preserves/values.py | 16 +- .../python/tests/test_preserves.py | 8 +- 5 files changed, 185 insertions(+), 35 deletions(-) diff --git a/implementations/python/mkdocs.yml b/implementations/python/mkdocs.yml index d0fa910..8a7701b 100644 --- a/implementations/python/mkdocs.yml +++ b/implementations/python/mkdocs.yml @@ -3,7 +3,11 @@ theme: name: material plugins: - search - - mkdocstrings + - mkdocstrings: + handlers: + python: + options: + merge_init_into_class: true - macros: include_dir: ../../_includes markdown_extensions: diff --git a/implementations/python/preserves/binary.py b/implementations/python/preserves/binary.py index 7f2904c..ee0b267 100644 --- a/implementations/python/preserves/binary.py +++ b/implementations/python/preserves/binary.py @@ -23,14 +23,108 @@ from .error import * from .compat import basestring_, ord_ class BinaryCodec(object): - """TODO""" pass class Decoder(BinaryCodec): - """TODO""" + """Implementation of a decoder for the machine-oriented binary Preserves syntax. + + Args: + packet (bytes): + initial contents of the input buffer; may subsequently be extended by calling + [extend][preserves.binary.Decoder.extend]. + + include_annotations (bool): + if `True`, wrap each value and subvalue in an + [Annotated][preserves.values.Annotated] object. + + decode_embedded: + function accepting a `Value` and returning a possibly-decoded form of that value + suitable for placing into an [Embedded][preserves.values.Embedded] object. + + Normal usage is to supply a buffer, and keep calling [next][preserves.binary.Decoder.next] + until a [ShortPacket][preserves.error.ShortPacket] exception is raised: + + ```python + >>> d = Decoder(b'\\xa0{\\xb1\\x05hello\\x85\\xb3\\x01x\\xb5\\x84') + >>> d.next() + 123 + >>> d.next() + 'hello' + >>> d.next() + () + >>> d.next() + Traceback (most recent call last): + ... + preserves.error.ShortPacket: Short packet + + ``` + + Alternatively, keep calling [try_next][preserves.binary.Decoder.try_next] until it yields + `None`, which is not in the domain of Preserves `Value`s: + + ```python + >>> d = Decoder(b'\\xa0{\\xb1\\x05hello\\x85\\xb3\\x01x\\xb5\\x84') + >>> d.try_next() + 123 + >>> d.try_next() + 'hello' + >>> d.try_next() + () + >>> d.try_next() + + ``` + + For convenience, [Decoder][preserves.binary.Decoder] implements the iterator interface, + backing it with [try_next][preserves.binary.Decoder.try_next], so you can simply iterate + over all complete values in an input: + + ```python + >>> d = Decoder(b'\\xa0{\\xb1\\x05hello\\x85\\xb3\\x01x\\xb5\\x84') + >>> list(d) + [123, 'hello', ()] + + ``` + + ```python + >>> for v in Decoder(b'\\xa0{\\xb1\\x05hello\\x85\\xb3\\x01x\\xb5\\x84'): + ... print(repr(v)) + 123 + 'hello' + () + + ``` + + Supply `include_annotations=True` to read annotations alongside the annotated values: + + ```python + >>> d = Decoder(b'\\xa0{\\xb1\\x05hello\\x85\\xb3\\x01x\\xb5\\x84', include_annotations=True) + >>> list(d) + [123, 'hello', @#x ()] + + ``` + + If you are incrementally reading from, say, a socket, you can use + [extend][preserves.binary.Decoder.extend] to add new input as if comes available: + + ```python + >>> d = Decoder(b'\\xa0{\\xb1\\x05he') + >>> d.try_next() + 123 + >>> d.try_next() # returns None because the input is incomplete + >>> d.extend(b'llo') + >>> d.try_next() + 'hello' + >>> d.try_next() + + ``` + + Attributes: + packet (bytes): buffered input waiting to be processed + index (int): read position within `packet` + + """ def __init__(self, packet=b'', include_annotations=False, decode_embedded=lambda x: x): - """TODO""" super(Decoder, self).__init__() self.packet = packet self.index = 0 @@ -38,7 +132,8 @@ class Decoder(BinaryCodec): self.decode_embedded = decode_embedded def extend(self, data): - """TODO""" + """Appends `data` to the remaining bytes in `self.packet`, trimming already-processed + bytes from the front of `self.packet` and resetting `self.index` to zero.""" self.packet = self.packet[self.index:] + data self.index = 0 @@ -92,7 +187,11 @@ class Decoder(BinaryCodec): return v def next(self): - """TODO""" + """Reads the next complete `Value` from the internal buffer, raising + [ShortPacket][preserves.error.ShortPacket] if too few bytes are available, or + [DecodeError][preserves.error.DecodeError] if the input is invalid somehow. + + """ tag = self.nextbyte() if tag == 0x80: return self.wrap(False) if tag == 0x81: return self.wrap(True) @@ -123,7 +222,8 @@ class Decoder(BinaryCodec): raise DecodeError('Invalid tag: ' + hex(tag)) def try_next(self): - """TODO""" + """Like [next][preserves.binary.Decoder.next], but returns `None` instead of raising + [ShortPacket][preserves.error.ShortPacket].""" start = self.index try: return self.next() @@ -132,7 +232,6 @@ class Decoder(BinaryCodec): return None def __iter__(self): - """TODO""" return self def __next__(self): @@ -142,26 +241,71 @@ class Decoder(BinaryCodec): return v def decode(bs, **kwargs): - """TODO""" + """Yields the first complete encoded value from `bs`, passing `kwargs` through to the + [Decoder][preserves.binary.Decoder] constructor. Raises exceptions as per + [next][preserves.binary.Decoder.next]. + + Args: + bs (bytes): encoded data to decode + + """ return Decoder(packet=bs, **kwargs).next() def decode_with_annotations(bs, **kwargs): - """TODO""" + """Like [decode][preserves.binary.decode], but supplying `include_annotations=True` to the + [Decoder][preserves.binary.Decoder] constructor.""" return Decoder(packet=bs, include_annotations=True, **kwargs).next() class Encoder(BinaryCodec): """Implementation of an encoder for the machine-oriented binary Preserves syntax. + ```python + >>> e = Encoder() + >>> e.append(123) + >>> e.append('hello') + >>> e.append(annotate([], Symbol('x'))) + >>> e.contents() + b'\\xa0{\\xb1\\x05hello\\x85\\xb3\\x01x\\xb5\\x84' + + ``` + + Args: + encode_embedded: + function accepting an [Embedded][preserves.values.Embedded].embeddedValue and + returning a `Value` for serialization. + + canonicalize (bool): + if `True`, ensures the serialized data are in [canonical + form](https://preserves.dev/canonical-binary.html). This is slightly more work than + producing potentially-non-canonical output. + + include_annotations (bool | None): + if `None`, includes annotations in the output only when `canonicalize` is `False`, + because [canonical serialization of values demands omission of + annotations](https://preserves.dev/canonical-binary.html). If explicitly `True` or + `False`, however, annotations will be included resp. excluded no matter the + `canonicalize` setting. This can be used to get canonical ordering + (`canonicalize=True`) *and* annotations (`include_annotations=True`). + + Attributes: + buffer (bytearray): accumulator for the output of the encoder + """ - def __init__(self, encode_embedded=lambda x: x, canonicalize=False): - """TODO""" + def __init__(self, + encode_embedded=lambda x: x, + canonicalize=False, + include_annotations=None): super(Encoder, self).__init__() self.buffer = bytearray() self._encode_embedded = encode_embedded self._canonicalize = canonicalize + if include_annotations is None: + self.include_annotations = not self._canonicalize + else: + self.include_annotations = include_annotations def reset(self): - """TODO""" + """Clears `self.buffer` to a fresh empty `bytearray`.""" self.buffer = bytearray() def encode_embedded(self, v): @@ -170,7 +314,7 @@ class Encoder(BinaryCodec): return self._encode_embedded(v) def contents(self): - """TODO""" + """Returns a `bytes` constructed from the contents of `self.buffer`.""" return bytes(self.buffer) def varint(self, v): @@ -208,7 +352,7 @@ class Encoder(BinaryCodec): if not self._canonicalize: self.encodevalues(6, v) else: - c = Canonicalizer(self._encode_embedded) + c = Canonicalizer(self._encode_embedded, self.include_annotations) for i in v: c.entry([i]) c.emit_entries(self, 6) @@ -216,12 +360,12 @@ class Encoder(BinaryCodec): if not self._canonicalize: self.encodevalues(7, list(dict_kvs(v))) else: - c = Canonicalizer(self._encode_embedded) + c = Canonicalizer(self._encode_embedded, self.include_annotations) for (kk, vv) in v.items(): c.entry([kk, vv]) c.emit_entries(self, 7) def append(self, v): - """TODO""" + """Extend `self.buffer` with an encoding of `v`.""" v = preserve(v) if hasattr(v, '__preserve_write_binary__'): v.__preserve_write_binary__(self) @@ -265,8 +409,8 @@ class Encoder(BinaryCodec): raise TypeError('Cannot preserves-encode: ' + repr(v)) class Canonicalizer: - def __init__(self, encode_embedded): - self.encoder = Encoder(encode_embedded, canonicalize=True) + def __init__(self, encode_embedded, include_annotations): + self.encoder = Encoder(encode_embedded, canonicalize=True, include_annotations=include_annotations) self.entries = [] def entry(self, pieces): @@ -281,10 +425,8 @@ class Canonicalizer: outer_encoder.buffer.append(0x84) def encode(v, **kwargs): - """Encode a single `Value` v to a byte string. Any kwargs are passed on to the underlying - [Encoder][preserves.binary.Encoder] constructor. - - """ + """Encode a single `Value` `v` to a byte string. Any supplied `kwargs` are passed on to the + underlying [Encoder][preserves.binary.Encoder] constructor.""" e = Encoder(**kwargs) e.append(v) return e.contents() diff --git a/implementations/python/preserves/text.py b/implementations/python/preserves/text.py index 9924636..f30e281 100644 --- a/implementations/python/preserves/text.py +++ b/implementations/python/preserves/text.py @@ -305,7 +305,8 @@ class Formatter(TextCodec): format_embedded=lambda x: x, indent=None, with_commas=False, - trailing_comma=False): + trailing_comma=False, + include_annotations=True): """TODO""" super(Formatter, self).__init__() self.indent_delta = 0 if indent is None else indent @@ -314,6 +315,7 @@ class Formatter(TextCodec): self.trailing_comma = trailing_comma self.chunks = [] self._format_embedded = format_embedded + self.include_annotations = include_annotations def format_embedded(self, v): if self._format_embedded is None: diff --git a/implementations/python/preserves/values.py b/implementations/python/preserves/values.py index d3e7225..e731040 100644 --- a/implementations/python/preserves/values.py +++ b/implementations/python/preserves/values.py @@ -290,16 +290,18 @@ class Annotated(object): self.item = item def __preserve_write_binary__(self, encoder): - for a in self.annotations: - encoder.buffer.append(0x85) - encoder.append(a) + if encoder.include_annotations: + for a in self.annotations: + encoder.buffer.append(0x85) + encoder.append(a) encoder.append(self.item) def __preserve_write_text__(self, formatter): - for a in self.annotations: - formatter.chunks.append('@') - formatter.append(a) - formatter.chunks.append(' ') + if formatter.include_annotations: + for a in self.annotations: + formatter.chunks.append('@') + formatter.append(a) + formatter.chunks.append(' ') formatter.append(self.item) def strip(self, depth=inf): diff --git a/implementations/python/tests/test_preserves.py b/implementations/python/tests/test_preserves.py index ecd2b6c..856caba 100644 --- a/implementations/python/tests/test_preserves.py +++ b/implementations/python/tests/test_preserves.py @@ -261,7 +261,7 @@ def install_test(d, variant, tName, binaryForm, annotatedTextForm): def test_back(self): self.assertPreservesEqual(self.DS(binaryForm), back) def test_back_ann(self): self.assertPreservesEqual(self.D(self.E(annotatedTextForm)), annotatedTextForm) def test_encode(self): self.assertPreservesEqual(self.E(forward), binaryForm) - def test_encode_canonical(self): self.assertPreservesEqual(self.EC(annotatedTextForm), binaryForm) + def test_encode_nondet(self): self.assertPreservesEqual(self.ENONDET(annotatedTextForm), binaryForm) def test_encode_ann(self): self.assertPreservesEqual(self.E(annotatedTextForm), binaryForm) add_method(d, tName, test_match_expected) add_method(d, tName, test_roundtrip) @@ -271,7 +271,7 @@ def install_test(d, variant, tName, binaryForm, annotatedTextForm): if variant in ['normal']: add_method(d, tName, test_encode) if variant in ['nondeterministic']: - add_method(d, tName, test_encode_canonical) + add_method(d, tName, test_encode_nondet) if variant in ['normal', 'nondeterministic']: add_method(d, tName, test_encode_ann) @@ -323,8 +323,8 @@ class CommonTestSuite(PreservesTestCase): def E(self, v): return encode(v, encode_embedded=lambda x: x) - def EC(self, v): - return encode(v, encode_embedded=lambda x: x, canonicalize=True) + def ENONDET(self, v): + return encode(v, encode_embedded=lambda x: x, canonicalize=True, include_annotations=True) class RecordTests(PreservesTestCase): def test_getters(self):