From 7cbd1a28137f09d5633d23d9159aae0b4450ec45 Mon Sep 17 00:00:00 2001
From: Tony Garnock-Jones <tonyg@leastfixedpoint.com>
Date: Thu, 16 Mar 2023 20:55:49 +0100
Subject: [PATCH] Docs for preserves.binary; better encoding annotations with
 canonicalization

---
 implementations/python/mkdocs.yml             |   6 +-
 implementations/python/preserves/binary.py    | 186 +++++++++++++++---
 implementations/python/preserves/text.py      |   4 +-
 implementations/python/preserves/values.py    |  16 +-
 .../python/tests/test_preserves.py            |   8 +-
 5 files changed, 185 insertions(+), 35 deletions(-)

diff --git a/implementations/python/mkdocs.yml b/implementations/python/mkdocs.yml
index d0fa910..8a7701b 100644
--- a/implementations/python/mkdocs.yml
+++ b/implementations/python/mkdocs.yml
@@ -3,7 +3,11 @@ theme:
   name: material
 plugins:
   - search
-  - mkdocstrings
+  - mkdocstrings:
+      handlers:
+        python:
+          options:
+            merge_init_into_class: true
   - macros:
       include_dir: ../../_includes
 markdown_extensions:
diff --git a/implementations/python/preserves/binary.py b/implementations/python/preserves/binary.py
index 7f2904c..ee0b267 100644
--- a/implementations/python/preserves/binary.py
+++ b/implementations/python/preserves/binary.py
@@ -23,14 +23,108 @@ from .error import *
 from .compat import basestring_, ord_
 
 class BinaryCodec(object):
-    """TODO"""
     pass
 
 class Decoder(BinaryCodec):
-    """TODO"""
+    """Implementation of a decoder for the machine-oriented binary Preserves syntax.
+
+    Args:
+        packet (bytes):
+            initial contents of the input buffer; may subsequently be extended by calling
+            [extend][preserves.binary.Decoder.extend].
+
+        include_annotations (bool):
+            if `True`, wrap each value and subvalue in an
+            [Annotated][preserves.values.Annotated] object.
+
+        decode_embedded:
+            function accepting a `Value` and returning a possibly-decoded form of that value
+            suitable for placing into an [Embedded][preserves.values.Embedded] object.
+
+    Normal usage is to supply a buffer, and keep calling [next][preserves.binary.Decoder.next]
+    until a [ShortPacket][preserves.error.ShortPacket] exception is raised:
+
+    ```python
+    >>> d = Decoder(b'\\xa0{\\xb1\\x05hello\\x85\\xb3\\x01x\\xb5\\x84')
+    >>> d.next()
+    123
+    >>> d.next()
+    'hello'
+    >>> d.next()
+    ()
+    >>> d.next()
+    Traceback (most recent call last):
+      ...
+    preserves.error.ShortPacket: Short packet
+
+    ```
+
+    Alternatively, keep calling [try_next][preserves.binary.Decoder.try_next] until it yields
+    `None`, which is not in the domain of Preserves `Value`s:
+
+    ```python
+    >>> d = Decoder(b'\\xa0{\\xb1\\x05hello\\x85\\xb3\\x01x\\xb5\\x84')
+    >>> d.try_next()
+    123
+    >>> d.try_next()
+    'hello'
+    >>> d.try_next()
+    ()
+    >>> d.try_next()
+
+    ```
+
+    For convenience, [Decoder][preserves.binary.Decoder] implements the iterator interface,
+    backing it with [try_next][preserves.binary.Decoder.try_next], so you can simply iterate
+    over all complete values in an input:
+
+    ```python
+    >>> d = Decoder(b'\\xa0{\\xb1\\x05hello\\x85\\xb3\\x01x\\xb5\\x84')
+    >>> list(d)
+    [123, 'hello', ()]
+
+    ```
+
+    ```python
+    >>> for v in Decoder(b'\\xa0{\\xb1\\x05hello\\x85\\xb3\\x01x\\xb5\\x84'):
+    ...     print(repr(v))
+    123
+    'hello'
+    ()
+
+    ```
+
+    Supply `include_annotations=True` to read annotations alongside the annotated values:
+
+    ```python
+    >>> d = Decoder(b'\\xa0{\\xb1\\x05hello\\x85\\xb3\\x01x\\xb5\\x84', include_annotations=True)
+    >>> list(d)
+    [123, 'hello', @#x ()]
+
+    ```
+
+    If you are incrementally reading from, say, a socket, you can use
+    [extend][preserves.binary.Decoder.extend] to add new input as if comes available:
+
+    ```python
+    >>> d = Decoder(b'\\xa0{\\xb1\\x05he')
+    >>> d.try_next()
+    123
+    >>> d.try_next() # returns None because the input is incomplete
+    >>> d.extend(b'llo')
+    >>> d.try_next()
+    'hello'
+    >>> d.try_next()
+
+    ```
+
+    Attributes:
+        packet (bytes): buffered input waiting to be processed
+        index (int): read position within `packet`
+
+    """
 
     def __init__(self, packet=b'', include_annotations=False, decode_embedded=lambda x: x):
-        """TODO"""
         super(Decoder, self).__init__()
         self.packet = packet
         self.index = 0
@@ -38,7 +132,8 @@ class Decoder(BinaryCodec):
         self.decode_embedded = decode_embedded
 
     def extend(self, data):
-        """TODO"""
+        """Appends `data` to the remaining bytes in `self.packet`, trimming already-processed
+        bytes from the front of `self.packet` and resetting `self.index` to zero."""
         self.packet = self.packet[self.index:] + data
         self.index = 0
 
@@ -92,7 +187,11 @@ class Decoder(BinaryCodec):
         return v
 
     def next(self):
-        """TODO"""
+        """Reads the next complete `Value` from the internal buffer, raising
+        [ShortPacket][preserves.error.ShortPacket] if too few bytes are available, or
+        [DecodeError][preserves.error.DecodeError] if the input is invalid somehow.
+
+        """
         tag = self.nextbyte()
         if tag == 0x80: return self.wrap(False)
         if tag == 0x81: return self.wrap(True)
@@ -123,7 +222,8 @@ class Decoder(BinaryCodec):
         raise DecodeError('Invalid tag: ' + hex(tag))
 
     def try_next(self):
-        """TODO"""
+        """Like [next][preserves.binary.Decoder.next], but returns `None` instead of raising
+        [ShortPacket][preserves.error.ShortPacket]."""
         start = self.index
         try:
             return self.next()
@@ -132,7 +232,6 @@ class Decoder(BinaryCodec):
             return None
 
     def __iter__(self):
-        """TODO"""
         return self
 
     def __next__(self):
@@ -142,26 +241,71 @@ class Decoder(BinaryCodec):
         return v
 
 def decode(bs, **kwargs):
-    """TODO"""
+    """Yields the first complete encoded value from `bs`, passing `kwargs` through to the
+    [Decoder][preserves.binary.Decoder] constructor. Raises exceptions as per
+    [next][preserves.binary.Decoder.next].
+
+    Args:
+        bs (bytes): encoded data to decode
+
+    """
     return Decoder(packet=bs, **kwargs).next()
 
 def decode_with_annotations(bs, **kwargs):
-    """TODO"""
+    """Like [decode][preserves.binary.decode], but supplying `include_annotations=True` to the
+    [Decoder][preserves.binary.Decoder] constructor."""
     return Decoder(packet=bs, include_annotations=True, **kwargs).next()
 
 class Encoder(BinaryCodec):
     """Implementation of an encoder for the machine-oriented binary Preserves syntax.
 
+    ```python
+    >>> e = Encoder()
+    >>> e.append(123)
+    >>> e.append('hello')
+    >>> e.append(annotate([], Symbol('x')))
+    >>> e.contents()
+    b'\\xa0{\\xb1\\x05hello\\x85\\xb3\\x01x\\xb5\\x84'
+
+    ```
+
+    Args:
+        encode_embedded:
+            function accepting an [Embedded][preserves.values.Embedded].embeddedValue and
+            returning a `Value` for serialization.
+
+        canonicalize (bool):
+            if `True`, ensures the serialized data are in [canonical
+            form](https://preserves.dev/canonical-binary.html). This is slightly more work than
+            producing potentially-non-canonical output.
+
+        include_annotations (bool | None):
+            if `None`, includes annotations in the output only when `canonicalize` is `False`,
+            because [canonical serialization of values demands omission of
+            annotations](https://preserves.dev/canonical-binary.html). If explicitly `True` or
+            `False`, however, annotations will be included resp. excluded no matter the
+            `canonicalize` setting. This can be used to get canonical ordering
+            (`canonicalize=True`) *and* annotations (`include_annotations=True`).
+
+    Attributes:
+        buffer (bytearray): accumulator for the output of the encoder
+
     """
-    def __init__(self, encode_embedded=lambda x: x, canonicalize=False):
-        """TODO"""
+    def __init__(self,
+                 encode_embedded=lambda x: x,
+                 canonicalize=False,
+                 include_annotations=None):
         super(Encoder, self).__init__()
         self.buffer = bytearray()
         self._encode_embedded = encode_embedded
         self._canonicalize = canonicalize
+        if include_annotations is None:
+            self.include_annotations = not self._canonicalize
+        else:
+            self.include_annotations = include_annotations
 
     def reset(self):
-        """TODO"""
+        """Clears `self.buffer` to a fresh empty `bytearray`."""
         self.buffer = bytearray()
 
     def encode_embedded(self, v):
@@ -170,7 +314,7 @@ class Encoder(BinaryCodec):
         return self._encode_embedded(v)
 
     def contents(self):
-        """TODO"""
+        """Returns a `bytes` constructed from the contents of `self.buffer`."""
         return bytes(self.buffer)
 
     def varint(self, v):
@@ -208,7 +352,7 @@ class Encoder(BinaryCodec):
         if not self._canonicalize:
             self.encodevalues(6, v)
         else:
-            c = Canonicalizer(self._encode_embedded)
+            c = Canonicalizer(self._encode_embedded, self.include_annotations)
             for i in v: c.entry([i])
             c.emit_entries(self, 6)
 
@@ -216,12 +360,12 @@ class Encoder(BinaryCodec):
         if not self._canonicalize:
             self.encodevalues(7, list(dict_kvs(v)))
         else:
-            c = Canonicalizer(self._encode_embedded)
+            c = Canonicalizer(self._encode_embedded, self.include_annotations)
             for (kk, vv) in v.items(): c.entry([kk, vv])
             c.emit_entries(self, 7)
 
     def append(self, v):
-        """TODO"""
+        """Extend `self.buffer` with an encoding of `v`."""
         v = preserve(v)
         if hasattr(v, '__preserve_write_binary__'):
             v.__preserve_write_binary__(self)
@@ -265,8 +409,8 @@ class Encoder(BinaryCodec):
         raise TypeError('Cannot preserves-encode: ' + repr(v))
 
 class Canonicalizer:
-    def __init__(self, encode_embedded):
-        self.encoder = Encoder(encode_embedded, canonicalize=True)
+    def __init__(self, encode_embedded, include_annotations):
+        self.encoder = Encoder(encode_embedded, canonicalize=True, include_annotations=include_annotations)
         self.entries = []
 
     def entry(self, pieces):
@@ -281,10 +425,8 @@ class Canonicalizer:
         outer_encoder.buffer.append(0x84)
 
 def encode(v, **kwargs):
-    """Encode a single `Value` v to a byte string. Any kwargs are passed on to the underlying
-    [Encoder][preserves.binary.Encoder] constructor.
-
-    """
+    """Encode a single `Value` `v` to a byte string. Any supplied `kwargs` are passed on to the
+    underlying [Encoder][preserves.binary.Encoder] constructor."""
     e = Encoder(**kwargs)
     e.append(v)
     return e.contents()
diff --git a/implementations/python/preserves/text.py b/implementations/python/preserves/text.py
index 9924636..f30e281 100644
--- a/implementations/python/preserves/text.py
+++ b/implementations/python/preserves/text.py
@@ -305,7 +305,8 @@ class Formatter(TextCodec):
                  format_embedded=lambda x: x,
                  indent=None,
                  with_commas=False,
-                 trailing_comma=False):
+                 trailing_comma=False,
+                 include_annotations=True):
         """TODO"""
         super(Formatter, self).__init__()
         self.indent_delta = 0 if indent is None else indent
@@ -314,6 +315,7 @@ class Formatter(TextCodec):
         self.trailing_comma = trailing_comma
         self.chunks = []
         self._format_embedded = format_embedded
+        self.include_annotations = include_annotations
 
     def format_embedded(self, v):
         if self._format_embedded is None:
diff --git a/implementations/python/preserves/values.py b/implementations/python/preserves/values.py
index d3e7225..e731040 100644
--- a/implementations/python/preserves/values.py
+++ b/implementations/python/preserves/values.py
@@ -290,16 +290,18 @@ class Annotated(object):
         self.item = item
 
     def __preserve_write_binary__(self, encoder):
-        for a in self.annotations:
-            encoder.buffer.append(0x85)
-            encoder.append(a)
+        if encoder.include_annotations:
+            for a in self.annotations:
+                encoder.buffer.append(0x85)
+                encoder.append(a)
         encoder.append(self.item)
 
     def __preserve_write_text__(self, formatter):
-        for a in self.annotations:
-            formatter.chunks.append('@')
-            formatter.append(a)
-            formatter.chunks.append(' ')
+        if formatter.include_annotations:
+            for a in self.annotations:
+                formatter.chunks.append('@')
+                formatter.append(a)
+                formatter.chunks.append(' ')
         formatter.append(self.item)
 
     def strip(self, depth=inf):
diff --git a/implementations/python/tests/test_preserves.py b/implementations/python/tests/test_preserves.py
index ecd2b6c..856caba 100644
--- a/implementations/python/tests/test_preserves.py
+++ b/implementations/python/tests/test_preserves.py
@@ -261,7 +261,7 @@ def install_test(d, variant, tName, binaryForm, annotatedTextForm):
     def test_back(self): self.assertPreservesEqual(self.DS(binaryForm), back)
     def test_back_ann(self): self.assertPreservesEqual(self.D(self.E(annotatedTextForm)), annotatedTextForm)
     def test_encode(self): self.assertPreservesEqual(self.E(forward), binaryForm)
-    def test_encode_canonical(self): self.assertPreservesEqual(self.EC(annotatedTextForm), binaryForm)
+    def test_encode_nondet(self): self.assertPreservesEqual(self.ENONDET(annotatedTextForm), binaryForm)
     def test_encode_ann(self): self.assertPreservesEqual(self.E(annotatedTextForm), binaryForm)
     add_method(d, tName, test_match_expected)
     add_method(d, tName, test_roundtrip)
@@ -271,7 +271,7 @@ def install_test(d, variant, tName, binaryForm, annotatedTextForm):
     if variant in ['normal']:
         add_method(d, tName, test_encode)
     if variant in ['nondeterministic']:
-        add_method(d, tName, test_encode_canonical)
+        add_method(d, tName, test_encode_nondet)
     if variant in ['normal', 'nondeterministic']:
         add_method(d, tName, test_encode_ann)
 
@@ -323,8 +323,8 @@ class CommonTestSuite(PreservesTestCase):
     def E(self, v):
         return encode(v, encode_embedded=lambda x: x)
 
-    def EC(self, v):
-        return encode(v, encode_embedded=lambda x: x, canonicalize=True)
+    def ENONDET(self, v):
+        return encode(v, encode_embedded=lambda x: x, canonicalize=True, include_annotations=True)
 
 class RecordTests(PreservesTestCase):
     def test_getters(self):