preserves/implementations/python/preserves/binary.py

"""The [preserves.binary][] module implements the [Preserves machine-oriented binary
syntax](https://preserves.dev/preserves-binary.html).

The main entry points are functions [encode][preserves.binary.encode],
[canonicalize][preserves.binary.canonicalize], [decode][preserves.binary.decode], and
[decode_with_annotations][preserves.binary.decode_with_annotations].

```python
>>> encode(Record(Symbol('hi'), []))
b'\\xb4\\xb3\\x02hi\\x84'
>>> decode(b'\\xb4\\xb3\\x02hi\\x84')
#hi()

```

"""

import numbers
import struct

from .values import *
from .error import *
from .compat import basestring_, ord_

class BinaryCodec(object):
    pass

class Decoder(BinaryCodec):
    """Implementation of a decoder for the machine-oriented binary Preserves syntax.

    Args:
        packet (bytes):
            initial contents of the input buffer; may subsequently be extended by calling
            [extend][preserves.binary.Decoder.extend].

        include_annotations (bool):
            if `True`, wrap each value and subvalue in an
            [Annotated][preserves.values.Annotated] object.

        decode_embedded:
            function accepting a `Value` and returning a possibly-decoded form of that value
            suitable for placing into an [Embedded][preserves.values.Embedded] object.

    Normal usage is to supply a buffer, and keep calling [next][preserves.binary.Decoder.next]
    until a [ShortPacket][preserves.error.ShortPacket] exception is raised:

    ```python
    >>> d = Decoder(b'\\xb0\\x01{\\xb1\\x05hello\\x85\\xb3\\x01x\\xb5\\x84')
    >>> d.next()
    123
    >>> d.next()
    'hello'
    >>> d.next()
    ()
    >>> d.next()
    Traceback (most recent call last):
      ...
    preserves.error.ShortPacket: Short packet

    ```

    Alternatively, keep calling [try_next][preserves.binary.Decoder.try_next] until it yields
    `None`, which is not in the domain of Preserves `Value`s:

    ```python
    >>> d = Decoder(b'\\xb0\\x01{\\xb1\\x05hello\\x85\\xb3\\x01x\\xb5\\x84')
    >>> d.try_next()
    123
    >>> d.try_next()
    'hello'
    >>> d.try_next()
    ()
    >>> d.try_next()

    ```

    For convenience, [Decoder][preserves.binary.Decoder] implements the iterator interface,
    backing it with [try_next][preserves.binary.Decoder.try_next], so you can simply iterate
    over all complete values in an input:

    ```python
    >>> d = Decoder(b'\\xb0\\x01{\\xb1\\x05hello\\x85\\xb3\\x01x\\xb5\\x84')
    >>> list(d)
    [123, 'hello', ()]

    ```

    ```python
    >>> for v in Decoder(b'\\xb0\\x01{\\xb1\\x05hello\\x85\\xb3\\x01x\\xb5\\x84'):
    ...     print(repr(v))
    123
    'hello'
    ()

    ```

    Supply `include_annotations=True` to read annotations alongside the annotated values:

    ```python
    >>> d = Decoder(b'\\xb0\\x01{\\xb1\\x05hello\\x85\\xb3\\x01x\\xb5\\x84', include_annotations=True)
    >>> list(d)
    [123, 'hello', @#x ()]

    ```

    If you are incrementally reading from, say, a socket, you can use
    [extend][preserves.binary.Decoder.extend] to add new input as if comes available:

    ```python
    >>> d = Decoder(b'\\xb0\\x01{\\xb1\\x05he')
    >>> d.try_next()
    123
    >>> d.try_next() # returns None because the input is incomplete
    >>> d.extend(b'llo')
    >>> d.try_next()
    'hello'
    >>> d.try_next()

    ```

    Attributes:
        packet (bytes): buffered input waiting to be processed
        index (int): read position within `packet`

    """

    def __init__(self, packet=b'', include_annotations=False, decode_embedded=lambda x: x):
        super(Decoder, self).__init__()
        self.packet = packet
        self.index = 0
        self.include_annotations = include_annotations
        self.decode_embedded = decode_embedded

    def extend(self, data):
        """Appends `data` to the remaining bytes in `self.packet`, trimming already-processed
        bytes from the front of `self.packet` and resetting `self.index` to zero."""
        self.packet = self.packet[self.index:] + data
        self.index = 0

    def nextbyte(self):
        if self.index >= len(self.packet):
            raise ShortPacket('Short packet')
        self.index = self.index + 1
        return ord_(self.packet[self.index - 1])

    def nextbytes(self, n):
        start = self.index
        end = start + n
        if end > len(self.packet):
            raise ShortPacket('Short packet')
        self.index = end
        return self.packet[start : end]

    def varint(self):
        v = self.nextbyte()
        if v < 128:
            return v
        else:
            return self.varint() * 128 + (v - 128)

    def peekend(self):
        matched = (self.nextbyte() == 0x84)
        if not matched:
            self.index = self.index - 1
        return matched

    def nextvalues(self):
        result = []
        while not self.peekend():
            result.append(self.next())
        return result

    def nextint(self, n):
        if n == 0: return 0
        acc = self.nextbyte()
        if acc & 0x80: acc = acc - 256
        for _i in range(n - 1):
            acc = (acc << 8) | self.nextbyte()
        return acc

    def wrap(self, v):
        return Annotated(v) if self.include_annotations else v

    def unshift_annotation(self, a, v):
        if self.include_annotations:
            v.annotations.insert(0, a)
        return v

    def next(self):
        """Reads the next complete `Value` from the internal buffer, raising
        [ShortPacket][preserves.error.ShortPacket] if too few bytes are available, or
        [DecodeError][preserves.error.DecodeError] if the input is invalid somehow.

        """
        tag = self.nextbyte()
        if tag == 0x80: return self.wrap(False)
        if tag == 0x81: return self.wrap(True)
        if tag == 0x84: raise DecodeError('Unexpected end-of-stream marker')
        if tag == 0x85:
            a = self.next()
            v = self.next()
            return self.unshift_annotation(a, v)
        if tag == 0x86:
            if self.decode_embedded is None:
                raise DecodeError('No decode_embedded function supplied')
            return self.wrap(Embedded(self.decode_embedded(self.next())))
        if tag == 0x87:
            count = self.nextbyte()
            if count == 8: return self.wrap(struct.unpack('>d', self.nextbytes(8))[0])
            raise DecodeError('Invalid IEEE754 size')
        if tag == 0xb0: return self.wrap(self.nextint(self.varint()))
        if tag == 0xb1: return self.wrap(self.nextbytes(self.varint()).decode('utf-8'))
        if tag == 0xb2: return self.wrap(self.nextbytes(self.varint()))
        if tag == 0xb3: return self.wrap(Symbol(self.nextbytes(self.varint()).decode('utf-8')))
        if tag == 0xb4:
            vs = self.nextvalues()
            if not vs: raise DecodeError('Too few elements in encoded record')
            return self.wrap(Record(vs[0], vs[1:]))
        if tag == 0xb5: return self.wrap(tuple(self.nextvalues()))
        if tag == 0xb6: return self.wrap(frozenset(self.nextvalues()))
        if tag == 0xb7: return self.wrap(ImmutableDict.from_kvs(self.nextvalues()))
        raise DecodeError('Invalid tag: ' + hex(tag))

    def try_next(self):
        """Like [next][preserves.binary.Decoder.next], but returns `None` instead of raising
        [ShortPacket][preserves.error.ShortPacket]."""
        start = self.index
        try:
            return self.next()
        except ShortPacket:
            self.index = start
            return None

    def __iter__(self):
        return self

    def __next__(self):
        v = self.try_next()
        if v is None:
            raise StopIteration
        return v

def decode(bs, **kwargs):
    """Yields the first complete encoded value from `bs`, passing `kwargs` through to the
    [Decoder][preserves.binary.Decoder] constructor. Raises exceptions as per
    [next][preserves.binary.Decoder.next].

    Args:
        bs (bytes): encoded data to decode

    """
    return Decoder(packet=bs, **kwargs).next()

def decode_with_annotations(bs, **kwargs):
    """Like [decode][preserves.binary.decode], but supplying `include_annotations=True` to the
    [Decoder][preserves.binary.Decoder] constructor."""
    return Decoder(packet=bs, include_annotations=True, **kwargs).next()

class Encoder(BinaryCodec):
    """Implementation of an encoder for the machine-oriented binary Preserves syntax.

    ```python
    >>> e = Encoder()
    >>> e.append(123)
    >>> e.append('hello')
    >>> e.append(annotate([], Symbol('x')))
    >>> e.contents()
    b'\\xb0\\x01{\\xb1\\x05hello\\x85\\xb3\\x01x\\xb5\\x84'

    ```

    Args:
        encode_embedded:
            function accepting an [Embedded][preserves.values.Embedded].embeddedValue and
            returning a `Value` for serialization.

        canonicalize (bool):
            if `True`, ensures the serialized data are in [canonical
            form](https://preserves.dev/canonical-binary.html). This is slightly more work than
            producing potentially-non-canonical output.

        include_annotations (bool | None):
            if `None`, includes annotations in the output only when `canonicalize` is `False`,
            because [canonical serialization of values demands omission of
            annotations](https://preserves.dev/canonical-binary.html). If explicitly `True` or
            `False`, however, annotations will be included resp. excluded no matter the
            `canonicalize` setting. This can be used to get canonical ordering
            (`canonicalize=True`) *and* annotations (`include_annotations=True`).

    Attributes:
        buffer (bytearray): accumulator for the output of the encoder

    """
    def __init__(self,
                 encode_embedded=lambda x: x,
                 canonicalize=False,
                 include_annotations=None):
        super(Encoder, self).__init__()
        self.buffer = bytearray()
        self._encode_embedded = encode_embedded
        self._canonicalize = canonicalize
        if include_annotations is None:
            self.include_annotations = not self._canonicalize
        else:
            self.include_annotations = include_annotations

    def reset(self):
        """Clears `self.buffer` to a fresh empty `bytearray`."""
        self.buffer = bytearray()

    def encode_embedded(self, v):
        if self._encode_embedded is None:
            raise EncodeError('No encode_embedded function supplied')
        return self._encode_embedded(v)

    def contents(self):
        """Returns a `bytes` constructed from the contents of `self.buffer`."""
        return bytes(self.buffer)

    def varint(self, v):
        if v < 128:
            self.buffer.append(v)
        else:
            self.buffer.append((v % 128) + 128)
            self.varint(v // 128)

    def encodeint(self, v):
        self.buffer.append(0xb0)
        if v == 0:
            bytecount = 0
        else:
            bitcount = (~v if v < 0 else v).bit_length() + 1
            bytecount = (bitcount + 7) // 8
        self.varint(bytecount)
        def enc(n,x):
            if n > 0:
                enc(n-1, x >> 8)
                self.buffer.append(x & 255)
        enc(bytecount, v)

    def encodevalues(self, tag, items):
        self.buffer.append(tag)
        for i in items: self.append(i)
        self.buffer.append(0x84)

    def encodebytes(self, tag, bs):
        self.buffer.append(tag)
        self.varint(len(bs))
        self.buffer.extend(bs)

    def encodeset(self, v):
        if not self._canonicalize:
            self.encodevalues(0xb6, v)
        else:
            c = Canonicalizer(self._encode_embedded, self.include_annotations)
            for i in v: c.entry([i])
            c.emit_entries(self, 0xb6)

    def encodedict(self, v):
        if not self._canonicalize:
            self.encodevalues(0xb7, list(dict_kvs(v)))
        else:
            c = Canonicalizer(self._encode_embedded, self.include_annotations)
            for (kk, vv) in v.items(): c.entry([kk, vv])
            c.emit_entries(self, 0xb7)

    def append(self, v):
        """Extend `self.buffer` with an encoding of `v`."""
        v = preserve(v)
        if hasattr(v, '__preserve_write_binary__'):
            v.__preserve_write_binary__(self)
        elif v is False:
            self.buffer.append(0x80)
        elif v is True:
            self.buffer.append(0x81)
        elif isinstance(v, float):
            self.buffer.append(0x87)
            self.buffer.append(8)
            self.buffer.extend(struct.pack('>d', v))
        elif isinstance(v, numbers.Number):
            self.encodeint(v)
        elif isinstance(v, bytes):
            self.encodebytes(0xb2, v)
        elif isinstance(v, basestring_):
            self.encodebytes(0xb1, v.encode('utf-8'))
        elif isinstance(v, list):
            self.encodevalues(0xb5, v)
        elif isinstance(v, tuple):
            self.encodevalues(0xb5, v)
        elif isinstance(v, set):
            self.encodeset(v)
        elif isinstance(v, frozenset):
            self.encodeset(v)
        elif isinstance(v, dict):
            self.encodedict(v)
        else:
            try:
                i = iter(v)
            except TypeError:
                i = None
            if i is None:
                self.cannot_encode(v)
            else:
                self.encodevalues(0xb5, i)

    def cannot_encode(self, v):
        raise TypeError('Cannot preserves-encode: ' + repr(v))

class Canonicalizer:
    def __init__(self, encode_embedded, include_annotations):
        self.encoder = Encoder(encode_embedded, canonicalize=True, include_annotations=include_annotations)
        self.entries = []

    def entry(self, pieces):
        for piece in pieces: self.encoder.append(piece)
        entry = self.encoder.contents()
        self.encoder.reset()
        self.entries.append(entry)

    def emit_entries(self, outer_encoder, tag):
        outer_encoder.buffer.append(tag)
        for e in sorted(self.entries): outer_encoder.buffer.extend(e)
        outer_encoder.buffer.append(0x84)

def encode(v, **kwargs):
    """Encode a single `Value` `v` to a byte string. Any supplied `kwargs` are passed on to the
    underlying [Encoder][preserves.binary.Encoder] constructor."""
    e = Encoder(**kwargs)
    e.append(v)
    return e.contents()

def canonicalize(v, **kwargs):
    """As [encode][preserves.binary.encode], but sets `canonicalize=True` in the
    [Encoder][preserves.binary.Encoder] constructor.

    """
    return encode(v, canonicalize=True, **kwargs)