438 lines
14 KiB
Python
438 lines
14 KiB
Python
"""The [preserves.binary][] module implements the [Preserves machine-oriented binary
|
|
syntax](https://preserves.dev/preserves-binary.html).
|
|
|
|
The main entry points are functions [encode][preserves.binary.encode],
|
|
[canonicalize][preserves.binary.canonicalize], [decode][preserves.binary.decode], and
|
|
[decode_with_annotations][preserves.binary.decode_with_annotations].
|
|
|
|
```python
|
|
>>> encode(Record(Symbol('hi'), []))
|
|
b'\\xb4\\xb3\\x02hi\\x84'
|
|
>>> decode(b'\\xb4\\xb3\\x02hi\\x84')
|
|
#hi()
|
|
|
|
```
|
|
|
|
"""
|
|
|
|
import numbers
|
|
import struct
|
|
|
|
from .values import *
|
|
from .error import *
|
|
from .compat import basestring_, ord_
|
|
|
|
class BinaryCodec(object):
|
|
pass
|
|
|
|
class Decoder(BinaryCodec):
|
|
"""Implementation of a decoder for the machine-oriented binary Preserves syntax.
|
|
|
|
Args:
|
|
packet (bytes):
|
|
initial contents of the input buffer; may subsequently be extended by calling
|
|
[extend][preserves.binary.Decoder.extend].
|
|
|
|
include_annotations (bool):
|
|
if `True`, wrap each value and subvalue in an
|
|
[Annotated][preserves.values.Annotated] object.
|
|
|
|
decode_embedded:
|
|
function accepting a `Value` and returning a possibly-decoded form of that value
|
|
suitable for placing into an [Embedded][preserves.values.Embedded] object.
|
|
|
|
Normal usage is to supply a buffer, and keep calling [next][preserves.binary.Decoder.next]
|
|
until a [ShortPacket][preserves.error.ShortPacket] exception is raised:
|
|
|
|
```python
|
|
>>> d = Decoder(b'\\xb0\\x01{\\xb1\\x05hello\\x85\\xb3\\x01x\\xb5\\x84')
|
|
>>> d.next()
|
|
123
|
|
>>> d.next()
|
|
'hello'
|
|
>>> d.next()
|
|
()
|
|
>>> d.next()
|
|
Traceback (most recent call last):
|
|
...
|
|
preserves.error.ShortPacket: Short packet
|
|
|
|
```
|
|
|
|
Alternatively, keep calling [try_next][preserves.binary.Decoder.try_next] until it yields
|
|
`None`, which is not in the domain of Preserves `Value`s:
|
|
|
|
```python
|
|
>>> d = Decoder(b'\\xb0\\x01{\\xb1\\x05hello\\x85\\xb3\\x01x\\xb5\\x84')
|
|
>>> d.try_next()
|
|
123
|
|
>>> d.try_next()
|
|
'hello'
|
|
>>> d.try_next()
|
|
()
|
|
>>> d.try_next()
|
|
|
|
```
|
|
|
|
For convenience, [Decoder][preserves.binary.Decoder] implements the iterator interface,
|
|
backing it with [try_next][preserves.binary.Decoder.try_next], so you can simply iterate
|
|
over all complete values in an input:
|
|
|
|
```python
|
|
>>> d = Decoder(b'\\xb0\\x01{\\xb1\\x05hello\\x85\\xb3\\x01x\\xb5\\x84')
|
|
>>> list(d)
|
|
[123, 'hello', ()]
|
|
|
|
```
|
|
|
|
```python
|
|
>>> for v in Decoder(b'\\xb0\\x01{\\xb1\\x05hello\\x85\\xb3\\x01x\\xb5\\x84'):
|
|
... print(repr(v))
|
|
123
|
|
'hello'
|
|
()
|
|
|
|
```
|
|
|
|
Supply `include_annotations=True` to read annotations alongside the annotated values:
|
|
|
|
```python
|
|
>>> d = Decoder(b'\\xb0\\x01{\\xb1\\x05hello\\x85\\xb3\\x01x\\xb5\\x84', include_annotations=True)
|
|
>>> list(d)
|
|
[123, 'hello', @#x ()]
|
|
|
|
```
|
|
|
|
If you are incrementally reading from, say, a socket, you can use
|
|
[extend][preserves.binary.Decoder.extend] to add new input as if comes available:
|
|
|
|
```python
|
|
>>> d = Decoder(b'\\xb0\\x01{\\xb1\\x05he')
|
|
>>> d.try_next()
|
|
123
|
|
>>> d.try_next() # returns None because the input is incomplete
|
|
>>> d.extend(b'llo')
|
|
>>> d.try_next()
|
|
'hello'
|
|
>>> d.try_next()
|
|
|
|
```
|
|
|
|
Attributes:
|
|
packet (bytes): buffered input waiting to be processed
|
|
index (int): read position within `packet`
|
|
|
|
"""
|
|
|
|
def __init__(self, packet=b'', include_annotations=False, decode_embedded=lambda x: x):
|
|
super(Decoder, self).__init__()
|
|
self.packet = packet
|
|
self.index = 0
|
|
self.include_annotations = include_annotations
|
|
self.decode_embedded = decode_embedded
|
|
|
|
def extend(self, data):
|
|
"""Appends `data` to the remaining bytes in `self.packet`, trimming already-processed
|
|
bytes from the front of `self.packet` and resetting `self.index` to zero."""
|
|
self.packet = self.packet[self.index:] + data
|
|
self.index = 0
|
|
|
|
def nextbyte(self):
|
|
if self.index >= len(self.packet):
|
|
raise ShortPacket('Short packet')
|
|
self.index = self.index + 1
|
|
return ord_(self.packet[self.index - 1])
|
|
|
|
def nextbytes(self, n):
|
|
start = self.index
|
|
end = start + n
|
|
if end > len(self.packet):
|
|
raise ShortPacket('Short packet')
|
|
self.index = end
|
|
return self.packet[start : end]
|
|
|
|
def varint(self):
|
|
v = self.nextbyte()
|
|
if v < 128:
|
|
return v
|
|
else:
|
|
return self.varint() * 128 + (v - 128)
|
|
|
|
def peekend(self):
|
|
matched = (self.nextbyte() == 0x84)
|
|
if not matched:
|
|
self.index = self.index - 1
|
|
return matched
|
|
|
|
def nextvalues(self):
|
|
result = []
|
|
while not self.peekend():
|
|
result.append(self.next())
|
|
return result
|
|
|
|
def nextint(self, n):
|
|
if n == 0: return 0
|
|
acc = self.nextbyte()
|
|
if acc & 0x80: acc = acc - 256
|
|
for _i in range(n - 1):
|
|
acc = (acc << 8) | self.nextbyte()
|
|
return acc
|
|
|
|
def wrap(self, v):
|
|
return Annotated(v) if self.include_annotations else v
|
|
|
|
def unshift_annotation(self, a, v):
|
|
if self.include_annotations:
|
|
v.annotations.insert(0, a)
|
|
return v
|
|
|
|
def next(self):
|
|
"""Reads the next complete `Value` from the internal buffer, raising
|
|
[ShortPacket][preserves.error.ShortPacket] if too few bytes are available, or
|
|
[DecodeError][preserves.error.DecodeError] if the input is invalid somehow.
|
|
|
|
"""
|
|
tag = self.nextbyte()
|
|
if tag == 0x80: return self.wrap(False)
|
|
if tag == 0x81: return self.wrap(True)
|
|
if tag == 0x84: raise DecodeError('Unexpected end-of-stream marker')
|
|
if tag == 0x85:
|
|
a = self.next()
|
|
v = self.next()
|
|
return self.unshift_annotation(a, v)
|
|
if tag == 0x86:
|
|
if self.decode_embedded is None:
|
|
raise DecodeError('No decode_embedded function supplied')
|
|
return self.wrap(Embedded(self.decode_embedded(self.next())))
|
|
if tag == 0x87:
|
|
count = self.nextbyte()
|
|
if count == 8: return self.wrap(struct.unpack('>d', self.nextbytes(8))[0])
|
|
raise DecodeError('Invalid IEEE754 size')
|
|
if tag == 0xb0: return self.wrap(self.nextint(self.varint()))
|
|
if tag == 0xb1: return self.wrap(self.nextbytes(self.varint()).decode('utf-8'))
|
|
if tag == 0xb2: return self.wrap(self.nextbytes(self.varint()))
|
|
if tag == 0xb3: return self.wrap(Symbol(self.nextbytes(self.varint()).decode('utf-8')))
|
|
if tag == 0xb4:
|
|
vs = self.nextvalues()
|
|
if not vs: raise DecodeError('Too few elements in encoded record')
|
|
return self.wrap(Record(vs[0], vs[1:]))
|
|
if tag == 0xb5: return self.wrap(tuple(self.nextvalues()))
|
|
if tag == 0xb6: return self.wrap(frozenset(self.nextvalues()))
|
|
if tag == 0xb7: return self.wrap(ImmutableDict.from_kvs(self.nextvalues()))
|
|
raise DecodeError('Invalid tag: ' + hex(tag))
|
|
|
|
def try_next(self):
|
|
"""Like [next][preserves.binary.Decoder.next], but returns `None` instead of raising
|
|
[ShortPacket][preserves.error.ShortPacket]."""
|
|
start = self.index
|
|
try:
|
|
return self.next()
|
|
except ShortPacket:
|
|
self.index = start
|
|
return None
|
|
|
|
def __iter__(self):
|
|
return self
|
|
|
|
def __next__(self):
|
|
v = self.try_next()
|
|
if v is None:
|
|
raise StopIteration
|
|
return v
|
|
|
|
def decode(bs, **kwargs):
|
|
"""Yields the first complete encoded value from `bs`, passing `kwargs` through to the
|
|
[Decoder][preserves.binary.Decoder] constructor. Raises exceptions as per
|
|
[next][preserves.binary.Decoder.next].
|
|
|
|
Args:
|
|
bs (bytes): encoded data to decode
|
|
|
|
"""
|
|
return Decoder(packet=bs, **kwargs).next()
|
|
|
|
def decode_with_annotations(bs, **kwargs):
|
|
"""Like [decode][preserves.binary.decode], but supplying `include_annotations=True` to the
|
|
[Decoder][preserves.binary.Decoder] constructor."""
|
|
return Decoder(packet=bs, include_annotations=True, **kwargs).next()
|
|
|
|
class Encoder(BinaryCodec):
|
|
"""Implementation of an encoder for the machine-oriented binary Preserves syntax.
|
|
|
|
```python
|
|
>>> e = Encoder()
|
|
>>> e.append(123)
|
|
>>> e.append('hello')
|
|
>>> e.append(annotate([], Symbol('x')))
|
|
>>> e.contents()
|
|
b'\\xb0\\x01{\\xb1\\x05hello\\x85\\xb3\\x01x\\xb5\\x84'
|
|
|
|
```
|
|
|
|
Args:
|
|
encode_embedded:
|
|
function accepting an [Embedded][preserves.values.Embedded].embeddedValue and
|
|
returning a `Value` for serialization.
|
|
|
|
canonicalize (bool):
|
|
if `True`, ensures the serialized data are in [canonical
|
|
form](https://preserves.dev/canonical-binary.html). This is slightly more work than
|
|
producing potentially-non-canonical output.
|
|
|
|
include_annotations (bool | None):
|
|
if `None`, includes annotations in the output only when `canonicalize` is `False`,
|
|
because [canonical serialization of values demands omission of
|
|
annotations](https://preserves.dev/canonical-binary.html). If explicitly `True` or
|
|
`False`, however, annotations will be included resp. excluded no matter the
|
|
`canonicalize` setting. This can be used to get canonical ordering
|
|
(`canonicalize=True`) *and* annotations (`include_annotations=True`).
|
|
|
|
Attributes:
|
|
buffer (bytearray): accumulator for the output of the encoder
|
|
|
|
"""
|
|
def __init__(self,
|
|
encode_embedded=lambda x: x,
|
|
canonicalize=False,
|
|
include_annotations=None):
|
|
super(Encoder, self).__init__()
|
|
self.buffer = bytearray()
|
|
self._encode_embedded = encode_embedded
|
|
self._canonicalize = canonicalize
|
|
if include_annotations is None:
|
|
self.include_annotations = not self._canonicalize
|
|
else:
|
|
self.include_annotations = include_annotations
|
|
|
|
def reset(self):
|
|
"""Clears `self.buffer` to a fresh empty `bytearray`."""
|
|
self.buffer = bytearray()
|
|
|
|
def encode_embedded(self, v):
|
|
if self._encode_embedded is None:
|
|
raise EncodeError('No encode_embedded function supplied')
|
|
return self._encode_embedded(v)
|
|
|
|
def contents(self):
|
|
"""Returns a `bytes` constructed from the contents of `self.buffer`."""
|
|
return bytes(self.buffer)
|
|
|
|
def varint(self, v):
|
|
if v < 128:
|
|
self.buffer.append(v)
|
|
else:
|
|
self.buffer.append((v % 128) + 128)
|
|
self.varint(v // 128)
|
|
|
|
def encodeint(self, v):
|
|
self.buffer.append(0xb0)
|
|
if v == 0:
|
|
bytecount = 0
|
|
else:
|
|
bitcount = (~v if v < 0 else v).bit_length() + 1
|
|
bytecount = (bitcount + 7) // 8
|
|
self.varint(bytecount)
|
|
def enc(n,x):
|
|
if n > 0:
|
|
enc(n-1, x >> 8)
|
|
self.buffer.append(x & 255)
|
|
enc(bytecount, v)
|
|
|
|
def encodevalues(self, tag, items):
|
|
self.buffer.append(tag)
|
|
for i in items: self.append(i)
|
|
self.buffer.append(0x84)
|
|
|
|
def encodebytes(self, tag, bs):
|
|
self.buffer.append(tag)
|
|
self.varint(len(bs))
|
|
self.buffer.extend(bs)
|
|
|
|
def encodeset(self, v):
|
|
if not self._canonicalize:
|
|
self.encodevalues(0xb6, v)
|
|
else:
|
|
c = Canonicalizer(self._encode_embedded, self.include_annotations)
|
|
for i in v: c.entry([i])
|
|
c.emit_entries(self, 0xb6)
|
|
|
|
def encodedict(self, v):
|
|
if not self._canonicalize:
|
|
self.encodevalues(0xb7, list(dict_kvs(v)))
|
|
else:
|
|
c = Canonicalizer(self._encode_embedded, self.include_annotations)
|
|
for (kk, vv) in v.items(): c.entry([kk, vv])
|
|
c.emit_entries(self, 0xb7)
|
|
|
|
def append(self, v):
|
|
"""Extend `self.buffer` with an encoding of `v`."""
|
|
v = preserve(v)
|
|
if hasattr(v, '__preserve_write_binary__'):
|
|
v.__preserve_write_binary__(self)
|
|
elif v is False:
|
|
self.buffer.append(0x80)
|
|
elif v is True:
|
|
self.buffer.append(0x81)
|
|
elif isinstance(v, float):
|
|
self.buffer.append(0x87)
|
|
self.buffer.append(8)
|
|
self.buffer.extend(struct.pack('>d', v))
|
|
elif isinstance(v, numbers.Number):
|
|
self.encodeint(v)
|
|
elif isinstance(v, bytes):
|
|
self.encodebytes(0xb2, v)
|
|
elif isinstance(v, basestring_):
|
|
self.encodebytes(0xb1, v.encode('utf-8'))
|
|
elif isinstance(v, list):
|
|
self.encodevalues(0xb5, v)
|
|
elif isinstance(v, tuple):
|
|
self.encodevalues(0xb5, v)
|
|
elif isinstance(v, set):
|
|
self.encodeset(v)
|
|
elif isinstance(v, frozenset):
|
|
self.encodeset(v)
|
|
elif isinstance(v, dict):
|
|
self.encodedict(v)
|
|
else:
|
|
try:
|
|
i = iter(v)
|
|
except TypeError:
|
|
i = None
|
|
if i is None:
|
|
self.cannot_encode(v)
|
|
else:
|
|
self.encodevalues(0xb5, i)
|
|
|
|
def cannot_encode(self, v):
|
|
raise TypeError('Cannot preserves-encode: ' + repr(v))
|
|
|
|
class Canonicalizer:
|
|
def __init__(self, encode_embedded, include_annotations):
|
|
self.encoder = Encoder(encode_embedded, canonicalize=True, include_annotations=include_annotations)
|
|
self.entries = []
|
|
|
|
def entry(self, pieces):
|
|
for piece in pieces: self.encoder.append(piece)
|
|
entry = self.encoder.contents()
|
|
self.encoder.reset()
|
|
self.entries.append(entry)
|
|
|
|
def emit_entries(self, outer_encoder, tag):
|
|
outer_encoder.buffer.append(tag)
|
|
for e in sorted(self.entries): outer_encoder.buffer.extend(e)
|
|
outer_encoder.buffer.append(0x84)
|
|
|
|
def encode(v, **kwargs):
|
|
"""Encode a single `Value` `v` to a byte string. Any supplied `kwargs` are passed on to the
|
|
underlying [Encoder][preserves.binary.Encoder] constructor."""
|
|
e = Encoder(**kwargs)
|
|
e.append(v)
|
|
return e.contents()
|
|
|
|
def canonicalize(v, **kwargs):
|
|
"""As [encode][preserves.binary.encode], but sets `canonicalize=True` in the
|
|
[Encoder][preserves.binary.Encoder] constructor.
|
|
|
|
"""
|
|
return encode(v, canonicalize=True, **kwargs)
|