From d497d9e6d1a52a244edf9ca151afa3eddd09dd8b Mon Sep 17 00:00:00 2001 From: Tony Garnock-Jones Date: Sat, 29 Sep 2018 17:22:34 +0100 Subject: [PATCH] Split out preserves into its own repository --- syndicate/mc/Makefile | 2 - syndicate/mc/codec.md | 662 --------------- syndicate/mc/preserve.css | 61 -- syndicate/mc/preserve.md | 1411 -------------------------------- syndicate/mc/preserve.py | 385 --------- syndicate/mc/preserve.rkt | 856 ------------------- syndicate/mc/test_preserve.py | 171 ---- syndicate/mc/udp-dataspace.rkt | 2 +- 8 files changed, 1 insertion(+), 3549 deletions(-) delete mode 100644 syndicate/mc/Makefile delete mode 100644 syndicate/mc/codec.md delete mode 100644 syndicate/mc/preserve.css delete mode 100644 syndicate/mc/preserve.md delete mode 100644 syndicate/mc/preserve.py delete mode 100644 syndicate/mc/preserve.rkt delete mode 100644 syndicate/mc/test_preserve.py diff --git a/syndicate/mc/Makefile b/syndicate/mc/Makefile deleted file mode 100644 index b7f6a8d..0000000 --- a/syndicate/mc/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -preserve.pdf: preserve.md preserve.css - google-chrome --headless --disable-gpu --print-to-pdf=$@ http://localhost:4000/preserve.html diff --git a/syndicate/mc/codec.md b/syndicate/mc/codec.md deleted file mode 100644 index 51e8076..0000000 --- a/syndicate/mc/codec.md +++ /dev/null @@ -1,662 +0,0 @@ ---- ---- - - -2018-06-04 20:31:48 TODO: cwebber's email comments -2018-06-04 20:31:51 TODO: look at https://github.com/imbal/rson and at clojure EDN -2018-06-05 10:32:02 ... and at http://json-schema.org/latest/json-schema-core.html#rfc.section.4.2 - -# SPKI CAT: SPKI S-Expressions with Canonical Atom Tags - -Tony Garnock-Jones -Christopher Lemmer Webber -May 2018 -Version 0.0.1 - - ________________ - / \ - /\__/\ / boo! \ - / \ \ i'm very spki! \ - \=\_^__^_/= ___/\__________________/ - |/ \ - \\ | | / - <_|--|_> - -## Introduction - -This document proposes a language-neutral JSON-like *data type*, along -with a robust equivalence relation ("semantics") and a total ordering -over inhabitants of the type.[^tjson] - - [^tjson]: [TJSON](https://www.tjson.org/) has a similar aim: - “different on-the-wire representations of an object correspond to - the same typed data object” - ([source](https://news.ycombinator.com/item?id=12860143)); “TJSON - is defined as a serialization format on top of a JSON-like data - model” ([source](https://news.ycombinator.com/item?id=12860401)). - -It then suggests conventions for encoding common data formats in terms -of the proposed data type. - -Finally, it proposes concrete *syntax* for the data type, offering a -language-neutral transfer syntax (based on -[Rivest's S-Expressions][sexp.txt] as used in [SPKI/SDSI][spki]) and -suggesting possible language-specific representations for the data -type's inhabitants. - - [sexp.txt]: http://people.csail.mit.edu/rivest/Sexp.txt - [spki]: http://world.std.com/~cme/html/spki.html - -### Why not Just Use JSON? - - - -JSON offers *syntax* for numbers, strings, booleans, null, arrays and -string-keyed maps. However, it offers no *semantics* for the syntax: -it is left to each implementation to determine how to treat each JSON -term. This causes -[interoperability](http://seriot.ch/parsing_json.php) and even -[security](http://seriot.ch/parsing_json.php) issues. - -Specifically, JSON does not: - - - assign any meaning to numbers,[^meaning-ieee-double] - - determine how strings are to be compared,[^string-key-comparison] - - determine whether object key ordering is significant, or - - determine whether duplicate object keys are permitted, what it - would mean if they were, or how to determine a duplicate in the - first place. - -In short, JSON syntax doesn't *denote* anything.[^xml-infoset] [^other-formats] - - [^meaning-ieee-double]: - [Section 6 of RFC 7159](https://tools.ietf.org/html/rfc7159#section-6) - does go so far as to indicate “good interoperability can be - achieved” by imagining that parsers are able reliably to - understand the syntax of numbers as denoting an IEEE 754 - double-precision floating-point value. - - [^string-key-comparison]: - [Section 8.3 of RFC 7159](https://tools.ietf.org/html/rfc7159#section-8.3) - suggests that *if* an implementation compares strings used as - object keys “code unit by code unit”, then it will interoperate - with *other such implementations*, but neither requires this - behaviour nor discusses comparisons of strings used in other - contexts. - - [^xml-infoset]: The XML world has the concept of - [XML infoset](https://www.w3.org/TR/xml-infoset/). Loosely - speaking, XML infoset is the *denotation* of an XML document; the - *meaning* of the document. - - [^other-formats]: Most other recent data languages are like JSON in - specifying only a syntax with no associated semantics. While some - do make a sketch of a semantics, the result is often - underspecified (e.g. in terms of how strings are to be compared), - overly machine-oriented (e.g. treating 32-bit integers as - fundamentally distinct from 64-bit integers and from - floating-point numbers), overly fine (e.g. giving visibility to - the order in which map entries are written), or all three. - -Some examples: - - - are the JSON values `1`, `1.0`, and `1e0` the same or different? - - are the JSON values `1.0` and `1.0000000000000001` the same or different? - - are the JSON strings `"päron"` (UTF-8 `70c3a4726f6e`) and `"päron"` - (UTF-8 `7061cc88726f6e`) the same or different? - - are the JSON objects `{"a":1, "b":2}` and `{"b":2, "a":1}` the same - or different? - - which, if any, of `{"a":1, "a":2}`, `{"a":1}` and `{"a":2}` are the - same? Are all three legal? - - are `{"päron":1}` and `{"päron":1}` the same or different? - -Different JSON implementations give different answers to these -questions. The JSON specifications are silent on these questions. - -There are other minor problems with JSON having to do with its syntax. -Examples include its relative verbosity and its lack of support for -binary data. - -## Starting with Semantics - -Taking inspiration from functional programming, we start with a -definition of the *values* that we want to work with and give them -meaning independent of their syntax. We will treat syntax separately, -later in this document. - -We will want our data type to accommodate *atoms* (numbers and text), -*products* (both tuples and sequences), and *labelled -sums*.[^zephyr-asdl] It should also include *keyed maps*. We should -avoid unnecessary restrictions such as machine-oriented fixed-width -integer or floating-point values where possible. - - [^zephyr-asdl]: This design was loosely inspired by Zephyr ASDL (h/t - [Darius Bacon](https://twitter.com/abecedarius/status/993545767884226561)), - which doesn't offer much in the way of atoms, but offers - general-purpose labelled sums and products. See D. C. Wang, A. W. - Appel, J. L. Korn, and C. S. Serra, “The Zephyr Abstract Syntax - Description Language,” in USENIX Conference on Domain-Specific - Languages, 1997, pp. 213–228. - [PDF available.](https://www.usenix.org/legacy/publications/library/proceedings/dsl97/full_papers/wang/wang.pdf) - -### Values - -A `Value` is one of: - - - a `ByteString` for general-purpose non-numeric atomic data,[^byte-string-rationale] - - a `Number` for integers and rational numbers, - - a `List` for general-purpose variable-length sequences, - - a `Map` for general-purpose variable-size key/value maps, or - - a `Record` for tagging a tuple of values with an intended interpretation. - -We define a total order over `Value`s: Every `ByteString` is less than -the other kinds of `Value`; every `Number` is less than any `List`, -`Map` or `Record`, but greater than any `ByteString`; and so on. - -That is, `ByteString < Number < List < Map < Record`. - -Two values of the same kind are compared using kind-specific rules, -given below. - -Two `Value`s are equal if neither is less than the other according to -the total order. - - [^byte-string-rationale]: Why include `ByteString`, when we could - instead use a reserved `Record` along with a `List` of `Number`s? - ((TODO: Actually decide about this! Similarly, why include `Map` - rather than a restricted form of `List` with a `Record`? I think - the answer has to do with the arbitrariness of the label we'd - pick: unless *extremely* carefully chosen (i.e. number 0 (ideally - even `-Inf`!) for byte strings, number 1 for map, and have the - order go `Number < Record < List`), they would mess up the - prettiness of the ordering. Though... we could ultimately reduce - this to `Number` and `Record`, and have a family of `#"list"` and - `#"map"` `Record`s...)) - -### Byte strings - -A `ByteString` is an ordered sequence of zero or more integers in the -inclusive range [0..255]. - -`ByteString`s are compared lexicographically. - -We will write examples of `ByteString`s that contain only ASCII -characters using “`#"`” as an opening quote mark and “`"`” as a -closing quote mark. - -**Examples.** The `ByteString` containing the three ASCII characters -`A`, `B` and `C` is written as `#"ABC"`. The empty `ByteString` is -written as `#""`. **N.B.** Despite appearances, these are *binary* -data. - -### Numbers - -A `Number` is a signed rational number of finite precision whose -magnitude can be exactly represented in base two with a finite number -of digits. This includes integers of arbitrary width as well as (for -example) the non-infinite non-NaN IEEE 754 floating-point values. - -`Number`s are compared as mathematical numbers. - -We will write examples of `Number`s using standard mathematical -notation. - -**Examples.** 10, -6, 0.5, -3/2, 33/192, -1.202E4567. - -**Non-examples.** NaN (the clue is in the name!), ∞ (not finite), -0.2 (cannot be exactly represented with a finite number of binary -digits), 1/7 (likewise), 2+*i*3 (not rational), √2 (likewise). - -### Lists - -A `List` is an ordered sequence of zero or more `Value`s. - -`List`s are compared lexicographically, appealing to the ordering on -`Value`s for comparisons at each position in the `List`s. - -### Maps - -A `Map` is an *unordered* collection of zero or more pairs of -`Value`s. Each pair comprises a *key* and a *value*. Keys in a `Map` -must be pairwise distinct. - -Instances of `Map` are compared by lexicographic comparison of the -sequences resulting from ordering each `Map`'s pairs in ascending -order by key. ((TODO: Is this a good idea? Is it clearly-enough -written? An alternative approach is to compare first by the *count* of -pairs, and only if the count is the same, start comparing the pairs -themselves.)) - -### Records - -A `Record` is a tuple of one or more `Value`s. The first in the tuple -is called the *label* of the `Record`, and the other elements of the -tuple are called its *fields*. - -`Record` labels are *usually* `ByteString`s, but can be any kind of -`Value`.[^iri-labels] - - [^iri-labels]: It is occasionally (but seldom) necessary to - interpret such `ByteString` labels as UTF-8 encoded IRIs. Where a - label can be read as a relative IRI, it is notionally interpreted - with respect to the IRI `http://spki-cat.org/` ((TODO: - placeholder)); where a label can be read as an absolute IRI, it - stands for that IRI; and otherwise, it cannot be read as an IRI at - all, and so the label simply stands for itself - for its own - `Value`. - -`Record`s are compared lexicographically as if they were just tuples; -that is, first by their labels, and then by the remainder of their -fields. - -We will write examples of `Record`s with `ByteString` labels entirely -composed of ASCII characters as their label followed by their -parenthesised, comma-separated fields. - -**Examples.** The `Record` with label `#"foo"` and fields 1, 2 and 3 -is written `#"foo"(1, 2, 3)`; the `Record` with label `#"void"` and no -fields is written `#"void"()`. - -## Conventions for Common Data Types - -The `Value` data type is essentially an abstract S-Expression, able to -represent semi-structured data over `ByteString` and `Number` atoms. - -However, users need a wide variety of data types for representing -domain-specific values such as text, calendrical values, machine -words, IEEE 754 floating-point values, booleans, and so on. - -We use appropriately-labelled `Record`s to denote these -domain-specific data types. - -All of these conventions are optional. They form a layer atop the core -`Value` structure. Non-domain-specific tools do not in general need to -treat them specially. - -**Validity.** Many of the labels we will describe in this section come - with side-conditions on the contents of labelled `Record`s. It is - possible to construct an instance of `Value` that violates these - side-conditions without ceasing to be a `Value` or becoming - unrepresentable. However, we say that such a `Value` is *invalid* - because it fails to honour the necessary side-conditions. - Implementations *SHOULD* allow two modes of working: one which - treats all `Value`s identically, without regard for side-conditions, - and one which enforces validity (i.e. side-conditions) when reading, - writing, or constructing `Value`s. - -### Text - -A `Text` is a `Record` labelled with the `ByteString` `#"utf-8"` and -having a single field that is also a `ByteString`. The field *MUST* be -valid UTF-8. - -We will write examples of `Text`s that contain Unicode text using -“`"`” as both an opening and closing quote mark. - -**Examples.** The `Text` containing the three Unicode code points `z` -(0x7A), `水` (0x6C34) and `𝄞` (0x1D11E) is written as `"z水𝄞"`. - -**Normalization forms.** Unicode defines multiple -[normalization forms](http://unicode.org/reports/tr15/) for text. The -ordering and equivalence relations defined for `Value`s mean that, for -Unicode text, the UTF-8 encoded byte-level form of a text is used in -comparisons.[^utf8-is-awesome] In order for users to unambiguously -signal or require a particular normalization form, we define a -`NormalizedText`, which is a `Record` labelled with -`#"unicode-normalization"` and having two fields, the first of which -is a `Text` specifying the normalization form used (e.g. `"nfc"`, -`"nfd"`, `"nfkc"`, `"nfkd"`), and the second of which is a `Text` -whose underlying representation *MUST* be normalized according to the -named normalization form. - - [^utf8-is-awesome]: Happily, the design of UTF-8 is such that this - gives the same result as a lexicographic code-point-by-code-point - comparison! - -**IRIs.** (URIs, URLs, URNs, etc.) An `IRI` is a `Record` labelled -with `#"iri"` and having one field, a `Text` which is the IRI itself -and which *MUST* be a valid absolute or relative IRI. - -**Symbols.** Programming languages like Lisp and Prolog frequently use -string-like values called *symbols*. A `Symbol` is a `Record` labelled -with `#"symbol"` and having one field, a `Text`. - -### Numbers - -The definition of `Number` captures all integers and all -finitely-representable floating-point values. However, in certain -circumstances it can be valuable to assert that a number inhabits a -particular range, such as a fixed-width machine word or an IEEE 754 -floating-point value. - -**Fixed-width machine words.** (16-, 32- and 64-bit) A family of -labels `i`*n* and `u`*n* for *n* ∈ {16,32,64} denote *n*-bit-wide -signed and unsigned range restrictions, respectively. Records with -these labels *MUST* have one field, a `Number`, which *MUST* fall -within the appropriate range. That is, to be valid, - - in `#"i16"(`*x*`)`, -32768 <= *x* <= 32767, and ⌊*x*⌋ = *x*. - - in `#"u16"(`*x*`)`, 0 <= *x* <= 65535, and ⌊*x*⌋ = *x*. - - in `#"i32"(`*x*`)`, -2147483648 <= *x* <= 2147483647, and ⌊*x*⌋ = *x*. - - etc. - -**IEEE 754 floating-point.** (single- and double-precision) The labels -`f32` and `f64` denote single- and double-precision IEEE 754 -floating-point values, respectively. Records with these labels *MUST* -have one field. This field *MUST* either be a `Number`, which *MUST* -fall within the appropriate representable range, or one of the records -`#"nan"()`, `#"+inf"()` or `#"-inf"()`. - -### Anonymous Tuples and Unit - -A `Tuple` is a `Record` with label `#"tuple"` and zero or more fields, -denoting an anonymous tuple of values. - -The 0-ary tuple, `#"tuple"()`, denotes the empty tuple, sometimes -called "unit" or "void" (but *not* e.g. JavaScript's "undefined" -value). - -### Booleans, Null and Undefined - -The two 0-ary `Record`s `#"true"()` and `#"false"()` denote the "true" -and "false" Boolean values, respectively. - -Tony Hoare's -"[billion-dollar mistake](https://en.wikipedia.org/wiki/Tony_Hoare#Apologies_and_retractions)" -can be represented with the 0-ary `Record` `#"null"()`. An "undefined" -value can be represented as `#"undefined"()`. - -### Dates and Times - -Dates, times, moments, and timestamps can be represented with a -`Record` with label `#"rfc3339"` having a single field, a `Text`, -which *MUST* conform to one of the `full-date`, `partial-time`, -`full-time`, or `date-time` productions of -[section 5.6 of RFC 3339](https://tools.ietf.org/html/rfc3339#section-5.6). - -## Syntax - -Now we have discussed `Value`s and their meanings, we may turn to -techniques for *representing* `Value`s for communication or storage. - -The syntax we have used for the examples so far is inadequate in many -ways, not least of which is that it cannot represent every `Value`. - -Separation of the meaning of a piece of syntax from the syntax itself -opens the door to domain-specific syntaxes, all equivalent and -interconvertible.[^asn1] With a robust semantic foundation, -connections to other data languages can also be made. - - [^asn1]: Those who remember - [ASN.1](https://www.itu.int/en/ITU-T/asn1/Pages/introduction.aspx) - will recall BER, DER, PER, CER, XER and so on, each appropriate to - a different setting. Similarly, - [Rivest's S-Expression design][sexp.txt] offers a human-friendly - syntax, a syntax robust to network-induced message corruption, and - an unambiguous, simple and easily-parsed machine-friendly syntax - for the same underlying values. - -### Transfer syntax: S-Expressions - -For now, we limit our attention to an easily-parsed, easily-produced -machine-readable syntax by mapping our `Value`s to the canonical form -of [Rivest's S-Expressions][sexp.txt].[^why-not-spki-sexps] - - [^why-not-spki-sexps]: Why not just use Rivest's S-Expressions as - they are? While they include binary data and sequences, and an - obvious equivalence for them exists, they lack numbers *per se* as - well as any kind of unordered structure such as sets or maps. In - addition, while "display hints" allow labelling of binary data - with an intended interpretation, they cannot be attached to any - other kind of structure, and the "hint" itself can only be a - binary blob. - -#### Byte strings - -`ByteString`s map to byte-string S-Expressions. - -**Examples.** - - What we have been writing above as `#"ABC"` would be represented as - the S-Expression `3:ABC`. - - The empty `ByteString` is represented by the S-Expression `0:`. - -#### Numbers - -Numbers are the most complicated values to represent as an -S-Expression. - -((TODO: Consider cutting complexity by e.g. representing a `Number` as -a sign bit, a little-endian blob of the integer part of the number, -and a little-endian blob of the fractional part of the number. Lots of -trailing/leading zeros for very large/small numbers!)) - -We represent `Number`s using a sign-magnitude format, where the -magnitude is written using a little-endian, twos-complement binary -[*significand*](https://en.wikipedia.org/wiki/Significand) and a -(signed) *shift amount*. - -In essence, we use a generalized, variable-width form of binary IEEE -floating-point representation. - -Let `N` be the `Number` to represent as an S-Expression. - -The sign bit is 0 when `N` is zero or positive, and 1 when `N` is -negative. - -The magnitude of `N` can be viewed as an infinite sequence of bits -with a fraction-separator mark placed somewhere in the sequence, - -``` -···00.000 b_0 b_1 ··· b_{k-1} b_k ··· b_{n-1} 000000··· -···000000 b_0 b_1 ··· b_{k-1} . b_k ··· b_{n-1} 000000··· -···000000 b_0 b_1 ··· b_{k-1} b_k ··· b_{n-1} 000.00··· -``` - -where `b_0` is the leftmost (most significant) and `b_{n-1}` the -rightmost (least significant) non-zero bit. - -Let `k`, the position of the fraction-separator mark, be `i` when it -is immediately to the left of `b_i` for some `i`, generalizing to -negative values when it is to the left of `b_0` and values greater -than `n-1` when it is to the right of `b_{n-1}`. - -For example, `k` will be: - - 0 when the fraction-separator is immediately (i.e. zero bits) to the left of `b_0`; - - -3 (as in the first example above) when it is three bits left of `b_0`; - - `n` when it is immediately (i.e. zero bits) to the right of `b_{n-1}`; - - `n`+3 when it is three bits to the right of `b_{n-1}`. - -The unpadded significand is `b_0 b_1 ··· b_{n-1}`. - -When `k` < `n`, the shift `z`=`k-n` and the significand is: - - the unpadded significand, - - with the sign bit appended to it on the right, and then - - padded on the left with zeroes until it is a whole number of octets wide. - -When `k` ≥ `n`, the shift `z`=`8×⌊(k-n)/8⌋` and the significand is: - - the unpadded significand, - - padded on the right with `(k-n) mod 8` zeroes, - - with the sign bit then appended on the right, and then - - padded on the left with zeroes until it is a whole number of octets wide. - -Now, let `s`=2`z` if `z` is zero or positive, or `s`=2|`z`|+1 if `z` -is negative. - -Finally, the S-Expression form of `N` is: - - `(4:*num [SIGNIFICAND] [SHIFT])`, if `s`≠0; or - - `(4:*num [SIGNIFICAND])`, if `s`=0 but the significand contains non-zero bits; or - - `(4:*num)`, if `s`=0 and the significand contains no non-zero bits; - -where - - `[SIGNIFICAND]` stands for a byte-string S-Expression containing a little-endian representation of the significand, and - - `[SHIFT]` stands for a byte-string S-Expression containing a little-endian representation of `s`. - -**Examples.** (Shown using the hexadecimal representation of -byte-strings from -[section 4.4 of Rivest's S-Expression specification][sexp.txt] in -places.) - - `N`=0 → `(4:*num)` - - `N`=1 → `(4:*num#02#)` - - `N`=-1 → `(4:*num#03#)` - - `N`=10₁₀=1010.0₂ → `n`=3, `k`=4, `z`=0, `s`=0 → `(4:*num#14#)` - - `N`=2560₁₀=101000000000.0₂ → `n`=3, `k`=12, `z`=8, `s`=16 → `(4:*num#14##10#)` - - `N`=-2560₁₀=-101000000000.0₂ → `n`=3, `k`=12, `z`=8, `s`=16 → `(4:*num#15##10#)` - - `N`=-6₁₀=-110.0₂ → `n`=2, `k`=3, `z`=0, `s`=0 → `(4:*num#0D#)` - - `N`=0.5₁₀=0.1₂ → `n`=1, `k`=0, `z`=-1, `s`=3 → `(4:*num#02##03#)` - - `N`=-3/2₁₀=-1.1₂ → `n`=2, `k`=1, `z`=-1, `s`=3 → `(4:*num#07##03#)` - - `N`=33/192₁₀=0.001011₂ → `n`=4, `k`=-2, `z`=-6, `s`=7 → `(4:*num#16##07#)` - - `N`=-1.202E4567=1011011001···000₂ (15172 binary digits, the last 4565 of which are zero) → `n`=10607, `k`=15172, `z`=4560, `s`=9120 → `(4:*num#41828E···24CD16##A023#)` - -((TODO: figure out what this algorithm would actually look like in, -say, C, Python and Racket.)) - -#### Lists - -A `List` maps to an S-Expression list of representations of its -elements, with the byte-string S-Expression `5:*list` prepended. - -**Examples.** - - The `List` containing the `ByteString`s `#"a"`, `#"b"`, and `#"c"` - would be represented as the S-Expression `(5:*list1:a1:b1:c)`. - - The empty `List` is represented by the S-Expression `(5:*list)`. - -#### Maps - -A `Map` is represented by an S-Expression list of representations of -the `Map`'s key-value pairs, with the byte-string `4:*map` prepended. - -Each key-value pair is represented by a two-element S-Expression list -containing representations of the key and the value, in that order. - -The key-value pairs *MUST* be ordered by `Value`-order of their keys. - -**Examples.** - - The `Map` containing entries mapping `#"a"` to `#"d"` and `#"c"` to - `#"b"` is represented by `(4:*map(1:a1:d)(1:c1:b))`. - - The `Map` containing an entry mapping the empty list to a "true" - Boolean value is represented by `(4:*map((5:*list)(4:true)))`. - - The empty `Map` is represented by `(4:*map)`. - -**Non-examples.** - - The S-Expression `(4:*map(1:c1:b)(1:a1:d))` is invalid, because its - key-value pairs are not in `Value`-order by key: `#"c"` > `#"a"`. - - The S-Expression `(4:*map1:a1:d1:c1:b)` is invalid, because its - key-value pairs appear "flattened" in the outer list, rather than - each appearing in a two-element list of its own. - -#### Records - -A `Record` is represented by an S-Expression list of its fields, -prepended by: - - - the representation of its label, if its label is a `ByteString` and - does not begin with byte 42 (ASCII "`*`"); or - - the S-Expression `1:*` followed by the representation of the - `Record`'s label, otherwise. - -**Examples.** - - The `Text` `"hello-world"` is represented by the S-Expression - `(5:utf-811:hello-world)`. - - The `IRI` denoting `http://www.w3.org/` is represented by the - S-Expression `(3:iri(5:utf-818:http://www.w3.org/))`. - - The `Record` `#"*"()` is represented by the S-Expression - `(1:*1:*)`. - - The `Record` `#"*foo"(#"*bar")` is represented by the S-Expression - `(1:*4:*foo4:*bar)`. - - The `Record` with the empty list as its label and no fields is - represented by the S-Expression `(1:*(5:*list))`. - - `(7:rfc3339(5:utf-83:foo))` represents a well-formed `Value` that - is a `Record` with `#"rfc3339"` as its label, and a single `Text` - field. While it is a perfectly reasonable `Value`, it does *not* - represent a valid date or time, since the `Text` `"foo"` does not - conform to any of the RFC 3339 productions enumerated above. - -**Non-examples.** - - `((5:*list))` is not a representation of the `Record` with the - empty list as its label and no fields, because that `Record` has a - non-`ByteString` as its label, mandating a `1:*` prefix on its - S-Expression representation. - - `(4:*foo4:*bar)` does not represent the `Record` - `#"*foo"(#"*bar")`, because the label `#"*foo"` begins with "`*`", - mandating a `1:*` prefix on the `Record`'s S-Expression - representation. - -## Examples - -((TODO: Give some examples of large and small SPKI-CAT documents, -perhaps translated from various JSON blobs floating around the -internet.)) - -## Representing Values in Programming Languages - -We have given a definition of `Value` and its semantics, and proposed -a concrete syntax for communicating and storing `Value`s. We now turn -to **suggested** representations of `Value`s as *programming-language -values* for various programming languages. - -### JavaScript - - - `ByteString` ↔ `Uint8Array` - - `Number` ↔ numbers, problematically; bignums, perhaps; other?? TODO - - `List` ↔ `Array` - - `Map` ↔ `Object` - - `Record` ↔ an instance of something like `Record` below, unless the label is... - - `#"utf-8"` ↔ `String` - - `#"true"` ↔ `true` - - `#"false"` ↔ `false` - - `#"null"` ↔ `null` - - `#"undefined"` ↔ the undefined value - - `#"rfc3339"` ↔ `Date`, if the `Record`'s field matches the `date-time` RFC 3339 production - -```javascript -function Record(label, ...fields) { - this.label = label; - this.fields = fields; -} -``` - -### Scheme/Racket - - - `ByteString` ↔ byte vector (Racket: "Bytes") - - `Number` ↔ numbers - - `List` ↔ (where possible, immutable) list - - `Map` ↔ hash-table - - `Record` ↔ a structure (Racket: a "prefab struct"), unless the label is... - - `#"utf-8"` ↔ a string - - `#"true"` ↔ `#t` - - `#"false"` ↔ `#f` - - `#"symbol"` ↔ a symbol - -### Java - - - `ByteString` ↔ `byte[]` - - `Number` ↔ numbers, problematically; bignums, perhaps; other?? TODO - - `List` ↔ `java.util.List` - - `Map` ↔ `java.util.Map` - - `Record` ↔ an instance of something like `Record` below, unless the label is... - - `#"utf-8"` ↔ `java.lang.String` - - `#"true"` ↔ `java.lang.Boolean.TRUE` - - `#"false"` ↔ `java.lang.Boolean.FALSE` - - `#"null"` ↔ a special singleton object, but *not* Java's `null` - - `#"rfc3339"` ↔ `java.util.{Date,Time,Timestamp}`, according to which RFC 3339 production the `Record`'s field matches - -### Erlang - - - `ByteString` ↔ a binary - - `Number` ↔ numbers, probably; TODO - - `List` ↔ a list - - `Map` ↔ a [map](http://erlang.org/doc/reference_manual/data_types.html#id77432) (new in Erlang/OTP R17) - - `Record` ↔ a tuple with the label in the first position, and the fields in subsequent positions, unless the label is... - - `#"true"` ↔ `true` - - `#"false"` ↔ `false` - - `#"null"` ↔ `null` - - `#"undefined"` ↔ `undefined` - - `#"symbol"` ↔ the `Text` field converted to an Erlang atom, if - some kind of an "unsafe" mode is set on the decoder (because - Erlang atoms are not GC'd); otherwise like any other kind of - `Record` - ---- diff --git a/syndicate/mc/preserve.css b/syndicate/mc/preserve.css deleted file mode 100644 index 35d1881..0000000 --- a/syndicate/mc/preserve.css +++ /dev/null @@ -1,61 +0,0 @@ -body { - font-family: palatino, "Palatino Linotype", "Palatino LT STD", "URW Palladio L", "TeX Gyre Pagella", serif; -} -@media screen { - body { padding-top: 2rem; max-width: 40em; margin: auto; font-size: 120%; } - hr { display: none; } -} -@media print { - @page { size: letter; margin: 4rem 0rem 4.333rem 0rem; } - body { margin-left: 4.5rem; margin-right: 4.5rem; } - h1, h2 { page-break-before: always; margin-top: 0; } - h1:first-of-type, h2:first-of-type { page-break-before: auto; } - hr+* { page-break-before: always; margin-top: 0; } - hr { display: none; } -} -h1, h2, h3, h4, h5, h6 { color: #4f81bd; } -h2 { border-bottom: solid #4f81bd 1px; } -pre, code { background-color: #eee; font-family: "DejaVu Sans Mono", monospace; } -code { font-size: 75%; } -pre { padding: 0.33rem; } - -body { - counter-reset: section 0 subsection 0 appendix 0; -} -h2:before, h3:before { - text-align: right; - display: inline-block; - position: relative; - right: 2.33em; - font-size: 75%; - text-align: right; - width: 2em; - margin-right: -2em; - height: 0; -} -h2:before { - counter-increment: section; - content: counter(section) ". "; -} -h2 { - counter-reset: subsection 0; -} -h3:before { - counter-increment: subsection; - content: counter(section) "." counter(subsection) ". "; -} - -h2[id^="appendix-"]:before { - counter-increment: appendix; - content: counter(appendix,upper-latin) ". "; -} -h2[id^="appendix-"] ~ h3:before { - counter-increment: subsection; - content: counter(appendix,upper-latin) "." counter(subsection) ". "; -} - -h2#notes:before { - display: none; -} - -.footnotes > ol { padding: 0; font-size: 90%; } diff --git a/syndicate/mc/preserve.md b/syndicate/mc/preserve.md deleted file mode 100644 index 8a36058..0000000 --- a/syndicate/mc/preserve.md +++ /dev/null @@ -1,1411 +0,0 @@ ---- ---- -Preserves: an Expressive Data Language - - -# Preserves: an Expressive Data Language - -Tony Garnock-Jones -September 2018. Version 0.0.3. - - [sexp.txt]: http://people.csail.mit.edu/rivest/Sexp.txt - [spki]: http://world.std.com/~cme/html/spki.html - [varint]: https://developers.google.com/protocol-buffers/docs/encoding#varints - [erlang-map]: http://erlang.org/doc/reference_manual/data_types.html#map - [abnf]: https://tools.ietf.org/html/rfc7405 - -This document proposes a data model and serialization format called -*Preserves*. - -Preserves supports *records* with user-defined *labels*. This makes it -more expressive[^macro-expressiveness] than most data languages in use -on the web and allows it to easily represent the *labelled sums of -products* as seen in many functional programming languages. - -Preserves also supports the usual suite of atomic and compound data -types, in particular including *binary* data as a distinct type from -text strings. - -Finally, Preserves defines precisely how to *compare* two values. -Comparison is based on the data model, not on syntax or on data -structures of any particular implementation language. - - [^macro-expressiveness]: By "expressive" I mean *macro-expressive* - in the sense of Felleisen's 1991 paper, "On the Expressive Power - of Programming Languages". - - Roughly speaking, there's no way in a JSON document to introduce a - new kind of information (such as binary data, or a date-stamp, or - a "person" object) in an *unambiguous way* without *global - agreement* from every potential consumer of the document. With an - extensible labelled record type, there is. - - Felleisen, Matthias. “On the Expressive Power of Programming - Languages.” Science of Computer Programming 17, no. 1--3 (1991): - 35–75. - -## Starting with Semantics - -Taking inspiration from functional programming, we start with a -definition of the *values* that we want to work with and give them -meaning independent of their syntax. When we write examples of values, -we will do so using the [textual syntax](#textual-syntax) defined -later in this document. - -Our `Value`s fall into two broad categories: *atomic* and *compound* -data. - - Value = Atom - | Compound - - Atom = Boolean - | Float - | Double - | SignedInteger - | String - | ByteString - | Symbol - - Compound = Record - | Sequence - | Set - | Dictionary - -**Total order.** As we go, we will -incrementally specify a total order over `Value`s. Two values of the -same kind are compared using kind-specific rules. The ordering among -values of different kinds is essentially arbitrary, but having a total -order is convenient for many tasks, so we define it as -follows:[^ordering-by-syntax] - - (Values) Atom < Compound - - (Compounds) Record < Sequence < Set < Dictionary - - (Atoms) Boolean < Float < Double < SignedInteger - < String < ByteString < Symbol - - [^ordering-by-syntax]: The observant reader may note that the - ordering here is (almost) the same as that implied by the tagging - scheme used in the concrete binary syntax for `Value`s. (The - exception is the syntax for small integers near zero.) - -**Equivalence.** Two `Value`s are equal if -neither is less than the other according to the total order. - -### Signed integers. - -A `SignedInteger` is a signed integer of arbitrary width. -`SignedInteger`s are compared as mathematical integers. - -**Examples.** 10; -6; 0. - -**Non-examples.** NaN (the clue is in the name!); ∞ (not finite); 0.2 -(not an integer); 1/7 (likewise); 2+*i*3 (likewise); √2 (likewise). - -### Unicode strings. - -A `String` is a sequence of Unicode -[code-point](http://www.unicode.org/glossary/#code_point)s. `String`s -are compared lexicographically, code-point by -code-point.[^utf8-is-awesome] - - [^utf8-is-awesome]: Happily, the design of UTF-8 is such that this - gives the same result as a lexicographic byte-by-byte comparison - of the UTF-8 encoding of a string! - -**Examples.** `"Hello world"`, an eleven-code-point string; `"z水𝄞"`, -the string containing the three Unicode code-points `z` (0x7A), `水` -(0x6C34) and `𝄞` (0x1D11E); `""`, the empty string. - -### Binary data. - -A `ByteString` is an ordered sequence of zero or more eight-bit bytes. -`ByteString`s are compared lexicographically. - -**Examples.** `#""`, the empty `ByteString`; `#"ABC"`, the -`ByteString` containing the integers 65, 66 and 67 (corresponding to -ASCII characters `A`, `B` and `C`). **N.B.** Despite appearances, -these are *binary* data. - -### Symbols. - -Programming languages like Lisp and Prolog frequently use string-like -values called *symbols*. Here, a `Symbol` is, like a `String`, a -sequence of Unicode code-points representing an identifier of some -kind. `Symbol`s are also compared lexicographically by code-point. - -**Examples.** `hello-world`; `utf8-string`; `exact-integer?`. - -### Booleans. - -There are exactly two `Boolean` values, “false” and “true”. The -“false” value compares less-than the “true” value. We write `#false` -for “false”, and `#true` for “true”. - -### IEEE floating-point values. - -A `Float` is a single-precision IEEE 754 floating-point value; a -`Double` is a double-precision IEEE 754 floating-point value. -`Float`s, `Double`s and `SignedInteger`s are considered disjoint, and -so by the rules [above](#total-order), every `Float` is less than -every `Double`, and every `SignedInteger` is greater than both. Two -`Float`s or two `Double`s are to be ordered by the `totalOrder` -predicate defined in section 5.10 of -[IEEE Std 754-2008](https://dx.doi.org/10.1109/IEEESTD.2008.4610935). -We write examples using a fractional part and/or an exponent to -distinguish them from `SignedInteger`s. An additional suffix `f` -distinguishes `Float`s from `Double`s. - -**Examples.** 10.0f; -6.0; 0.0f; 0.5; -1.202e300. - -**Non-examples.** 10, -6, and 0, because writing them this way -indicates `SignedInteger`s, not `Float`s or `Double`s. - -### Records. - -A `Record` is a *labelled* tuple of zero or more `Value`s, called the -record's *fields*. A record's label is itself a `Value`, though it -will usually be a `Symbol`.[^extensibility] [^iri-labels] `Record`s -are compared lexicographically as if they were just tuples; that is, -first by their labels, and then by the remainder of their fields. - - [^extensibility]: The [Racket](https://racket-lang.org/) programming - language defines - “[prefab](http://docs.racket-lang.org/guide/define-struct.html#(part._prefab-struct))” - structure types, which map well to our `Record`s. Racket supports - record extensibility by encoding record supertypes into record - labels as specially-formatted lists. - - [^iri-labels]: It is occasionally (but seldom) necessary to - interpret such `Symbol` labels as UTF-8 encoded IRIs. Where a - label can be read as a relative IRI, it is notionally interpreted - with respect to the IRI - `urn:uuid:6bf094a6-20f1-4887-ada7-46834a9b5b34`; where a label can - be read as an absolute IRI, it stands for that IRI; and otherwise, - it cannot be read as an IRI at all, and so the label simply stands - for itself—for its own `Value`. - -**Examples.** `foo(1 2 3)`, a `Record` with label `foo` and fields 1, -2 and 3; `void()`, a `Record` with label `void` and no fields. - -**Non-examples.** `()`, because it lacks a label; `void`, because it -lacks even an empty tuple of fields. - -### Sequences. - -A `Sequence` is a general-purpose, variable-length ordered sequence of -zero or more `Value`s. `Sequence`s are compared lexicographically. - -**Examples.** `[]`, the empty sequence; `[1 2 3]`, the sequence of -`SignedInteger`s 1, 2 and 3. - -### Sets. - -A `Set` is an unordered finite set of `Value`s. It contains no -duplicate values, following the [equivalence relation](#equivalence) -induced by the total order on `Value`s. Two `Set`s are compared by -sorting their elements ascending using the [total order](#total-order) -and comparing the resulting `Sequence`s. - -**Examples.** `#set{}`, the empty set; `#set{#set{}}`, the set -containing only the empty set; `{4 "hello" (void) 9.0f}`, the set -containing 4, the string `"hello"`, the record with label `void` and -no fields, and the `Float` denoting the number 9.0; `{1 1.0f}`, the -set containing a `SignedInteger` and a `Float`; `{mime(application/xml -#"") mime(application/xml #"")}`, a set containing two -different `mime` records.[^mime-xml-difference] - - [^mime-xml-difference]: The two XML documents `` and `` - differ by bytewise comparison, and thus yield different record - values, even though under the semantics of XML they denote - identical XML infoset. - -**Non-examples.** `{1 1}`, because it contains multiple equivalent -`Value`s; `{}`, because without the `#set` marker, it denotes the -empty dictionary. - -### Dictionaries. - -A `Dictionary` is an unordered finite collection of pairs of `Value`s. -Each pair comprises a *key* and a *value*. Keys in a `Dictionary` must -be pairwise distinct. Instances of `Dictionary` are compared by -lexicographic comparison of the sequences resulting from ordering each -`Dictionary`'s pairs in ascending order by key. - -**Examples.** `{}`, the empty dictionary; `{a: 1}`, the dictionary -mapping the `Symbol` `a` to the `SignedInteger` 1; `{[1 2 3]: a}`, -mapping `[1 2 3]` to `a`; `{"hi": 0, hi: 0, there: []}`, having a -`String` and two `Symbol` keys, and `SignedInteger` and `Sequence` -values. - -**Non-examples.** `{a:1 b:2 a:3}`, because it contains duplicate -keys; `{[7 8]:[] [7 8]:99}`, for the same reason. - -## Textual Syntax - -Now we have discussed `Value`s and their meanings, we may turn to -techniques for *representing* `Value`s for communication or storage. - -In this section, we use [case-sensitive ABNF][abnf] to define a -textual syntax that is easy for people to read and -write.[^json-superset] Most of the examples in this document are -written using this syntax. In the following section, we will define an -equivalent compact machine-readable syntax. - - [^json-superset]: The grammar of the textual syntax is a superset of - JSON, with the slightly unusual feature that `true`, `false`, and - `null` are all read as `Symbol`s, and that `SignedInteger`s are - never read as `Double`s. - -### Character set - -[ABNF][abnf] allows easy definition of US-ASCII-based languages. -However, Preserves is a Unicode-based language. Therefore, we -reinterpret ABNF as a grammar for recognising sequences of Unicode -code points. - -Textual syntax for a `Value` *SHOULD* be encoded using UTF-8 where -possible. - -### Whitespace - -Whitespace is defined as any number of spaces, tabs, carriage returns, -line feeds, comments, or commas. A comment is a semicolon followed by -the unicode code points up to and including the next carriage return -or line feed. - - ws = *(%x20 / %x09 / newline / comment / ",") - newline = CR / LF - comment = ";" *(WSP / nonnl) newline - nonnl = - -### Grammar - -Standalone documents containing textual representations of `Value`s may have trailing whitespace. - - Document = Value ws - -Any `Value` may be preceded by whitespace. - - Value = ws (Record / Collection / Atom / Compact) - Collection = Sequence / Dictionary / Set - Atom = Boolean / Float / Double / SignedInteger / - String / ByteString / Symbol - -Each `Record` is its label-`Value` followed by a parenthesised -grouping of its field-`Value`s. Whitespace is not permitted between -the label and the open-parenthesis. - - Record = Value "(" *Value ws ")" - -`Sequence`s are enclosed in square brackets. `Dictionary` values are -curly-brace-enclosed colon-separated pairs of values. `Set`s are -written either as a simple curly-brace-enclosed non-empty sequence of -values, or as a possibly-empty sequence of values enclosed by the -tokens `#set{` and `}`.[^printing-collections] - - Sequence = "[" *Value ws "]" - Dictionary = "{" *(Value ws ":" Value) ws "}" - Set = %s"#set{" *Value ws "}" / "{" 1*Value ws "}" - - [^printing-collections]: **Implementation note.** When implementing - printing of `Value`s using the textual syntax, consider supporting - (a) optional pretty-printing with indentation, (b) optional - JSON-compatible print mode for that subset of `Value` that is - compatible with JSON, and (c) optional submodes for no commas, - commas separating, and commas terminating elements or key/value - pairs within a collection. - -Any `Value` may be represented using the -[compact binary syntax](#compact-binary-syntax) by directly prefixing -the binary form of the `Value` with ASCII `SOH` (`%x01`), or by -enclosing a hexadecimal representation of the binary form of the -`Value` in the tokens `#hexvalue{` and `}`.[^rationale-switch-to-binary] - - Compact = %x01 / %s"#hexvalue{" *(ws / HEXDIG) ws "}" - - [^rationale-switch-to-binary]: **Rationale.** The textual syntax - cannot express every `Value`: specifically, it cannot express the - several million floating-point NaNs, or the two floating-point - Infinities. Since the compact binary format for `Value`s expresses - each `Value` with precision, embedding binary `Value`s solves the - problem. - -`Boolean`s are the simple literal strings `#true` and `#false`. - - Boolean = %s"#true" / %s"#false" - -Numeric data follow the -[JSON grammar](https://tools.ietf.org/html/rfc8259#section-6), with -the addition of a trailing "f" distinguishing `Float` from `Double` -values. `Float`s and `Double`s always have either a fractional part or -an exponent part, where `SignedInteger`s never have -either.[^reading-and-writing-floats-accurately] -[^arbitrary-precision-signedinteger] - - Float = flt %i"f" - Double = flt - SignedInteger = int - - digit1-9 = %x31-39 - nat = %x30 / ( digit1-9 *DIGIT ) - int = ["-"] nat - frac = "." 1*DIGIT - exp = %i"e" ["-"/"+"] 1*DIGIT - flt = int (frac exp / frac / exp) - - [^reading-and-writing-floats-accurately]: **Implementation note.** - Your language's standard library likely has a good routine for - converting between decimal notation and IEEE 754 floating-point. - However, if not, or if you are interested in the challenges of - accurately reading and writing floating point numbers, see the - excellent matched pair of 1990 papers by Clinger and Steele & - White, and a recent follow-up by Jaffer: - - Clinger, William D. ‘How to Read Floating Point Numbers - Accurately’. In Proc. PLDI. White Plains, New York, 1990. - . - - Steele, Guy L., Jr., and Jon L. White. ‘How to Print - Floating-Point Numbers Accurately’. In Proc. PLDI. White Plains, - New York, 1990. . - - Jaffer, Aubrey. ‘Easy Accurate Reading and Writing of - Floating-Point Numbers’. ArXiv:1310.8121 [Cs], 27 October 2013. - . - - [^arbitrary-precision-signedinteger]: **Implementation note.** Be - aware when implementing reading and writing of `SignedInteger`s - that the data model *requires* arbitrary-precision integers. Your - I/O routines must not truncate precision either when reading or - writing a `SignedInteger`. - -`String`s are, -[as in JSON](https://tools.ietf.org/html/rfc8259#section-7), possibly -escaped text surrounded by double quotes. The escaping rules are the -same as for JSON.[^string-json-correspondence] [^escaping-surrogate-pairs] - - String = %x22 *char %x22 - char = unescaped / %x7C / escape (escaped / %x22 / %s"u" 4HEXDIG) - unescaped = %x20-21 / %x23-5B / %x5D-7B / %x7D-10FFFF - escape = %x5C ; \ - escaped = ( %x5C / ; \ reverse solidus U+005C - %x2F / ; / solidus U+002F - %x62 / ; b backspace U+0008 - %x66 / ; f form feed U+000C - %x6E / ; n line feed U+000A - %x72 / ; r carriage return U+000D - %x74 ) ; t tab U+0009 - - [^string-json-correspondence]: The grammar for `String` has the same - effect as the - [JSON](https://tools.ietf.org/html/rfc8259#section-7) grammar for - `string`. Some auxiliary definitions (e.g. `escaped`) are lifted - largely unmodified from the text of RFC 8259. - - [^escaping-surrogate-pairs]: In particular, note JSON's rules around - the use of surrogate pairs for code points not in the Basic - Multilingual Plane. We encourage implementations to avoid escaping - such characters when producing output, and instead to rely on the - UTF-8 encoding of the entire document to handle them correctly. - -A `ByteString` may be written in any of three different forms. - -The first is similar to a `String`, but prepended with a hash sign -`#`. In addition, only Unicode code points overlapping with printable -7-bit ASCII are permitted unescaped inside such a `ByteString`; other -byte values must be escaped by prepending a two-digit hexadecimal -value with `\x`. - - ByteString = "#" %x22 *binchar %x22 - binchar = binunescaped / escape (escaped / %x22 / %s"x" 2HEXDIG) - binunescaped = %x20-21 / %x23-5B / %x5D-7E - -The second is as a sequence of pairs of hexadecimal digits interleaved -with whitespace and surrounded by `#hex{` and `}`. - - ByteString =/ %s"#hex{" *(ws / 2HEXDIG) ws "}" - -The third is as a sequence of -[Base64](https://tools.ietf.org/html/rfc4648) characters, interleaved -with whitespace and surrounded by `#base64{` and `}`. Plain and -URL-safe Base64 characters are allowed. - - ByteString =/ %s"#base64{" *(ws / base64char) ws "}" / - base64char = %x41-5A / %x61-7A / %x30-39 / "+" / "/" / "-" / "_" / "=" - -A `Symbol` may be written in a "bare" form[^cf-sexp-token] so long as -it conforms to certain restrictions on the characters appearing in the -symbol. Alternatively, it may be written in a quoted form. The quoted -form is much the same as the syntax for `String`s, including embedded -escape syntax, except using a bar or pipe character (`|`) instead of a -double quote mark. - - Symbol = symstart *symcont / "|" *symchar "|" - symstart = ALPHA / sympunct / symunicode - symcont = ALPHA / sympunct / symunicode / DIGIT / "-" / "." - sympunct = "~" / "!" / "@" / "$" / "%" / "^" / "&" / "*" / - "?" / "_" / "=" / "+" / "<" / ">" / "/" - symchar = unescaped / %x22 / escape (escaped / %x7C / %s"u" 4HEXDIG) - symunicode = - - [^cf-sexp-token]: Compare with the [SPKI S-expression][sexp.txt] - definition of "token representation", and with the - [R6RS definition of identifiers](http://www.r6rs.org/final/html/r6rs/r6rs-Z-H-7.html#node_sec_4.2.4). - -## Compact Binary Syntax - -A `Repr` is an encoding, or representation, of a specific `Value`. -Each `Repr` comprises one or more bytes describing first the kind of -represented `Value` and the length of the representation, and then the -encoded details of the `Value` itself. - -For a value `v`, we write `[[v]]` for the `Repr` of v. - -### Type and Length representation - -Each `Repr` takes one of three possible forms: - - - (A) a fixed-length form, used for simple values such as `Boolean`s - or `Float`s. - - - (B) a variable-length form with length specified up-front, used for - almost all `Record`s as well as for most `Sequence`s and `String`s, - when their sizes are known at the time serialization begins. - - - (C) a variable-length streaming form with unknown or unpredictable - length, used only seldom for `Record`s, since the number of fields - in a `Record` is usually statically known, but sometimes used for - `Sequence`s, `String`s etc., such as in cases when serialization - begins before the number of elements or bytes in the corresponding - `Value` is known. - -Applications may choose between formats B and C depending on their -needs at serialization time. - -#### The lead byte - -Every `Repr` starts with a *lead byte*, constructed by -`leadbyte(t,n,m)`, where `t`,`n`∈{0,1,2,3} and 0≤`m`<16: - - leadbyte(t,n,m) = [t*64 + n*16 + m] - -The arguments `t` and `n` describe the rest of the -representation:[^some-encodings-unused] - - [^some-encodings-unused]: Some encodings are unused. All such - encodings are reserved for future versions of this specification. - - - `t`=0, `n`=0 (format A) represents an `Atom` with fixed-length binary representation. - - `t`=0, `n`=1 (format A) represents certain small `SignedInteger`s. - - `t`=0, `n`=2 (format C) is a Stream Start byte. - - `t`=0, `n`=3 (format C) is a Stream End byte. - - `t`=1 (format B) represents an `Atom` with variable-length binary representation. - - `t`=2 (format B) represents a `Record`. - - `t`=3 (format B) represents a `Sequence`, `Set` or `Dictionary`. - -#### Encoding data of fixed length (format A) - -Each specific type of data defines its own rules for this format. - -#### Encoding data of known length (format B) - -A `Repr` where the length of the `Value` to be encoded is variable but -known uses the value of `m` in `leadbyte` to encode its length. The -length counts *bytes* for atomic `Value`s, but counts *contained -values* for compound `Value`s. - - - A length `l` between 0 and 14 is represented using `leadbyte` with - `m=l`. - - A length of 15 or greater is represented by `m=15` and additional - bytes describing the length following the lead byte. - -The function `header(t,n,m)` yields an appropriate sequence of bytes -describing a `Repr`'s type and length when `t`, `n` and `m` are -appropriate non-negative integers: - - header(t,n,m) = leadbyte(t,n,m) when m < 15 - or leadbyte(t,n,15) ++ varint(m) otherwise - -The additional length bytes are formatted as -[base 128 varints][varint]. We write `varint(m)` for the -varint-encoding of `m`. Quoting the [Google Protocol Buffers][varint] -definition, - -> Each byte in a varint, except the last byte, has the most -> significant bit (msb) set – this indicates that there are further -> bytes to come. The lower 7 bits of each byte are used to store the -> two's complement representation of the number in groups of 7 bits, -> least significant group first. - -**Examples.** - - - The varint representation of 15 is just the byte 15. - - 300 (binary, grouped into 7-bit chunks, `10 0101100`) varint-encodes to the two bytes 172 and 2. - - 1000000000 (binary `11 1011100 1101011 0010100 0000000`) varint-encodes to bytes 128, 148, 235, 220, and 3. - -#### Streaming data of unknown length (format C) - -A `Repr` where the length of the `Value` to be encoded is variable and -not known at the time serialization of the `Value` starts is encoded -by a single Stream Start (“open”) byte, followed by zero or more -*chunks*, followed by a matching Stream End (“close”) byte: - - open(t,n) = leadbyte(0,2, t*4 + n) - close(t,n) = leadbyte(0,3, t*4 + n) - -For a `Repr` of a `Value` containing binary data, each chunk is to be -a format B `Repr` of a `ByteString`, no matter the type of the overall -`Repr`. - -For a `Repr` of a `Value` containing other `Value`s, each chunk is to -be a single `Repr`. - -### Records - -Format B (known length): - - [[ L(F_1...F_m) ]] = header(2,3,m+1) ++ [[L]] ++ [[F_1]] ++...++ [[F_m]] - -For `m` fields, `m+1` is supplied to `header`, to account for the -encoding of the record label. - -Format C (streaming): - - [[ L(F_1...F_m) ]] = open(2,3) ++ [[L]] ++ [[F_1]] ++...++ [[F_m]] ++ close(2,3) - -Applications *SHOULD* prefer the known-length format for encoding -`Record`s. - -#### Application-specific short form for labels - -Any given protocol using Preserves may additionally define an -interpretation for `n`∈{0,1,2}, mapping each *short form label -number* `n` to a specific record label. When encoding `m` fields with -short form label number `n`, format B becomes - - header(2,n,m) ++ [[F_1]] ++...++ [[F_m]] - -and format C becomes - - open(2,n) ++ [[F_1]] ++...++ [[F_m]] ++ close(2,n) - -**Examples.** For example, a protocol may choose to map records -labelled `void` to `n=0`, making - - [[void()]] = header(2,0,0) = [0x80] - -or it may map records labelled `person` to short form label number 1, -making - - [[person("Dr", "Elizabeth", "Blackwell")]] - = header(2,1,3) ++ [["Dr"]] ++ [["Elizabeth"]] ++ [["Blackwell"]] - = [0x93] ++ [["Dr"]] ++ [["Elizabeth"]] ++ [["Blackwell"]] - -for format B, or - - = open(2,1) ++ [["Dr"]] ++ [["Elizabeth"]] ++ [["Blackwell"]] ++ close(2,1) - = [0x29] ++ [["Dr"]] ++ [["Elizabeth"]] ++ [["Blackwell"]] ++ [0x39] - -for format C. - -### Sequences, Sets and Dictionaries - -Format B (known length): - - [[ [X_1...X_m] ]] = header(3,0,m) ++ [[X_1]] ++...++ [[X_m]] - [[ #set{X_1...X_m} ]] = header(3,1,m) ++ [[X_1]] ++...++ [[X_m]] - [[ {K_1:V_1...K_m:V_m} ]] = header(3,2,m*2) ++ [[K_1]] ++ [[V_1]] ++... - ++ [[K_m]] ++ [[V_m]] - -Note that `m*2` is given to `header` for a `Dictionary`, since there -are two `Value`s in each key-value pair. - -Format C (streaming): - - [[ [X_1...X_m] ]] = open(3,0) ++ [[X_1]] ++...++ [[X_m]] ++ close(3,0) - [[ #set{X_1...X_m} ]] = open(3,1) ++ [[X_1]] ++...++ [[X_m]] ++ close(3,1) - [[ {K_1:V_1...K_m:V_m} ]] = open(3,2) ++ [[K_1]] ++ [[V_1]] ++... - ++ [[K_m]] ++ [[V_m]] ++ close(3,2) - -Applications may use whichever format suits their needs on a -case-by-case basis. - -There is *no* ordering requirement on the `X_i` elements or -`K_i`/`V_i` pairs.[^no-sorting-rationale] They may appear in any -order. - - [^no-sorting-rationale]: In the BitTorrent encoding format, - [bencoding](http://www.bittorrent.org/beps/bep_0003.html#bencoding), - dictionary key/value pairs must be sorted by key. This is a - necessary step for ensuring serialization of `Value`s is - canonical. We do not require that key/value pairs (or set - elements) be in sorted order for serialized `Value`s, because (a) - where canonicalization is used for cryptographic signatures, it is - more reliable to simply retain the exact binary form of the signed - document than to depend on canonical de- and re-serialization, and - (b) sorting keys or elements makes no sense in streaming - serialization formats. - - However, a quality implementation may wish to offer the programmer - the option of serializing with set elements and dictionary keys in - sorted order. - -Note that `header(3,3,m)` and `open(3,3)`/`close(3,3)` are unused and reserved. - -### SignedIntegers - -Format B/A (known length/fixed-size): - - [[ x ]] when x ∈ SignedInteger = header(1,0,m) ++ intbytes(x) if x<-3 ∨ 13≤x - header(0,1,x+16) if -3≤x<0 - header(0,1,x) if 0≤x<13 - -Integers in the range [-3,12] are compactly represented using format A -because they are so frequently used. Other integers are represented -using format B. - -Format C *MUST NOT* be used for `SignedInteger`s. - -The function `intbytes(x)` gives the big-endian two's-complement -binary representation of `x`, taking exactly as many whole bytes as -needed to unambiguously identify the value and its sign, and `m = -|intbytes(x)|`. The most-significant bit in the first byte in -`intbytes(x)` is the sign bit.[^zero-intbytes] - - [^zero-intbytes]: The value 0 needs zero bytes to identify the - value, so `intbytes(0)` is the empty byte string. Non-zero values - need at least one byte. - -For example, - - [[ -257 ]] = 42 FE FF [[ -3 ]] = 1D [[ 128 ]] = 42 00 80 - [[ -256 ]] = 42 FF 00 [[ -2 ]] = 1E [[ 255 ]] = 42 00 FF - [[ -255 ]] = 42 FF 01 [[ -1 ]] = 1F [[ 256 ]] = 42 01 00 - [[ -254 ]] = 42 FF 02 [[ 0 ]] = 10 [[ 32767 ]] = 42 7F FF - [[ -129 ]] = 42 FF 7F [[ 1 ]] = 11 [[ 32768 ]] = 43 00 80 00 - [[ -128 ]] = 41 80 [[ 12 ]] = 1C [[ 65535 ]] = 43 00 FF FF - [[ -127 ]] = 41 81 [[ 13 ]] = 41 0D [[ 65536 ]] = 43 01 00 00 - [[ -4 ]] = 41 FC [[ 127 ]] = 41 7F [[ 131072 ]] = 43 02 00 00 - -### Strings, ByteStrings and Symbols - -Syntax for these three types varies only in the value of `n` supplied -to `header`, `open`, and `close`. In each case, the payload following -the header is a binary sequence; for `String` and `Symbol`, it is a -UTF-8 encoding of the `Value`'s code points, while for `ByteString` it -is the raw data contained within the `Value` unmodified. - -Format B (known length): - - [[ S ]] = header(1,n,m) ++ encode(S) - where m = |encode(S)| - and (n,encode(S)) = (1,utf8(S)) if S ∈ String - (2,S) if S ∈ ByteString - (3,utf8(S)) if S ∈ Symbol - -To stream a `String`, `ByteString` or `Symbol`, emit `open(1,n)` and -then a sequence of zero or more format B chunks, followed by -`close(1,n)`. Every chunk must be a `ByteString`. - -While the overall content of a streamed `String` or `Symbol` must be -valid UTF-8, individual chunks do not have to conform to UTF-8. - -### Fixed-length Atoms - -Fixed-length atoms all use format A, and do not have a length -representation. They repurpose the bits that format B `Repr`s use to -specify lengths. Applications *MUST NOT* use format C with -`open(0,n)` or `close(0,n)` for any `n`. - -#### Booleans - - [[ #false ]] = header(0,0,0) = [0x00] - [[ #true ]] = header(0,0,1) = [0x01] - -#### Floats and Doubles - - [[ F ]] when F ∈ Float = header(0,0,2) ++ binary32(F) - [[ D ]] when D ∈ Double = header(0,0,3) ++ binary64(D) - -The functions `binary32(F)` and `binary64(D)` yield big-endian 4- and -8-byte IEEE 754 binary representations of `F` and `D`, respectively. - -## Examples - -### Simple examples - - - - -For the following examples, imagine an application that maps `Record` -short form label number 0 to label `discard`, 1 to `capture`, and 2 to -`observe`. - -| Value | Encoded hexadecimal byte sequence | -|---------------------------------------------------|----------------------------------------------------------------------| -| `capture(discard())` | 91 80 | -| `observe(speak(discard(), capture(discard())))` | A1 B3 75 73 70 65 61 6B 80 91 80 | -| `[1 2 3 4]` (format B) | C4 11 12 13 14 | -| `[1 2 3 4]` (format C) | 2C 11 12 13 14 3C | -| `[-2 -1 0 1]` | C4 1E 1F 10 11 | -| `"hello"` (format B) | 55 68 65 6C 6C 6F | -| `"hello"` (format C, 2 chunks) | 25 62 68 65 63 6C 6C 6F 35 | -| `"hello"` (format C, 5 chunks) | 25 62 68 65 62 6C 6C 60 60 61 6F 35 | -| `["hello" there #"world" [] #set{} #true #false]` | C7 55 68 65 6C 6C 6F 75 74 68 65 72 65 65 77 6F 72 6C 64 C0 D0 01 00 | -| `-257` | 42 FE FF | -| `-1` | 1F | -| `0` | 10 | -| `1` | 11 | -| `255` | 42 00 FF | -| `1.0f` | 02 3F 80 00 00 | -| `1.0` | 03 3F F0 00 00 00 00 00 00 | -| `-1.202e300` | 03 FE 3C B7 B7 59 BF 04 26 | - -The next example uses a non-`Symbol` label for a record.[^extensibility2] The `Record` - - [titled person 2 thing 1](101, "Blackwell", date(1821 2 3), "Dr") - -encodes to - - B5 ;; Record, generic, 4+1 - C5 ;; Sequence, 5 - 76 74 69 74 6C 65 64 ;; Symbol, "titled" - 76 70 65 72 73 6F 6E ;; Symbol, "person" - 12 ;; SignedInteger, "2" - 75 74 68 69 6E 67 ;; Symbol, "thing" - 11 ;; SignedInteger, "1" - 41 65 ;; SignedInteger, "101" - 59 42 6C 61 63 6B 77 65 6C 6C ;; String, "Blackwell" - B4 ;; Record, generic, 3+1 - 74 64 61 74 65 ;; Symbol, "date" - 42 07 1D ;; SignedInteger, "1821" - 12 ;; SignedInteger, "2" - 13 ;; SignedInteger, "3" - 52 44 72 ;; String, "Dr" - - [^extensibility2]: It happens to line up with Racket's - representation of a record label for an inheritance hierarchy - where `titled` extends `person` extends `thing`: - - (struct date (year month day) #:prefab) - (struct thing (id) #:prefab) - (struct person thing (name date-of-birth) #:prefab) - (struct titled person (title) #:prefab) - - For more detail on Racket's representations of record labels, see - [the Racket documentation for `make-prefab-struct`](http://docs.racket-lang.org/reference/structutils.html#%28def._%28%28quote._~23~25kernel%29._make-prefab-struct%29%29). - ---- - -### JSON examples - -The examples from -[RFC 8259](https://tools.ietf.org/html/rfc8259#section-13) read as -valid Preserves, though the JSON literals `true`, `false` and `null` -read as `Symbol`s. The first example: - - { - "Image": { - "Width": 800, - "Height": 600, - "Title": "View from 15th Floor", - "Thumbnail": { - "Url": "http://www.example.com/image/481989943", - "Height": 125, - "Width": 100 - }, - "Animated" : false, - "IDs": [116, 943, 234, 38793] - } - } - -encodes to binary as follows: - - E2 - 55 "Image" - EC - 55 "Width" 42 03 20 - 55 "Title" 5F 14 "View from 15th Floor" - 58 "Animated" 75 "false" - 56 "Height" 42 02 58 - 59 "Thumbnail" - E6 - 55 "Width" 41 64 - 53 "Url" 5F 26 "http://www.example.com/image/481989943" - 56 "Height" 41 7D - 53 "IDs" C4 - 41 74 - 42 03 AF - 42 00 EA - 43 00 97 89 - -and the second example: - - [ - { - "precision": "zip", - "Latitude": 37.7668, - "Longitude": -122.3959, - "Address": "", - "City": "SAN FRANCISCO", - "State": "CA", - "Zip": "94107", - "Country": "US" - }, - { - "precision": "zip", - "Latitude": 37.371991, - "Longitude": -122.026020, - "Address": "", - "City": "SUNNYVALE", - "State": "CA", - "Zip": "94085", - "Country": "US" - } - ] - -encodes to binary as follows: - - C2 - EF 10 - 59 "precision" 53 "zip" - 58 "Latitude" 03 40 42 E2 26 80 9D 49 52 - 59 "Longitude" 03 C0 5E 99 56 6C F4 1F 21 - 57 "Address" 50 - 54 "City" 5D "SAN FRANCISCO" - 55 "State" 52 "CA" - 53 "Zip" 55 "94107" - 57 "Country" 52 "US" - EF 10 - 59 "precision" 53 "zip" - 58 "Latitude" 03 40 42 AF 9D 66 AD B4 03 - 59 "Longitude" 03 C0 5E 81 AA 4F CA 42 AF - 57 "Address" 50 - 54 "City" 59 "SUNNYVALE" - 55 "State" 52 "CA" - 53 "Zip" 55 "94085" - 57 "Country" 52 "US" - -## Conventions for Common Data Types - -The `Value` data type is essentially an S-Expression, able to -represent semi-structured data over `ByteString`, `String`, -`SignedInteger` atoms and so on.[^why-not-spki-sexps] - - [^why-not-spki-sexps]: Rivest's S-Expressions are in many ways - similar to Preserves. However, while they include binary data and - sequences, and an obvious equivalence for them exists, they lack - numbers *per se* as well as any kind of unordered structure such - as sets or maps. In addition, while "display hints" allow - labelling of binary data with an intended interpretation, they - cannot be attached to any other kind of structure, and the "hint" - itself can only be a binary blob. - -However, users need a wide variety of data types for representing -domain-specific values such as various kinds of encoded and normalized -text, calendrical values, machine words, and so on. - -Appropriately-labelled `Record`s denote these domain-specific data -types.[^why-dictionaries] - - [^why-dictionaries]: Given `Record`'s existence, it may seem odd - that `Dictionary`, `Set`, `Float`, etc. are given special - treatment. Preserves aims to offer a useful basic equivalence - predicate to programmers, and so if a data type demands a special - equivalence predicate, as `Dictionary`, `Set` and `Float` all do, - then the type should be included in the base language. Otherwise, - it can be represented as a `Record` and treated separately. Both - `Boolean` and `String` are seeming exceptions: they merit - inclusion because of their cultural importance. - -All of these conventions are optional. They form a layer atop the core -`Value` structure. Non-domain-specific tools do not in general need to -treat them specially. - -**Validity.** Many of the labels we will describe in this section come - with side-conditions on the contents of labelled `Record`s. It is - possible to construct an instance of `Value` that violates these - side-conditions without ceasing to be a `Value` or becoming - unrepresentable. However, we say that such a `Value` is *invalid* - because it fails to honour the necessary side-conditions. - Implementations *SHOULD* allow two modes of working: one which - treats all `Value`s identically, without regard for side-conditions, - and one which enforces validity (i.e. side-conditions) when reading, - writing, or constructing `Value`s. - -### MIME-type tagged binary data - -Many internet protocols use -[media types](https://tools.ietf.org/html/rfc6838) (a.k.a MIME types) -to indicate the format of some associated binary data. For this -purpose, we define `MIMEData` to be a record labelled `mime` with two -fields, the first being a `Symbol`, the media type, and the second -being a `ByteString`, the binary data. - -While each media type may define its own rules for comparing -documents, we define ordering among `MIMEData` *representations* of -such media types following the general rules for ordering of -`Record`s. - -**Examples.** - -| Value | Encoded hexadecimal byte sequence | -|--------------------------------------------|-------------------------------------------------------------------------------------------------------------------| -| `mime(application/octet-stream #"abcde")` | B3 74 6D 69 6D 65 7F 18 61 70 70 6C 69 63 61 74 69 6F 6E 2F 6F 63 74 65 74 2D 73 74 72 65 61 6D 65 61 62 63 64 65 | -| `mime(text/plain #"ABC")` | B3 74 6D 69 6D 65 7A 74 65 78 74 2F 70 6C 61 69 6E 63 41 42 43 | -| `mime(application/xml #"")` | B3 74 6D 69 6D 65 7F 0F 61 70 70 6C 69 63 61 74 69 6F 6E 2F 78 6D 6C 68 3C 78 68 74 6D 6C 2F 3E | -| `mime(text/csv #"123,234,345")` | B3 74 6D 69 6D 65 78 74 65 78 74 2F 63 73 76 6B 31 32 33 2C 32 33 34 2C 33 34 35 | - -Applications making heavy use of `mime` records may choose to use a -short form label number for the record type. For example, if short -form label number 1 were chosen, the second example above, -`mime(text/plain "ABC")`, would be encoded with "92" in place of "B3 -74 6D 69 6D 65". - -### Unicode normalization forms - -Unicode defines multiple -[normalization forms](http://unicode.org/reports/tr15/) for text. -While no particular normalization form is required for `String`s, -users may need to unambiguously signal or require a particular -normalization form. A `NormalizedString` is a `Record` labelled with -`unicode-normalization` and having two fields, the first of which is a -`Symbol` specifying the normalization form used (e.g. `nfc`, `nfd`, -`nfkc`, `nfkd`), and the second of which is a `String` whose -underlying code point representation *MUST* be normalized according to -the named normalization form. - -### IRIs (URIs, URLs, URNs, etc.) - -An `IRI` is a `Record` labelled with `iri` and having one field, a -`String` which is the IRI itself and which *MUST* be a valid absolute -or relative IRI. - -### Machine words - -The definition of `SignedInteger` captures all integers. However, in -certain circumstances it can be valuable to assert that a number -inhabits a particular range, such as a fixed-width machine word. - -A family of labels `i`*n* and `u`*n* for *n* ∈ {8,16,32,64} denote -*n*-bit-wide signed and unsigned range restrictions, respectively. -Records with these labels *MUST* have one field, a `SignedInteger`, -which *MUST* fall within the appropriate range. That is, to be valid, - - in `i8(`*x*`)`, -128 <= *x* <= 127. - - in `u8(`*x*`)`, 0 <= *x* <= 255. - - in `i16(`*x*`)`, -32768 <= *x* <= 32767. - - etc. - -### Anonymous Tuples and Unit - -A `Tuple` is a `Record` with label `tuple` and zero or more fields, -denoting an anonymous tuple of values. - -The 0-ary tuple, `tuple()`, denotes the empty tuple, sometimes called -"unit" or "void" (but *not* e.g. JavaScript's "undefined" value). - -### Null and Undefined - -Tony Hoare's -"[billion-dollar mistake](https://en.wikipedia.org/wiki/Tony_Hoare#Apologies_and_retractions)" -can be represented with the 0-ary `Record` `null()`. An "undefined" -value can be represented as `undefined()`. - -### Dates and Times - -Dates, times, moments, and timestamps can be represented with a -`Record` with label `rfc3339` having a single field, a `String`, which -*MUST* conform to one of the `full-date`, `partial-time`, `full-time`, -or `date-time` productions of -[section 5.6 of RFC 3339](https://tools.ietf.org/html/rfc3339#section-5.6). - -## Security Considerations - -**Empty chunks.** Streamed (format C) `String`s, `ByteString`s and -`Symbol`s may include chunks of zero length. This opens up a -possibility for denial-of-service: an attacker may begin streaming a -string, sending an endless sequence of zero length chunks, appearing -to make progress but not actually doing so. Implementations may place -optional reasonable restrictions on the number of consecutive empty -chunks that may appear in a stream, and may even supply an optional -mode that rejects empty chunks entirely. - -**Whitespace.** Similarly, the textual format for `Value`s allows -arbitrary whitespace in many positions. In streaming transfer -situations, consider optional restrictions on the amount of -consecutive whitespace and comments that may appear in a serialized -`Value`. - -**Canonical form for cryptographic hashing and signing.** As -specified, neither the textual nor the compact binary encoding rules -for `Value`s force canonical serializations. Two serializations of the -same `Value` may yield different binary `Repr`s. - -## Appendix. Table of lead byte values - - 00 - False - 01 - True - 02 - Float - 03 - Double - (0x) RESERVED 04-0F - 1x - Small integers 0..12,-3..-1 - 2x - Start Stream - 3x - End Stream - - 4x - SignedInteger - 5x - String - 6x - ByteString - 7x - Symbol - - 8x - short form Record label index 0 - 9x - short form Record label index 1 - Ax - short form Record label index 2 - Bx - Record - - Cx - Sequence - Dx - Set - Ex - Dictionary - (Fx) RESERVED F0-FF - -## Appendix. Bit fields within lead byte values - - tt nn mmmm contents - ---------- --------- - - 00 00 0000 False - 00 00 0001 True - 00 00 0010 Float, 32 bits big-endian binary - 00 00 0011 Double, 64 bits big-endian binary - - 00 01 xxxx Small integers 0..12,-3..-1 - - 00 10 ttnn Start Stream - When tt = 00 --> error - 01 --> each chunk is a ByteString - 1x --> each chunk is a single encoded Value - 00 11 ttnn End Stream (must match preceding Start Stream) - - 01 00 mmmm SignedInteger, big-endian binary - 01 01 mmmm String, UTF-8 binary - 01 10 mmmm ByteString - 01 11 mmmm Symbol, UTF-8 binary - - 10 00 mmmm application-specific Record - 10 01 mmmm application-specific Record - 10 10 mmmm application-specific Record - 10 11 mmmm Record - - 11 00 mmmm Sequence - 11 01 mmmm Set - 11 10 mmmm Dictionary - - If mmmm = 1111, a varint(m) follows, giving the length, before - the body; otherwise, m is the length of the body to follow. - -## Appendix. Representing Values in Programming Languages - -We have given a definition of `Value` and its semantics, and proposed -a concrete syntax for communicating and storing `Value`s. We now turn -to **suggested** representations of `Value`s as *programming-language -values* for various programming languages. - -When designing a language mapping, an important consideration is -roundtripping: serialization after deserialization, and vice versa, -should both be identities. - -### JavaScript - - - `Boolean` ↔ `Boolean` - - `Float` and `Double` ↔ numbers - - `SignedInteger` ↔ numbers or `BigInt` (see [here](https://developers.google.com/web/updates/2018/05/bigint) and [here](https://github.com/tc39/proposal-bigint)) - - `String` ↔ strings - - `ByteString` ↔ `Uint8Array` - - `Symbol` ↔ `Symbol.for(...)` - - `Record` ↔ `{ "_label": theLabel, "_fields": [field0, ..., fieldN] }`, plus convenience accessors - - `(undefined)` ↔ the undefined value - - `(rfc3339 F)` ↔ `Date`, if `F` matches the `date-time` RFC 3339 production - - `Sequence` ↔ `Array` - - `Set` ↔ `{ "_set": M }` where `M` is a `Map` from the elements of the set to `true` - - `Dictionary` ↔ a `Map` - -### Scheme/Racket - - - `Boolean` ↔ booleans - - `Float` and `Double` ↔ inexact numbers (Racket: single- and double-precision floats) - - `SignedInteger` ↔ exact numbers - - `String` ↔ strings - - `ByteString` ↔ byte vector (Racket: "Bytes") - - `Symbol` ↔ symbols - - `Record` ↔ structures (Racket: prefab struct) - - `Sequence` ↔ lists - - `Set` ↔ Racket: sets - - `Dictionary` ↔ Racket: hash-table - -### Java - - - `Boolean` ↔ `Boolean` - - `Float` and `Double` ↔ `Float` and `Double` - - `SignedInteger` ↔ `Integer`, `Long`, `BigInteger` - - `String` ↔ `String` - - `ByteString` ↔ `byte[]` - - `Symbol` ↔ a simple data class wrapping a `String` - - `Record` ↔ in a simple implementation, a generic `Record` class; else perhaps a bean mapping? - - `(mime T B)` ↔ an implementation of `javax.activation.DataSource`? - - `Sequence` ↔ an implementation of `java.util.List` - - `Set` ↔ an implementation of `java.util.Set` - - `Dictionary` ↔ an implementation of `java.util.Map` - -### Erlang - - - `Boolean` ↔ `true` and `false` - - `Float` and `Double` ↔ floats (unsure how Erlang deals with single-precision) - - `SignedInteger` ↔ integers - - `String` ↔ pair of `utf8` and a binary - - `ByteString` ↔ a binary - - `Symbol` ↔ pair of `atom` and a binary - - `Record` ↔ triple of `obj`, label, and field list - - `Sequence` ↔ a list - - `Set` ↔ a `sets` set - - `Dictionary` ↔ a [map][erlang-map] (new in Erlang/OTP R17) - -This is a somewhat unsatisfactory mapping because: (a) Erlang doesn't -garbage-collect its atoms, meaning that (a.1) representing `Symbol`s -as atoms could lead to denial-of-service and (a.2) representing -`Symbol`-labelled `Record`s as Erlang records must be rejected for the -same reason; (b) even if it did, Erlang's boolean values are atoms, -which would then clash with the `Symbol`s `true` and `false`; and (c) -Erlang has no distinct string type, making for a trilemma where -`String`s are in danger of clashing with `ByteString`s, `Sequence`s, -or `Record`s. - -### Python - - - `Boolean` ↔ `True` and `False` - - `Float` ↔ a `Float` wrapper-class for a double-precision value - - `Double` ↔ float - - `SignedInteger` ↔ int and long - - `String` ↔ `unicode` - - `ByteString` ↔ `bytes` - - `Symbol` ↔ a simple data class wrapping a `unicode` - - `Record` ↔ something like `namedtuple`, but that doesn't care about class identity? - - `Sequence` ↔ `tuple` (but accept `list` during encoding) - - `Set` ↔ `frozenset` (but accept `set` during encoding) - - `Dictionary` ↔ a hashable (immutable) dictionary-like thing (but accept `dict` during encoding) - -### Squeak Smalltalk - - - `Boolean` ↔ `true` and `false` - - `Float` ↔ perhaps a subclass of `Float`? - - `Double` ↔ `Float` - - `SignedInteger` ↔ `Integer` - - `String` ↔ `WideString` - - `ByteString` ↔ `ByteArray` - - `Symbol` ↔ `WideSymbol` - - `Record` ↔ a simple data class - - `Sequence` ↔ `ArrayedCollection` (usually `OrderedCollection`) - - `Set` ↔ `Set` - - `Dictionary` ↔ `Dictionary` - -## Appendix. Why not Just Use JSON? - - - -JSON offers *syntax* for numbers, strings, booleans, null, arrays and -string-keyed maps. However, it suffers from two major problems. First, -it offers no *semantics* for the syntax: it is left to each -implementation to determine how to treat each JSON term. This causes -[interoperability](http://seriot.ch/parsing_json.php) and even -[security](http://web.archive.org/web/20180906202559/http://docs.couchdb.org/en/stable/cve/2017-12635.html) -issues. Second, JSON's lack of support for type tags leads to awkward -and incompatible *encodings* of type information in terms of the fixed -suite of constructors on offer. - -There are other minor problems with JSON having to do with its syntax. -Examples include its relative verbosity and its lack of support for -binary data. - -### JSON syntax doesn't *mean* anything - -When are two JSON values the same? When are they different? - - -The specifications are largely silent on these questions. Different -JSON implementations give different answers. - -Specifically, JSON does not: - - - assign any meaning to numbers,[^meaning-ieee-double] - - determine how strings are to be compared,[^string-key-comparison] - - determine whether object key ordering is significant,[^json-member-ordering] or - - determine whether duplicate object keys are permitted, what it - would mean if they were, or how to determine a duplicate in the - first place.[^json-key-uniqueness] - -In short, JSON syntax doesn't *denote* anything.[^xml-infoset] [^other-formats] - - [^meaning-ieee-double]: - [Section 6 of RFC 8259](https://tools.ietf.org/html/rfc8259#section-6) - does go so far as to indicate “good interoperability can be - achieved” by imagining that parsers are able reliably to - understand the syntax of numbers as denoting an IEEE 754 - double-precision floating-point value. - - [^string-key-comparison]: - [Section 8.3 of RFC 8259](https://tools.ietf.org/html/rfc8259#section-8.3) - suggests that *if* an implementation compares strings used as - object keys “code unit by code unit”, then it will interoperate - with *other such implementations*, but neither requires this - behaviour nor discusses comparisons of strings used in other - contexts. - - [^json-member-ordering]: - [Section 4 of RFC 8259](https://tools.ietf.org/html/rfc8259#section-4) - remarks that “[implementations] differ as to whether or not they - make the ordering of object members visible to calling software.” - - [^json-key-uniqueness]: - [Section 4 of RFC 8259](https://tools.ietf.org/html/rfc8259#section-4) - is the only place in the specification that mentions the issue. It - explicitly sanctions implementations supporting duplicate keys, - noting only that “when the names within an object are not unique, - the behavior of software that receives such an object is - unpredictable.” Implementations are free to choose any behaviour - at all in this situation, including signalling an error, or - discarding all but one of a set of duplicates. - - [^xml-infoset]: The XML world has the concept of - [XML infoset](https://www.w3.org/TR/xml-infoset/). Loosely - speaking, XML infoset is the *denotation* of an XML document; the - *meaning* of the document. - - [^other-formats]: Most other recent data languages are like JSON in - specifying only a syntax with no associated semantics. While some - do make a sketch of a semantics, the result is often - underspecified (e.g. in terms of how strings are to be compared), - overly machine-oriented (e.g. treating 32-bit integers as - fundamentally distinct from 64-bit integers and from - floating-point numbers), overly fine (e.g. giving visibility to - the order in which map entries are written), or all three. - -Some examples: - - - are the JSON values `1`, `1.0`, and `1e0` the same or different? - - are the JSON values `1.0` and `1.0000000000000001` the same or different? - - are the JSON strings `"päron"` (UTF-8 `70c3a4726f6e`) and `"päron"` - (UTF-8 `7061cc88726f6e`) the same or different? - - are the JSON objects `{"a":1, "b":2}` and `{"b":2, "a":1}` the same - or different? - - which, if any, of `{"a":1, "a":2}`, `{"a":1}` and `{"a":2}` are the - same? Are all three legal? - - are `{"päron":1}` and `{"päron":1}` the same or different? - -### JSON can multiply nicely, but it can't add very well - -JSON includes a fixed set of types: numbers, strings, booleans, null, -arrays and string-keyed maps. Domain-specific data must be *encoded* -into these types. For example, dates and email addresses are often -represented as strings with an implicit internal structure. - -There is no convention for *labelling* a value as belonging to a -particular category. This makes it difficult to extract, say, all -email addresses, or all URLs, from an arbitrary JSON document. - -Instead, JSON-encoded data are often labelled in an ad-hoc way. -Multiple incompatible approaches exist. For example, a "money" -structure containing a `currency` field and an `amount` may be -represented in any number of ways: - - { "_type": "money", "currency": "EUR", "amount": 10 } - { "type": "money", "value": { "currency": "EUR", "amount": 10 } } - [ "money", { "currency": "EUR", "amount": 10 } ] - { "@money": { "currency": "EUR", "amount": 10 } } - -This causes particular problems when JSON is used to represent *sum* -or *union* types, such as "either a value or an error, but not both". -Again, multiple incompatible approaches exist. - -For example, imagine an API for depositing money in an account. The -response might be either a "success" response indicating the new -balance, or one of a set of possible errors. - -Sometimes, a *pair* of values is used, with `null` marking the option -not taken.[^interesting-failure-mode] - - { "ok": { "balance": 210 }, "error": null } - { "ok": null, "error": "Unauthorized" } - - [^interesting-failure-mode]: What is the meaning of a document where - both `ok` and `error` are non-null? What might happen when a - program is presented with such a document? - -The branch not chosen is sometimes present, sometimes omitted as if it -were an optional field: - - { "ok": { "balance": 210 } } - { "error": "Unauthorized" } - -Sometimes, an array of a label and a value is used: - - [ "ok", { "balance": 210 } ] - [ "error", "Unauthorized" ] - -Sometimes, the shape of the data is sufficient to distinguish among -the alternatives, and the label is left implicit: - - { "balance": 210 } - "Unauthorized" - -JSON itself does not offer any guidance for which of these options to -choose. In many real cases on the web, poor choices have led to -encodings that are irrecoverably ambiguous. - -# Open questions - -Q. Should "symbols" instead be URIs? Relative, usually; relative to -what? Some domain-specific base URI? - -Q. Literal small integers: are they pulling their weight? They're not -absolutely necessary. They mess up the connection between -value-ordering and repr-ordering! - -Q. Should we go for trying to make the data ordering line up with the -encoding ordering? We'd have to only use streaming forms, and avoid -the small integer encoding, and not store record arities, and sort -sets and dictionaries, and mask floats and doubles (perhaps -[like this](https://stackoverflow.com/questions/43299299/sorting-floating-point-values-using-their-byte-representation)), -and pick a specific `NaN`, and I don't know what to do about -SignedIntegers. Perhaps make them more like float formats, with the -byte count acting as a kind of exponent underneath the sign bit. - - - Perhaps define separate additional canonicalization restrictions? - Doesn't help the ordering, but does help the equivalence. - - - Canonicalization and early-bailout-equivalence-checking are in - tension with support for streaming values. - -Q. The postfix fields in the textual syntax come unannounced: "oh, and -another thing, what you just read is a label, and here are some -fields." This is a problem for interactive reading of textual syntax, -because after a complete term, it needs to see the next character to -tell whether it is an open-parenthesis or not! For this reason, I've -disallowed whitespace between a label `Value` and the open-parenthesis -of the fields. Is this reasonable?? - -Q. Should SOH-prefixed binary values embedded in a textual representation -be length-prefixed, too - byte strings, essentially? Also, why not -base64 embedded binary values? The length-prefixing might help with -being able to avoid having to care whether the embedded value is well- -formed or not; on the other hand, it means streaming-format embeddings -aren't possible. - -TODO. The SOH-prefixed embedded binary idea is probably incoherent. -Textual form is *text*, not binary, and since it's code-points, we -cannot rely on having access to a hypothetical underlying bytestream. -Remove it, and consider generalizing `#hexvalue{}` to include -`#base64value{}` or similar. - -## Notes diff --git a/syndicate/mc/preserve.py b/syndicate/mc/preserve.py deleted file mode 100644 index f399037..0000000 --- a/syndicate/mc/preserve.py +++ /dev/null @@ -1,385 +0,0 @@ -import sys -import numbers -import struct - -try: - basestring -except NameError: - basestring = str - -if isinstance(chr(123), bytes): - _ord = ord -else: - _ord = lambda x: x - -class Float(object): - def __init__(self, value): - self.value = value - - def __eq__(self, other): - if other.__class__ is self.__class__: - return self.value == other.value - - def __repr__(self): - return 'Float(' + repr(self.value) + ')' - - def __preserve_on__(self, encoder): - encoder.leadbyte(0, 0, 2) - encoder.buffer.extend(struct.pack('>f', self.value)) - -class Symbol(object): - def __init__(self, name): - self.name = name - - def __eq__(self, other): - return isinstance(other, Symbol) and self.name == other.name - - def __hash__(self): - return hash(self.name) - - def __repr__(self): - return '#' + self.name - - def __preserve_on__(self, encoder): - bs = self.name.encode('utf-8') - encoder.header(1, 3, len(bs)) - encoder.buffer.extend(bs) - -class Record(object): - def __init__(self, key, fields): - self.key = key - self.fields = tuple(fields) - self.__hash = None - - def __eq__(self, other): - return isinstance(other, Record) and (self.key, self.fields) == (other.key, other.fields) - - def __hash__(self): - if self.__hash is None: - self.__hash = hash((self.key, self.fields)) - return self.__hash - - def __repr__(self): - return str(self.key) + '(' + ', '.join((repr(f) for f in self.fields)) + ')' - - def __preserve_on__(self, encoder): - try: - index = encoder.shortForms.index(self.key) - except ValueError: - index = None - if index is None: - encoder.header(2, 3, len(self.fields) + 1) - encoder.append(self.key) - else: - encoder.header(2, index, len(self.fields)) - for f in self.fields: - encoder.append(f) - -# Blub blub blub -class ImmutableDict(dict): - def __init__(self, *args, **kwargs): - if hasattr(self, '__hash'): raise TypeError('Immutable') - super(ImmutableDict, self).__init__(*args, **kwargs) - self.__hash = None - - def __delitem__(self, key): raise TypeError('Immutable') - def __setitem__(self, key, val): raise TypeError('Immutable') - def clear(self): raise TypeError('Immutable') - def pop(self, k, d=None): raise TypeError('Immutable') - def popitem(self): raise TypeError('Immutable') - def setdefault(self, k, d=None): raise TypeError('Immutable') - def update(self, e, **f): raise TypeError('Immutable') - - def __hash__(self): - if self.__hash is None: - h = 0 - for k in self: - h = ((h << 5) ^ (hash(k) << 2) ^ hash(self[k])) & sys.maxsize - self.__hash = h - return self.__hash - - @staticmethod - def from_kvs(kvs): - i = iter(kvs) - result = ImmutableDict() - result_proxy = super(ImmutableDict, result) - try: - while True: - k = next(i) - v = next(i) - result_proxy.__setitem__(k, v) - except StopIteration: - pass - return result - -def dict_kvs(d): - for k in d: - yield k - yield d[k] - -class DecodeError(ValueError): pass -class EncodeError(ValueError): pass - -class Codec(object): - def __init__(self): - self.shortForms = [Symbol(u'discard'), Symbol(u'capture'), Symbol(u'observe')] - - def set_shortform(self, index, v): - if index >= 0 and index < 3: - self.shortForms[index] = v - else: - raise ValueError('Invalid short form index %r' % (index,)) - -class Stream(object): - def __init__(self, iterator): - self._iterator = iterator - - def __preserve_on__(self, encoder): - arg = (self.major << 2) | self.minor - encoder.leadbyte(0, 2, arg) - self._emit(encoder) - encoder.leadbyte(0, 3, arg) - - def _emit(self, encoder): - raise NotImplementedError('Should be implemented in subclasses') - -class ValueStream(Stream): - major = 3 - def _emit(self, encoder): - for v in self._iterator: - encoder.append(v) - -class SequenceStream(ValueStream): - minor = 0 - -class SetStream(ValueStream): - minor = 1 - -class DictStream(ValueStream): - minor = 2 - def _emit(self, encoder): - for (k, v) in self._iterator: - encoder.append(k) - encoder.append(v) - -class BinaryStream(Stream): - major = 1 - minor = 2 - def _emit(self, encoder): - for chunk in self._iterator: - if not isinstance(chunk, bytes): - raise EncodeError('Illegal chunk in BinaryStream %r' % (chunk,)) - encoder.append(chunk) - -class StringStream(BinaryStream): - minor = 1 - -class SymbolStream(BinaryStream): - minor = 3 - -class Decoder(Codec): - def __init__(self, packet): - super(Decoder, self).__init__() - self.packet = packet - self.index = 0 - - def peekbyte(self): - if self.index < len(self.packet): - return _ord(self.packet[self.index]) - else: - raise DecodeError('Short packet') - - def advance(self, count=1): - start = self.index - self.index = self.index + count - return start - - def nextbyte(self): - val = self.peekbyte() - self.advance() - return val - - def wirelength(self, arg): - if arg < 15: - return arg - return self.varint() - - def varint(self): - v = self.nextbyte() - if v < 128: - return v - else: - return self.varint() * 128 + (v - 128) - - def nextbytes(self, n): - start = self.advance(n) - return self.packet[start : self.index] - - def nextvalues(self, n): - result = [] - for i in range(n): - result.append(self.next()) - return result - - def peekop(self): - b = self.peekbyte() - major = b >> 6 - minor = (b >> 4) & 3 - arg = b & 15 - return (major, minor, arg) - - def nextop(self): - op = self.peekop() - self.advance() - return op - - def peekend(self, arg): - return self.peekop() == (0, 3, arg) - - def binarystream(self, arg, minor): - result = [] - while not self.peekend(arg): - chunk = self.next() - if isinstance(chunk, bytes): - result.append(chunk) - else: - raise DecodeError('Unexpected non-binary chunk') - return self.decodebinary(minor, b''.join(result)) - - def valuestream(self, arg, minor, decoder): - result = [] - while not self.peekend(arg): - result.append(self.next()) - return decoder(minor, result) - - def decodeint(self, bs): - if len(bs) == 0: return 0 - acc = _ord(bs[0]) - if acc & 0x80: acc = acc - 256 - for b in bs[1:]: - acc = (acc << 8) | _ord(b) - return acc - - def decodebinary(self, minor, bs): - if minor == 0: return self.decodeint(bs) - if minor == 1: return bs.decode('utf-8') - if minor == 2: return bs - if minor == 3: return Symbol(bs.decode('utf-8')) - - def decoderecord(self, minor, vs): - if minor == 3: - if not vs: raise DecodeError('Too few elements in encoded record') - return Record(vs[0], vs[1:]) - else: - return Record(self.shortForms[minor], vs) - - def decodecollection(self, minor, vs): - if minor == 0: return tuple(vs) - if minor == 1: return frozenset(vs) - if minor == 2: return ImmutableDict.from_kvs(vs) - if minor == 3: raise DecodeError('Invalid collection type') - - def next(self): - (major, minor, arg) = self.nextop() - if major == 0: - if minor == 0: - if arg == 0: return False - if arg == 1: return True - if arg == 2: return Float(struct.unpack('>f', self.nextbytes(4))[0]) - if arg == 3: return struct.unpack('>d', self.nextbytes(8))[0] - raise DecodeError('Invalid format A encoding') - elif minor == 1: - return arg - 16 if arg > 12 else arg - elif minor == 2: - t = arg >> 2 - n = arg & 3 - if t == 0: raise DecodeError('Invalid format C start byte') - if t == 1: return self.binarystream(arg, n) - if t == 2: return self.valuestream(arg, n, self.decoderecord) - if t == 3: return self.valuestream(arg, n, self.decodecollection) - else: # minor == 3 - raise DecodeError('Unexpected format C end byte') - elif major == 1: - return self.decodebinary(minor, self.nextbytes(self.wirelength(arg))) - elif major == 2: - return self.decoderecord(minor, self.nextvalues(self.wirelength(arg))) - else: # major == 3 - return self.decodecollection(minor, self.nextvalues(self.wirelength(arg))) - -class Encoder(Codec): - def __init__(self): - super(Encoder, self).__init__() - self.buffer = bytearray() - - def contents(self): - return bytes(self.buffer) - - def varint(self, v): - if v < 128: - self.buffer.append(v) - else: - self.buffer.append((v % 128) + 128) - self.varint(v // 128) - - def leadbyte(self, major, minor, arg): - self.buffer.append(((major & 3) << 6) | ((minor & 3) << 4) | (arg & 15)) - - def header(self, major, minor, wirelength): - if wirelength < 15: - self.leadbyte(major, minor, wirelength) - else: - self.leadbyte(major, minor, 15) - self.varint(wirelength) - - def encodeint(self, v): - bitcount = (~v if v < 0 else v).bit_length() + 1 - bytecount = (bitcount + 7) // 8 - self.header(1, 0, bytecount) - def enc(n,x): - if n > 0: - enc(n-1, x >> 8) - self.buffer.append(x & 255) - enc(bytecount, v) - - def encodecollection(self, minor, items): - self.header(3, minor, len(items)) - for i in items: self.append(i) - - def append(self, v): - if hasattr(v, '__preserve_on__'): - v.__preserve_on__(self) - elif v is False: - self.leadbyte(0, 0, 0) - elif v is True: - self.leadbyte(0, 0, 1) - elif isinstance(v, float): - self.leadbyte(0, 0, 3) - self.buffer.extend(struct.pack('>d', v)) - elif isinstance(v, numbers.Number): - if v >= -3 and v <= 12: - self.leadbyte(0, 1, v if v >= 0 else v + 16) - else: - self.encodeint(v) - elif isinstance(v, bytes): - self.header(1, 2, len(v)) - self.buffer.extend(v) - elif isinstance(v, basestring): - bs = v.encode('utf-8') - self.header(1, 1, len(bs)) - self.buffer.extend(bs) - elif isinstance(v, list): - self.encodecollection(0, v) - elif isinstance(v, tuple): - self.encodecollection(0, v) - elif isinstance(v, set): - self.encodecollection(1, v) - elif isinstance(v, frozenset): - self.encodecollection(1, v) - elif isinstance(v, dict): - self.encodecollection(2, list(dict_kvs(v))) - else: - try: - i = iter(v) - except TypeError: - raise EncodeError('Cannot encode %r' % (v,)) - self.encodestream(3, 0, i) diff --git a/syndicate/mc/preserve.rkt b/syndicate/mc/preserve.rkt deleted file mode 100644 index fed3b51..0000000 --- a/syndicate/mc/preserve.rkt +++ /dev/null @@ -1,856 +0,0 @@ -#lang racket/base -;; Preserve, as in Fruit Preserve, as in a remarkably weak pun on pickling/dehydration etc - -(provide (struct-out stream-of) - (struct-out record) - short-form-labels - read-preserve - string->preserve - encode - decode - wire-value) - -(require racket/bytes) -(require racket/dict) -(require racket/generator) -(require racket/match) -(require racket/set) -(require bitsyntax) -(require syndicate/support/struct) -(require (only-in syntax/readerr raise-read-error)) - -(require imperative-syndicate/assertions) -(require imperative-syndicate/pattern) - -(struct stream-of (kind generator) #:transparent) - -(struct record (label fields) #:transparent) - -(define short-form-labels - (make-parameter (vector 'discard 'capture 'observe))) - -(define (encode v) - (bit-string->bytes (bit-string (v :: (wire-value))))) - -(define (decode bs [on-fail (lambda () (error 'decode "Invalid encoding: ~v" bs))]) - (bit-string-case bs - ([ (v :: (wire-value)) ] v) - (else (on-fail)))) - -(define-syntax wire-value - (syntax-rules () - [(_ #t input ks kf) (decode-value input ks kf)] - [(_ #f v) (encode-value v)])) - -(define-syntax wire-length - (syntax-rules () - [(_ #t input ks kf) (decode-wire-length input ks kf)] - [(_ #f v) (encode-wire-length v)])) - -(define (encode-wire-length v) - (when (negative? v) (error 'encode-wire-length "Cannot encode negative wire-length ~v" v)) - (if (< v #b1111) - (bit-string (v :: bits 4)) - (bit-string (#b1111 :: bits 4) ((encode-varint v) :: binary)))) - -(define (encode-varint v) - (if (< v 128) - (bytes v) - (bit-string ((+ (modulo v 128) 128) :: bits 8) - ((encode-varint (quotient v 128)) :: binary)))) - -(define (encode-array-like major minor fields) - (bit-string (major :: bits 2) - (minor :: bits 2) - ((length fields) :: (wire-length)) - ((apply bit-string-append (map encode-value fields)) :: binary))) - -(define (encode-binary-like major minor bs) - (bit-string (major :: bits 2) - (minor :: bits 2) - ((bytes-length bs) :: (wire-length)) - (bs :: binary))) - -(define (encode-start-byte major minor) - (bit-string (#b0010 :: bits 4) (major :: bits 2) (minor :: bits 2))) - -(define (encode-end-byte major minor) - (bit-string (#b0011 :: bits 4) (major :: bits 2) (minor :: bits 2))) - -(define (encode-stream major minor chunk-ok? generator) - (bit-string-append (encode-start-byte major minor) - (let loop () - (match (generator) - [(? void?) #""] - [(? chunk-ok? v) (bit-string-append (encode-value v) (loop))] - [bad (error 'encode-stream "Cannot encode chunk: ~v" bad)])) - (encode-end-byte major minor))) - -(define (dict-keys-and-values d) - (reverse (for/fold [(acc '())] [((k v) (in-dict d))] (cons v (cons k acc))))) - -(define (short-form-for-label key) - (let ((labels (short-form-labels))) - (let loop ((i 0)) - (cond [(= i 3) #f] - [(equal? (vector-ref labels i) key) i] - [else (loop (+ i 1))])))) - -(define (encode-record key fields) - (define short (short-form-for-label key)) - (if short - (encode-array-like 2 short fields) - (encode-array-like 2 3 (cons key fields)))) - -(define (encode-value v) - (match v - [#f (bytes #b00000000)] - [#t (bytes #b00000001)] - [(? single-flonum?) (bit-string #b00000010 (v :: float bits 32))] - [(? double-flonum?) (bit-string #b00000011 (v :: float bits 64))] - [(? integer? x) #:when (<= -3 x 12) (bit-string (#b0001 :: bits 4) (x :: bits 4))] - [(stream-of 'string p) (encode-stream 1 1 bytes? p)] - [(stream-of 'byte-string p) (encode-stream 1 2 bytes? p)] - [(stream-of 'symbol p) (encode-stream 1 3 bytes? p)] - [(stream-of 'sequence p) (encode-stream 3 0 (lambda (x) #t) p)] - [(stream-of 'set p) (encode-stream 3 1 (lambda (x) #t) p)] - [(stream-of 'dictionary p) (encode-stream 3 2 (lambda (x) #t) p)] - - ;; [0 (bytes #b10000000)] - [(? integer?) - (define raw-bit-count (+ (integer-length v) 1)) ;; at least one sign bit - (define byte-count (quotient (+ raw-bit-count 7) 8)) - (bit-string (#b0100 :: bits 4) (byte-count :: (wire-length)) (v :: integer bytes byte-count))] - [(? string?) (encode-binary-like 1 1 (string->bytes/utf-8 v))] - [(? bytes?) (encode-binary-like 1 2 v)] - [(? symbol?) (encode-binary-like 1 3 (string->bytes/utf-8 (symbol->string v)))] - - [(record label fields) (encode-record label fields)] - [(? non-object-struct?) - (define key (prefab-struct-key v)) - (when (not key) (error 'encode-value "Cannot encode non-prefab struct ~v" v)) - (encode-record key (cdr (vector->list (struct->vector v))))] - - [(? list?) (encode-array-like 3 0 v)] - [(? set?) (encode-array-like 3 1 (set->list v))] - [(? dict?) (encode-array-like 3 2 (dict-keys-and-values v))] - - [_ (error 'encode-value "Cannot encode value ~v" v)])) - -;;--------------------------------------------------------------------------- - -(define (decode-wire-length bs ks kf) - (bit-string-case bs - ([ (= #b1111 :: bits 4) (rest :: binary) ] - (decode-varint rest - (lambda (v tail) - (if (< v #b1111) - (kf) - (ks v tail))) - kf)) - ([ (v :: bits 4) (rest :: binary) ] (ks v rest)) - (else (kf)))) - -(define (decode-varint bs ks kf) - (bit-string-case bs - ([ (= 1 :: bits 1) (v :: bits 7) (rest :: binary) ] - (decode-varint rest (lambda (acc tail) (ks (+ (* acc 128) v) tail)) kf)) - ([ (= 0 :: bits 1) (v :: bits 7) (rest :: binary) ] - (ks v rest)) - (else - (kf)))) - -(define (decode-values n acc-rev bs ks kf) - (if (zero? n) - (ks (reverse acc-rev) bs) - (bit-string-case bs - ([ (v :: (wire-value)) (rest :: binary) ] - (decode-values (- n 1) (cons v acc-rev) rest ks kf)) - (else (kf))))) - -(define (decode-binary minor bs rest ks kf) - (match minor - [0 (if (positive? (bit-string-length bs)) - (ks (bit-string->signed-integer bs #t) rest) - (ks 0 rest))] - [2 (ks bs rest)] - [(or 1 3) - ((with-handlers [(exn:fail:contract? (lambda (e) kf))] - (define s (bytes->string/utf-8 bs)) - (lambda () (ks (if (= minor 3) (string->symbol s) s) rest))))])) - -(define (build-record label fields) - (with-handlers [(exn:fail:contract? (lambda (e) (record label fields)))] - (apply make-prefab-struct label fields))) - -(define (decode-record minor fields rest ks kf) - (match* (minor fields) - [(3 (list* key fs)) (ks (build-record key fs) rest)] - [(3 '()) (kf)] - [(n fs) (ks (build-record (vector-ref (short-form-labels) n) fs) rest)])) - -(define (decode-collection minor vs rest ks kf) - (match minor - [0 (ks vs rest)] - [1 (ks (list->set vs) rest)] - [2 (if (even? (length vs)) - (ks (apply hash vs) rest) - (kf))] - [_ (kf)])) - -(define (decode-stream major minor chunk-ok? join-chunks decode rest ks kf) - (let loop ((acc-rev '()) (rest rest)) - (bit-string-case rest - ([ (= #b0011 :: bits 4) (emajor :: bits 2) (eminor :: bits 2) (rest :: binary) ] - (if (and (= major emajor) (= minor eminor)) - (decode minor (join-chunks (reverse acc-rev)) rest ks kf) - (kf))) - (else - (decode-value rest - (lambda (chunk rest) - (if (chunk-ok? chunk) - (loop (cons chunk acc-rev) rest) - (kf))) - kf))))) - -(define (decode-value bs ks kf) - (bit-string-case bs - ([ (= #b00000000 :: bits 8) (rest :: binary) ] (ks #f rest)) - ([ (= #b00000001 :: bits 8) (rest :: binary) ] (ks #t rest)) - ([ (= #b00000010 :: bits 8) (v :: float bits 32) (rest :: binary) ] (ks (real->single-flonum v) rest)) - ([ (= #b00000011 :: bits 8) (v :: float bits 64) (rest :: binary) ] (ks v rest)) - ([ (= #b0001 :: bits 4) (x :: bits 4) (rest :: binary) ] (ks (if (> x 12) (- x 16) x) rest)) - - ([ (= #b001001 :: bits 6) (minor :: bits 2) (rest :: binary) ] - (decode-stream 1 minor bytes? bytes-append* decode-binary rest ks kf)) - ([ (= #b001010 :: bits 6) (minor :: bits 2) (rest :: binary) ] - (decode-stream 2 minor (lambda (x) #t) values decode-record rest ks kf)) - ([ (= #b001011 :: bits 6) (minor :: bits 2) (rest :: binary) ] - (decode-stream 3 minor (lambda (x) #t) values decode-collection rest ks kf)) - - ([ (= #b01 :: bits 2) (minor :: bits 2) (byte-count :: (wire-length)) - (bits :: binary bytes byte-count) - (rest :: binary) ] - (decode-binary minor (bit-string->bytes bits) rest ks kf)) - - ([ (= #b10 :: bits 2) (minor :: bits 2) (field-count :: (wire-length)) (rest :: binary) ] - (decode-values field-count '() rest - (lambda (fields rest) (decode-record minor fields rest ks kf)) - kf)) - - ([ (= #b11 :: bits 2) (minor :: bits 2) (count :: (wire-length)) (rest :: binary) ] - (decode-values count '() rest - (lambda (vs rest) (decode-collection minor vs rest ks kf)) - kf)) - - (else (kf)))) - -;;--------------------------------------------------------------------------- - -(define (skip-whitespace* i) - (regexp-match? #px#"^(\\s|,)*" i) - (match (peek-char i) - [#\; (regexp-match? #px#"[^\r\n]*[\r\n]" i) (skip-whitespace* i)] - [_ #t])) - -(define (parse-error* i fmt . args) - (define-values [line column pos] (port-next-location i)) - (raise-read-error (format "read-preserve: ~a" (apply format fmt args)) - (object-name i) - line - column - pos - #f)) - -(define (read-preserve [i (current-input-port)]) - (local-require net/base64) - (local-require file/sha1) - - (define-match-expander px - (syntax-rules () - [(_ re pat) (app (lambda (v) (regexp-try-match re v)) pat)])) - - (define (parse-error fmt . args) - (apply parse-error* i fmt args)) - - (define (eof-guard ch) - (match ch - [(? eof-object?) (parse-error "Unexpected end of input")] - [ch ch])) - - (define (peek/no-eof) (eof-guard (peek-char i))) - (define (read/no-eof) (eof-guard (read-char i))) - - (define (skip-whitespace) (skip-whitespace* i)) - - (define (read-sequence terminator) - (sequence-fold '() (lambda (acc) (cons (read-value) acc)) reverse terminator)) - - (define (read-dictionary-or-set) - (sequence-fold #f - (lambda (acc) - (define k (read-value)) - (skip-whitespace) - (match (peek-char i) - [#\: (when (set? acc) (parse-error "Unexpected key/value separator in set")) - (read-char i) - (define v (read-value)) - (hash-set (or acc (hash)) k v)] - [_ (when (hash? acc) (parse-error "Missing expected key/value separator")) - (set-add (or acc (set)) k)])) - (lambda (acc) (or acc (hash))) - #\})) - - (define PIPE #\|) - - (define (read-raw-symbol acc) - (match (peek-char i) - [(or (? eof-object?) - (? char? (or #\( #\) #\{ #\} #\[ #\] - #\" #\; #\, #\# #\: (== PIPE) - (? char-whitespace?)))) - (string->symbol (list->string (reverse acc)))] - [_ (read-raw-symbol (cons (read-char i) acc))])) - - (define (read-base64-binary acc) - (skip-whitespace) - (define ch (read/no-eof)) - (cond [(eqv? ch #\}) - (base64-decode (string->bytes/latin-1 (list->string (reverse acc))))] - [(or (and (char>=? ch #\A) (char<=? ch #\Z)) - (and (char>=? ch #\a) (char<=? ch #\z)) - (and (char>=? ch #\0) (char<=? ch #\9)) - (memv ch '(#\+ #\/ #\- #\_ #\=))) - (read-base64-binary (cons ch acc))] - [else - (parse-error "Invalid base64 character")])) - - (define (hexdigit? ch) - (or (and (char>=? ch #\A) (char<=? ch #\F)) - (and (char>=? ch #\a) (char<=? ch #\f)) - (and (char>=? ch #\0) (char<=? ch #\9)))) - - (define (read-hex-binary acc) - (skip-whitespace) - (define ch (read/no-eof)) - (cond [(eqv? ch #\}) - (hex-string->bytes (list->string (reverse acc)))] - [(hexdigit? ch) - (define ch2 (read/no-eof)) - (when (not (hexdigit? ch2)) - (parse-error "Hex-encoded binary digits must come in pairs")) - (read-hex-binary (cons ch2 (cons ch acc)))] - [else - (parse-error "Invalid hex character")])) - - (define (read-stringlike xform-item finish terminator-char hexescape-char hexescape-proc) - (let loop ((acc '())) - (match (read/no-eof) - [(== terminator-char) (finish (reverse acc))] - [#\\ (match (read/no-eof) - [(== hexescape-char) (loop (cons (hexescape-proc) acc))] - [(and c (or (== terminator-char) #\\ #\/)) (loop (cons (xform-item c) acc))] - [#\b (loop (cons (xform-item #\u08) acc))] - [#\f (loop (cons (xform-item #\u0C) acc))] - [#\n (loop (cons (xform-item #\u0A) acc))] - [#\r (loop (cons (xform-item #\u0D) acc))] - [#\t (loop (cons (xform-item #\u09) acc))] - [c (parse-error "Invalid escape code \\~a" c)])] - [c (loop (cons (xform-item c) acc))]))) - - (define (read-string terminator-char) - (read-stringlike values - list->string - terminator-char - #\u - (lambda () - (integer->char - (match i - [(px #px#"^[a-fA-F0-9]{4}" (list hexdigits)) - (define n1 (string->number (bytes->string/utf-8 hexdigits) 16)) - (if (<= #xd800 n1 #xdfff) ;; surrogate pair first half - (match i - [(px #px#"^\\\\u([a-fA-F0-9]{4})" (list _ hexdigits2)) - (define n2 (string->number (bytes->string/utf-8 hexdigits2) 16)) - (if (<= #xdc00 n2 #xdfff) - (+ (arithmetic-shift (- n1 #xd800) 10) - (- n2 #xdc00) - #x10000) - (parse-error "Bad second half of surrogate pair"))] - [_ (parse-error "Missing second half of surrogate pair")]) - n1)] - [_ (parse-error "Bad string \\u escape")]))))) - - (define (read-literal-binary) - (read-stringlike (lambda (c) - (define b (char->integer c)) - (when (>= b 256) - (parse-error "Invalid code point ~a (~v) in literal binary" b c)) - b) - list->bytes - #\" - #\x - (lambda () - (match i - [(px #px#"^[a-fA-F0-9]{2}" (list hexdigits)) - (string->number (bytes->string/utf-8 hexdigits) 16)] - [_ (parse-error "Bad binary \\x escape")])))) - - (define (read-intpart acc-rev) - (match (peek-char i) - [#\0 (read-fracexp (cons (read-char i) acc-rev))] - [_ (read-digit+ acc-rev read-fracexp)])) - - (define (read-digit* acc-rev k) - (match (peek-char i) - [(? char? (? char-numeric?)) (read-digit* (cons (read-char i) acc-rev) k)] - [_ (k acc-rev)])) - - (define (read-digit+ acc-rev k) - (match (peek-char i) - [(? char? (? char-numeric?)) (read-digit* (cons (read-char i) acc-rev) k)] - [_ (parse-error "Incomplete number")])) - - (define (read-fracexp acc-rev) - (match (peek-char i) - [#\. (read-digit+ (cons (read-char i) acc-rev) read-exp)] - [_ (read-exp acc-rev)])) - - (define (read-exp acc-rev) - (match (peek-char i) - [(or #\e #\E) (read-sign-and-exp (cons (read-char i) acc-rev))] - [_ (finish-number acc-rev)])) - - (define (read-sign-and-exp acc-rev) - (match (peek-char i) - [(or #\+ #\-) (read-digit+ (cons (read-char i) acc-rev) finish-number)] - [_ (read-digit+ acc-rev finish-number)])) - - (define (finish-number acc-rev) - (define s (list->string (reverse acc-rev))) - (define n (string->number s)) - (when (not n) (parse-error "Invalid number: ~v" s)) - (if (flonum? n) - (match (peek-char i) - [(or #\f #\F) (read-char i) (real->single-flonum n)] - [_ n]) - n)) - - (define (read-number) - (match (peek/no-eof) - [#\- (read-intpart (list (read-char i)))] - [_ (read-intpart (list))])) - - (define (sequence-fold acc accumulate-one finish terminator-char) - (let loop ((acc acc)) - (skip-whitespace) - (match (peek/no-eof) - [(== terminator-char) (read-char i) (finish acc)] - [_ (loop (accumulate-one acc))]))) - - (define (collect-fields head) - (match (peek-char i) - [#\( - (read-char i) - (collect-fields (build-record head (read-sequence #\))))] - [_ - head])) - - (define (read-value) - (skip-whitespace) - (collect-fields - (match (peek-char i) - [(? eof-object? o) o] - [#\{ (read-char i) (read-dictionary-or-set)] - [#\[ (read-char i) (read-sequence #\])] - [(or #\- #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9) (read-number)] - [#\" (read-char i) (read-string #\")] - [(== PIPE) (read-char i) (string->symbol (read-string PIPE))] - [#\# (match i - [(px #px#"^#set\\{" (list _)) - (sequence-fold (set) (lambda (acc) (set-add acc (read-value))) values #\})] - [(px #px#"^#hexvalue\\{" (list _)) - (decode (read-hex-binary '()) (lambda () (parse-error "Invalid #hexvalue encoding")))] - [(px #px#"^#true" (list _)) - #t] - [(px #px#"^#false" (list _)) - #f] - [(px #px#"^#\"" (list _)) - (read-literal-binary)] - [(px #px#"^#hex\\{" (list _)) - (read-hex-binary '())] - [(px #px#"^#base64\\{" (list _)) - (read-base64-binary '())] - [_ - (parse-error "Invalid preserve value")])] - [#\: (parse-error "Unexpected key/value separator between items")] - [_ (read-raw-symbol '())]))) - - (read-value)) - -(define (string->preserve s) - (define p (open-input-string s)) - (define v (read-preserve p)) - (skip-whitespace* p) - (when (not (eof-object? (peek-char p))) - (parse-error* p "Unexpected text following preserve")) - v) - -;;--------------------------------------------------------------------------- - -(module+ test - (require rackunit) - (require (for-syntax racket syntax/srcloc)) - - (check-equal? (bit-string->bytes (encode-varint 0)) (bytes 0)) - (check-equal? (bit-string->bytes (encode-varint 1)) (bytes 1)) - (check-equal? (bit-string->bytes (encode-varint 127)) (bytes 127)) - (check-equal? (bit-string->bytes (encode-varint 128)) (bytes 128 1)) - (check-equal? (bit-string->bytes (encode-varint 255)) (bytes 255 1)) - (check-equal? (bit-string->bytes (encode-varint 256)) (bytes 128 2)) - (check-equal? (bit-string->bytes (encode-varint 300)) (bytes #b10101100 #b00000010)) - (check-equal? (bit-string->bytes (encode-varint 1000000000)) (bytes 128 148 235 220 3)) - - (define (ks* v rest) (list v (bit-string->bytes rest))) - (define (kf*) (void)) - - (check-equal? (decode-varint (bytes 0) ks* kf*) (list 0 (bytes))) - (check-equal? (decode-varint (bytes 0 99) ks* kf*) (list 0 (bytes 99))) - (check-equal? (decode-varint (bytes 1) ks* kf*) (list 1 (bytes))) - (check-equal? (decode-varint (bytes 127) ks* kf*) (list 127 (bytes))) - (check-equal? (decode-varint (bytes 128 1) ks* kf*) (list 128 (bytes))) - (check-equal? (decode-varint (bytes 128 1 99) ks* kf*) (list 128 (bytes 99))) - (check-equal? (decode-varint (bytes 255 1) ks* kf*) (list 255 (bytes))) - (check-equal? (decode-varint (bytes 128 2) ks* kf*) (list 256 (bytes))) - (check-equal? (decode-varint (bytes #b10101100 #b00000010) ks* kf*) (list 300 (bytes))) - (check-equal? (decode-varint (bytes 128 148 235 220 3) ks* kf*) (list 1000000000 (bytes))) - (check-equal? (decode-varint (bytes 128 148 235 220 3 99) ks* kf*) (list 1000000000 (bytes 99))) - - (check-equal? (bit-string->bytes (bit-string (0 :: bits 4) (0 :: (wire-length)))) (bytes 0)) - (check-equal? (bit-string->bytes (bit-string (0 :: bits 4) (3 :: (wire-length)))) (bytes 3)) - (check-equal? (bit-string->bytes (bit-string (0 :: bits 4) (14 :: (wire-length)))) (bytes 14)) - (check-equal? (bit-string->bytes (bit-string (0 :: bits 4) (15 :: (wire-length)))) (bytes 15 15)) - (check-equal? (bit-string->bytes (bit-string (0 :: bits 4) (100 :: (wire-length)))) - (bytes 15 100)) - (check-equal? (bit-string->bytes (bit-string (0 :: bits 4) (300 :: (wire-length)))) - (bytes 15 #b10101100 #b00000010)) - - (define (dwl bs) - (bit-string-case bs - ([ (= 0 :: bits 4) (w :: (wire-length)) ] w) - (else (void)))) - - (check-equal? (dwl (bytes 0)) 0) - (check-equal? (dwl (bytes 3)) 3) - (check-equal? (dwl (bytes 14)) 14) - (check-equal? (dwl (bytes 15)) (void)) - (check-equal? (dwl (bytes 15 9)) (void)) ;; not canonical - (check-equal? (dwl (bytes 15 15)) 15) - (check-equal? (dwl (bytes 15 100)) 100) - (check-equal? (dwl (bytes 15 #b10101100 #b00000010)) 300) - - (struct speak (who what) #:prefab) - - (define (expected . pieces) - (bit-string->bytes - (apply bit-string-append - (map (match-lambda - [(? byte? b) (bytes b)] - [(? bytes? bs) bs] - [(? string? s) (string->bytes/utf-8 s)]) - pieces)))) - - (define (d bs) (decode bs void)) - - (define-syntax (cross-check stx) - (syntax-case stx () - ((_ text v (b ...)) - #'(let ((val v)) (cross-check text v v (b ...)))) - ((_ text forward back (b ...)) - #`(let ((loc #,(source-location->string #'forward))) - (check-equal? (string->preserve text) back loc) - (check-equal? (d (encode forward)) back loc) - (check-equal? (d (encode back)) back loc) - (check-equal? (d (expected b ...)) back loc) - (check-equal? (encode forward) (expected b ...) loc) - )))) - - (define-syntax (cross-check/nondeterministic stx) - (syntax-case stx () - ((_ text v (b ...)) - #'(let ((val v)) (cross-check/nondeterministic text v v (b ...)))) - ((_ text forward back (b ...)) - #`(let ((loc #,(source-location->string #'forward))) - (check-equal? (string->preserve text) back loc) - (check-equal? (d (encode forward)) back loc) - (check-equal? (d (encode back)) back loc) - (check-equal? (d (expected b ...)) back loc) - )))) - - (cross-check "capture(discard())" (capture (discard)) (#x91 #x80)) - (cross-check "observe(speak(discard(), capture(discard())))" - (observe (speak (discard) (capture (discard)))) - (#xA1 #xB3 #x75 "speak" #x80 #x91 #x80)) - (cross-check "[1, 2, 3, 4]" '(1 2 3 4) (#xC4 #x11 #x12 #x13 #x14)) - (cross-check "[1 2 3 4]" - (stream-of 'sequence (sequence->generator '(1 2 3 4))) - '(1 2 3 4) - (#x2C #x11 #x12 #x13 #x14 #x3C)) - (cross-check " [ -2 -1 0 1 ] " '(-2 -1 0 1) (#xC4 #x1E #x1F #x10 #x11)) - (cross-check "\"hello\"" "hello" (#x55 "hello")) - (cross-check "\"hello\"" - (stream-of 'string (sequence->generator '(#"he" #"llo"))) - "hello" - (#x25 #x62 "he" #x63 "llo" #x35)) - (cross-check "\"hello\"" - (stream-of 'string (sequence->generator '(#"he" #"ll" #"" #"" #"o"))) - "hello" - (#x25 #x62 "he" #x62 "ll" #x60 #x60 #x61 "o" #x35)) - (cross-check "#\"hello\"" - (stream-of 'byte-string (sequence->generator '(#"he" #"ll" #"" #"" #"o"))) - #"hello" - (#x26 #x62 "he" #x62 "ll" #x60 #x60 #x61 "o" #x36)) - (cross-check "hello" - (stream-of 'symbol (sequence->generator '(#"he" #"ll" #"" #"" #"o"))) - 'hello - (#x27 #x62 "he" #x62 "ll" #x60 #x60 #x61 "o" #x37)) - (cross-check "[\"hello\" there #\"world\" [] #set{} #true #false]" - `("hello" there #"world" () ,(set) #t #f) - (#xC7 #x55 "hello" #x75 "there" #x65 "world" #xC0 #xD0 #x01 #x00)) - - (cross-check "#\"ABC\"" #"ABC" (#x63 #x41 #x42 #x43)) - (cross-check "#hex{414243}" #"ABC" (#x63 #x41 #x42 #x43)) - (cross-check "#hex{ 41 4A 4e }" #"AJN" (#x63 #x41 #x4A #x4E)) - (cross-check "#hex{ 41;re\n 42 43 }" #"ABC" (#x63 #x41 #x42 #x43)) - (check-exn exn? (lambda () (string->preserve "#hex{414 243}"))) ;; bytes must be 2-digits entire - (cross-check "#base64{Y29yeW1i}" #"corymb" (#x66 "corymb")) - (cross-check "#base64{Y29 yeW 1i}" #"corymb" (#x66 "corymb")) - (cross-check ";; a comment\n#base64{\n;x\nY29 yeW 1i}" #"corymb" (#x66 "corymb")) - (cross-check "#base64{SGk=}" #"Hi" (#x62 "Hi")) - (cross-check "#base64{SGk}" #"Hi" (#x62 "Hi")) - (cross-check "#base64{ S G k }" #"Hi" (#x62 "Hi")) - - (check-equal? (string->preserve "[]") '()) - (check-equal? (string->preserve "{}") (hash)) - (check-equal? (string->preserve "\"\"") "") - (check-equal? (string->preserve "||") (string->symbol "")) - (check-equal? (string->preserve "#set{}") (set)) - - (check-equal? (string->preserve "{1 2 3}") (set 1 2 3)) - (check-equal? (string->preserve "#set{1 2 3}") (set 1 2 3)) - - (cross-check "\"abc\\u6c34\\u6C34\\\\\\/\\\"\\b\\f\\n\\r\\txyz\"" - "abc\u6c34\u6c34\\/\"\b\f\n\r\txyz" - (#x5f #x14 - #x61 #x62 #x63 #xe6 #xb0 #xb4 #xe6 #xb0 - #xb4 #x5c #x2f #x22 #x08 #x0c #x0a #x0d - #x09 #x78 #x79 #x7a)) - - (cross-check "|abc\\u6c34\\u6C34\\\\\\/\\|\\b\\f\\n\\r\\txyz|" - (string->symbol "abc\u6c34\u6c34\\/|\b\f\n\r\txyz") - (#x7f #x14 - #x61 #x62 #x63 #xe6 #xb0 #xb4 #xe6 #xb0 - #xb4 #x5c #x2f #x7c #x08 #x0c #x0a #x0d - #x09 #x78 #x79 #x7a)) - - (check-exn #px"Invalid escape code \\\\u" (lambda () (string->preserve "#\"\\u6c34\""))) - - (cross-check "#\"abc\\x6c\\x34\\xf0\\\\\\/\\\"\\b\\f\\n\\r\\txyz\"" - #"abc\x6c\x34\xf0\\/\"\b\f\n\r\txyz" - (#x6f #x11 - #x61 #x62 #x63 #x6c #x34 #xf0 #x5c #x2f - #x22 #x08 #x0c #x0a #x0d #x09 #x78 #x79 #x7a)) - - (cross-check "\"\\uD834\\uDD1E\"" "\U0001D11E" (#x54 #xF0 #x9D #x84 #x9E)) - - (cross-check "-257" -257 (#x42 #xFE #xFF)) - (cross-check "-256" -256 (#x42 #xFF #x00)) - (cross-check "-255" -255 (#x42 #xFF #x01)) - (cross-check "-254" -254 (#x42 #xFF #x02)) - (cross-check "-129" -129 (#x42 #xFF #x7F)) - (cross-check "-128" -128 (#x41 #x80)) - (cross-check "-127" -127 (#x41 #x81)) - (cross-check "-4" -4 (#x41 #xFC)) - (cross-check "-3" -3 (#x1D)) - (cross-check "-2" -2 (#x1E)) - (cross-check "-1" -1 (#x1F)) - (cross-check "0" 0 (#x10)) - (cross-check "1" 1 (#x11)) - (cross-check "12" 12 (#x1C)) - (cross-check "13" 13 (#x41 #x0D)) - (cross-check "127" 127 (#x41 #x7F)) - (cross-check "128" 128 (#x42 #x00 #x80)) - (cross-check "255" 255 (#x42 #x00 #xFF)) - (cross-check "256" 256 (#x42 #x01 #x00)) - (cross-check "32767" 32767 (#x42 #x7F #xFF)) - (cross-check "32768" 32768 (#x43 #x00 #x80 #x00)) - (cross-check "65535" 65535 (#x43 #x00 #xFF #xFF)) - (cross-check "65536" 65536 (#x43 #x01 #x00 #x00)) - (cross-check "131072" 131072 (#x43 #x02 #x00 #x00)) - - (cross-check "1.0f" 1.0f0 (#b00000010 #b00111111 #b10000000 0 0)) - (cross-check "1.0" 1.0 (#b00000011 #b00111111 #b11110000 0 0 0 0 0 0)) - (cross-check "-1.202e300" -1.202e300 (#x03 #xFE #x3C #xB7 #xB7 #x59 #xBF #x04 #x26)) - - (check-equal? (d (expected #x25 #x51 "a" #x35)) (void)) ;; Bad chunk type: must be bytes - (check-equal? (d (expected #x25 #x71 "a" #x35)) (void)) ;; Bad chunk type: must be bytes - (check-equal? (d (expected #x26 #x51 "a" #x36)) (void)) ;; Bad chunk type: must be bytes - (check-equal? (d (expected #x26 #x71 "a" #x36)) (void)) ;; Bad chunk type: must be bytes - (check-equal? (d (expected #x27 #x51 "a" #x37)) (void)) ;; Bad chunk type: must be bytes - (check-equal? (d (expected #x27 #x71 "a" #x37)) (void)) ;; Bad chunk type: must be bytes - (check-equal? (d (expected #x25 #x61 "a" #x35)) "a") - (check-equal? (d (expected #x26 #x61 "a" #x36)) #"a") - (check-equal? (d (expected #x27 #x61 "a" #x37)) 'a) - - (struct date (year month day) #:prefab) - (struct thing (id) #:prefab) - (struct person thing (name date-of-birth) #:prefab) - (struct titled person (title) #:prefab) - - (cross-check - "[titled person 2 thing 1](101, \"Blackwell\", date(1821 2 3), \"Dr\")" - (titled 101 "Blackwell" (date 1821 2 3) "Dr") - (#xB5 ;; Record, generic, 4+1 - #xC5 ;; Sequence, 5 - #x76 #x74 #x69 #x74 #x6C #x65 #x64 ;; Symbol, "titled" - #x76 #x70 #x65 #x72 #x73 #x6F #x6E ;; Symbol, "person" - #x12 ;; SignedInteger, "2" - #x75 #x74 #x68 #x69 #x6E #x67 ;; Symbol, "thing" - #x11 ;; SignedInteger, "1" - #x41 #x65 ;; SignedInteger, "101" - #x59 #x42 #x6C #x61 #x63 #x6B #x77 #x65 #x6C #x6C ;; String, "Blackwell" - #xB4 ;; Record, generic, 3+1 - #x74 #x64 #x61 #x74 #x65 ;; Symbol, "date" - #x42 #x07 #x1D ;; SignedInteger, "1821" - #x12 ;; SignedInteger, "2" - #x13 ;; SignedInteger, "3" - #x52 #x44 #x72 ;; String, "Dr" - )) - - (cross-check "discard()" (record 'discard '()) (discard) (#x80)) - (cross-check "discard(surprise)" - (record 'discard '(surprise)) - '#s(discard surprise) - (#x81 #x78 "surprise")) - (cross-check "capture(x)" (record 'capture '(x)) (capture 'x) (#x91 #x71 "x")) - (cross-check "observe(x)" (record 'observe '(x)) (observe 'x) (#xA1 #x71 "x")) - (cross-check "observe(x y)" (record 'observe '(x y)) '#s(observe x y) (#xA2 #x71 "x" #x71 "y")) - (cross-check "other(x y)" - (record 'other '(x y)) - '#s(other x y) - (#xB3 #x75 "other" #x71 "x" #x71 "y")) - (cross-check "\"aString\"(3 4)" - (record "aString" '(3 4)) - (#xB3 #x57 "aString" #x13 #x14)) - (cross-check "discard()(3, 4)" - (record (discard) '(3 4)) - (#xB3 #x80 #x13 #x14)) - - (check-equal? (d (expected #x2C #x00 #x00)) (void)) ;; missing end byte - (check-equal? (d (expected #xC3 #x00 #x00)) (void)) ;; missing element - - (cross-check/nondeterministic - "{a: 1, \"b\": #true, [1 2 3]: #\"c\", {first-name:\"Elizabeth\"}:{surname:\"Blackwell\"}}" - (hash 'a 1 - "b" #t - '(1 2 3) #"c" - (hash 'first-name "Elizabeth") (hash 'surname "Blackwell")) - (#xE8 #x71 "a" #x11 - #x51 "b" #x01 - #xC3 #x11 #x12 #x13 #x61 "c" - #xE2 #x7A "first-name" #x59 "Elizabeth" - #xE2 #x77 "surname" #x59 "Blackwell" - )) - - (let () - (local-require json) - (define rfc8259-example1 (string->preserve #<preserve #<