Merge branch 'main' into comment-syntax-hash-space

This commit is contained in:
Tony Garnock-Jones 2023-10-31 21:15:41 +01:00
commit fb63ac24b0
25 changed files with 603 additions and 53 deletions

1
.gitignore vendored
View File

@ -1,4 +1,5 @@
_site/
preserves-expressions.pdf
preserves-binary.pdf
preserves-schema.pdf
preserves-text.pdf

View File

@ -1,6 +1,11 @@
__ignored__ := $(shell ./setup.sh)
PDFS=preserves.pdf preserves-text.pdf preserves-binary.pdf preserves-schema.pdf
PDFS=\
preserves.pdf \
preserves-text.pdf \
preserves-binary.pdf \
preserves-schema.pdf \
preserves-expressions.pdf
all: $(PDFS)

View File

@ -53,13 +53,17 @@ export class Bytes implements Preservable<any>, PreserveWritable<any> {
static fromHex(s: string): Bytes {
if (s.length & 1) throw new Error("Cannot decode odd-length hexadecimal string");
const result = new Bytes(s.length >> 1);
Bytes._raw_fromHexInto(s, result._view);
return result;
}
static _raw_fromHexInto(s: string, target: Uint8Array): void {
const len = s.length >> 1;
const result = new Bytes(len);
for (let i = 0; i < len; i++) {
result._view[i] =
target[i] =
(unhexDigit(s.charCodeAt(i << 1)) << 4) | unhexDigit(s.charCodeAt((i << 1) + 1));
}
return result;
}
static fromIO(io: string | BytesLike): string | Bytes {
@ -135,11 +139,11 @@ export class Bytes implements Preservable<any>, PreserveWritable<any> {
return Bytes.isBytes(v) ? v : void 0;
}
toHex(): string {
toHex(digit = hexDigit): string {
var nibbles = [];
for (let i = 0; i < this.length; i++) {
nibbles.push(hexDigit(this._view[i] >> 4));
nibbles.push(hexDigit(this._view[i] & 15));
nibbles.push(digit(this._view[i] >> 4));
nibbles.push(digit(this._view[i] & 15));
}
return nibbles.join('');
}

View File

@ -4,7 +4,7 @@ import { Tag } from "./constants";
import { Set, Dictionary } from "./dictionary";
import { DoubleFloat, SingleFloat } from "./float";
import { Record } from "./record";
import { Bytes, BytesLike, underlying } from "./bytes";
import { Bytes, BytesLike, underlying, hexDigit } from "./bytes";
import { Value } from "./values";
import { is } from "./is";
import { embed, GenericEmbedded, Embedded, EmbeddedTypeDecode } from "./embedded";
@ -34,7 +34,7 @@ export interface TypedDecoder<T> {
nextFloat(): SingleFloat | undefined;
nextDouble(): DoubleFloat | undefined;
nextEmbedded(): Embedded<T> | undefined;
nextSignedInteger(): number | undefined;
nextSignedInteger(): number | bigint | undefined;
nextString(): string | undefined;
nextByteString(): Bytes | undefined;
nextSymbol(): symbol | undefined;
@ -130,15 +130,42 @@ export class DecoderState {
return (this.nextbyte() === Tag.End) || (this.index--, false);
}
nextint(n: number): number {
// TODO: Bignums :-/
nextint(n: number): number | bigint {
const start = this.index;
if (n === 0) return 0;
if (n > 7) return this.nextbigint(n);
if (n === 7) {
const highByte = this.packet[this.index];
if ((highByte >= 0x20) && (highByte < 0xe0)) {
return this.nextbigint(n);
}
// if highByte is 0xe0, we still might have a value
// equal to (Number.MIN_SAFE_INTEGER-1).
}
let acc = this.nextbyte();
if (acc & 0x80) acc -= 256;
for (let i = 1; i < n; i++) acc = (acc * 256) + this.nextbyte();
if (!Number.isSafeInteger(acc)) {
this.index = start;
return this.nextbigint(n);
}
return acc;
}
nextbigint(n: number): bigint {
if (n === 0) return BigInt(0);
const bs = Bytes.from(this.nextbytes(n));
if (bs.get(0) >= 128) {
// negative
const hex = bs.toHex(d => hexDigit(15 - d));
return ~BigInt('0x' + hex);
} else {
// (strictly) positive
const hex = bs.toHex();
return BigInt('0x' + hex);
}
}
wrap<T>(v: Value<T>): Value<T> {
return this.includeAnnotations ? new Annotated(v) : v;
}
@ -306,7 +333,7 @@ export class Decoder<T = never> implements TypedDecoder<T> {
});
}
nextSignedInteger(): number | undefined {
nextSignedInteger(): number | bigint | undefined {
return this.skipAnnotations((reset) => {
switch (this.state.nextbyte()) {
case Tag.SignedInteger: return this.state.nextint(this.state.varint());

View File

@ -1,5 +1,5 @@
import { Tag } from "./constants";
import { Bytes } from "./bytes";
import { Bytes, unhexDigit } from "./bytes";
import { Value } from "./values";
import { EncodeError } from "./codec";
import { Record, Tuple } from "./record";
@ -122,6 +122,13 @@ export class EncoderState {
this.index += bs.length;
}
claimbytes(count: number) {
this.makeroom(count);
const view = new Uint8Array(this.view.buffer, this.index, count);
this.index += count;
return view;
}
varint(v: number) {
while (v >= 128) {
this.emitbyte((v % 128) + 128);
@ -130,8 +137,9 @@ export class EncoderState {
this.emitbyte(v);
}
encodeint(v: number) {
// TODO: Bignums :-/
encodeint(v: number | bigint) {
if (typeof v === 'bigint') return this.encodebigint(v);
this.emitbyte(Tag.SignedInteger);
if (v === 0) {
@ -153,6 +161,37 @@ export class EncoderState {
enc(bytecount, v);
}
encodebigint(v: bigint) {
this.emitbyte(Tag.SignedInteger);
let hex: string;
if (v > 0) {
hex = v.toString(16);
if (hex.length & 1) {
hex = '0' + hex;
} else if (unhexDigit(hex.charCodeAt(0)) >= 8) {
hex = '00' + hex;
}
} else if (v < 0) {
const negatedHex = (~v).toString(16);
hex = '';
for (let i = 0; i < negatedHex.length; i++) {
hex = hex + 'fedcba9876543210'[unhexDigit(negatedHex.charCodeAt(i))];
}
if (hex.length & 1) {
hex = 'f' + hex;
} else if (unhexDigit(hex.charCodeAt(0)) < 8) {
hex = 'ff' + hex;
}
} else {
this.emitbyte(0);
return;
}
this.varint(hex.length >> 1);
Bytes._raw_fromHexInto(hex, this.claimbytes(hex.length >> 1));
}
encodebytes(tag: Tag, bs: Uint8Array) {
this.emitbyte(tag);
this.varint(bs.length);
@ -219,7 +258,7 @@ export class Encoder<T = object> {
else if (typeof v === 'boolean') {
this.state.emitbyte(v ? Tag.True : Tag.False);
}
else if (typeof v === 'number') {
else if (typeof v === 'number' || typeof v === 'bigint') {
this.state.encodeint(v);
}
else if (typeof v === 'string') {

View File

@ -28,7 +28,7 @@ export interface FoldMethods<T, R> {
boolean(b: boolean): R;
single(f: number): R;
double(f: number): R;
integer(i: number): R;
integer(i: number | bigint): R;
string(s: string): R;
bytes(b: Bytes): R;
symbol(s: symbol): R;
@ -47,7 +47,7 @@ export class VoidFold<T> implements FoldMethods<T, void> {
boolean(b: boolean): void {}
single(f: number): void {}
double(f: number): void {}
integer(i: number): void {}
integer(i: number | bigint): void {}
string(s: string): void {}
bytes(b: Bytes): void {}
symbol(s: symbol): void {}
@ -79,7 +79,7 @@ export abstract class ValueFold<T, R = T> implements FoldMethods<T, Value<R>> {
double(f: number): Value<R> {
return Double(f);
}
integer(i: number): Value<R> {
integer(i: number | bigint): Value<R> {
return i;
}
string(s: string): Value<R> {
@ -138,6 +138,8 @@ export function valueClass<T>(v: Value<T>): ValueClass {
} else {
return ValueClass.SignedInteger;
}
case 'bigint':
return ValueClass.SignedInteger;
case 'string':
return ValueClass.String;
case 'symbol':
@ -181,6 +183,8 @@ export function fold<T, R>(v: Value<T>, o: FoldMethods<T, R>): R {
} else {
return o.integer(v);
}
case 'bigint':
return o.integer(v);
case 'string':
return o.string(v);
case 'symbol':

View File

@ -12,6 +12,7 @@ export function fromJS<T = GenericEmbedded>(x: any): Value<T> {
throw new TypeError("Refusing to autoconvert non-integer number to Single or Double");
}
// FALL THROUGH
case 'bigint':
case 'string':
case 'symbol':
case 'boolean':
@ -19,7 +20,6 @@ export function fromJS<T = GenericEmbedded>(x: any): Value<T> {
case 'undefined':
case 'function':
case 'bigint':
break;
case 'object':

View File

@ -12,7 +12,13 @@ export function is(a: any, b: any): boolean {
if (isAnnotated(a)) a = a.item;
if (isAnnotated(b)) b = b.item;
if (Object.is(a, b)) return true;
if (typeof a !== typeof b) return false;
if (typeof a !== typeof b) {
if ((typeof a === 'number' && typeof b === 'bigint') ||
(typeof a === 'bigint' && typeof b === 'number')) {
return a == b;
}
return false;
}
if (typeof a === 'object') {
if (a === null || b === null) return false;
if ('equals' in a && typeof a.equals === 'function') return a.equals(b, is);

View File

@ -7,6 +7,7 @@ import { Set, Dictionary } from "./dictionary";
import { Annotated } from "./annotated";
import { unannotate } from "./strip";
import { embed, isEmbedded, Embedded } from "./embedded";
import { isCompound } from "./compound";
export function merge<T>(
mergeEmbeddeds: (a: T, b: T) => T | undefined,
@ -18,7 +19,17 @@ export function merge<T>(
}
function walk(a: Value<T>, b: Value<T>): Value<T> {
if (a === b) return a;
if (a === b) {
// Shortcut for merges of trivially identical values.
return a;
}
if (!isCompound(a) && !isCompound(b)) {
// Don't do expensive recursive comparisons for compounds.
if (is(a, b)) {
// Shortcut for merges of marginally less trivially identical values.
return a;
}
}
return fold<T, Value<T>>(a, {
boolean: die,
single(_f: number) { return is(a, b) ? a : die(); },

View File

@ -21,9 +21,8 @@ export interface ReaderOptions<T> extends ReaderStateOptions {
embeddedDecode?: EmbeddedTypeDecode<T>;
}
type IntOrFloat = 'int' | 'float';
type Numeric = number | SingleFloat | DoubleFloat;
type IntContinuation = (kind: IntOrFloat, acc: string) => Numeric;
const MAX_SAFE_INTEGERn = BigInt(Number.MAX_SAFE_INTEGER);
const MIN_SAFE_INTEGERn = BigInt(Number.MIN_SAFE_INTEGER);
export const NUMBER_RE: RegExp = /^([-+]?\d+)(((\.\d+([eE][-+]?\d+)?)|([eE][-+]?\d+))([fF]?))?$/;
// Groups:
@ -174,9 +173,12 @@ export class ReaderState {
const m = NUMBER_RE.exec(acc);
if (m) {
if (m[2] === void 0) {
let v = parseInt(m[1]);
if (Object.is(v, -0)) v = 0;
return v;
let v = BigInt(m[1]);
if (v <= MIN_SAFE_INTEGERn || v >= MAX_SAFE_INTEGERn) {
return v;
} else {
return Number(v);
}
} else if (m[7] === '') {
return Double(parseFloat(m[1] + m[3]));
} else {

View File

@ -15,7 +15,7 @@ export type Atom =
| boolean
| SingleFloat
| DoubleFloat
| number
| number | bigint
| string
| Bytes
| symbol;

View File

@ -278,6 +278,7 @@ export class Writer<T> {
}
break;
}
case 'bigint':
case 'number':
this.state.pieces.push('' + v);
break;
@ -328,7 +329,9 @@ export class Writer<T> {
}
break;
default:
throw new Error(`Internal error: unhandled in Preserves Writer.push for ${v}`);
((_: never) => {
throw new Error(`Internal error: unhandled in Preserves Writer.push for ${v}`);
})(v);
}
return this; // for chaining
}

View File

@ -184,6 +184,71 @@ describe('encoding and decoding embeddeds', () => {
});
});
describe('integer text parsing', () => {
it('should work for zero', () => {
expect(parse('0')).is(0);
});
it('should work for smallish positive integers', () => {
expect(parse('60000')).is(60000);
});
it('should work for smallish negative integers', () => {
expect(parse('-60000')).is(-60000);
});
it('should work for largeish positive integers', () => {
expect(parse('1234567812345678123456781234567'))
.is(BigInt("1234567812345678123456781234567"));
});
it('should work for largeish negative integers', () => {
expect(parse('-1234567812345678123456781234567'))
.is(BigInt("-1234567812345678123456781234567"));
});
it('should work for larger positive integers', () => {
expect(parse('12345678123456781234567812345678'))
.is(BigInt("12345678123456781234567812345678"));
});
it('should work for larger negative integers', () => {
expect(parse('-12345678123456781234567812345678'))
.is(BigInt("-12345678123456781234567812345678"));
});
});
describe('integer binary encoding', () => {
it('should work for zero integers', () => {
expect(encode(0)).is(Bytes.fromHex('b000'));
});
it('should work for zero bigints', () => {
expect(encode(BigInt(0))).is(Bytes.fromHex('b000'));
});
it('should work for smallish positive integers', () => {
expect(encode(60000)).is(Bytes.fromHex('b00300ea60'));
});
it('should work for smallish negative integers', () => {
expect(encode(-60000)).is(Bytes.fromHex('b003ff15a0'));
});
it('should work for largeish positive integers', () => {
expect(encode(BigInt("1234567812345678123456781234567")))
.is(Bytes.fromHex('b00d0f951a8f2b4b049d518b923187'));
});
it('should work for largeish negative integers', () => {
expect(encode(BigInt("-1234567812345678123456781234567")))
.is(Bytes.fromHex('b00df06ae570d4b4fb62ae746dce79'));
});
it('should work for larger positive integers', () => {
expect(encode(BigInt("12345678123456781234567812345678")))
.is(Bytes.fromHex('b00e009bd30997b0ee2e252f73b5ef4e'));
});
it('should work for larger negative integers', () => {
expect(encode(BigInt("-12345678123456781234567812345678")))
.is(Bytes.fromHex('b00eff642cf6684f11d1dad08c4a10b2'));
});
});
describe('common test suite', () => {
const samples_bin = fs.readFileSync(__dirname + '/../../../../../tests/samples.bin');
const samples = decodeWithAnnotations(samples_bin, { embeddedDecode: genericEmbeddedTypeDecode });

View File

@ -1,4 +1,4 @@
import { Single, Double, fromJS, Dictionary, IDENTITY_FOLD, fold, mapEmbeddeds, Value, embed } from '../src/index';
import { Single, Double, fromJS, Dictionary, IDENTITY_FOLD, fold, mapEmbeddeds, Value, embed, preserves } from '../src/index';
import './test-utils';
describe('Single', () => {
@ -41,4 +41,51 @@ describe('fromJS', () => {
it('should map integers to themselves', () => {
expect(fromJS(1)).toBe(1);
});
it('should map bigints to themselves', () => {
expect(fromJS(BigInt("12345678123456781234567812345678")))
.toBe(BigInt("12345678123456781234567812345678"));;
});
});
describe('is()', () => {
it('should compare small integers sensibly', () => {
expect(3).is(3);
expect(3).not.is(4);
});
it('should compare large integers sensibly', () => {
const a = BigInt("12345678123456781234567812345678");
const b = BigInt("12345678123456781234567812345679");
expect(a).is(a);
expect(a).is(BigInt("12345678123456781234567812345678"));
expect(a).not.is(b);
});
it('should compare mixed integers sensibly', () => {
const a = BigInt("12345678123456781234567812345678");
const b = BigInt("3");
const c = BigInt("4");
expect(3).not.is(a);
expect(a).not.is(3);
expect(3).not.toBe(b);
expect(3).is(b);
expect(b).not.toBe(3);
expect(b).is(3);
expect(3).not.toBe(c);
expect(3).not.is(c);
expect(c).not.toBe(3);
expect(c).not.is(3);
});
});
describe('`preserves` formatter', () => {
it('should format numbers', () => {
expect(preserves`>${3}<`).toBe('>3<');
});
it('should format small bigints', () => {
expect(preserves`>${BigInt("3")}<`).toBe('>3<');
});
it('should format big bigints', () => {
expect(preserves`>${BigInt("12345678123456781234567812345678")}<`)
.toBe('>12345678123456781234567812345678<');
});
});

View File

@ -118,6 +118,9 @@
float14: @"+qNaN" <Test #x"87047fc00111" #xf"7fc00111">
float15: @"-qNaN" <Test #x"8704ffc00001" #xf"ffc00001">
float16: @"-qNaN" <Test #x"8704ffc00111" #xf"ffc00111">
int-98765432109876543210987654321098765432109: <Test #x"b012feddc125aed4226c770369269596ce3f0ad3" -98765432109876543210987654321098765432109>
int-12345678123456781234567812345678: <Test #x"b00eff642cf6684f11d1dad08c4a10b2" -12345678123456781234567812345678>
int-1234567812345678123456781234567: <Test #x"b00df06ae570d4b4fb62ae746dce79" -1234567812345678123456781234567>
int-257: <Test #x"b002feff" -257>
int-256: <Test #x"b002ff00" -256>
int-255: <Test #x"b002ff01" -255>
@ -146,7 +149,10 @@
int65536: <Test #x"b003010000" 65536>
int131072: <Test #x"b003020000" 131072>
int2500000000: <Test #x"b005009502f900" 2500000000>
int1234567812345678123456781234567: <Test #x"b00d0f951a8f2b4b049d518b923187" 1234567812345678123456781234567>
int12345678123456781234567812345678: <Test #x"b00e009bd30997b0ee2e252f73b5ef4e" 12345678123456781234567812345678>
int87112285931760246646623899502532662132736: <Test #x"b012010000000000000000000000000000000000" 87112285931760246646623899502532662132736>
int98765432109876543210987654321098765432109: <Test #x"b01201223eda512bdd9388fc96d96a6931c0f52d" 98765432109876543210987654321098765432109>
list0: <Test #x"b584" []>
list4: <Test #x"b5b00101b00102b00103b0010484" [1 2 3 4]>
list4a: <Test #x"b5b00101b00102b00103b0010484" [1, 2, 3, 4]>

View File

@ -118,6 +118,9 @@
float14: @"+qNaN" <Test #x"87047fc00111" #xf"7fc00111">
float15: @"-qNaN" <Test #x"8704ffc00001" #xf"ffc00001">
float16: @"-qNaN" <Test #x"8704ffc00111" #xf"ffc00111">
int-98765432109876543210987654321098765432109: <Test #x"b012feddc125aed4226c770369269596ce3f0ad3" -98765432109876543210987654321098765432109>
int-12345678123456781234567812345678: <Test #x"b00eff642cf6684f11d1dad08c4a10b2" -12345678123456781234567812345678>
int-1234567812345678123456781234567: <Test #x"b00df06ae570d4b4fb62ae746dce79" -1234567812345678123456781234567>
int-257: <Test #x"b002feff" -257>
int-256: <Test #x"b002ff00" -256>
int-255: <Test #x"b002ff01" -255>
@ -146,7 +149,10 @@
int65536: <Test #x"b003010000" 65536>
int131072: <Test #x"b003020000" 131072>
int2500000000: <Test #x"b005009502f900" 2500000000>
int1234567812345678123456781234567: <Test #x"b00d0f951a8f2b4b049d518b923187" 1234567812345678123456781234567>
int12345678123456781234567812345678: <Test #x"b00e009bd30997b0ee2e252f73b5ef4e" 12345678123456781234567812345678>
int87112285931760246646623899502532662132736: <Test #x"b012010000000000000000000000000000000000" 87112285931760246646623899502532662132736>
int98765432109876543210987654321098765432109: <Test #x"b01201223eda512bdd9388fc96d96a6931c0f52d" 98765432109876543210987654321098765432109>
list0: <Test #x"b584" []>
list4: <Test #x"b5b00101b00102b00103b0010484" [1 2 3 4]>
list4a: <Test #x"b5b00101b00102b00103b0010484" [1, 2, 3, 4]>

View File

@ -1,6 +1,6 @@
[package]
name = "preserves"
version = "3.990.3"
version = "3.990.4"
authors = ["Tony Garnock-Jones <tonyg@leastfixedpoint.com>"]
edition = "2018"
description = "Implementation of the Preserves serialization format via serde."

View File

@ -289,7 +289,7 @@ impl Writer for BinaryOrderWriter {
macro_rules! fits_in_bytes {
($v:ident, $limit:literal) => {{
let bits = $limit * 8 - 1;
$v >= -(2 << bits) && $v < (2 << bits)
$v >= -(1 << bits) && $v < (1 << bits)
}};
}

291
preserves-expressions.md Normal file
View File

@ -0,0 +1,291 @@
---
title: "P-expressions"
---
Tony Garnock-Jones <tonyg@leastfixedpoint.com>
October 2023. Version 0.1.1.
This document defines a grammar called *Preserves Expressions*
(*P-expressions*, *pexprs*) that includes [ordinary Preserves text
syntax](preserves-text.html) but offers extensions sufficient to support
a Lisp- or Haskell-like programming notation.
**Motivation.** The [text syntax](preserves-text.html) for Preserves
works well for writing `Value`s, i.e. data. However, in some contexts,
Preserves applications need a broader grammar that allows interleaving
of *expressions* with data. Two examples are the [Preserves Schema
language](preserves-schema.html) and the [Synit configuration scripting
language](https://synit.org/book/operation/scripting.html), both of
which (ab)use Preserves text syntax as a kind of programming notation.
## Preliminaries
The P-expression grammar takes the text syntax grammar as its base and
modifies it.
<a id="whitespace">
**Whitespace.** Whitespace is redefined as any number of spaces, tabs,
carriage returns, or line feeds. Commas are *not* considered whitespace
in P-expressions.
ws = *(%x20 / %x09 / CR / LF)
<a id="delimiters"></a>
**Delimiters.** Because commas are no longer included in class `ws`,
class `delimiter` is widened to include them explicitly.
delimiter = ws / ","
/ "<" / ">" / "[" / "]" / "{" / "}"
/ "#" / ":" / DQUOTE / "|" / "@" / ";"
## Grammar
P-expressions add comma, semicolon, and sequences of one or more colons
to the syntax class `Value`.
Value =/ Comma / Semicolon / Colons
Comma = ","
Semicolon = ";"
Colons = 1*":"
Now that colon is in `Value`, the syntax for `Dictionary` is replaced
with `Block` everywhere it is mentioned.
Block = "{" *Value ws "}"
New syntax for explicit uninterpreted grouping of sequences of values is
introduced, and added to class `Value`.
Value =/ ws Group
Group = "(" *Value ws ")"
Finally, class `Document` is replaced in order to allow standalone
documents to directly comprise a sequence of multiple values.
Document = *Value ws
No changes to [the Preserves semantic model](preserves.html) are made.
Every Preserves text-syntax term is a valid P-expression, but in general
P-expressions must be rewritten or otherwise interpreted before a
meaningful Preserves value can be arrived at ([see
below](#reading-preserves)).
## <a id="annotations"></a>Annotations and Comments
Annotations and comments attach to the term following them, just as in
the ordinary text syntax. However, it is common in programming notations
to allow comments at the end of a file or other sequential construct:
{
key: value
# example of a comment at the end of a dictionary
}
# example of a comment at the end of the input file
While the ordinary text syntax forbids comments in these positions,
P-expressions allow them:
Document =/ *Value Trailer ws
Record =/ "<" Value *Value Trailer ws ">"
Sequence =/ "[" *Value Trailer ws "]"
Set =/ "#{" *Value Trailer ws "}"
Block =/ "{" *Value Trailer ws "}"
Trailer = 1*Annotation
## <a id="encoding-pexprs"></a>Encoding P-expressions as Preserves
We write ⌜*p*⌝ for the encoding into Preserves of P-expression *p*.
{:.pseudocode.equations}
| ⌜·⌝ : **P-expression** | ⟶ | **Preserves** |
Aside from the special classes `Group`, `Block`, `Comma`, `Semicolon`,
`Colons`, or `Trailer`, P-expressions are encoded directly as Preserves
data.
{:.pseudocode.equations}
| ⌜`[`*p* ...`]`⌝ | = | `[`⌜*p*⌝ ...`]` |
| ⌜`<`*p* ...`>`⌝ | = | `<`⌜*p*⌝ ...`>` |
| ⌜`#{`*p* ...`}`⌝ | = | `#{`⌜*p*⌝ ...`}` |
| ⌜`#!`*p*⌝ | = | `#!`⌜*p*⌝ |
| ⌜`@`*p* *q*⌝ | = | `@`⌜*p*⌝ ⌜*q*⌝ |
| ⌜*p*⌝ | = | *p* when *p***Atom** |
All members of the special classes are encoded as Preserves
dictionaries[^encoding-rationale].
[^encoding-rationale]: In principle, it would be nice to use *records*
for this purpose, but if we did so we would have to also encode
usages of records!
{:.pseudocode.equations}
| ⌜`(`*p* ...`)`⌝ | = | `{g:[`⌜*p*⌝ ...`]}` |
| ⌜`{`*p* ...`}`⌝ | = | `{b:[`⌜*p*⌝ ...`]}` |
| ⌜`,`⌝ | = | `{s:|,|}` |
| ⌜`;`⌝ | = | `{s:|;|}` |
| ⌜`:` ...⌝ | = | `{s:|:` ...`|}` |
| ⌜*t*⌝ | = | ⌜*a*⌝ ... `{}`, where *a* ... are the annotations in *t* and *t***Trailer** |
The empty dictionary `{}` acts as an anchor for the annotations in a
`Trailer`.
We overload the ⌜·⌝ notation for encoding whole `Document`s into
sequences of Preserves values.
{:.pseudocode.equations}
| ⌜·⌝ : **P-expression Document** | ⟶ | **Preserves Sequence** |
| ⌜*p* ...⌝ | = | `[`⌜*p*⌝ ...`]` |
## <a id="reading-preserves"></a>Interpreting P-expressions as Preserves
The [previous section](#encoding-pexprs) discussed ways of representing
P-expressions using Preserves. Here, we discuss *interpreting*
P-expressions *as* Preserves, so that (1) a Preserves datum (2) written
using Preserves text syntax and then (3) read as a P-expression can be
(4) interpreted from that P-expression to yield the original datum.
A reader for P-expressions can be adapted to yield a reader for
Preserves terms by processing (subterms of) each P-expression that the
reader produces. The only subterms that need processing are the special
classes mentioned above.
1. Every `Group` or `Semicolon` that appears is an error.
2. Every `Colons` with two or more colons in it is an error.
3. Every `Comma` that appears is discarded.
3. Every `Trailer` that appears is an error.[^discard-trailers-instead-of-error]
4. Every `Block` must contain triplets of `Value`, `Colons` (with a
single colon), `Value`. Any `Block` not following this pattern is an
error. Each `Block` following the pattern is translated to a
`Dictionary` containing a key/value pair for each triplet.
[^discard-trailers-instead-of-error]: **Implementation note.** When
implementing parsing of P-expressions into Preserves, consider
offering an optional mode where trailing annotations `Trailer` are
*discarded* instead of causing an error to be signalled.
## Appendix: Examples
Examples are given as pairs of P-expressions and their Preserves
text-syntax encodings.
### Individual P-expression `Value`s
```preserves
<date 1821 (lookup-month "February") 3>
= <date 1821 {g:[lookup-month "February"]} 3>
```
```preserves
⌜(begin (println! (+ 1 2)) (+ 3 4))⌝
= {g:[begin {g:[println! {g:[+ 1 2]}]} {g:[+ 3 4]}]}
```
```preserves
⌜()⌝
= {g:[]}
⌜[() () ()]⌝
= [{g:[]}, {g:[]}, {g:[]}]
```
```preserves
⌜{
setUp();
# Now enter the loop
loop: {
greet("World");
}
tearDown();
}⌝
= {b:[
setUp {g:[]} {s:|;|}
# Now enter the loop
loop {s:|:|} {b:[
greet {g:["World"]} {s:|;|}
]}
tearDown {g:[]} {s:|;|}
]}
```
```preserves
⌜[1 + 2.0, print "Hello", predicate: #t, foo, #!remote, bar]⌝
= [1 + 2.0 {s:|,|} print "Hello" {s:|,|} predicate {s:|:|} #t {s:|,|}
foo {s:|,|} #!remote {s:|,|} bar]
```
```preserves
⌜{
optional name: string,
address: Address,
}⌝
= {b:[
optional name {s:|:|} string {s:|,|}
address {s:|:|} Address {s:|,|}
]}
```
### Whole `Document`s
```preserves
⌜{
key: value
# example of a comment at the end of a dictionary
}
# example of a comment at the end of the input file⌝
= [ {b:[
key {s:|:|} value
@"example of a comment at the end of a dictionary" {}
]}
@"example of a comment at the end of the input file"
{}
]
```
## Appendix: Reading vs. Parsing
Lisp systems first *read* streams of bytes into S-expressions and then
*parse* those S-expressions into more abstract structures denoting
various kinds of program syntax. [Separation of reading from parsing is
what gives Lisp its syntactic
flexibility.](http://calculist.org/blog/2012/04/17/homoiconicity-isnt-the-point/)
Similarly, the Apple programming language
[Dylan](https://en.wikipedia.org/wiki/Dylan_(programming_language))
included a reader-parser split, with the Dylan reader producing
*D-expressions* that are somewhat similar to P-expressions.
Finally, the Racket dialects
[Honu](https://docs.racket-lang.org/honu/index.html) and
[Something](https://github.com/tonyg/racket-something) use a
reader-parser-macro setup, where the reader produces Racket data, the
parser produces "syntax" and is user-extensible, and Racket's own
modular macro system rewrites this "syntax" down to core forms to be
compiled to machine code.
Similarly, when using P-expressions as the foundation for a language, a
generic P-expression reader can then feed into special-purpose
*parsers*. The reader captures the coarse syntactic structure of a
program, and the parser refines this.
Often, a parser will wish to extract structure from sequences of
P-expression `Value`s.
- A simple technique is repeated splitting of sequences; first by
`Semicolon`, then by `Comma`, then by increasingly high binding-power
operators.
- More refined is to use a Pratt parser or similar
([1](https://en.wikipedia.org/wiki/Operator-precedence_parser),
[2](https://matklad.github.io/2020/04/13/simple-but-powerful-pratt-parsing.html),
[3](https://github.com/tonyg/racket-something/blob/f6116bf3861b76970f5ce291a628476adef820b4/src/something/pratt.rkt))
to build a parse tree using an extensible specification of the pre-,
in-, and postfix operators involved.
- Finally, if you treat sequences of `Value`s as pre-lexed token
streams, almost any parsing formalism (such as [PEG
parsing](https://en.wikipedia.org/wiki/Parsing_expression_grammar),
[Ometa](https://en.wikipedia.org/wiki/OMeta), etc.) can be used to
extract further syntactic structure.
## Notes

View File

@ -55,7 +55,7 @@ Standalone documents may have trailing whitespace.
Any `Value` may be preceded by whitespace.
Value = ws (Record / Collection / Atom / Embedded)
Collection = Sequence / Dictionary / Set
Collection = Sequence / Set / Dictionary
Atom = Boolean / String / ByteString /
QuotedSymbol / SymbolOrNumber
@ -64,18 +64,18 @@ label-`Value` followed by its field-`Value`s.
Record = "<" Value *Value ws ">"
`Sequence`s are enclosed in square brackets. `Dictionary` values are
curly-brace-enclosed colon-separated pairs of values. `Set`s are
written as values enclosed by the tokens `#{` and
`}`.[^printing-collections] It is an error for a set to contain
`Sequence`s are enclosed in square brackets. `Set`s are written as
values enclosed by the tokens `#{` and `}`. `Dictionary` values are
curly-brace-enclosed colon-separated pairs of
values.[^printing-collections] It is an error for a set to contain
duplicate elements or for a dictionary to contain duplicate keys. When
printing sets and dictionaries, implementations *SHOULD* order
elements resp. keys with respect to the [total order over
printing sets and dictionaries, implementations *SHOULD* order elements
resp. keys with respect to the [total order over
`Value`s](preserves.html#total-order).[^rationale-print-ordering]
Sequence = "[" *Value ws "]"
Dictionary = "{" *(Value ws ":" Value) ws "}"
Set = "#{" *Value ws "}"
Sequence = "[" *Value ws "]"
Set = "#{" *Value ws "}"
Dictionary = "{" *(Value ws ":" Value) ws "}"
[^printing-collections]: **Implementation note.** When implementing
printing of `Value`s using the textual syntax, consider supporting
@ -273,7 +273,8 @@ value. Each annotation is, in turn, a `Value`, and may itself have
annotations. The ordering of annotations attached to a `Value` is
significant.
Value =/ ws "@" Value Value
Value =/ ws Annotation Value
Annotation = "@" Value
Each annotation is preceded by `@`; the underlying annotated value
follows its annotations. Here we extend only the syntactic nonterminal
@ -283,7 +284,7 @@ named “`Value`” without altering the semantic class of `Value`s.
interpreted as comments associated with that value. Comments are
sufficiently common that special syntax exists for them.
Value =/ ws ("#" [(%x20 / %x09) linecomment]) (CR / LF) Value
Annotation =/ "#" [(%x20 / %x09) linecomment] (CR / LF)
linecomment = *<any unicode scalar value except CR or LF>
When written this way, everything between the hash-space or hash-tab and

View File

@ -1,6 +1,7 @@
:root {
--sans-font: "Open Sans", -apple-system, BlinkMacSystemFont, avenir next, avenir, segoe ui, helvetica neue, helvetica, Cantarell, Ubuntu, roboto, noto, arial, sans-serif;
--serif-font: palatino, "Palatino Linotype", "Palatino LT STD", "URW Palladio L", "TeX Gyre Pagella", serif;
--blockquote-indent: 40px;
}
body {
font-family: var(--serif-font);
@ -230,6 +231,7 @@ table.postcard-grammar {
blockquote {
padding: 0.5rem 1rem;
border-left: solid #4f81bd 2px;
margin-left: var(--blockquote-indent);
margin-right: 0;
}
blockquote :first-child {
@ -243,6 +245,10 @@ blockquote :last-child {
background-color: #e9f0f9;
}
table.equations { width: auto; margin-left: var(--blockquote-indent); }
table.equations tr > *:nth-child(1) { text-align: right; }
table.equations tr > *:nth-child(2) { text-align: center; }
blockquote.pseudocode {
border-left: none;
padding: 0;

View File

@ -104,8 +104,8 @@ the `totalOrder` predicate defined in section 5.10 of [IEEE Std
A `Record` is a *labelled* tuple of `Value`s, the record's *fields*. A
label can be any `Value`, but is usually a `Symbol`.[^extensibility]
[^iri-labels] `Record`s are compared lexicographically: first by
label, then by field sequence.
[^iri-labels] `Record`s are ordered first by label, then
lexicographically[^lexicographical-sequences] by field sequence.
[^extensibility]: The [Racket](https://racket-lang.org/) programming
language defines
@ -123,10 +123,25 @@ label, then by field sequence.
it cannot be read as an IRI at all, and so the label simply stands
for itself—for its own `Value`.
[^lexicographical-sequences]: When comparing sequences of values for
the total order, [lexicographical
ordering](https://en.wikipedia.org/wiki/Lexicographic_order) is
used. Elements are drawn pairwise from the two sequences to be
compared. If one is smaller than the other according to the total
order, the sequence it was drawn from is the smaller of the
sequences. If the end of one sequence is reached, while the other
sequence has elements remaining, the shorter sequence is considered
smaller. Otherwise, all the elements compared equal and neither was
longer than the other, so they compare equal. For example,
- `[#f]` is ordered before `[foo]` because `Boolean` appears before `Symbol` in the kind ordering;
- `[x]` before `[x y]` because there is no element remaining to compare against `y`;
- `[a b]` before `[x]` because `a` is smaller than `x`; and
- `[x y]` before `[x z]` because `y` is ordered before `z` according to the ordering rules for `Symbol`.
### Sequences.
A `Sequence` is a sequence of `Value`s. `Sequence`s are compared
lexicographically.
lexicographically.[^lexicographical-sequences]
### Sets.
@ -134,15 +149,16 @@ A `Set` is an unordered finite set of `Value`s. It contains no
duplicate values, following the [equivalence relation](#equivalence)
induced by the total order on `Value`s. Two `Set`s are compared by
sorting their elements ascending using the [total order](#total-order)
and comparing the resulting `Sequence`s.
and comparing the resulting `Sequence`s.[^lexicographical-sequences]
### Dictionaries.
A `Dictionary` is an unordered finite collection of pairs of `Value`s.
Each pair comprises a *key* and a *value*. Keys in a `Dictionary` are
pairwise distinct. Instances of `Dictionary` are compared by
lexicographic comparison of the sequences resulting from ordering each
`Dictionary`'s pairs in ascending order by key.
lexicographic[^lexicographical-sequences] comparison of the sequences
resulting from ordering each `Dictionary`'s pairs in ascending order by
key.
### Embeddeds.
@ -194,8 +210,12 @@ sequences use [the Preserves binary encoding](preserves-binary.html).
The total ordering specified [above](#total-order) means that the following statements are true:
"bzz" < "c" < "caa" < #!"a"
#t < 3.0f < 3.0 < 3 < "3" < |3| < [] < #!#t
- `"bzz"` &lt; `"c"` &lt; `"caa"` &lt; `#!"a"`
- `#t` &lt; `3.0f` &lt; `3.0` &lt; `3` &lt; `"3"` &lt; `|3|` &lt; `[]` &lt; `#!#t`
- `[#f]` &lt; `[foo]`, because `Boolean` appears before `Symbol` in the kind ordering
- `[x]` &lt; `[x y]`, because there is no element remaining to compare against `y`
- `[a b]` &lt; `[x]`, because `a` is smaller than `x`
- `[x y]` &lt; `[x z]`, because `y` is ordered before `z`
### Simple examples.

Binary file not shown.

View File

@ -118,6 +118,9 @@
float14: @"+qNaN" <Test #x"87047fc00111" #xf"7fc00111">
float15: @"-qNaN" <Test #x"8704ffc00001" #xf"ffc00001">
float16: @"-qNaN" <Test #x"8704ffc00111" #xf"ffc00111">
int-98765432109876543210987654321098765432109: <Test #x"b012feddc125aed4226c770369269596ce3f0ad3" -98765432109876543210987654321098765432109>
int-12345678123456781234567812345678: <Test #x"b00eff642cf6684f11d1dad08c4a10b2" -12345678123456781234567812345678>
int-1234567812345678123456781234567: <Test #x"b00df06ae570d4b4fb62ae746dce79" -1234567812345678123456781234567>
int-257: <Test #x"b002feff" -257>
int-256: <Test #x"b002ff00" -256>
int-255: <Test #x"b002ff01" -255>
@ -146,7 +149,10 @@
int65536: <Test #x"b003010000" 65536>
int131072: <Test #x"b003020000" 131072>
int2500000000: <Test #x"b005009502f900" 2500000000>
int1234567812345678123456781234567: <Test #x"b00d0f951a8f2b4b049d518b923187" 1234567812345678123456781234567>
int12345678123456781234567812345678: <Test #x"b00e009bd30997b0ee2e252f73b5ef4e" 12345678123456781234567812345678>
int87112285931760246646623899502532662132736: <Test #x"b012010000000000000000000000000000000000" 87112285931760246646623899502532662132736>
int98765432109876543210987654321098765432109: <Test #x"b01201223eda512bdd9388fc96d96a6931c0f52d" 98765432109876543210987654321098765432109>
list0: <Test #x"b584" []>
list4: <Test #x"b5b00101b00102b00103b0010484" [1 2 3 4]>
list4a: <Test #x"b5b00101b00102b00103b0010484" [1, 2, 3, 4]>