Merge branch 'main' into comment-syntax-hash-space

2023-10-31 21:15:41 +01:00 · 2023-10-31 21:15:41 +01:00 · fb63ac24b0
parent c053102d07 ec03bdb45f
commit fb63ac24b0
25 changed files with 603 additions and 53 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,5 @@
 _site/
+preserves-expressions.pdf
 preserves-binary.pdf
 preserves-schema.pdf
 preserves-text.pdf
--- a/7
+++ b/7
@ -1,6 +1,11 @@
 __ignored__ := $(shell ./setup.sh)

-PDFS=preserves.pdf preserves-text.pdf preserves-binary.pdf preserves-schema.pdf
+PDFS=\
+	preserves.pdf \
+	preserves-text.pdf \
+	preserves-binary.pdf \
+	preserves-schema.pdf \
+	preserves-expressions.pdf

 all: $(PDFS)

--- a/implementations/javascript/packages/core/src/bytes.ts
+++ b/implementations/javascript/packages/core/src/bytes.ts
@ -53,13 +53,17 @@ export class Bytes implements Preservable<any>, PreserveWritable<any> {

    static fromHex(s: string): Bytes {
        if (s.length & 1) throw new Error("Cannot decode odd-length hexadecimal string");
+        const result = new Bytes(s.length >> 1);
+        Bytes._raw_fromHexInto(s, result._view);
+        return result;
+    }
+
+    static _raw_fromHexInto(s: string, target: Uint8Array): void {
        const len = s.length >> 1;
-        const result = new Bytes(len);
        for (let i = 0; i < len; i++) {
-            result._view[i] =
+            target[i] =
                (unhexDigit(s.charCodeAt(i << 1)) << 4) | unhexDigit(s.charCodeAt((i << 1) + 1));
        }
-        return result;
    }

    static fromIO(io: string | BytesLike): string | Bytes {
@ -135,11 +139,11 @@ export class Bytes implements Preservable<any>, PreserveWritable<any> {
        return Bytes.isBytes(v) ? v : void 0;
    }

-    toHex(): string {
+    toHex(digit = hexDigit): string {
        var nibbles = [];
        for (let i = 0; i < this.length; i++) {
-            nibbles.push(hexDigit(this._view[i] >> 4));
-            nibbles.push(hexDigit(this._view[i] & 15));
+            nibbles.push(digit(this._view[i] >> 4));
+            nibbles.push(digit(this._view[i] & 15));
        }
        return nibbles.join('');
    }
--- a/implementations/javascript/packages/core/src/decoder.ts
+++ b/implementations/javascript/packages/core/src/decoder.ts
@ -4,7 +4,7 @@ import { Tag } from "./constants";
 import { Set, Dictionary } from "./dictionary";
 import { DoubleFloat, SingleFloat } from "./float";
 import { Record } from "./record";
-import { Bytes, BytesLike, underlying } from "./bytes";
+import { Bytes, BytesLike, underlying, hexDigit } from "./bytes";
 import { Value } from "./values";
 import { is } from "./is";
 import { embed, GenericEmbedded, Embedded, EmbeddedTypeDecode } from "./embedded";
@ -34,7 +34,7 @@ export interface TypedDecoder<T> {
    nextFloat(): SingleFloat | undefined;
    nextDouble(): DoubleFloat | undefined;
    nextEmbedded(): Embedded<T> | undefined;
-    nextSignedInteger(): number | undefined;
+    nextSignedInteger(): number | bigint | undefined;
    nextString(): string | undefined;
    nextByteString(): Bytes | undefined;
    nextSymbol(): symbol | undefined;
@ -130,15 +130,42 @@ export class DecoderState {
        return (this.nextbyte() === Tag.End) || (this.index--, false);
    }

-    nextint(n: number): number {
-        // TODO: Bignums :-/
+    nextint(n: number): number | bigint {
+        const start = this.index;
        if (n === 0) return 0;
+        if (n > 7) return this.nextbigint(n);
+        if (n === 7) {
+            const highByte = this.packet[this.index];
+            if ((highByte >= 0x20) && (highByte < 0xe0)) {
+                return this.nextbigint(n);
+            }
+            // if highByte is 0xe0, we still might have a value
+            // equal to (Number.MIN_SAFE_INTEGER-1).
+        }
        let acc = this.nextbyte();
        if (acc & 0x80) acc -= 256;
        for (let i = 1; i < n; i++) acc = (acc * 256) + this.nextbyte();
+        if (!Number.isSafeInteger(acc)) {
+            this.index = start;
+            return this.nextbigint(n);
+        }
        return acc;
    }

+    nextbigint(n: number): bigint {
+        if (n === 0) return BigInt(0);
+        const bs = Bytes.from(this.nextbytes(n));
+        if (bs.get(0) >= 128) {
+            // negative
+            const hex = bs.toHex(d => hexDigit(15 - d));
+            return ~BigInt('0x' + hex);
+        } else {
+            // (strictly) positive
+            const hex = bs.toHex();
+            return BigInt('0x' + hex);
+        }
+    }
+
    wrap<T>(v: Value<T>): Value<T> {
        return this.includeAnnotations ? new Annotated(v) : v;
    }
@ -306,7 +333,7 @@ export class Decoder<T = never> implements TypedDecoder<T> {
        });
    }

-    nextSignedInteger(): number | undefined {
+    nextSignedInteger(): number | bigint | undefined {
        return this.skipAnnotations((reset) => {
            switch (this.state.nextbyte()) {
                case Tag.SignedInteger: return this.state.nextint(this.state.varint());
--- a/implementations/javascript/packages/core/src/encoder.ts
+++ b/implementations/javascript/packages/core/src/encoder.ts
@ -1,5 +1,5 @@
 import { Tag } from "./constants";
-import { Bytes } from "./bytes";
+import { Bytes, unhexDigit } from "./bytes";
 import { Value } from "./values";
 import { EncodeError } from "./codec";
 import { Record, Tuple } from "./record";
@ -122,6 +122,13 @@ export class EncoderState {
        this.index += bs.length;
    }

+    claimbytes(count: number) {
+        this.makeroom(count);
+        const view = new Uint8Array(this.view.buffer, this.index, count);
+        this.index += count;
+        return view;
+    }
+
    varint(v: number) {
        while (v >= 128) {
            this.emitbyte((v % 128) + 128);
@ -130,8 +137,9 @@ export class EncoderState {
        this.emitbyte(v);
    }

-    encodeint(v: number) {
-        // TODO: Bignums :-/
+    encodeint(v: number | bigint) {
+        if (typeof v === 'bigint') return this.encodebigint(v);
+
        this.emitbyte(Tag.SignedInteger);

        if (v === 0) {
@ -153,6 +161,37 @@ export class EncoderState {
        enc(bytecount, v);
    }

+    encodebigint(v: bigint) {
+        this.emitbyte(Tag.SignedInteger);
+
+        let hex: string;
+        if (v > 0) {
+            hex = v.toString(16);
+            if (hex.length & 1) {
+                hex = '0' + hex;
+            } else if (unhexDigit(hex.charCodeAt(0)) >= 8) {
+                hex = '00' + hex;
+            }
+        } else if (v < 0) {
+            const negatedHex = (~v).toString(16);
+            hex = '';
+            for (let i = 0; i < negatedHex.length; i++) {
+                hex = hex + 'fedcba9876543210'[unhexDigit(negatedHex.charCodeAt(i))];
+            }
+            if (hex.length & 1) {
+                hex = 'f' + hex;
+            } else if (unhexDigit(hex.charCodeAt(0)) < 8) {
+                hex = 'ff' + hex;
+            }
+        } else {
+            this.emitbyte(0);
+            return;
+        }
+
+        this.varint(hex.length >> 1);
+        Bytes._raw_fromHexInto(hex, this.claimbytes(hex.length >> 1));
+    }
+
    encodebytes(tag: Tag, bs: Uint8Array) {
        this.emitbyte(tag);
        this.varint(bs.length);
@ -219,7 +258,7 @@ export class Encoder<T = object> {
        else if (typeof v === 'boolean') {
            this.state.emitbyte(v ? Tag.True : Tag.False);
        }
-        else if (typeof v === 'number') {
+        else if (typeof v === 'number' || typeof v === 'bigint') {
            this.state.encodeint(v);
        }
        else if (typeof v === 'string') {
--- a/implementations/javascript/packages/core/src/fold.ts
+++ b/implementations/javascript/packages/core/src/fold.ts
@ -28,7 +28,7 @@ export interface FoldMethods<T, R> {
    boolean(b: boolean): R;
    single(f: number): R;
    double(f: number): R;
-    integer(i: number): R;
+    integer(i: number | bigint): R;
    string(s: string): R;
    bytes(b: Bytes): R;
    symbol(s: symbol): R;
@ -47,7 +47,7 @@ export class VoidFold<T> implements FoldMethods<T, void> {
    boolean(b: boolean): void {}
    single(f: number): void {}
    double(f: number): void {}
-    integer(i: number): void {}
+    integer(i: number | bigint): void {}
    string(s: string): void {}
    bytes(b: Bytes): void {}
    symbol(s: symbol): void {}
@ -79,7 +79,7 @@ export abstract class ValueFold<T, R = T> implements FoldMethods<T, Value<R>> {
    double(f: number): Value<R> {
        return Double(f);
    }
-    integer(i: number): Value<R> {
+    integer(i: number | bigint): Value<R> {
        return i;
    }
    string(s: string): Value<R> {
@ -138,6 +138,8 @@ export function valueClass<T>(v: Value<T>): ValueClass {
            } else {
                return ValueClass.SignedInteger;
            }
+        case 'bigint':
+            return ValueClass.SignedInteger;
        case 'string':
            return ValueClass.String;
        case 'symbol':
@ -181,6 +183,8 @@ export function fold<T, R>(v: Value<T>, o: FoldMethods<T, R>): R {
                } else {
                    return o.integer(v);
                }
+            case 'bigint':
+                return o.integer(v);
            case 'string':
                return o.string(v);
            case 'symbol':
--- a/implementations/javascript/packages/core/src/fromjs.ts
+++ b/implementations/javascript/packages/core/src/fromjs.ts
@ -12,6 +12,7 @@ export function fromJS<T = GenericEmbedded>(x: any): Value<T> {
                throw new TypeError("Refusing to autoconvert non-integer number to Single or Double");
            }
            // FALL THROUGH
+        case 'bigint':
        case 'string':
        case 'symbol':
        case 'boolean':
@ -19,7 +20,6 @@ export function fromJS<T = GenericEmbedded>(x: any): Value<T> {

        case 'undefined':
        case 'function':
-        case 'bigint':
            break;

        case 'object':
--- a/implementations/javascript/packages/core/src/is.ts
+++ b/implementations/javascript/packages/core/src/is.ts
@ -12,7 +12,13 @@ export function is(a: any, b: any): boolean {
    if (isAnnotated(a)) a = a.item;
    if (isAnnotated(b)) b = b.item;
    if (Object.is(a, b)) return true;
-    if (typeof a !== typeof b) return false;
+    if (typeof a !== typeof b) {
+        if ((typeof a === 'number' && typeof b === 'bigint') ||
+            (typeof a === 'bigint' && typeof b === 'number')) {
+            return a == b;
+        }
+        return false;
+    }
    if (typeof a === 'object') {
        if (a === null || b === null) return false;
        if ('equals' in a && typeof a.equals === 'function') return a.equals(b, is);
--- a/implementations/javascript/packages/core/src/merge.ts
+++ b/implementations/javascript/packages/core/src/merge.ts
@ -7,6 +7,7 @@ import { Set, Dictionary } from "./dictionary";
 import { Annotated } from "./annotated";
 import { unannotate } from "./strip";
 import { embed, isEmbedded, Embedded } from "./embedded";
+import { isCompound } from "./compound";

 export function merge<T>(
    mergeEmbeddeds: (a: T, b: T) => T | undefined,
@ -18,7 +19,17 @@ export function merge<T>(
    }

    function walk(a: Value<T>, b: Value<T>): Value<T> {
-        if (a === b) return a;
+        if (a === b) {
+            // Shortcut for merges of trivially identical values.
+            return a;
+        }
+        if (!isCompound(a) && !isCompound(b)) {
+            // Don't do expensive recursive comparisons for compounds.
+            if (is(a, b)) {
+                // Shortcut for merges of marginally less trivially identical values.
+                return a;
+            }
+        }
        return fold<T, Value<T>>(a, {
            boolean: die,
            single(_f: number) { return is(a, b) ? a : die(); },
--- a/implementations/javascript/packages/core/src/reader.ts
+++ b/implementations/javascript/packages/core/src/reader.ts
@ -21,9 +21,8 @@ export interface ReaderOptions<T> extends ReaderStateOptions {
    embeddedDecode?: EmbeddedTypeDecode<T>;
 }

-type IntOrFloat = 'int' | 'float';
-type Numeric = number | SingleFloat | DoubleFloat;
-type IntContinuation = (kind: IntOrFloat, acc: string) => Numeric;
+const MAX_SAFE_INTEGERn = BigInt(Number.MAX_SAFE_INTEGER);
+const MIN_SAFE_INTEGERn = BigInt(Number.MIN_SAFE_INTEGER);

 export const NUMBER_RE: RegExp = /^([-+]?\d+)(((\.\d+([eE][-+]?\d+)?)|([eE][-+]?\d+))([fF]?))?$/;
 // Groups:
@ -174,9 +173,12 @@ export class ReaderState {
        const m = NUMBER_RE.exec(acc);
        if (m) {
            if (m[2] === void 0) {
-                let v = parseInt(m[1]);
-                if (Object.is(v, -0)) v = 0;
-                return v;
+                let v = BigInt(m[1]);
+                if (v <= MIN_SAFE_INTEGERn || v >= MAX_SAFE_INTEGERn) {
+                    return v;
+                } else {
+                    return Number(v);
+                }
            } else if (m[7] === '') {
                return Double(parseFloat(m[1] + m[3]));
            } else {
--- a/implementations/javascript/packages/core/src/values.ts
+++ b/implementations/javascript/packages/core/src/values.ts
@ -15,7 +15,7 @@ export type Atom =
    | boolean
    | SingleFloat
    | DoubleFloat
-    | number
+    | number | bigint
    | string
    | Bytes
    | symbol;
--- a/implementations/javascript/packages/core/src/writer.ts
+++ b/implementations/javascript/packages/core/src/writer.ts
@ -278,6 +278,7 @@ export class Writer<T> {
                }
                break;
            }
+            case 'bigint':
            case 'number':
                this.state.pieces.push('' + v);
                break;
@ -328,7 +329,9 @@ export class Writer<T> {
                }
                break;
            default:
-                throw new Error(`Internal error: unhandled in Preserves Writer.push for ${v}`);
+                ((_: never) => {
+                    throw new Error(`Internal error: unhandled in Preserves Writer.push for ${v}`);
+                })(v);
        }
        return this; // for chaining
    }
--- a/implementations/javascript/packages/core/test/codec.test.ts
+++ b/implementations/javascript/packages/core/test/codec.test.ts
@ -184,6 +184,71 @@ describe('encoding and decoding embeddeds', () => {
    });
 });

+describe('integer text parsing', () => {
+    it('should work for zero', () => {
+        expect(parse('0')).is(0);
+    });
+
+    it('should work for smallish positive integers', () => {
+        expect(parse('60000')).is(60000);
+    });
+    it('should work for smallish negative integers', () => {
+        expect(parse('-60000')).is(-60000);
+    });
+
+    it('should work for largeish positive integers', () => {
+        expect(parse('1234567812345678123456781234567'))
+            .is(BigInt("1234567812345678123456781234567"));
+    });
+    it('should work for largeish negative integers', () => {
+        expect(parse('-1234567812345678123456781234567'))
+            .is(BigInt("-1234567812345678123456781234567"));
+    });
+
+    it('should work for larger positive integers', () => {
+        expect(parse('12345678123456781234567812345678'))
+            .is(BigInt("12345678123456781234567812345678"));
+    });
+    it('should work for larger negative integers', () => {
+        expect(parse('-12345678123456781234567812345678'))
+            .is(BigInt("-12345678123456781234567812345678"));
+    });
+});
+
+describe('integer binary encoding', () => {
+    it('should work for zero integers', () => {
+        expect(encode(0)).is(Bytes.fromHex('b000'));
+    });
+    it('should work for zero bigints', () => {
+        expect(encode(BigInt(0))).is(Bytes.fromHex('b000'));
+    });
+
+    it('should work for smallish positive integers', () => {
+        expect(encode(60000)).is(Bytes.fromHex('b00300ea60'));
+    });
+    it('should work for smallish negative integers', () => {
+        expect(encode(-60000)).is(Bytes.fromHex('b003ff15a0'));
+    });
+
+    it('should work for largeish positive integers', () => {
+        expect(encode(BigInt("1234567812345678123456781234567")))
+            .is(Bytes.fromHex('b00d0f951a8f2b4b049d518b923187'));
+    });
+    it('should work for largeish negative integers', () => {
+        expect(encode(BigInt("-1234567812345678123456781234567")))
+            .is(Bytes.fromHex('b00df06ae570d4b4fb62ae746dce79'));
+    });
+
+    it('should work for larger positive integers', () => {
+        expect(encode(BigInt("12345678123456781234567812345678")))
+            .is(Bytes.fromHex('b00e009bd30997b0ee2e252f73b5ef4e'));
+    });
+    it('should work for larger negative integers', () => {
+        expect(encode(BigInt("-12345678123456781234567812345678")))
+            .is(Bytes.fromHex('b00eff642cf6684f11d1dad08c4a10b2'));
+    });
+});
+
 describe('common test suite', () => {
    const samples_bin = fs.readFileSync(__dirname + '/../../../../../tests/samples.bin');
    const samples = decodeWithAnnotations(samples_bin, { embeddedDecode: genericEmbeddedTypeDecode });
--- a/implementations/javascript/packages/core/test/values.test.ts
+++ b/implementations/javascript/packages/core/test/values.test.ts
@ -1,4 +1,4 @@
-import { Single, Double, fromJS, Dictionary, IDENTITY_FOLD, fold, mapEmbeddeds, Value, embed } from '../src/index';
+import { Single, Double, fromJS, Dictionary, IDENTITY_FOLD, fold, mapEmbeddeds, Value, embed, preserves } from '../src/index';
 import './test-utils';

 describe('Single', () => {
@ -41,4 +41,51 @@ describe('fromJS', () => {
    it('should map integers to themselves', () => {
        expect(fromJS(1)).toBe(1);
    });
+
+    it('should map bigints to themselves', () => {
+        expect(fromJS(BigInt("12345678123456781234567812345678")))
+            .toBe(BigInt("12345678123456781234567812345678"));;
+    });
+});
+
+describe('is()', () => {
+    it('should compare small integers sensibly', () => {
+        expect(3).is(3);
+        expect(3).not.is(4);
+    });
+    it('should compare large integers sensibly', () => {
+        const a = BigInt("12345678123456781234567812345678");
+        const b = BigInt("12345678123456781234567812345679");
+        expect(a).is(a);
+        expect(a).is(BigInt("12345678123456781234567812345678"));
+        expect(a).not.is(b);
+    });
+    it('should compare mixed integers sensibly', () => {
+        const a = BigInt("12345678123456781234567812345678");
+        const b = BigInt("3");
+        const c = BigInt("4");
+        expect(3).not.is(a);
+        expect(a).not.is(3);
+        expect(3).not.toBe(b);
+        expect(3).is(b);
+        expect(b).not.toBe(3);
+        expect(b).is(3);
+        expect(3).not.toBe(c);
+        expect(3).not.is(c);
+        expect(c).not.toBe(3);
+        expect(c).not.is(3);
+    });
+});
+
+describe('`preserves` formatter', () => {
+    it('should format numbers', () => {
+        expect(preserves`>${3}<`).toBe('>3<');
+    });
+    it('should format small bigints', () => {
+        expect(preserves`>${BigInt("3")}<`).toBe('>3<');
+    });
+    it('should format big bigints', () => {
+        expect(preserves`>${BigInt("12345678123456781234567812345678")}<`)
+            .toBe('>12345678123456781234567812345678<');
+    });
 });
--- a/implementations/python/tests/samples.bin
+++ b/implementations/python/tests/samples.bin
--- a/implementations/python/tests/samples.pr
+++ b/implementations/python/tests/samples.pr
@ -118,6 +118,9 @@
  float14: @"+qNaN" <Test #x"87047fc00111" #xf"7fc00111">
  float15: @"-qNaN" <Test #x"8704ffc00001" #xf"ffc00001">
  float16: @"-qNaN" <Test #x"8704ffc00111" #xf"ffc00111">
+  int-98765432109876543210987654321098765432109: <Test #x"b012feddc125aed4226c770369269596ce3f0ad3" -98765432109876543210987654321098765432109>
+  int-12345678123456781234567812345678: <Test #x"b00eff642cf6684f11d1dad08c4a10b2" -12345678123456781234567812345678>
+  int-1234567812345678123456781234567: <Test #x"b00df06ae570d4b4fb62ae746dce79" -1234567812345678123456781234567>
  int-257: <Test #x"b002feff" -257>
  int-256: <Test #x"b002ff00" -256>
  int-255: <Test #x"b002ff01" -255>
@ -146,7 +149,10 @@
  int65536: <Test #x"b003010000" 65536>
  int131072: <Test #x"b003020000" 131072>
  int2500000000: <Test #x"b005009502f900" 2500000000>
+  int1234567812345678123456781234567: <Test #x"b00d0f951a8f2b4b049d518b923187" 1234567812345678123456781234567>
+  int12345678123456781234567812345678: <Test #x"b00e009bd30997b0ee2e252f73b5ef4e" 12345678123456781234567812345678>
  int87112285931760246646623899502532662132736: <Test #x"b012010000000000000000000000000000000000" 87112285931760246646623899502532662132736>
+  int98765432109876543210987654321098765432109: <Test #x"b01201223eda512bdd9388fc96d96a6931c0f52d" 98765432109876543210987654321098765432109>
  list0: <Test #x"b584" []>
  list4: <Test #x"b5b00101b00102b00103b0010484" [1 2 3 4]>
  list4a: <Test #x"b5b00101b00102b00103b0010484" [1, 2, 3, 4]>
--- a/implementations/racket/preserves/preserves/tests/samples.pr
+++ b/implementations/racket/preserves/preserves/tests/samples.pr
@ -118,6 +118,9 @@
  float14: @"+qNaN" <Test #x"87047fc00111" #xf"7fc00111">
  float15: @"-qNaN" <Test #x"8704ffc00001" #xf"ffc00001">
  float16: @"-qNaN" <Test #x"8704ffc00111" #xf"ffc00111">
+  int-98765432109876543210987654321098765432109: <Test #x"b012feddc125aed4226c770369269596ce3f0ad3" -98765432109876543210987654321098765432109>
+  int-12345678123456781234567812345678: <Test #x"b00eff642cf6684f11d1dad08c4a10b2" -12345678123456781234567812345678>
+  int-1234567812345678123456781234567: <Test #x"b00df06ae570d4b4fb62ae746dce79" -1234567812345678123456781234567>
  int-257: <Test #x"b002feff" -257>
  int-256: <Test #x"b002ff00" -256>
  int-255: <Test #x"b002ff01" -255>
@ -146,7 +149,10 @@
  int65536: <Test #x"b003010000" 65536>
  int131072: <Test #x"b003020000" 131072>
  int2500000000: <Test #x"b005009502f900" 2500000000>
+  int1234567812345678123456781234567: <Test #x"b00d0f951a8f2b4b049d518b923187" 1234567812345678123456781234567>
+  int12345678123456781234567812345678: <Test #x"b00e009bd30997b0ee2e252f73b5ef4e" 12345678123456781234567812345678>
  int87112285931760246646623899502532662132736: <Test #x"b012010000000000000000000000000000000000" 87112285931760246646623899502532662132736>
+  int98765432109876543210987654321098765432109: <Test #x"b01201223eda512bdd9388fc96d96a6931c0f52d" 98765432109876543210987654321098765432109>
  list0: <Test #x"b584" []>
  list4: <Test #x"b5b00101b00102b00103b0010484" [1 2 3 4]>
  list4a: <Test #x"b5b00101b00102b00103b0010484" [1, 2, 3, 4]>
--- a/implementations/rust/preserves/Cargo.toml
+++ b/implementations/rust/preserves/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "preserves"
-version = "3.990.3"
+version = "3.990.4"
 authors = ["Tony Garnock-Jones <tonyg@leastfixedpoint.com>"]
 edition = "2018"
 description = "Implementation of the Preserves serialization format via serde."
--- a/implementations/rust/preserves/src/value/packed/writer.rs
+++ b/implementations/rust/preserves/src/value/packed/writer.rs
@ -289,7 +289,7 @@ impl Writer for BinaryOrderWriter {
 macro_rules! fits_in_bytes {
    ($v:ident, $limit:literal) => {{
        let bits = $limit * 8 - 1;
-        $v >= -(2 << bits) && $v < (2 << bits)
+        $v >= -(1 << bits) && $v < (1 << bits)
    }};
 }

--- a/preserves-expressions.md
+++ b/preserves-expressions.md
@ -0,0 +1,291 @@
+---
+title: "P-expressions"
+---
+
+Tony Garnock-Jones <tonyg@leastfixedpoint.com>  
+October 2023. Version 0.1.1.
+
+This document defines a grammar called *Preserves Expressions*
+(*P-expressions*, *pexprs*) that includes [ordinary Preserves text
+syntax](preserves-text.html) but offers extensions sufficient to support
+a Lisp- or Haskell-like programming notation.
+
+**Motivation.** The [text syntax](preserves-text.html) for Preserves
+works well for writing `Value`s, i.e. data. However, in some contexts,
+Preserves applications need a broader grammar that allows interleaving
+of *expressions* with data. Two examples are the [Preserves Schema
+language](preserves-schema.html) and the [Synit configuration scripting
+language](https://synit.org/book/operation/scripting.html), both of
+which (ab)use Preserves text syntax as a kind of programming notation.
+
+## Preliminaries
+
+The P-expression grammar takes the text syntax grammar as its base and
+modifies it.
+
+<a id="whitespace">
+**Whitespace.** Whitespace is redefined as any number of spaces, tabs,
+carriage returns, or line feeds. Commas are *not* considered whitespace
+in P-expressions.
+
+                ws = *(%x20 / %x09 / CR / LF)
+
+<a id="delimiters"></a>
+**Delimiters.** Because commas are no longer included in class `ws`,
+class `delimiter` is widened to include them explicitly.
+
+         delimiter = ws / ","
+                   / "<" / ">" / "[" / "]" / "{" / "}"
+                   / "#" / ":" / DQUOTE / "|" / "@" / ";"
+
+## Grammar
+
+P-expressions add comma, semicolon, and sequences of one or more colons
+to the syntax class `Value`.
+
+            Value =/ Comma / Semicolon / Colons
+             Comma = ","
+         Semicolon = ";"
+            Colons = 1*":"
+
+Now that colon is in `Value`, the syntax for `Dictionary` is replaced
+with `Block` everywhere it is mentioned.
+
+             Block = "{" *Value ws "}"
+
+New syntax for explicit uninterpreted grouping of sequences of values is
+introduced, and added to class `Value`.
+
+            Value =/ ws Group
+             Group = "(" *Value ws ")"
+
+Finally, class `Document` is replaced in order to allow standalone
+documents to directly comprise a sequence of multiple values.
+
+          Document = *Value ws
+
+No changes to [the Preserves semantic model](preserves.html) are made.
+Every Preserves text-syntax term is a valid P-expression, but in general
+P-expressions must be rewritten or otherwise interpreted before a
+meaningful Preserves value can be arrived at ([see
+below](#reading-preserves)).
+
+## <a id="annotations"></a>Annotations and Comments
+
+Annotations and comments attach to the term following them, just as in
+the ordinary text syntax. However, it is common in programming notations
+to allow comments at the end of a file or other sequential construct:
+
+    {
+        key: value
+        # example of a comment at the end of a dictionary
+    }
+    # example of a comment at the end of the input file
+
+While the ordinary text syntax forbids comments in these positions,
+P-expressions allow them:
+
+         Document =/ *Value Trailer ws
+           Record =/ "<" Value *Value Trailer ws ">"
+         Sequence =/  "[" *Value Trailer ws "]"
+              Set =/ "#{" *Value Trailer ws "}"
+            Block =/  "{" *Value Trailer ws "}"
+
+           Trailer = 1*Annotation
+
+## <a id="encoding-pexprs"></a>Encoding P-expressions as Preserves
+
+We write ⌜*p*⌝ for the encoding into Preserves of P-expression *p*.
+
+{:.pseudocode.equations}
+| ⌜·⌝ : **P-expression** | ⟶ | **Preserves** |
+
+Aside from the special classes `Group`, `Block`, `Comma`, `Semicolon`,
+`Colons`, or `Trailer`, P-expressions are encoded directly as Preserves
+data.
+
+{:.pseudocode.equations}
+| ⌜`[`*p* ...`]`⌝  | = | `[`⌜*p*⌝ ...`]`             |
+| ⌜`<`*p* ...`>`⌝  | = | `<`⌜*p*⌝ ...`>`             |
+| ⌜`#{`*p* ...`}`⌝ | = | `#{`⌜*p*⌝ ...`}`            |
+| ⌜`#!`*p*⌝        | = | `#!`⌜*p*⌝                   |
+| ⌜`@`*p* *q*⌝     | = | `@`⌜*p*⌝ ⌜*q*⌝              |
+| ⌜*p*⌝            | = | *p* when *p* ∈ **Atom** |
+
+All members of the special classes are encoded as Preserves
+dictionaries[^encoding-rationale].
+
+[^encoding-rationale]: In principle, it would be nice to use *records*
+    for this purpose, but if we did so we would have to also encode
+    usages of records!
+
+{:.pseudocode.equations}
+| ⌜`(`*p* ...`)`⌝ | = | `{g:[`⌜*p*⌝ ...`]}` |
+| ⌜`{`*p* ...`}`⌝ | = | `{b:[`⌜*p*⌝ ...`]}` |
+| ⌜`,`⌝           | = | `{s:|,|}`           |
+| ⌜`;`⌝           | = | `{s:|;|}`           |
+| ⌜`:` ...⌝       | = | `{s:|:` ...`|}`     |
+| ⌜*t*⌝           | = | ⌜*a*⌝ ... `{}`, where *a* ... are the annotations in *t* and *t* ∈ **Trailer** |
+
+The empty dictionary `{}` acts as an anchor for the annotations in a
+`Trailer`.
+
+We overload the ⌜·⌝ notation for encoding whole `Document`s into
+sequences of Preserves values.
+
+{:.pseudocode.equations}
+| ⌜·⌝ : **P-expression Document** | ⟶ | **Preserves Sequence** |
+| ⌜*p* ...⌝                       | = | `[`⌜*p*⌝ ...`]`        |
+
+## <a id="reading-preserves"></a>Interpreting P-expressions as Preserves
+
+The [previous section](#encoding-pexprs) discussed ways of representing
+P-expressions using Preserves. Here, we discuss *interpreting*
+P-expressions *as* Preserves, so that (1) a Preserves datum (2) written
+using Preserves text syntax and then (3) read as a P-expression can be
+(4) interpreted from that P-expression to yield the original datum.
+
+A reader for P-expressions can be adapted to yield a reader for
+Preserves terms by processing (subterms of) each P-expression that the
+reader produces. The only subterms that need processing are the special
+classes mentioned above.
+
+ 1. Every `Group` or `Semicolon` that appears is an error.
+ 2. Every `Colons` with two or more colons in it is an error.
+ 3. Every `Comma` that appears is discarded.
+ 3. Every `Trailer` that appears is an error.[^discard-trailers-instead-of-error]
+ 4. Every `Block` must contain triplets of `Value`, `Colons` (with a
+    single colon), `Value`. Any `Block` not following this pattern is an
+    error. Each `Block` following the pattern is translated to a
+    `Dictionary` containing a key/value pair for each triplet.
+
+[^discard-trailers-instead-of-error]: **Implementation note.** When
+    implementing parsing of P-expressions into Preserves, consider
+    offering an optional mode where trailing annotations `Trailer` are
+    *discarded* instead of causing an error to be signalled.
+
+## Appendix: Examples
+
+Examples are given as pairs of P-expressions and their Preserves
+text-syntax encodings.
+
+### Individual P-expression `Value`s
+
+```preserves
+ ⌜<date 1821 (lookup-month "February") 3>⌝
+= <date 1821 {g:[lookup-month "February"]} 3>
+```
+
+```preserves
+ ⌜(begin (println! (+ 1 2)) (+ 3 4))⌝
+= {g:[begin {g:[println! {g:[+ 1 2]}]} {g:[+ 3 4]}]}
+```
+
+```preserves
+ ⌜()⌝
+= {g:[]}
+
+ ⌜[() () ()]⌝
+= [{g:[]}, {g:[]}, {g:[]}]
+```
+
+```preserves
+ ⌜{
+      setUp();
+      # Now enter the loop
+      loop: {
+          greet("World");
+      }
+      tearDown();
+  }⌝
+= {b:[
+      setUp {g:[]} {s:|;|}
+      # Now enter the loop
+      loop {s:|:|} {b:[
+          greet {g:["World"]} {s:|;|}
+      ]}
+      tearDown {g:[]} {s:|;|}
+  ]}
+```
+
+```preserves
+ ⌜[1 + 2.0, print "Hello", predicate: #t, foo, #!remote, bar]⌝
+= [1 + 2.0 {s:|,|} print "Hello" {s:|,|} predicate {s:|:|} #t {s:|,|}
+   foo {s:|,|} #!remote {s:|,|} bar]
+```
+
+```preserves
+ ⌜{
+      optional name: string,
+      address: Address,
+  }⌝
+= {b:[
+      optional name {s:|:|} string {s:|,|}
+      address {s:|:|} Address {s:|,|}
+  ]}
+```
+
+### Whole `Document`s
+
+```preserves
+ ⌜{
+      key: value
+      # example of a comment at the end of a dictionary
+  }
+  # example of a comment at the end of the input file⌝
+= [ {b:[
+        key {s:|:|} value
+        @"example of a comment at the end of a dictionary" {}
+    ]}
+    @"example of a comment at the end of the input file"
+    {}
+  ]
+```
+
+## Appendix: Reading vs. Parsing
+
+Lisp systems first *read* streams of bytes into S-expressions and then
+*parse* those S-expressions into more abstract structures denoting
+various kinds of program syntax. [Separation of reading from parsing is
+what gives Lisp its syntactic
+flexibility.](http://calculist.org/blog/2012/04/17/homoiconicity-isnt-the-point/)
+
+Similarly, the Apple programming language
+[Dylan](https://en.wikipedia.org/wiki/Dylan_(programming_language))
+included a reader-parser split, with the Dylan reader producing
+*D-expressions* that are somewhat similar to P-expressions.
+
+Finally, the Racket dialects
+[Honu](https://docs.racket-lang.org/honu/index.html) and
+[Something](https://github.com/tonyg/racket-something) use a
+reader-parser-macro setup, where the reader produces Racket data, the
+parser produces "syntax" and is user-extensible, and Racket's own
+modular macro system rewrites this "syntax" down to core forms to be
+compiled to machine code.
+
+Similarly, when using P-expressions as the foundation for a language, a
+generic P-expression reader can then feed into special-purpose
+*parsers*. The reader captures the coarse syntactic structure of a
+program, and the parser refines this.
+
+Often, a parser will wish to extract structure from sequences of
+P-expression `Value`s.
+
+ - A simple technique is repeated splitting of sequences; first by
+   `Semicolon`, then by `Comma`, then by increasingly high binding-power
+   operators.
+
+ - More refined is to use a Pratt parser or similar
+   ([1](https://en.wikipedia.org/wiki/Operator-precedence_parser),
+   [2](https://matklad.github.io/2020/04/13/simple-but-powerful-pratt-parsing.html),
+   [3](https://github.com/tonyg/racket-something/blob/f6116bf3861b76970f5ce291a628476adef820b4/src/something/pratt.rkt))
+   to build a parse tree using an extensible specification of the pre-,
+   in-, and postfix operators involved.
+
+ - Finally, if you treat sequences of `Value`s as pre-lexed token
+   streams, almost any parsing formalism (such as [PEG
+   parsing](https://en.wikipedia.org/wiki/Parsing_expression_grammar),
+   [Ometa](https://en.wikipedia.org/wiki/OMeta), etc.) can be used to
+   extract further syntactic structure.
+
+## Notes
--- a/preserves-text.md
+++ b/preserves-text.md
@ -55,7 +55,7 @@ Standalone documents may have trailing whitespace.
 Any `Value` may be preceded by whitespace.

             Value = ws (Record / Collection / Atom / Embedded)
-        Collection = Sequence / Dictionary / Set
+        Collection = Sequence / Set / Dictionary
              Atom = Boolean / String / ByteString /
                     QuotedSymbol / SymbolOrNumber

@ -64,18 +64,18 @@ label-`Value` followed by its field-`Value`s.

            Record = "<" Value *Value ws ">"

-`Sequence`s are enclosed in square brackets. `Dictionary` values are
-curly-brace-enclosed colon-separated pairs of values. `Set`s are
-written as values enclosed by the tokens `#{` and
-`}`.[^printing-collections] It is an error for a set to contain
+`Sequence`s are enclosed in square brackets. `Set`s are written as
+values enclosed by the tokens `#{` and `}`. `Dictionary` values are
+curly-brace-enclosed colon-separated pairs of
+values.[^printing-collections] It is an error for a set to contain
 duplicate elements or for a dictionary to contain duplicate keys. When
-printing sets and dictionaries, implementations *SHOULD* order
-elements resp. keys with respect to the [total order over
+printing sets and dictionaries, implementations *SHOULD* order elements
+resp. keys with respect to the [total order over
 `Value`s](preserves.html#total-order).[^rationale-print-ordering]

-          Sequence = "[" *Value ws "]"
-        Dictionary = "{" *(Value ws ":" Value) ws "}"
-               Set = "#{" *Value ws "}"
+          Sequence =  "["  *Value               ws "]"
+               Set = "#{"  *Value               ws "}"
+        Dictionary =  "{" *(Value ws ":" Value) ws "}"

  [^printing-collections]: **Implementation note.** When implementing
    printing of `Value`s using the textual syntax, consider supporting
@ -273,7 +273,8 @@ value. Each annotation is, in turn, a `Value`, and may itself have
 annotations. The ordering of annotations attached to a `Value` is
 significant.

-            Value =/ ws "@" Value Value
+            Value =/ ws Annotation Value
+        Annotation = "@" Value

 Each annotation is preceded by `@`; the underlying annotated value
 follows its annotations. Here we extend only the syntactic nonterminal
@ -283,7 +284,7 @@ named “`Value`” without altering the semantic class of `Value`s.
 interpreted as comments associated with that value. Comments are
 sufficiently common that special syntax exists for them.

-            Value =/ ws ("#" [(%x20 / %x09) linecomment]) (CR / LF) Value
+       Annotation =/ "#" [(%x20 / %x09) linecomment] (CR / LF)
       linecomment = *<any unicode scalar value except CR or LF>

 When written this way, everything between the hash-space or hash-tab and
--- a/preserves.css
+++ b/preserves.css
@ -1,6 +1,7 @@
 :root {
    --sans-font: "Open Sans", -apple-system, BlinkMacSystemFont, avenir next, avenir, segoe ui, helvetica neue, helvetica, Cantarell, Ubuntu, roboto, noto, arial, sans-serif;
    --serif-font: palatino, "Palatino Linotype", "Palatino LT STD", "URW Palladio L", "TeX Gyre Pagella", serif;
+    --blockquote-indent: 40px;
 }
 body {
    font-family: var(--serif-font);
@ -230,6 +231,7 @@ table.postcard-grammar {
 blockquote {
    padding: 0.5rem 1rem;
    border-left: solid #4f81bd 2px;
+    margin-left: var(--blockquote-indent);
    margin-right: 0;
 }
 blockquote :first-child {
@ -243,6 +245,10 @@ blockquote :last-child {
    background-color: #e9f0f9;
 }

+table.equations { width: auto; margin-left: var(--blockquote-indent); }
+table.equations tr > *:nth-child(1) { text-align: right; }
+table.equations tr > *:nth-child(2) { text-align: center; }
+
 blockquote.pseudocode {
    border-left: none;
    padding: 0;
--- a/preserves.md
+++ b/preserves.md
@ -104,8 +104,8 @@ the `totalOrder` predicate defined in section 5.10 of [IEEE Std

 A `Record` is a *labelled* tuple of `Value`s, the record's *fields*. A
 label can be any `Value`, but is usually a `Symbol`.[^extensibility]
-[^iri-labels] `Record`s are compared lexicographically: first by
-label, then by field sequence.
+[^iri-labels] `Record`s are ordered first by label, then
+lexicographically[^lexicographical-sequences] by field sequence.

  [^extensibility]: The [Racket](https://racket-lang.org/) programming
    language defines
@ -123,10 +123,25 @@ label, then by field sequence.
    it cannot be read as an IRI at all, and so the label simply stands
    for itself—for its own `Value`.

+  [^lexicographical-sequences]: When comparing sequences of values for
+    the total order, [lexicographical
+    ordering](https://en.wikipedia.org/wiki/Lexicographic_order) is
+    used. Elements are drawn pairwise from the two sequences to be
+    compared. If one is smaller than the other according to the total
+    order, the sequence it was drawn from is the smaller of the
+    sequences. If the end of one sequence is reached, while the other
+    sequence has elements remaining, the shorter sequence is considered
+    smaller. Otherwise, all the elements compared equal and neither was
+    longer than the other, so they compare equal. For example,
+      - `[#f]` is ordered before `[foo]` because `Boolean` appears before `Symbol` in the kind ordering;
+      - `[x]` before `[x y]` because there is no element remaining to compare against `y`;
+      - `[a b]` before `[x]` because `a` is smaller than `x`; and
+      - `[x y]` before `[x z]` because `y` is ordered before `z` according to the ordering rules for `Symbol`.
+
 ### Sequences.

 A `Sequence` is a sequence of `Value`s. `Sequence`s are compared
-lexicographically.
+lexicographically.[^lexicographical-sequences]

 ### Sets.

@ -134,15 +149,16 @@ A `Set` is an unordered finite set of `Value`s. It contains no
 duplicate values, following the [equivalence relation](#equivalence)
 induced by the total order on `Value`s. Two `Set`s are compared by
 sorting their elements ascending using the [total order](#total-order)
-and comparing the resulting `Sequence`s.
+and comparing the resulting `Sequence`s.[^lexicographical-sequences]

 ### Dictionaries.

 A `Dictionary` is an unordered finite collection of pairs of `Value`s.
 Each pair comprises a *key* and a *value*. Keys in a `Dictionary` are
 pairwise distinct. Instances of `Dictionary` are compared by
-lexicographic comparison of the sequences resulting from ordering each
-`Dictionary`'s pairs in ascending order by key.
+lexicographic[^lexicographical-sequences] comparison of the sequences
+resulting from ordering each `Dictionary`'s pairs in ascending order by
+key.

 ### Embeddeds.

@ -194,8 +210,12 @@ sequences use [the Preserves binary encoding](preserves-binary.html).

 The total ordering specified [above](#total-order) means that the following statements are true:

-    "bzz" < "c" < "caa" < #!"a"
-    #t < 3.0f < 3.0 < 3 < "3" < |3| < [] < #!#t
+ - `"bzz"` &lt; `"c"` &lt; `"caa"` &lt; `#!"a"`
+ - `#t` &lt; `3.0f` &lt; `3.0` &lt; `3` &lt; `"3"` &lt; `|3|` &lt; `[]` &lt; `#!#t`
+ - `[#f]` &lt; `[foo]`, because `Boolean` appears before `Symbol` in the kind ordering
+ - `[x]` &lt; `[x y]`, because there is no element remaining to compare against `y`
+ - `[a b]` &lt; `[x]`, because `a` is smaller than `x`
+ - `[x y]` &lt; `[x z]`, because `y` is ordered before `z`

 ### Simple examples.

--- a/tests/samples.bin
+++ b/tests/samples.bin
--- a/tests/samples.pr
+++ b/tests/samples.pr
@ -118,6 +118,9 @@
  float14: @"+qNaN" <Test #x"87047fc00111" #xf"7fc00111">
  float15: @"-qNaN" <Test #x"8704ffc00001" #xf"ffc00001">
  float16: @"-qNaN" <Test #x"8704ffc00111" #xf"ffc00111">
+  int-98765432109876543210987654321098765432109: <Test #x"b012feddc125aed4226c770369269596ce3f0ad3" -98765432109876543210987654321098765432109>
+  int-12345678123456781234567812345678: <Test #x"b00eff642cf6684f11d1dad08c4a10b2" -12345678123456781234567812345678>
+  int-1234567812345678123456781234567: <Test #x"b00df06ae570d4b4fb62ae746dce79" -1234567812345678123456781234567>
  int-257: <Test #x"b002feff" -257>
  int-256: <Test #x"b002ff00" -256>
  int-255: <Test #x"b002ff01" -255>
@ -146,7 +149,10 @@
  int65536: <Test #x"b003010000" 65536>
  int131072: <Test #x"b003020000" 131072>
  int2500000000: <Test #x"b005009502f900" 2500000000>
+  int1234567812345678123456781234567: <Test #x"b00d0f951a8f2b4b049d518b923187" 1234567812345678123456781234567>
+  int12345678123456781234567812345678: <Test #x"b00e009bd30997b0ee2e252f73b5ef4e" 12345678123456781234567812345678>
  int87112285931760246646623899502532662132736: <Test #x"b012010000000000000000000000000000000000" 87112285931760246646623899502532662132736>
+  int98765432109876543210987654321098765432109: <Test #x"b01201223eda512bdd9388fc96d96a6931c0f52d" 98765432109876543210987654321098765432109>
  list0: <Test #x"b584" []>
  list4: <Test #x"b5b00101b00102b00103b0010484" [1 2 3 4]>
  list4a: <Test #x"b5b00101b00102b00103b0010484" [1, 2, 3, 4]>