preserves/implementations/javascript/packages/core/src/reader.ts

444 lines
15 KiB
TypeScript

// Text syntax reader.
import type { Value } from './values';
import { DecodeError, ShortPacket } from './codec';
import { Dictionary, Set } from './dictionary';
import { strip } from './strip';
import { Bytes, underlying, unhexDigit } from './bytes';
import { Decoder, DecoderState, neverEmbeddedTypeDecode } from './decoder';
import { Record } from './record';
import { Annotated, newPosition, Position, updatePosition } from './annotated';
import { Double, DoubleFloat, FloatType, Single, SingleFloat } from './float';
import { stringify } from './text';
import { embed, GenericEmbedded, EmbeddedTypeDecode } from './embedded';
export interface ReaderStateOptions {
includeAnnotations?: boolean;
name?: string | Position;
}
export interface ReaderOptions<T> extends ReaderStateOptions {
embeddedDecode?: EmbeddedTypeDecode<T>;
}
type IntOrFloat = 'int' | 'float';
type Numeric = number | SingleFloat | DoubleFloat;
type IntContinuation = (kind: IntOrFloat, acc: string) => Numeric;
export const NUMBER_RE: RegExp = /^([-+]?\d+)(((\.\d+([eE][-+]?\d+)?)|([eE][-+]?\d+))([fF]?))?$/;
// Groups:
// 1 - integer part and sign
// 2 - decimal part, exponent and Float marker
// 3 - decimal part and exponent
// 7 - Float marker
export class ReaderState {
buffer: string;
pos: Position;
index: number;
discarded = 0;
options: ReaderStateOptions;
constructor(buffer: string, options: ReaderStateOptions) {
this.buffer = buffer;
switch (typeof options.name) {
case 'undefined': this.pos = newPosition(); break;
case 'string': this.pos = newPosition(options.name); break;
case 'object': this.pos = { ... options.name }; break;
}
this.index = 0;
this.options = options;
}
error(message: string, pos: Position): never {
throw new DecodeError(message, { ... pos });
}
get includeAnnotations(): boolean {
return this.options.includeAnnotations ?? false;
}
copyPos(): Position {
return { ... this.pos };
}
write(data: string) {
if (this.atEnd()) {
this.buffer = data;
} else {
this.buffer = this.buffer.substring(this.index) + data;
}
this.discarded += this.index;
this.index = 0;
}
atEnd(): boolean {
return (this.index >= this.buffer.length);
}
peek(): string {
if (this.atEnd()) throw new ShortPacket("Short term", this.pos);
return this.buffer[this.index];
}
advance(): number {
const n = this.index++;
updatePosition(this.pos, this.buffer[n]);
return n;
}
nextchar(): string {
if (this.atEnd()) throw new ShortPacket("Short term", this.pos);
return this.buffer[this.advance()];
}
nextcharcode(): number {
if (this.atEnd()) throw new ShortPacket("Short term", this.pos);
return this.buffer.charCodeAt(this.advance());
}
skipws() {
while (true) {
if (this.atEnd()) break;
if (!isSpace(this.peek())) break;
this.advance();
}
}
readHex2(): number {
const x1 = unhexDigit(this.nextcharcode());
const x2 = unhexDigit(this.nextcharcode());
return (x1 << 4) | x2;
}
readHex4(): number {
const x1 = unhexDigit(this.nextcharcode());
const x2 = unhexDigit(this.nextcharcode());
const x3 = unhexDigit(this.nextcharcode());
const x4 = unhexDigit(this.nextcharcode());
return (x1 << 12) | (x2 << 8) | (x3 << 4) | x4;
}
readHexBinary(): Bytes {
const acc: number[] = [];
while (true) {
this.skipws();
if (this.peek() === '"') {
this.advance();
return Bytes.from(acc);
}
acc.push(this.readHex2());
}
}
readHexFloat(precision: FloatType): SingleFloat | DoubleFloat {
const pos = this.copyPos();
if (this.nextchar() !== '"') {
this.error("Missing open-double-quote in hex-encoded floating-point number", pos);
}
const bs = this.readHexBinary();
switch (precision) {
case 'Single':
if (bs.length !== 4) this.error("Incorrect number of bytes in hex-encoded Float", pos);
return SingleFloat.fromBytes(bs);
case 'Double':
if (bs.length !== 8) this.error("Incorrect number of bytes in hex-encoded Double", pos);
return DoubleFloat.fromBytes(bs);
}
}
readBase64Binary(): Bytes {
let acc = '';
while (true) {
this.skipws();
const c = this.nextchar();
if (c === ']') break;
acc = acc + c;
}
return decodeBase64(acc);
}
readRawSymbolOrNumber<T>(acc: string): Value<T> {
while (true) {
if (this.atEnd()) break;
const ch = this.peek();
if (('(){}[]<>";,@#:|'.indexOf(ch) !== -1) || isSpace(ch)) break;
this.advance();
acc = acc + ch;
}
const m = NUMBER_RE.exec(acc);
if (m) {
if (m[2] === void 0) {
let v = parseInt(m[1]);
if (Object.is(v, -0)) v = 0;
return v;
} else if (m[7] === '') {
return Double(parseFloat(m[1] + m[3]));
} else {
return Single(parseFloat(m[1] + m[3]));
}
} else {
return Symbol.for(acc);
}
}
readStringlike<E, R>(xform: (ch: string) => E,
finish: (acc: E[]) => R,
terminator: string,
hexescape: string,
hex: () => E): R
{
let acc: E[] = [];
while (true) {
const ch = this.nextchar();
switch (ch) {
case terminator:
return finish(acc);
case '\\': {
const ch = this.nextchar();
switch (ch) {
case hexescape: acc.push(hex()); break;
case terminator:
case '\\':
case '/':
acc.push(xform(ch)); break;
case 'b': acc.push(xform('\x08')); break;
case 'f': acc.push(xform('\x0c')); break;
case 'n': acc.push(xform('\x0a')); break;
case 'r': acc.push(xform('\x0d')); break;
case 't': acc.push(xform('\x09')); break;
default:
this.error(`Invalid escape code \\${ch}`, this.pos);
}
break;
}
default:
acc.push(xform(ch));
break;
}
}
}
readString(terminator: string): string {
return this.readStringlike(x => x, xs => xs.join(''), terminator, 'u', () => {
const n1 = this.readHex4();
if ((n1 >= 0xd800) && (n1 <= 0xdfff)) {
if ((this.nextchar() === '\\') && (this.nextchar() === 'u')) {
const n2 = this.readHex4();
if ((n2 >= 0xdc00) && (n2 <= 0xdfff) && (n1 <= 0xdbff)) {
return String.fromCharCode(n1, n2);
}
}
this.error('Invalid surrogate pair', this.pos);
}
return String.fromCharCode(n1);
});
}
readLiteralBinary(): Bytes {
return this.readStringlike(
x => {
const v = x.charCodeAt(0);
if (v >= 256) this.error(`Invalid code point ${v} in literal binary`, this.pos);
return v;
},
Bytes.from,
'"',
'x',
() => this.readHex2());
}
}
export const genericEmbeddedTypeDecode: EmbeddedTypeDecode<GenericEmbedded> = {
decode(s: DecoderState): GenericEmbedded {
return new GenericEmbedded(new Decoder(s, this).next());
},
fromValue(v: Value<GenericEmbedded>, options: ReaderStateOptions): GenericEmbedded {
return new GenericEmbedded(options.includeAnnotations ? v : strip(v));
},
};
export class Reader<T> {
state: ReaderState;
embeddedType: EmbeddedTypeDecode<T>;
constructor(state: ReaderState, embeddedType: EmbeddedTypeDecode<T>);
constructor(buffer: string, options?: ReaderOptions<T>);
constructor(
state_or_buffer: (ReaderState | string) = '',
embeddedType_or_options?: (EmbeddedTypeDecode<T> | ReaderOptions<T>))
{
if (state_or_buffer instanceof ReaderState) {
this.state = state_or_buffer;
this.embeddedType = embeddedType_or_options as EmbeddedTypeDecode<T>;
} else {
const options = (embeddedType_or_options as ReaderOptions<T>) ?? {};
this.state = new ReaderState(state_or_buffer, options);
this.embeddedType = options.embeddedDecode ?? neverEmbeddedTypeDecode;
}
}
write(data: string) {
this.state.write(data);
}
readCommentLine(): Value<T> {
const startPos = this.state.copyPos();
let acc = '';
while (true) {
const c = this.state.nextchar();
if (c === '\n' || c === '\r') {
return this.wrap(acc, startPos);
}
acc = acc + c;
}
}
wrap(v: Value<T>, pos: Position): Value<T> {
if (this.state.includeAnnotations && !Annotated.isAnnotated(v)) {
v = new Annotated(v, pos);
}
return v;
}
annotateNextWith(v: Value<T>): Value<T> {
this.state.skipws();
if (this.state.atEnd()) {
throw new DecodeError("Trailing annotations and comments are not permitted",
this.state.pos);
}
const u = this.next();
if (this.state.includeAnnotations) (u as Annotated<T>).annotations.unshift(v);
return u;
}
readToEnd(): Array<Value<T>> {
const acc = [];
while (true) {
this.state.skipws();
if (this.state.atEnd()) return acc;
acc.push(this.next());
}
}
next(): Value<T> {
this.state.skipws();
const startPos = this.state.copyPos();
const unwrapped = ((): Value<T> => {
const c = this.state.nextchar();
switch (c) {
case '"':
return this.state.readString('"');
case '|':
return Symbol.for(this.state.readString('|'));
case ';':
return this.annotateNextWith(this.readCommentLine());
case '@':
return this.annotateNextWith(this.next());
case ':':
this.state.error('Unexpected key/value separator between items', startPos);
case '#': {
const c = this.state.nextchar();
switch (c) {
case 'f': return false;
case 't': return true;
case '{': return this.seq(new Set<T>(), (v, s) => s.add(v), '}');
case '"': return this.state.readLiteralBinary();
case 'x': switch (this.state.nextchar()) {
case '"': return this.state.readHexBinary();
case 'f': return this.state.readHexFloat('Single');
case 'd': return this.state.readHexFloat('Double');
default: this.state.error('Invalid #x syntax', startPos);
}
case '[': return this.state.readBase64Binary();
case '!': return embed(this.embeddedType.fromValue(
new Reader<GenericEmbedded>(this.state, genericEmbeddedTypeDecode).next(),
this.state.options));
default:
this.state.error(`Invalid # syntax: ${c}`, startPos);
}
}
case '<': {
const label = this.next();
const fields = this.readSequence('>');
return Record(label, fields);
}
case '[': return this.readSequence(']');
case '{': return this.readDictionary();
case '>': this.state.error('Unexpected >', startPos);
case ']': this.state.error('Unexpected ]', startPos);
case '}': this.state.error('Unexpected }', startPos);
default:
return this.state.readRawSymbolOrNumber(c);
}
})();
return this.wrap(unwrapped, startPos);
}
seq<S>(acc: S, update: (v: Value<T>, acc: S) => void, ch: string): S {
while (true) {
this.state.skipws();
if (this.state.peek() === ch) {
this.state.advance();
return acc;
}
update(this.next(), acc);
}
}
readSequence(ch: string): Array<Value<T>> {
return this.seq([] as Array<Value<T>>, (v, acc) => acc.push(v), ch);
}
readDictionary(): Dictionary<T> {
return this.seq(new Dictionary<T>(),
(k, acc) => {
this.state.skipws();
switch (this.state.peek()) {
case ':':
if (acc.has(k)) this.state.error(
`Duplicate key: ${stringify(k)}`, this.state.pos);
this.state.advance();
acc.set(k, this.next());
break;
default:
this.state.error('Missing key/value separator', this.state.pos);
}
},
'}');
}
}
const BASE64: {[key: string]: number} = {};
[... 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'].forEach(
(c, i) => BASE64[c] = i);
BASE64['+'] = BASE64['-'] = 62;
BASE64['/'] = BASE64['_'] = 63;
export function decodeBase64(s: string): Bytes {
const bs = new Uint8Array(Math.floor(s.length * 3/4));
let i = 0;
let j = 0;
while (i < s.length) {
const v1 = BASE64[s[i++]];
const v2 = BASE64[s[i++]];
const v3 = BASE64[s[i++]];
const v4 = BASE64[s[i++]];
const v = (v1 << 18) | (v2 << 12) | (v3 << 6) | v4;
bs[j++] = (v >> 16) & 255;
if (v3 === void 0) break;
bs[j++] = (v >> 8) & 255;
if (v4 === void 0) break;
bs[j++] = v & 255;
}
return Bytes.from(bs.subarray(0, j));
}
function isSpace(s: string): boolean {
return ' \t\n\r,'.indexOf(s) !== -1;
}