diff --git a/implementations/javascript/packages/core/src/bytes.ts b/implementations/javascript/packages/core/src/bytes.ts index 40cae0b..c53c8e4 100644 --- a/implementations/javascript/packages/core/src/bytes.ts +++ b/implementations/javascript/packages/core/src/bytes.ts @@ -35,6 +35,10 @@ export class Bytes implements Preservable, PreserveWritable { } } + dataview(): DataView { + return new DataView(this._view.buffer, this._view.byteOffset, this._view.byteLength); + } + get length(): number { return this._view.length; } @@ -179,6 +183,10 @@ export function underlying(b: Bytes | Uint8Array): Uint8Array { return (b instanceof Uint8Array) ? b : b._view; } +export function dataview(b: Bytes | DataView): DataView { + return (b instanceof DataView) ? b : b.dataview(); +} + // Uint8Array / TypedArray methods export interface Bytes { diff --git a/implementations/javascript/packages/core/src/decoder.ts b/implementations/javascript/packages/core/src/decoder.ts index f5316c1..25b35e8 100644 --- a/implementations/javascript/packages/core/src/decoder.ts +++ b/implementations/javascript/packages/core/src/decoder.ts @@ -216,8 +216,8 @@ export class Decoder implements TypedDecoder { switch (tag) { case Tag.False: return this.state.wrap(false); case Tag.True: return this.state.wrap(true); - case Tag.Float: return this.state.wrap(new SingleFloat(this.state.nextbytes(4).getFloat32(0, false))); - case Tag.Double: return this.state.wrap(new DoubleFloat(this.state.nextbytes(8).getFloat64(0, false))); + case Tag.Float: return this.state.wrap(SingleFloat.fromBytes(this.state.nextbytes(4))); + case Tag.Double: return this.state.wrap(DoubleFloat.fromBytes(this.state.nextbytes(8))); case Tag.End: throw new DecodeError("Unexpected Compound end marker"); case Tag.Annotation: { const a = this.next(); @@ -294,7 +294,7 @@ export class Decoder implements TypedDecoder { nextFloat(): SingleFloat | undefined { this.skipAnnotations(); switch (this.state.nextbyte()) { - case Tag.Float: return new SingleFloat(this.state.nextbytes(4).getFloat32(0, false)); + case Tag.Float: return SingleFloat.fromBytes(this.state.nextbytes(4)); default: return void 0; } } @@ -302,7 +302,7 @@ export class Decoder implements TypedDecoder { nextDouble(): DoubleFloat | undefined { this.skipAnnotations(); switch (this.state.nextbyte()) { - case Tag.Double: return new DoubleFloat(this.state.nextbytes(8).getFloat64(0, false)); + case Tag.Double: return DoubleFloat.fromBytes(this.state.nextbytes(8)); default: return void 0; } } diff --git a/implementations/javascript/packages/core/src/float.ts b/implementations/javascript/packages/core/src/float.ts index a831a44..a95e943 100644 --- a/implementations/javascript/packages/core/src/float.ts +++ b/implementations/javascript/packages/core/src/float.ts @@ -4,6 +4,7 @@ import { Value } from "./values"; import type { GenericEmbedded } from "./embedded"; import type { Encoder, Preservable } from "./encoder"; import type { Writer, PreserveWritable } from "./writer"; +import { Bytes, dataview, underlying } from "./bytes"; export type FloatType = 'Single' | 'Double'; export const FloatType = Symbol.for('FloatType'); @@ -19,8 +20,15 @@ export abstract class Float { return stringify(this); } + abstract toBytes(): Bytes; + equals(other: any): boolean { - return Object.is(other.constructor, this.constructor) && (other.value === this.value); + if (!Object.is(other.constructor, this.constructor)) return false; + if (Number.isNaN(this.value) && Number.isNaN(other.value)) { + return other.toBytes().equals(this.toBytes()); + } else { + return Object.is(other.value, this.value); + } } hashCode(): number { @@ -44,24 +52,72 @@ export function floatValue(f: any): number { } } +export function floatlikeString(f: number): string { + if (Object.is(f, -0)) return '-0.0'; + const s = '' + f; + if (s.includes('.') || s.includes('e') || s.includes('E')) return s; + return s + '.0'; +} + export class SingleFloat extends Float implements Preservable, PreserveWritable { __as_preserve__(): Value { return this; } + static fromBytes(bs: Bytes | DataView): SingleFloat { + const view = dataview(bs); + const vf = view.getInt32(0, false); + if ((vf & 0x7f800000) === 0x7f800000) { + // NaN or inf. Preserve quiet/signalling bit by manually expanding to double-precision. + const sign = vf >> 31; + const payload = vf & 0x007fffff; + const dbs = new Bytes(8); + const dview = dataview(dbs); + dview.setInt16(0, (sign << 15) | 0x7ff0 | (payload >> 19), false); + dview.setInt32(2, (payload & 0x7ffff) << 13, false); + return new SingleFloat(dview.getFloat64(0, false)); + } else { + return new SingleFloat(dataview(bs).getFloat32(0, false)); + } + } + static __from_preserve__(v: Value): undefined | SingleFloat { return Float.isSingle(v) ? v : void 0; } + __w(v: DataView, offset: number) { + if (Number.isNaN(this.value)) { + const dbs = new Bytes(8); + const dview = dataview(dbs); + dview.setFloat64(0, this.value, false); + const sign = dview.getInt8(0) >> 7; + const payload = (dview.getInt32(1, false) >> 5) & 0x007fffff; + const vf = (sign << 31) | 0x7f800000 | payload; + v.setInt32(offset, vf, false); + } else { + v.setFloat32(offset, this.value, false); + } + } + __preserve_on__(encoder: Encoder) { encoder.state.emitbyte(Tag.Float); encoder.state.makeroom(4); - encoder.state.view.setFloat32(encoder.state.index, this.value, false); + this.__w(encoder.state.view, encoder.state.index); encoder.state.index += 4; } + toBytes(): Bytes { + const bs = new Bytes(4); + this.__w(bs.dataview(), 0); + return bs; + } + __preserve_text_on__(w: Writer) { - w.state.pieces.push('' + this.value + 'f'); + if (Number.isFinite(this.value)) { + w.state.pieces.push(floatlikeString(this.value) + 'f'); + } else { + w.state.pieces.push('#xf"', this.toBytes().toHex(), '"'); + } } get [FloatType](): 'Single' { @@ -78,6 +134,10 @@ export class DoubleFloat extends Float implements Preservable, PreserveWrit return this; } + static fromBytes(bs: Bytes | DataView): DoubleFloat { + return new DoubleFloat(dataview(bs).getFloat64(0, false)); + } + static __from_preserve__(v: Value): undefined | DoubleFloat { return Float.isDouble(v) ? v : void 0; } @@ -89,8 +149,18 @@ export class DoubleFloat extends Float implements Preservable, PreserveWrit encoder.state.index += 8; } + toBytes(): Bytes { + const bs = new Bytes(8); + bs.dataview().setFloat64(0, this.value, false); + return bs; + } + __preserve_text_on__(w: Writer) { - w.state.pieces.push('' + this.value); + if (Number.isFinite(this.value)) { + w.state.pieces.push(floatlikeString(this.value)); + } else { + w.state.pieces.push('#xd"', this.toBytes().toHex(), '"'); + } } get [FloatType](): 'Double' { diff --git a/implementations/javascript/packages/core/src/reader.ts b/implementations/javascript/packages/core/src/reader.ts index 220f1f7..4c914d7 100644 --- a/implementations/javascript/packages/core/src/reader.ts +++ b/implementations/javascript/packages/core/src/reader.ts @@ -3,12 +3,12 @@ import type { Value } from './values'; import { DecodeError, ShortPacket } from './codec'; import { Dictionary, Set } from './dictionary'; -import { strip, unannotate } from './strip'; -import { Bytes, unhexDigit } from './bytes'; -import { decode, Decoder, DecoderState, neverEmbeddedTypeDecode } from './decoder'; +import { strip } from './strip'; +import { Bytes, underlying, unhexDigit } from './bytes'; +import { Decoder, DecoderState, neverEmbeddedTypeDecode } from './decoder'; import { Record } from './record'; import { Annotated, newPosition, Position, updatePosition } from './annotated'; -import { Double, DoubleFloat, Single, SingleFloat } from './float'; +import { Double, DoubleFloat, FloatType, Single, SingleFloat } from './float'; import { stringify } from './text'; import { embed, GenericEmbedded, EmbeddedTypeDecode } from './embedded'; @@ -25,6 +25,13 @@ type IntOrFloat = 'int' | 'float'; type Numeric = number | SingleFloat | DoubleFloat; type IntContinuation = (kind: IntOrFloat, acc: string) => Numeric; +export const NUMBER_RE: RegExp = /^([-+]?\d+)(((\.\d+([eE][-+]?\d+)?)|([eE][-+]?\d+))([fF]?))?$/; +// Groups: +// 1 - integer part and sign +// 2 - decimal part, exponent and Float marker +// 3 - decimal part and exponent +// 7 - Float marker + export class ReaderState { buffer: string; pos: Position; @@ -124,6 +131,22 @@ export class ReaderState { } } + readHexFloat(precision: FloatType): SingleFloat | DoubleFloat { + const pos = this.copyPos(); + if (this.nextchar() !== '"') { + this.error("Missing open-double-quote in hex-encoded floating-point number", pos); + } + const bs = this.readHexBinary(); + switch (precision) { + case 'Single': + if (bs.length !== 4) this.error("Incorrect number of bytes in hex-encoded Float", pos); + return SingleFloat.fromBytes(bs); + case 'Double': + if (bs.length !== 8) this.error("Incorrect number of bytes in hex-encoded Double", pos); + return DoubleFloat.fromBytes(bs); + } + } + readBase64Binary(): Bytes { let acc = ''; while (true) { @@ -135,67 +158,7 @@ export class ReaderState { return decodeBase64(acc); } - readIntpart(acc: string, ch: string): Numeric { - if (ch === '0') return this.readFracexp('int', acc + ch); - return this.readDigit1('int', acc, (kind, acc) => this.readFracexp(kind, acc), ch); - } - - readDigit1(kind: IntOrFloat, acc: string, k: IntContinuation, ch?: string): Numeric { - if (ch === void 0) ch = this.nextchar(); - if (ch >= '0' && ch <= '9') return this.readDigit0(kind, acc + ch, k); - this.error('Incomplete number', this.pos); - } - - readDigit0(kind: IntOrFloat, acc: string, k: IntContinuation): Numeric { - while (true) { - if (this.atEnd()) break; - const ch = this.peek(); - if (!(ch >= '0' && ch <= '9')) break; - this.advance(); - acc = acc + ch; - } - return k(kind, acc); - } - - readFracexp(kind: IntOrFloat, acc: string): Numeric { - if (!this.atEnd() && this.peek() === '.') { - this.advance(); - return this.readDigit1('float', acc + '.', (kind, acc) => this.readExp(kind, acc)); - } - return this.readExp(kind, acc); - } - - readExp(kind: IntOrFloat, acc: string): Numeric { - const ch = this.atEnd() ? '' : this.peek(); - if (ch === 'e' || ch === 'E') { - this.advance(); - return this.readSignAndExp(acc + ch); - } - return this.finishNumber(kind, acc); - } - - readSignAndExp(acc: string): Numeric { - const ch = this.peek(); - if (ch === '+' || ch === '-') { - this.advance(); - return this.readDigit1('float', acc + ch, (kind, acc) => this.finishNumber(kind, acc)); - } - return this.readDigit1('float', acc, (kind, acc) => this.finishNumber(kind, acc)); - } - - finishNumber(kind: IntOrFloat, acc: string): Numeric { - const i = parseFloat(acc); - if (kind === 'int') return i; - const ch = this.atEnd() ? '' : this.peek(); - if (ch === 'f' || ch === 'F') { - this.advance(); - return Single(i); - } else { - return Double(i); - } - } - - readRawSymbol(acc: string): Value { + readRawSymbolOrNumber(acc: string): Value { while (true) { if (this.atEnd()) break; const ch = this.peek(); @@ -203,7 +166,20 @@ export class ReaderState { this.advance(); acc = acc + ch; } - return Symbol.for(acc); + const m = NUMBER_RE.exec(acc); + if (m) { + if (m[2] === void 0) { + let v = parseInt(m[1]); + if (Object.is(v, -0)) v = 0; + return v; + } else if (m[7] === '') { + return Double(parseFloat(m[1] + m[3])); + } else { + return Single(parseFloat(m[1] + m[3])); + } + } else { + return Symbol.for(acc); + } } readStringlike(xform: (ch: string) => E, @@ -355,11 +331,6 @@ export class Reader { const unwrapped = ((): Value => { const c = this.state.nextchar(); switch (c) { - case '-': - return this.state.readIntpart('-', this.state.nextchar()); - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - return this.state.readIntpart('', c); case '"': return this.state.readString('"'); case '|': @@ -377,22 +348,13 @@ export class Reader { case 't': return true; case '{': return this.seq(new Set(), (v, s) => s.add(v), '}'); case '"': return this.state.readLiteralBinary(); - case 'x': - if (this.state.nextchar() !== '"') { - this.state.error('Expected open-quote at start of hex ByteString', - startPos); - } - return this.state.readHexBinary(); - case '[': return this.state.readBase64Binary(); - case '=': { - const bs = unannotate(this.next()); - if (!Bytes.isBytes(bs)) this.state.error('ByteString must follow #=', - startPos); - return decode(bs, { - embeddedDecode: this.embeddedType, - includeAnnotations: this.state.options.includeAnnotations, - }); + case 'x': switch (this.state.nextchar()) { + case '"': return this.state.readHexBinary(); + case 'f': return this.state.readHexFloat('Single'); + case 'd': return this.state.readHexFloat('Double'); + default: this.state.error('Invalid #x syntax', startPos); } + case '[': return this.state.readBase64Binary(); case '!': return embed(this.embeddedType.fromValue( new Reader(this.state, genericEmbeddedTypeDecode).next(), this.state.options)); @@ -411,7 +373,7 @@ export class Reader { case ']': this.state.error('Unexpected ]', startPos); case '}': this.state.error('Unexpected }', startPos); default: - return this.state.readRawSymbol(c); + return this.state.readRawSymbolOrNumber(c); } })(); return this.wrap(unwrapped, startPos); diff --git a/implementations/javascript/packages/core/src/text.ts b/implementations/javascript/packages/core/src/text.ts index 2fb8146..9cf2cdd 100644 --- a/implementations/javascript/packages/core/src/text.ts +++ b/implementations/javascript/packages/core/src/text.ts @@ -4,7 +4,7 @@ import type { Value } from './values'; import { Annotated } from './annotated'; import { Bytes } from './bytes'; import { KeyedDictionary, KeyedSet } from './dictionary'; -import { Writer, Writable, WriterOptions, EmbeddedWriter, WriterState } from './writer'; +import { Writer, WriterOptions, EmbeddedWriter, WriterState } from './writer'; import { fromJS } from './fromjs'; export const stringifyEmbeddedWrite: EmbeddedWriter = { diff --git a/implementations/javascript/packages/core/src/writer.ts b/implementations/javascript/packages/core/src/writer.ts index be55d34..8409e5c 100644 --- a/implementations/javascript/packages/core/src/writer.ts +++ b/implementations/javascript/packages/core/src/writer.ts @@ -3,6 +3,7 @@ import { Record, Tuple } from "./record"; import type { GenericEmbedded, Embedded, EmbeddedTypeEncode } from "./embedded"; import { Encoder, EncoderState } from "./encoder"; import type { Value } from "./values"; +import { NUMBER_RE } from './reader'; export type Writable = Value | PreserveWritable | Iterable> | ArrayBufferView; @@ -270,8 +271,7 @@ export class Writer { case 'symbol': { const s = v.description!; // FIXME: This regular expression is conservatively correct, but Anglo-chauvinistic. - const m = /^[a-zA-Z~!$%^&*?_=+/.][-a-zA-Z~!$%^&*?_=+/.0-9]*$/.exec(s); - if (m) { + if (/^[-a-zA-Z0-9~!$%^&*?_=+/.]+$/.exec(s) && !NUMBER_RE.exec(s)) { this.state.pieces.push(s); } else { this.state.pieces.push(this.state.escapeStringlike(s, '|')); diff --git a/implementations/python/preserves/binary.py b/implementations/python/preserves/binary.py index 78ec1dd..8b246f4 100644 --- a/implementations/python/preserves/binary.py +++ b/implementations/python/preserves/binary.py @@ -72,7 +72,7 @@ class Decoder(BinaryCodec): tag = self.nextbyte() if tag == 0x80: return self.wrap(False) if tag == 0x81: return self.wrap(True) - if tag == 0x82: return self.wrap(Float(struct.unpack('>f', self.nextbytes(4))[0])) + if tag == 0x82: return self.wrap(Float.from_bytes(self.nextbytes(4))) if tag == 0x83: return self.wrap(struct.unpack('>d', self.nextbytes(8))[0]) if tag == 0x84: raise DecodeError('Unexpected end-of-stream marker') if tag == 0x85: diff --git a/implementations/python/preserves/compare.py b/implementations/python/preserves/compare.py index 87df7ff..1572426 100644 --- a/implementations/python/preserves/compare.py +++ b/implementations/python/preserves/compare.py @@ -2,7 +2,7 @@ import numbers from enum import Enum from functools import cmp_to_key -from .values import preserve, Float, Embedded, Record, Symbol +from .values import preserve, Float, Embedded, Record, Symbol, cmp_floats, _unwrap from .compat import basestring_ class TypeNumber(Enum): @@ -19,7 +19,7 @@ class TypeNumber(Enum): SET = 9 DICTIONARY = 10 - EMBEDDED = 10 + EMBEDDED = 11 def type_number(v): if hasattr(v, '__preserve__'): @@ -84,12 +84,17 @@ def _item_key(item): return item[0] def _eq(a, b): + a = _unwrap(a) + b = _unwrap(b) ta = type_number(a) tb = type_number(b) if ta != tb: return False + if ta == TypeNumber.DOUBLE: + return cmp_floats(a, b) == 0 + if ta == TypeNumber.EMBEDDED: - return ta.embeddedValue == tb.embeddedValue + return _eq(a.embeddedValue, b.embeddedValue) if ta == TypeNumber.RECORD: return _eq(a.key, b.key) and _eq_sequences(a.fields, b.fields) @@ -118,13 +123,18 @@ def _cmp_sequences(aa, bb): return len(aa) - len(bb) def _cmp(a, b): + a = _unwrap(a) + b = _unwrap(b) ta = type_number(a) tb = type_number(b) if ta.value < tb.value: return -1 if tb.value < ta.value: return 1 + if ta == TypeNumber.DOUBLE: + return cmp_floats(a, b) + if ta == TypeNumber.EMBEDDED: - return _simplecmp(ta.embeddedValue, tb.embeddedValue) + return _cmp(a.embeddedValue, b.embeddedValue) if ta == TypeNumber.RECORD: v = _cmp(a.key, b.key) diff --git a/implementations/python/preserves/text.py b/implementations/python/preserves/text.py index 424441b..321f8b3 100644 --- a/implementations/python/preserves/text.py +++ b/implementations/python/preserves/text.py @@ -1,6 +1,7 @@ import numbers import struct import base64 +import math from .values import * from .error import * @@ -9,6 +10,8 @@ from .binary import Decoder class TextCodec(object): pass +NUMBER_RE = re.compile(r'^([-+]?\d+)(((\.\d+([eE][-+]?\d+)?)|([eE][-+]?\d+))([fF]?))?$') + class Parser(TextCodec): def __init__(self, input_buffer=u'', include_annotations=False, parse_embedded=lambda x: x): super(Parser, self).__init__() @@ -66,50 +69,6 @@ class Parser(TextCodec): return self.wrap(u''.join(s)) s.append(c) - def read_intpart(self, acc, c): - if c == '0': - acc.append(c) - else: - self.read_digit1(acc, c) - return self.read_fracexp(acc) - - def read_fracexp(self, acc): - is_float = False - if self.peek() == '.': - is_float = True - acc.append(self.nextchar()) - self.read_digit1(acc, self.nextchar()) - if self.peek() in 'eE': - acc.append(self.nextchar()) - return self.read_sign_and_exp(acc) - else: - return self.finish_number(acc, is_float) - - def read_sign_and_exp(self, acc): - if self.peek() in '+-': - acc.append(self.nextchar()) - self.read_digit1(acc, self.nextchar()) - return self.finish_number(acc, True) - - def finish_number(self, acc, is_float): - if is_float: - if self.peek() in 'fF': - self.skip() - return Float(float(u''.join(acc))) - else: - return float(u''.join(acc)) - else: - return int(u''.join(acc)) - - def read_digit1(self, acc, c): - if not c.isdigit(): - raise DecodeError('Incomplete number') - acc.append(c) - while not self._atend(): - if not self.peek().isdigit(): - break - acc.append(self.nextchar()) - def read_stringlike(self, terminator, hexescape, hexescaper): acc = [] while True: @@ -186,6 +145,16 @@ class Parser(TextCodec): if c == '=': continue acc.append(c) + def read_hex_float(self, bytecount): + if self.nextchar() != '"': + raise DecodeError('Missing open-double-quote in hex-encoded floating-point number') + bs = self.read_hex_binary() + if len(bs) != bytecount: + raise DecodeError('Incorrect number of bytes in hex-encoded floating-point number') + if bytecount == 4: return Float.from_bytes(bs) + if bytecount == 8: return struct.unpack('>d', bs)[0] + raise DecodeError('Unsupported byte count in hex-encoded floating-point number') + def upto(self, delimiter): vs = [] while True: @@ -208,14 +177,24 @@ class Parser(TextCodec): raise DecodeError('Missing expected key/value separator') acc.append(self.next()) - def read_raw_symbol(self, acc): + def read_raw_symbol_or_number(self, acc): while not self._atend(): c = self.peek() if c.isspace() or c in '(){}[]<>";,@#:|': break self.skip() acc.append(c) - return Symbol(u''.join(acc)) + acc = u''.join(acc) + m = NUMBER_RE.match(acc) + if m: + if m[2] is None: + return int(m[1]) + elif m[7] == '': + return float(m[1] + m[3]) + else: + return Float(float(m[1] + m[3])) + else: + return Symbol(acc) def wrap(self, v): return Annotated(v) if self.include_annotations else v @@ -223,12 +202,6 @@ class Parser(TextCodec): def next(self): self.skip_whitespace() c = self.peek() - if c == '-': - self.skip() - return self.wrap(self.read_intpart(['-'], self.nextchar())) - if c.isdigit(): - self.skip() - return self.wrap(self.read_intpart([], c)) if c == '"': self.skip() return self.wrap(self.read_string('"')) @@ -251,9 +224,11 @@ class Parser(TextCodec): if c == '{': return self.wrap(frozenset(self.upto('}'))) if c == '"': return self.wrap(self.read_literal_binary()) if c == 'x': - if self.nextchar() != '"': - raise DecodeError('Expected open-quote at start of hex ByteString') - return self.wrap(self.read_hex_binary()) + c = self.nextchar() + if c == '"': return self.wrap(self.read_hex_binary()) + if c == 'f': return self.wrap(self.read_hex_float(4)) + if c == 'd': return self.wrap(self.read_hex_float(8)) + raise DecodeError('Invalid #x syntax') if c == '[': return self.wrap(self.read_base64_binary()) if c == '=': old_ann = self.include_annotations @@ -286,7 +261,7 @@ class Parser(TextCodec): if c in '>]}': raise DecodeError('Unexpected ' + c) self.skip() - return self.wrap(self.read_raw_symbol([c])) + return self.wrap(self.read_raw_symbol_or_number([c])) def try_next(self): start = self.index @@ -385,7 +360,10 @@ class Formatter(TextCodec): elif v is True: self.chunks.append('#t') elif isinstance(v, float): - self.chunks.append(repr(v)) + if math.isnan(v) or math.isinf(v): + self.chunks.append('#xd"' + struct.pack('>d', v).hex() + '"') + else: + self.chunks.append(repr(v)) elif isinstance(v, numbers.Number): self.chunks.append('%d' % (v,)) elif isinstance(v, bytes): diff --git a/implementations/python/preserves/values.py b/implementations/python/preserves/values.py index b96bba3..cdb2fc0 100644 --- a/implementations/python/preserves/values.py +++ b/implementations/python/preserves/values.py @@ -1,6 +1,7 @@ import re import sys import struct +import math from .error import DecodeError @@ -9,6 +10,16 @@ def preserve(v): v = v.__preserve__() return v +def float_to_int(v): + return struct.unpack('>Q', struct.pack('>d', v))[0] + +def cmp_floats(a, b): + a = float_to_int(a) + b = float_to_int(b) + if a & 0x8000000000000000: a = a ^ 0x7fffffffffffffff + if b & 0x8000000000000000: b = b ^ 0x7fffffffffffffff + return a - b + class Float(object): def __init__(self, value): self.value = value @@ -16,7 +27,12 @@ class Float(object): def __eq__(self, other): other = _unwrap(other) if other.__class__ is self.__class__: - return self.value == other.value + return cmp_floats(self.value, other.value) == 0 + + def __lt__(self, other): + other = _unwrap(other) + if other.__class__ is self.__class__: + return cmp_floats(self.value, other.value) < 0 def __ne__(self, other): return not self.__eq__(other) @@ -27,15 +43,41 @@ class Float(object): def __repr__(self): return 'Float(' + repr(self.value) + ')' + def _to_bytes(self): + if math.isnan(self.value) or math.isinf(self.value): + dbs = struct.pack('>d', self.value) + vd = struct.unpack('>Q', dbs)[0] + sign = vd >> 63 + payload = (vd >> 29) & 0x007fffff + vf = (sign << 31) | 0x7f800000 | payload + return struct.pack('>I', vf) + else: + return struct.pack('>f', self.value) + def __preserve_write_binary__(self, encoder): encoder.buffer.append(0x82) - encoder.buffer.extend(struct.pack('>f', self.value)) + encoder.buffer.extend(self._to_bytes()) def __preserve_write_text__(self, formatter): - formatter.chunks.append(repr(self.value) + 'f') + if math.isnan(self.value) or math.isinf(self.value): + formatter.chunks.append('#xf"' + self._to_bytes().hex() + '"') + else: + formatter.chunks.append(repr(self.value) + 'f') + + @staticmethod + def from_bytes(bs): + vf = struct.unpack('>I', bs)[0] + if (vf & 0x7f800000) == 0x7f800000: + # NaN or inf. Preserve quiet/signalling bit by manually expanding to double-precision. + sign = vf >> 31 + payload = vf & 0x007fffff + dbs = struct.pack('>Q', (sign << 63) | 0x7ff0000000000000 | (payload << 29)) + return Float(struct.unpack('>d', dbs)[0]) + else: + return Float(struct.unpack('>f', bs)[0]) # FIXME: This regular expression is conservatively correct, but Anglo-chauvinistic. -RAW_SYMBOL_RE = re.compile(r'^[a-zA-Z~!$%^&*?_=+/.][-a-zA-Z~!$%^&*?_=+/.0-9]*$') +RAW_SYMBOL_RE = re.compile(r'^[-a-zA-Z0-9~!$%^&*?_=+/.]+$') class Symbol(object): def __init__(self, name): diff --git a/implementations/python/tests/samples.bin b/implementations/python/tests/samples.bin index 5d5732a..0d2ad15 100644 Binary files a/implementations/python/tests/samples.bin and b/implementations/python/tests/samples.bin differ diff --git a/implementations/python/tests/samples.pr b/implementations/python/tests/samples.pr index 9450d49..a29deb4 100644 --- a/implementations/python/tests/samples.pr +++ b/implementations/python/tests/samples.pr @@ -74,9 +74,45 @@ dict3: @"Duplicate key" dict4: @"Unexpected close brace" dict5: @"Missing value" + double0: + double+0: + double-0: double1: double2: + double3: + double4: @"Fewer than 16 digits" + double5: @"More than 16 digits" + double6: @"Invalid chars" + double7: @"Positive infinity" + double8: @"Negative infinity" + double9: @"-qNaN" + double10: @"-qNaN" + double11: @"+qNaN" + double12: @"+qNaN" + double13: @"Bad spacing" + double14: @"-sNaN" + double15: @"-sNaN" + double16: @"+sNaN" + double17: @"+sNaN" + float0: + float+0: + float-0: float1: + float2: + float3: @"Fewer than 8 digits" + float4: @"More than 8 digits" + float5: @"Invalid chars" + float6: @"Positive infinity" + float7: @"Negative infinity" + float8: @"+sNaN" + float9: @"+sNaN" + float10: @"-sNaN" + float11: @"-sNaN" + float12: @"Bad spacing" + float13: @"+qNaN" + float14: @"+qNaN" + float15: @"-qNaN" + float16: @"-qNaN" int-257: int-256: int-255: @@ -89,10 +125,13 @@ int-2: int-1: int0: + int+0: + int-0: int1: int12: int13: int127: + int+127: int128: int255: int256: @@ -112,6 +151,8 @@ list8: @"Missing close bracket" list9: @"Unexpected close bracket" list10: @"Missing end byte" + list11: + list12: noinput0: @"No input at all" embed0: embed1: @@ -138,17 +179,22 @@ string5: symbol0: symbol2: + symbol3: + symbol4: + symbol5: + symbol6: + symbol7: + symbol8: + symbol9: + symbol10: + symbol11: + symbol12: + symbol13: tag0: @"Unexpected end tag" tag1: @"Invalid tag" tag2: @"Invalid tag" whitespace0: @"Leading spaces have to eventually yield something" whitespace1: @"No input at all" - value1: - value2: - value3: - value4: - value5: - value6: longlist14: ').exec([ Record(Symbol('hi'), [1]), - Record(Symbol('no'), [2]), - Record(Symbol('hi'), [3]) ]), + self.assertPreservesEqual(parse('').exec([ Record(Symbol('hi'), [1]), + Record(Symbol('no'), [2]), + Record(Symbol('hi'), [3]) ]), (2,)) - self.assertEqual(parse('/ ').exec([ Record(Symbol('hi'), [1]), - Record(Symbol('no'), [2]), - Record(Symbol('hi'), [3]) ]), + self.assertPreservesEqual(parse('/ ').exec([ Record(Symbol('hi'), [1]), + Record(Symbol('no'), [2]), + Record(Symbol('hi'), [3]) ]), (1, 0, 1)) diff --git a/implementations/python/tests/test_preserves.py b/implementations/python/tests/test_preserves.py index 9dc2f9c..ecd2b6c 100644 --- a/implementations/python/tests/test_preserves.py +++ b/implementations/python/tests/test_preserves.py @@ -1,11 +1,12 @@ import numbers import os import sys -import unittest # Make `preserves` available for imports sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from utils import PreservesTestCase + from preserves import * from preserves.compat import basestring_, ord_ from preserves.values import _unwrap @@ -49,33 +50,33 @@ def _e(v): def _R(k, *args): return Record(Symbol(k), args) -class BinaryCodecTests(unittest.TestCase): +class BinaryCodecTests(PreservesTestCase): def _roundtrip(self, forward, expected, back=None, nondeterministic=False): if back is None: back = forward - self.assertEqual(_d(_e(forward)), back) - self.assertEqual(_d(_e(back)), back) - self.assertEqual(_d(expected), back) + self.assertPreservesEqual(_d(_e(forward)), back) + self.assertPreservesEqual(_d(_e(back)), back) + self.assertPreservesEqual(_d(expected), back) if not nondeterministic: actual = _e(forward) - self.assertEqual(actual, expected, '%s != %s' % (_hex(actual), _hex(expected))) + self.assertPreservesEqual(actual, expected, '%s != %s' % (_hex(actual), _hex(expected))) def test_decode_varint(self): with self.assertRaises(DecodeError): Decoder(_buf()).varint() - self.assertEqual(Decoder(_buf(0)).varint(), 0) - self.assertEqual(Decoder(_buf(10)).varint(), 10) - self.assertEqual(Decoder(_buf(100)).varint(), 100) - self.assertEqual(Decoder(_buf(200, 1)).varint(), 200) - self.assertEqual(Decoder(_buf(0b10101100, 0b00000010)).varint(), 300) - self.assertEqual(Decoder(_buf(128, 148, 235, 220, 3)).varint(), 1000000000) + self.assertPreservesEqual(Decoder(_buf(0)).varint(), 0) + self.assertPreservesEqual(Decoder(_buf(10)).varint(), 10) + self.assertPreservesEqual(Decoder(_buf(100)).varint(), 100) + self.assertPreservesEqual(Decoder(_buf(200, 1)).varint(), 200) + self.assertPreservesEqual(Decoder(_buf(0b10101100, 0b00000010)).varint(), 300) + self.assertPreservesEqual(Decoder(_buf(128, 148, 235, 220, 3)).varint(), 1000000000) def test_encode_varint(self): - self.assertEqual(_varint(0), _buf(0)) - self.assertEqual(_varint(10), _buf(10)) - self.assertEqual(_varint(100), _buf(100)) - self.assertEqual(_varint(200), _buf(200, 1)) - self.assertEqual(_varint(300), _buf(0b10101100, 0b00000010)) - self.assertEqual(_varint(1000000000), _buf(128, 148, 235, 220, 3)) + self.assertPreservesEqual(_varint(0), _buf(0)) + self.assertPreservesEqual(_varint(10), _buf(10)) + self.assertPreservesEqual(_varint(100), _buf(100)) + self.assertPreservesEqual(_varint(200), _buf(200, 1)) + self.assertPreservesEqual(_varint(300), _buf(0b10101100, 0b00000010)) + self.assertPreservesEqual(_varint(1000000000), _buf(128, 148, 235, 220, 3)) def test_simple_seq(self): self._roundtrip([1,2,3,4], _buf(0xb5, 0x91, 0x92, 0x93, 0x94, 0x84), back=(1,2,3,4)) @@ -157,7 +158,7 @@ class BinaryCodecTests(unittest.TestCase): # python 3 bs = _e(d.items()) self.assertRegex(_hex(bs), r) - self.assertEqual(sorted(_d(bs)), [(u'a', 1), (u'b', 2), (u'c', 3)]) + self.assertPreservesEqual(sorted(_d(bs)), [(u'a', 1), (u'b', 2), (u'c', 3)]) def test_long_sequence(self): self._roundtrip((False,) * 14, _buf(0xb5, b'\x80' * 14, 0x84)) @@ -172,9 +173,9 @@ class BinaryCodecTests(unittest.TestCase): a1 = Embedded(A(1)) a2 = Embedded(A(1)) self.assertNotEqual(encode(a1, encode_embedded=id), encode(a2, encode_embedded=id)) - self.assertEqual(encode(a1, encode_embedded=id), encode(a1, encode_embedded=id)) - self.assertEqual(ord_(encode(a1, encode_embedded=id)[0]), 0x86) - self.assertEqual(ord_(encode(a2, encode_embedded=id)[0]), 0x86) + self.assertPreservesEqual(encode(a1, encode_embedded=id), encode(a1, encode_embedded=id)) + self.assertPreservesEqual(ord_(encode(a1, encode_embedded=id)[0]), 0x86) + self.assertPreservesEqual(ord_(encode(a2, encode_embedded=id)[0]), 0x86) def test_decode_embedded_absent(self): with self.assertRaises(DecodeError): @@ -185,15 +186,15 @@ class BinaryCodecTests(unittest.TestCase): def enc(p): objects.append(p) return len(objects) - 1 - self.assertEqual(encode([Embedded(object()), Embedded(object())], encode_embedded = enc), - b'\xb5\x86\x90\x86\x91\x84') + self.assertPreservesEqual(encode([Embedded(object()), Embedded(object())], encode_embedded = enc), + b'\xb5\x86\x90\x86\x91\x84') def test_decode_embedded(self): objects = [123, 234] def dec(v): return objects[v] - self.assertEqual(decode(b'\xb5\x86\x90\x86\x91\x84', decode_embedded = dec), - (Embedded(123), Embedded(234))) + self.assertPreservesEqual(decode(b'\xb5\x86\x90\x86\x91\x84', decode_embedded = dec), + (Embedded(123), Embedded(234))) def load_binary_samples(): with open(os.path.join(os.path.dirname(__file__), 'samples.bin'), 'rb') as f: @@ -203,16 +204,16 @@ def load_text_samples(): with open(os.path.join(os.path.dirname(__file__), 'samples.pr'), 'rt') as f: return Parser(f.read(), include_annotations=True, parse_embedded=lambda x: x).next() -class TextCodecTests(unittest.TestCase): +class TextCodecTests(PreservesTestCase): def test_samples_bin_eq_txt(self): b = load_binary_samples() t = load_text_samples() - self.assertEqual(b, t) + self.assertPreservesEqual(b, t) def test_txt_roundtrip(self): b = load_binary_samples() s = stringify(b, format_embedded=lambda x: x) - self.assertEqual(parse(s, include_annotations=True, parse_embedded=lambda x: x), b) + self.assertPreservesEqual(parse(s, include_annotations=True, parse_embedded=lambda x: x), b) def add_method(d, tName, fn): if hasattr(fn, 'func_name'): @@ -254,14 +255,14 @@ def install_test(d, variant, tName, binaryForm, annotatedTextForm): entry = get_expected_values(tName, textForm) forward = entry['forward'] back = entry['back'] - def test_match_expected(self): self.assertEqual(textForm, back) - def test_roundtrip(self): self.assertEqual(self.DS(self.E(textForm)), back) - def test_forward(self): self.assertEqual(self.DS(self.E(forward)), back) - def test_back(self): self.assertEqual(self.DS(binaryForm), back) - def test_back_ann(self): self.assertEqual(self.D(self.E(annotatedTextForm)), annotatedTextForm) - def test_encode(self): self.assertEqual(self.E(forward), binaryForm) - def test_encode_canonical(self): self.assertEqual(self.EC(annotatedTextForm), binaryForm) - def test_encode_ann(self): self.assertEqual(self.E(annotatedTextForm), binaryForm) + def test_match_expected(self): self.assertPreservesEqual(textForm, back) + def test_roundtrip(self): self.assertPreservesEqual(self.DS(self.E(textForm)), back) + def test_forward(self): self.assertPreservesEqual(self.DS(self.E(forward)), back) + def test_back(self): self.assertPreservesEqual(self.DS(binaryForm), back) + def test_back_ann(self): self.assertPreservesEqual(self.D(self.E(annotatedTextForm)), annotatedTextForm) + def test_encode(self): self.assertPreservesEqual(self.E(forward), binaryForm) + def test_encode_canonical(self): self.assertPreservesEqual(self.EC(annotatedTextForm), binaryForm) + def test_encode_ann(self): self.assertPreservesEqual(self.E(annotatedTextForm), binaryForm) add_method(d, tName, test_match_expected) add_method(d, tName, test_roundtrip) add_method(d, tName, test_forward) @@ -284,7 +285,7 @@ def install_exn_test(d, tName, bs, check_proc): self.fail('did not fail as expected') add_method(d, tName, test_exn) -class CommonTestSuite(unittest.TestCase): +class CommonTestSuite(PreservesTestCase): TestCases = Record.makeConstructor('TestCases', 'cases') samples = load_binary_samples() @@ -325,7 +326,7 @@ class CommonTestSuite(unittest.TestCase): def EC(self, v): return encode(v, encode_embedded=lambda x: x, canonicalize=True) -class RecordTests(unittest.TestCase): +class RecordTests(PreservesTestCase): def test_getters(self): T = Record.makeConstructor('t', 'x y z') T2 = Record.makeConstructor('t', 'x y z') @@ -334,8 +335,8 @@ class RecordTests(unittest.TestCase): self.assertTrue(T.isClassOf(t)) self.assertTrue(T2.isClassOf(t)) self.assertFalse(U.isClassOf(t)) - self.assertEqual(T._x(t), 1) - self.assertEqual(T2._y(t), 2) - self.assertEqual(T._z(t), 3) + self.assertPreservesEqual(T._x(t), 1) + self.assertPreservesEqual(T2._y(t), 2) + self.assertPreservesEqual(T._z(t), 3) with self.assertRaises(TypeError): U._x(t) diff --git a/implementations/python/tests/test_schema.py b/implementations/python/tests/test_schema.py index 69e8688..115da71 100644 --- a/implementations/python/tests/test_schema.py +++ b/implementations/python/tests/test_schema.py @@ -1,4 +1,4 @@ -import unittest +from utils import PreservesTestCase from preserves import * from preserves.schema import meta, Compiler @@ -8,7 +8,7 @@ def literal_schema(modname, s): c.load_schema((Symbol(modname),), preserve(s)) return c.root -class BasicSchemaTests(unittest.TestCase): +class BasicSchemaTests(PreservesTestCase): def test_dictionary_literal(self): m = literal_schema( 's', @@ -22,7 +22,7 @@ class BasicSchemaTests(unittest.TestCase): }> ''')) self.assertEqual(m.s.C.decode({'core': Symbol('true')}), m.s.C()) - self.assertEqual(preserve(m.s.C()), {'core': Symbol('true')}) + self.assertPreservesEqual(preserve(m.s.C()), {'core': Symbol('true')}) def test_alternation_of_dictionary_literal(self): m = literal_schema( @@ -40,6 +40,6 @@ class BasicSchemaTests(unittest.TestCase): }> ''')) self.assertEqual(m.s.C.decode({'core': Symbol('true')}), m.s.C.core()) - self.assertEqual(preserve(m.s.C.core()), {'core': Symbol('true')}) + self.assertPreservesEqual(preserve(m.s.C.core()), {'core': Symbol('true')}) self.assertEqual(m.s.C.decode({'notcore': Symbol('true')}), m.s.C.notcore()) - self.assertEqual(preserve(m.s.C.notcore()), {'notcore': Symbol('true')}) + self.assertPreservesEqual(preserve(m.s.C.notcore()), {'notcore': Symbol('true')}) diff --git a/implementations/python/tests/utils.py b/implementations/python/tests/utils.py new file mode 100644 index 0000000..2487c6f --- /dev/null +++ b/implementations/python/tests/utils.py @@ -0,0 +1,9 @@ +import unittest + +from preserves import cmp + +class PreservesTestCase(unittest.TestCase): + def assertPreservesEqual(self, a, b, msg=None): + if msg is None: + msg = 'Expected %s to be Preserves-equal to %s' % (a, b) + self.assertTrue(cmp(a, b) == 0, msg) diff --git a/implementations/racket/preserves/preserves/float-bytes.rkt b/implementations/racket/preserves/preserves/float-bytes.rkt new file mode 100644 index 0000000..46b0fa5 --- /dev/null +++ b/implementations/racket/preserves/preserves/float-bytes.rkt @@ -0,0 +1,101 @@ +#lang racket/base +;; Conversion between binary32 and binary64 big-endian external format (byte-vectors) and +;; internal double-precision floating-point numbers, with special attention paid to +;; preservation of the quiet/signaling bit of NaNs, which otherwise is frequently disturbed by +;; hardware-level conversion between single and double precision. + +(provide bytes->float + float->bytes + bytes->double + double->bytes) + +(require "float.rkt") +(require (only-in racket/math nan? infinite?)) + +(module binary racket/base + (provide (all-defined-out)) + + (define (binary32-nan-or-inf? bs) + (and (= (bitwise-bit-field (bytes-ref bs 0) 0 7) #x7f) + (bitwise-bit-set? (bytes-ref bs 1) 7))) + + (define (binary64-nan-or-inf? bs) + (and (= (bitwise-bit-field (bytes-ref bs 0) 0 7) #x7f) + (= (bitwise-bit-field (bytes-ref bs 1) 4 8) #x0f))) + + (define (sign-bit-set? bs) + (bitwise-bit-set? (bytes-ref bs 0) 0))) + +(require (submod "." binary)) + +(define (bytes->float bs) + (if (binary32-nan-or-inf? bs) + (let* ((vf (integer-bytes->integer bs #f #t)) + (signexp (bitwise-bit-field vf 23 32)) + (payload (bitwise-bit-field vf 0 23)) + (vd (bitwise-ior (arithmetic-shift signexp 55) + #x0070000000000000 + (arithmetic-shift payload 29))) + (dbs (integer->integer-bytes vd 8 #f #t))) + (float (floating-point-bytes->real dbs #t 0 8))) + (float (floating-point-bytes->real bs #t 0 4)))) + +(define (float->bytes v) + (let ((v (float-value v))) + (if (or (nan? v) (infinite? v)) + (let* ((dbs (real->floating-point-bytes v 8 #t)) + (vd (integer-bytes->integer dbs #f #t)) + (signexp (bitwise-bit-field vd 55 64)) + (payload (bitwise-bit-field vd 29 52)) + (vf (bitwise-ior (arithmetic-shift signexp 23) + payload)) + (bs (integer->integer-bytes vf 4 #f #t))) + bs) + (real->floating-point-bytes v 4 #t)))) + +(define (bytes->double bs) + (floating-point-bytes->real bs #t 0 8)) + +(define (double->bytes v) + (real->floating-point-bytes v 8 #t)) + +(module+ test + (require rackunit) + (require file/sha1) + + (define (check-roundtrip-double hex) + (check-equal? (bytes->hex-string (double->bytes (bytes->double (hex-string->bytes hex)))) + hex)) + + (define (check-roundtrip-float hex) + (check-equal? (bytes->hex-string (float->bytes (bytes->float (hex-string->bytes hex)))) + hex)) + + (check-roundtrip-double "0123456789abcdef") + (check-roundtrip-double "7ff0000000000321") + (check-roundtrip-double "7ff0000000000001") + (check-roundtrip-double "7ff0000000000000") + (check-roundtrip-double "fff0000000000321") + (check-roundtrip-double "fff0000000000001") + (check-roundtrip-double "fff0000000000000") + (check-roundtrip-double "7ff8000000000321") + (check-roundtrip-double "7ff8000000000001") + (check-roundtrip-double "7ff8000000000000") + (check-roundtrip-double "fff8000000000321") + (check-roundtrip-double "fff8000000000001") + (check-roundtrip-double "fff8000000000000") + + (check-roundtrip-float "01234567") + (check-roundtrip-float "7f800321") + (check-roundtrip-float "7f800001") + (check-roundtrip-float "7f800000") + (check-roundtrip-float "ff800321") + (check-roundtrip-float "ff800001") + (check-roundtrip-float "ff800000") + (check-roundtrip-float "7fc00321") + (check-roundtrip-float "7fc00001") + (check-roundtrip-float "7fc00000") + (check-roundtrip-float "ffc00321") + (check-roundtrip-float "ffc00001") + (check-roundtrip-float "ffc00000") + ) diff --git a/implementations/racket/preserves/preserves/jelly.rkt b/implementations/racket/preserves/preserves/jelly.rkt index 0895429..6ab8ef1 100644 --- a/implementations/racket/preserves/preserves/jelly.rkt +++ b/implementations/racket/preserves/preserves/jelly.rkt @@ -8,8 +8,8 @@ ;;--------------------------------------------------------------------------- ;; Representing values +(require "float.rkt" "float-bytes.rkt") (struct record (label fields) #:transparent) -(struct float (value) #:transparent) ;; a marker for single-precision I/O (struct annotated (annotations item) #:transparent) (struct embedded (value) #:transparent) @@ -23,8 +23,8 @@ (match (next-byte) [#x80 #f] [#x81 #t] - [#x82 (float (floating-point-bytes->real (next-bytes 4) #t 0 4))] - [#x83 (floating-point-bytes->real (next-bytes 8) #t 0 8)] + [#x82 (bytes->float (next-bytes 4))] + [#x83 (bytes->double (next-bytes 8))] [#x84 '#:end] [#x85 (let ((a (next))) (match (next) @@ -80,8 +80,8 @@ (match v [#f (write-byte #x80 out-port)] [#t (write-byte #x81 out-port)] - [(float v) (write-byte #x82 out-port) (output-bytes (real->floating-point-bytes v 4 #t))] - [(? flonum?) (write-byte #x83 out-port) (output-bytes (real->floating-point-bytes v 8 #t))] + [(float _) (write-byte #x82 out-port) (output-bytes (float->bytes v))] + [(? flonum?) (write-byte #x83 out-port) (output-bytes (double->bytes v))] [(annotated as v) (for [(a (in-list as))] (write-byte #x85 out-port) (output a)) diff --git a/implementations/racket/preserves/preserves/read-binary.rkt b/implementations/racket/preserves/preserves/read-binary.rkt index 0ec1f28..9459adb 100644 --- a/implementations/racket/preserves/preserves/read-binary.rkt +++ b/implementations/racket/preserves/preserves/read-binary.rkt @@ -7,6 +7,7 @@ (require "record.rkt") (require "embedded.rkt") (require "float.rkt") +(require "float-bytes.rkt") (require "annotation.rkt") (require "varint.rkt") (require racket/set) @@ -70,8 +71,8 @@ (match lead-byte [#x80 #f] [#x81 #t] - [#x82 (float (floating-point-bytes->real (next-bytes 4) #t 0 4))] - [#x83 (floating-point-bytes->real (next-bytes 8) #t 0 8)] + [#x82 (bytes->float (next-bytes 4))] + [#x83 (bytes->double (next-bytes 8))] [#x84 '#:end] [#x85 (let ((a (next))) (if read-annotations? diff --git a/implementations/racket/preserves/preserves/read-text.rkt b/implementations/racket/preserves/preserves/read-text.rkt index 9090789..b3e20c2 100644 --- a/implementations/racket/preserves/preserves/read-text.rkt +++ b/implementations/racket/preserves/preserves/read-text.rkt @@ -10,6 +10,7 @@ (require "read-binary.rkt") (require "record.rkt") (require "float.rkt") +(require "float-bytes.rkt") (require syntax/readerr) (require (only-in file/sha1 hex-string->bytes)) (require (only-in net/base64 base64-decode)) @@ -67,8 +68,6 @@ (define (next*) (skip-whitespace) (match (next-char) - [#\- (read-intpart (list #\-) (next-char))] - [(and c (or #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9)) (read-intpart '() c)] [#\" (read-string #\")] [(== PIPE) (string->symbol (read-string PIPE))] @@ -82,21 +81,12 @@ [#\t #t] [#\{ (sequence-fold (set) set-add* values #\})] [#\" (read-literal-binary)] - [#\x (if (eqv? (next-char) #\") - (read-hex-binary '()) - (parse-error "Expected open-quote at start of hex ByteString"))] + [#\x (match (next-char) + [#\" (read-hex-binary '())] + [#\f (read-hex-float 'float)] + [#\d (read-hex-float 'double)] + [c (parse-error "Invalid #x syntax: ~v" c)])] [#\[ (read-base64-binary '())] - [#\= (define bs (read-preserve/text in-port #:read-syntax? #t #:source source)) - (when (not (bytes? (annotated-item bs))) - (parse-error "ByteString must follow #=")) - (when (not (null? (annotated-annotations bs))) - (parse-error "Annotations not permitted after #=")) - (bytes->preserve - (annotated-item bs) - (lambda (message . args) - (apply parse-error (string-append "Inline binary value: " message) args)) - #:read-syntax? read-syntax? - #:on-short (lambda () (parse-error "Incomplete inline binary value")))] [#\! (embedded (decode-embedded (next)))] [c (parse-error "Invalid # syntax: ~v" c)])] @@ -110,7 +100,7 @@ [#\] (parse-error "Unexpected ]")] [#\} (parse-error "Unexpected }")] - [c (read-raw-symbol (list c))])) + [c (read-raw-symbol-or-number (list c))])) (define (set-add* s e) (when (set-member? s e) (parse-error "Duplicate set element: ~v" e)) @@ -159,49 +149,6 @@ (annotated '() loc v)))) (lambda (pos0 v) v))) - ;;--------------------------------------------------------------------------- - ;; Numbers - - (define (read-intpart acc-rev ch) - (match ch - [#\0 (read-fracexp (cons ch acc-rev))] - [_ (read-digit+ acc-rev read-fracexp ch)])) - - (define (read-digit* acc-rev k) - (match (peek-char in-port) - [(? char? (? char-numeric?)) (read-digit* (cons (read-char in-port) acc-rev) k)] - [_ (k acc-rev)])) - - (define (read-digit+ acc-rev k [ch (read-char in-port)]) - (match ch - [(? char? (? char-numeric?)) (read-digit* (cons ch acc-rev) k)] - [_ (parse-error "Incomplete number")])) - - (define (read-fracexp acc-rev) - (match (peek-char in-port) - [#\. (read-digit+ (cons (read-char in-port) acc-rev) read-exp)] - [_ (read-exp acc-rev)])) - - (define (read-exp acc-rev) - (match (peek-char in-port) - [(or #\e #\E) (read-sign-and-exp (cons (read-char in-port) acc-rev))] - [_ (finish-number acc-rev)])) - - (define (read-sign-and-exp acc-rev) - (match (peek-char in-port) - [(or #\+ #\-) (read-digit+ (cons (read-char in-port) acc-rev) finish-number)] - [_ (read-digit+ acc-rev finish-number)])) - - (define (finish-number acc-rev) - (define s (list->string (reverse acc-rev))) - (define n (string->number s 10)) - (when (not n) (parse-error "Invalid number: ~v" s)) - (if (flonum? n) - (match (peek-char in-port) - [(or #\f #\F) (read-char in-port) (float n)] - [_ n]) - n)) - ;;--------------------------------------------------------------------------- ;; String-like things @@ -279,6 +226,19 @@ [else (parse-error "Invalid hex character")])) + ;;--------------------------------------------------------------------------- + ;; Hex-encoded floating point numbers + + (define (read-hex-float precision) + (unless (eqv? (next-char) #\") + (parse-error "Missing open-double-quote in hex-encoded floating-point number")) + (define bs (read-hex-binary '())) + (unless (= (bytes-length bs) (match precision ['float 4] ['double 8])) + (parse-error "Incorrect number of bytes in hex-encoded floating-point number")) + (match precision + ['float (bytes->float bs)] + ['double (bytes->double bs)])) + ;;--------------------------------------------------------------------------- ;; Base64-encoded ByteStrings @@ -334,16 +294,56 @@ #\})) ;;--------------------------------------------------------------------------- - ;; "Raw" symbols + ;; "Raw" symbols and numbers - (define (read-raw-symbol acc) + (define (read-raw-symbol-or-number acc) (match (peek-char in-port) [(or (? eof-object?) (? char? (or #\( #\) #\{ #\} #\[ #\] #\< #\> #\" #\; #\, #\@ #\# #\: (== PIPE) (? char-whitespace?)))) - (string->symbol (list->string (reverse acc)))] - [_ (read-raw-symbol (cons (read-char in-port) acc))])) + (let ((input (reverse acc))) + (or (analyze-number input) + (string->symbol (list->string input))))] + [_ (read-raw-symbol-or-number (cons (read-char in-port) acc))])) + + (define (analyze-number input) + (match input + [(cons (and sign (or #\+ #\-)) input) (read-digit+ (list sign) read-fracexp input)] + [_ (read-digit+ (list) read-fracexp input)])) + + (define (read-digit* acc-rev k input) + (match input + [(cons (? char? (? char-numeric? d)) input) (read-digit* (cons d acc-rev) k input)] + [_ (k acc-rev input)])) + + (define (read-digit+ acc-rev k input) + (match input + [(cons (? char? (? char-numeric? d)) input) (read-digit* (cons d acc-rev) k input)] + [_ #f])) + + (define (read-fracexp acc-rev input) + (match input + [(cons #\. input) (read-digit+ (cons #\. acc-rev) read-exp input)] + [_ (read-exp acc-rev input)])) + + (define (read-exp acc-rev input) + (match input + [(cons (and e (or #\e #\E)) input) (read-sign-and-exp (cons e acc-rev) input)] + [_ (finish-number acc-rev input)])) + + (define (read-sign-and-exp acc-rev input) + (match input + [(cons (and sign (or #\+ #\-)) input) (read-digit+ (cons sign acc-rev) finish-number input)] + [_ (read-digit+ acc-rev finish-number input)])) + + (define (finish-number acc-rev input) + (define s (list->string (reverse acc-rev))) + (define n (string->number s 10)) + (cond [(not n) #f] + [(and (flonum? n) (member input '((#\f) (#\F)))) (float n)] + [(equal? input '()) n] + [else #f])) ;;--------------------------------------------------------------------------- ;; Main entry point to parser diff --git a/implementations/racket/preserves/preserves/tests/samples.pr b/implementations/racket/preserves/preserves/tests/samples.pr index 9450d49..a29deb4 100644 --- a/implementations/racket/preserves/preserves/tests/samples.pr +++ b/implementations/racket/preserves/preserves/tests/samples.pr @@ -74,9 +74,45 @@ dict3: @"Duplicate key" dict4: @"Unexpected close brace" dict5: @"Missing value" + double0: + double+0: + double-0: double1: double2: + double3: + double4: @"Fewer than 16 digits" + double5: @"More than 16 digits" + double6: @"Invalid chars" + double7: @"Positive infinity" + double8: @"Negative infinity" + double9: @"-qNaN" + double10: @"-qNaN" + double11: @"+qNaN" + double12: @"+qNaN" + double13: @"Bad spacing" + double14: @"-sNaN" + double15: @"-sNaN" + double16: @"+sNaN" + double17: @"+sNaN" + float0: + float+0: + float-0: float1: + float2: + float3: @"Fewer than 8 digits" + float4: @"More than 8 digits" + float5: @"Invalid chars" + float6: @"Positive infinity" + float7: @"Negative infinity" + float8: @"+sNaN" + float9: @"+sNaN" + float10: @"-sNaN" + float11: @"-sNaN" + float12: @"Bad spacing" + float13: @"+qNaN" + float14: @"+qNaN" + float15: @"-qNaN" + float16: @"-qNaN" int-257: int-256: int-255: @@ -89,10 +125,13 @@ int-2: int-1: int0: + int+0: + int-0: int1: int12: int13: int127: + int+127: int128: int255: int256: @@ -112,6 +151,8 @@ list8: @"Missing close bracket" list9: @"Unexpected close bracket" list10: @"Missing end byte" + list11: + list12: noinput0: @"No input at all" embed0: embed1: @@ -138,17 +179,22 @@ string5: symbol0: symbol2: + symbol3: + symbol4: + symbol5: + symbol6: + symbol7: + symbol8: + symbol9: + symbol10: + symbol11: + symbol12: + symbol13: tag0: @"Unexpected end tag" tag1: @"Invalid tag" tag2: @"Invalid tag" whitespace0: @"Leading spaces have to eventually yield something" whitespace1: @"No input at all" - value1: - value2: - value3: - value4: - value5: - value6: longlist14: floating-point-bytes v 4 #t))] + (output-bytes (float->bytes v))] [(? flonum?) (output-byte #x83) - (output-bytes (real->floating-point-bytes v 8 #t))] + (output-bytes (double->bytes v))] [(annotated as _ v) (when write-annotations? diff --git a/implementations/racket/preserves/preserves/write-text.rkt b/implementations/racket/preserves/preserves/write-text.rkt index 3bfb75d..84c3aa1 100644 --- a/implementations/racket/preserves/preserves/write-text.rkt +++ b/implementations/racket/preserves/preserves/write-text.rkt @@ -12,11 +12,14 @@ (require "embedded.rkt") (require "annotation.rkt") (require "float.rkt") +(require "float-bytes.rkt") (require "record.rkt") (require "object-id.rkt") (require racket/dict) (require racket/set) (require (only-in racket/port with-output-to-string)) +(require (only-in racket/math nan? infinite?)) +(require (only-in file/sha1 bytes->hex-string)) (define PIPE #\|) @@ -132,6 +135,15 @@ (write-binary-stringlike v) (write-binary-base64 outer-distance v))))) + (define (write-float v precision) + (if (or (nan? v) (infinite? v)) + (! "#x~a\"~a\"" + (match precision ['float "f"] ['double "d"]) + (bytes->hex-string (match precision + ['float (float->bytes (float v))] + ['double (double->bytes v)]))) + (! "~v~a" v (match precision ['float "f"] ['double ""])))) + (define (write-value distance v) (match v [(annotated annotations _ item) @@ -143,8 +155,8 @@ (write-value distance item)] [#f (! "#f")] [#t (! "#t")] - [(float v) (! "~vf" v)] - [(? flonum?) (! "~v" v)] + [(float v) (write-float v 'float)] + [(? flonum?) (write-float v 'double)] [(? integer? x) (! "~v" v)] [(? string?) (! "\"") diff --git a/implementations/rust/preserves/Cargo.toml b/implementations/rust/preserves/Cargo.toml index 0d9bd3d..fc7bb85 100644 --- a/implementations/rust/preserves/Cargo.toml +++ b/implementations/rust/preserves/Cargo.toml @@ -15,6 +15,7 @@ gitlab = { repository = "preserves/preserves" } base64 = "0.13" dtoa = "0.4" num = "0.4" +lazy_static = "1.4.0" regex = "1.5" serde = { version = "1.0", features = ["derive"] } serde_bytes = "0.11" diff --git a/implementations/rust/preserves/src/value/text/reader.rs b/implementations/rust/preserves/src/value/text/reader.rs index 446d3e8..8673ce3 100644 --- a/implementations/rust/preserves/src/value/text/reader.rs +++ b/implementations/rust/preserves/src/value/text/reader.rs @@ -26,8 +26,11 @@ use crate::value::reader::BinarySource; use crate::value::reader::ReaderResult; use crate::value::repr::Annotations; +use lazy_static::lazy_static; + use num::bigint::BigInt; +use std::convert::TryInto; use std::io; use std::iter::FromIterator; use std::marker::PhantomData; @@ -137,86 +140,21 @@ impl<'de, 'src, D: Embeddable, Dec: DomainParse, S: BinarySource<'de>> } } - fn read_intpart(&mut self, mut bs: Vec, c: u8) -> io::Result { - match c { - b'0' => { - bs.push(c); - self.read_fracexp(bs) - } - _ => { - self.read_digit1(&mut bs, c)?; - self.read_fracexp(bs) - } + fn read_hex_float(&mut self, bytecount: usize) -> io::Result { + if self.next_byte()? != b'"' { + return Err(io_syntax_error("Missing open-double-quote in hex-encoded floating-point number")); } - } - - fn read_fracexp(&mut self, mut bs: Vec) -> io::Result { - let mut is_float = false; - match self.peek() { - Ok(b'.') => { - is_float = true; - bs.push(self.next_byte()?); - let c = self.next_byte()?; - self.read_digit1(&mut bs, c)?; - } - _ => () + let bs = self.read_hex_binary()?; + if bs.len() != bytecount { + return Err(io_syntax_error("Incorrect number of bytes in hex-encoded floating-point number")); } - match self.peek() { - Ok(b'e') | Ok(b'E') => { - bs.push(self.next_byte()?); - self.read_sign_and_exp(bs) - } - _ => self.finish_number(bs, is_float) + match bytecount { + 4 => Ok(Value::from(f32::from_bits(u32::from_be_bytes(bs.try_into().unwrap()))).wrap()), + 8 => Ok(Value::from(f64::from_bits(u64::from_be_bytes(bs.try_into().unwrap()))).wrap()), + _ => Err(io_syntax_error("Unsupported byte count in hex-encoded floating-point number")), } } - fn read_sign_and_exp(&mut self, mut bs: Vec) -> io::Result { - match self.peek()? { - b'+' | b'-' => bs.push(self.next_byte()?), - _ => (), - } - let c = self.next_byte()?; - self.read_digit1(&mut bs, c)?; - self.finish_number(bs, true) - } - - fn finish_number(&mut self, bs: Vec, is_float: bool) -> io::Result { - let s = decode_utf8(bs)?; - if is_float { - match self.peek() { - Ok(b'f') | Ok(b'F') => { - self.skip()?; - Ok(N::new(s.parse::().map_err( - |_| io_syntax_error(&format!( - "Invalid single-precision number: {:?}", s)))?)) - } - _ => - Ok(N::new(s.parse::().map_err( - |_| io_syntax_error(&format!( - "Invalid double-precision number: {:?}", s)))?)) - } - } else { - Ok(N::new(s.parse::().map_err( - |_| io_syntax_error(&format!( - "Invalid signed-integer number: {:?}", s)))?)) - } - } - - fn read_digit1(&mut self, bs: &mut Vec, c: u8) -> io::Result<()> - { - if !(c as char).is_digit(10) { - return Err(io_syntax_error("Incomplete number")); - } - bs.push(c); - while let Ok(c) = self.peek() { - if !(c as char).is_digit(10) { - break; - } - bs.push(self.next_byte()?); - } - Ok(()) - } - fn read_stringlike( &mut self, mut seed: R, @@ -299,14 +237,13 @@ impl<'de, 'src, D: Embeddable, Dec: DomainParse, S: BinarySource<'de>> |bs, r| Ok(bs.push(r.hexnum(2)? as u8)))?[..])) } - fn read_hex_binary(&mut self) -> io::Result { + fn read_hex_binary(&mut self) -> io::Result> { let mut s = String::new(); loop { self.skip_whitespace(); let c1 = self.next_byte()? as char; if c1 == '"' { - let bs = hex::HexParser::Strict.decode(&s).unwrap(); - return Ok(N::new(&bs[..])); + return Ok(hex::HexParser::Strict.decode(&s).unwrap()); } let c2 = self.next_byte()? as char; if !(c1.is_digit(16) && c2.is_digit(16)) { @@ -364,7 +301,11 @@ impl<'de, 'src, D: Embeddable, Dec: DomainParse, S: BinarySource<'de>> } } - fn read_raw_symbol(&mut self, mut bs: Vec) -> io::Result { + fn read_raw_symbol_or_number(&mut self, mut bs: Vec) -> io::Result { + lazy_static! { + static ref NUMBER_RE: regex::Regex = regex::Regex::new( + r"^([-+]?\d+)(((\.\d+([eE][-+]?\d+)?)|([eE][-+]?\d+))([fF]?))?$").unwrap(); + } loop { let c = match self.peek() { Err(e) if is_eof_io_error(&e) => b' ', @@ -374,8 +315,33 @@ impl<'de, 'src, D: Embeddable, Dec: DomainParse, S: BinarySource<'de>> }; match c { b'(' | b')' | b'{' | b'}' | b'[' | b']' | b'<' | b'>' | - b'"' | b';' | b',' | b'@' | b'#' | b':' | b'|' | b' ' => - return Ok(N::symbol(&decode_utf8(bs)?)), + b'"' | b';' | b',' | b'@' | b'#' | b':' | b'|' | b' ' => { + let s = decode_utf8(bs)?; + return match NUMBER_RE.captures(&s) { + None => Ok(N::symbol(&s)), + Some(m) => match m.get(2) { + None => Ok(N::new(s.parse::().map_err( + |_| io_syntax_error(&format!( + "Invalid signed-integer number: {:?}", s)))?)), + Some(_) => { + if let Some(maybe_f) = m.get(7) { + let s = m[1].to_owned() + &m[3]; + if maybe_f.range().is_empty() { + Ok(N::new(s.parse::().map_err( + |_| io_syntax_error(&format!( + "Invalid double-precision number: {:?}", s)))?)) + } else { + Ok(N::new(s.parse::().map_err( + |_| io_syntax_error(&format!( + "Invalid single-precision number: {:?}", s)))?)) + } + } else { + panic!("Internal error: cannot analyze number {:?}", s) + } + } + } + } + } c => { self.skip()?; bs.push(c) @@ -396,15 +362,6 @@ impl<'de, 'src, N: NestedValue, Dec: DomainParse, S: BinarySource<' Err(e) => return Err(e.into()), }; Ok(Some(match c { - b'-' => { - self.skip()?; - let c1 = self.next_byte()?; - self.read_intpart(vec![b'-'], c1)? - } - b'0' | b'1' | b'2' | b'3' | b'4' | b'5' | b'6' | b'7' | b'8' | b'9' => { - self.skip()?; - self.read_intpart(Vec::new(), c)? - } b'"' => { self.skip()?; N::new(self.read_string(b'"')?) @@ -435,26 +392,13 @@ impl<'de, 'src, N: NestedValue, Dec: DomainParse, S: BinarySource<' b't' => N::new(true), b'{' => N::new(Set::from_iter(self.upto(b'}', read_annotations)?.into_iter())), b'"' => self.read_literal_binary()?, - b'x' => if self.next_byte()? == b'"' { - self.read_hex_binary()? - } else { - return Err(io_syntax_error("Expected open-quote at start of hex ByteString")); + b'x' => match self.next_byte()? { + b'"' => N::new(&self.read_hex_binary()?[..]), + b'f' => self.read_hex_float(4)?, + b'd' => self.read_hex_float(8)?, + _ => return Err(io_syntax_error("Invalid #x syntax")), }, b'[' => self.read_base64_binary()?, - b'=' => { - let bs_val: N = self.demand_next(true)?; - if bs_val.annotations().slice().len() > 0 { - return Err(io_syntax_error("Annotations not permitted after #=")); - } - match bs_val.value().as_bytestring() { - None => - return Err(io_syntax_error("ByteString must follow #=")), - Some(bs) => - crate::value::BytesBinarySource::new(bs) - .packed(ViaCodec::new(&mut self.dec)) - .demand_next(read_annotations)? - } - } b'!' => { let v = self.next_iovalue(read_annotations)?; Value::Embedded(self.dec.parse_embedded(&v)?).wrap() @@ -483,7 +427,7 @@ impl<'de, 'src, N: NestedValue, Dec: DomainParse, S: BinarySource<' b'}' => return Err(io_syntax_error("Unexpected }")), other => { self.skip()?; - self.read_raw_symbol(vec![other])? + self.read_raw_symbol_or_number(vec![other])? } })) } diff --git a/implementations/rust/preserves/src/value/text/writer.rs b/implementations/rust/preserves/src/value/text/writer.rs index 3eb8446..8589d10 100644 --- a/implementations/rust/preserves/src/value/text/writer.rs +++ b/implementations/rust/preserves/src/value/text/writer.rs @@ -1,3 +1,4 @@ +use crate::hex::HexFormatter; use crate::value::DomainEncode; use crate::value::IOValue; use crate::value::IOValueDomainCodec; @@ -6,6 +7,8 @@ use crate::value::Writer; use crate::value::suspendable::Suspendable; use crate::value::writer::CompoundWriter; +use lazy_static::lazy_static; + use num::bigint::BigInt; use std::io; @@ -231,13 +234,23 @@ impl Writer for TextWriter { } fn write_f32(&mut self, v: f32) -> io::Result<()> { - dtoa::write(&mut *self.w, v)?; - write!(self.w, "f") + if v.is_nan() || v.is_infinite() { + write!(self.w, "#xf\"{}\"", + HexFormatter::Packed.encode(&u32::to_be_bytes(f32::to_bits(v)))) + } else { + dtoa::write(&mut *self.w, v)?; + write!(self.w, "f") + } } fn write_f64(&mut self, v: f64) -> io::Result<()> { - dtoa::write(&mut *self.w, v)?; - Ok(()) + if v.is_nan() || v.is_infinite() { + write!(self.w, "#xd\"{}\"", + HexFormatter::Packed.encode(&u64::to_be_bytes(f64::to_bits(v)))) + } else { + dtoa::write(&mut *self.w, v)?; + Ok(()) + } } simple_writer_method!(write_i8, i8); @@ -269,9 +282,12 @@ impl Writer for TextWriter { } fn write_symbol(&mut self, v: &str) -> io::Result<()> { - // FIXME: This regular expression is conservatively correct, but Anglo-chauvinistic. - let re = regex::Regex::new("^[a-zA-Z~!$%^&*?_=+/.][-a-zA-Z~!$%^&*?_=+/.0-9]*$").unwrap(); - if re.is_match(v) { + lazy_static! { + // FIXME: This regular expression is conservatively correct, but Anglo-chauvinistic. + static ref RE: regex::Regex = + regex::Regex::new("^[-a-zA-Z0-9~!$%^&*?_=+/.]+$").unwrap(); + } + if RE.is_match(v) { write!(self.w, "{}", v) } else { write!(self.w, "|")?; diff --git a/preserves-text.md b/preserves-text.md index e6a77c0..a6e9ca4 100644 --- a/preserves-text.md +++ b/preserves-text.md @@ -40,10 +40,10 @@ Standalone documents may have trailing whitespace. Any `Value` may be preceded by whitespace. - Value = ws (Record / Collection / Atom / Embedded / Machine) + Value = ws (Record / Collection / Atom / Embedded) Collection = Sequence / Dictionary / Set - Atom = Boolean / Float / Double / SignedInteger / - String / ByteString / Symbol + Atom = Boolean / String / ByteString / + QuotedSymbol / SymbolOrNumber Each `Record` is an angle-bracket enclosed grouping of its label-`Value` followed by its field-`Value`s. @@ -73,55 +73,6 @@ false, respectively. Boolean = %s"#t" / %s"#f" -Numeric data follow the -[JSON grammar](https://tools.ietf.org/html/rfc8259#section-6), with -the addition of a trailing “f” distinguishing `Float` from `Double` -values. `Float`s and `Double`s always have either a fractional part or -an exponent part, where `SignedInteger`s never have -either.[^reading-and-writing-floats-accurately] -[^arbitrary-precision-signedinteger] - - Float = flt %i"f" - Double = flt - SignedInteger = int - - digit1-9 = %x31-39 - nat = %x30 / ( digit1-9 *DIGIT ) - int = ["-"] nat - frac = "." 1*DIGIT - exp = %i"e" ["-"/"+"] 1*DIGIT - flt = int (frac exp / frac / exp) - - [^reading-and-writing-floats-accurately]: **Implementation note.** - Your language's standard library likely has a good routine for - converting between decimal notation and IEEE 754 floating-point. - However, if not, or if you are interested in the challenges of - accurately reading and writing floating point numbers, see the - excellent matched pair of 1990 papers by Clinger and Steele & - White, and a recent follow-up by Jaffer: - - Clinger, William D. ‘How to Read Floating Point Numbers - Accurately’. In Proc. PLDI. White Plains, New York, 1990. - . - - Steele, Guy L., Jr., and Jon L. White. ‘How to Print - Floating-Point Numbers Accurately’. In Proc. PLDI. White Plains, - New York, 1990. . - - Jaffer, Aubrey. ‘Easy Accurate Reading and Writing of - Floating-Point Numbers’. ArXiv:1310.8121 [Cs], 27 October 2013. - . - - [^arbitrary-precision-signedinteger]: **Implementation note.** Be - aware when implementing reading and writing of `SignedInteger`s - that the data model *requires* arbitrary-precision integers. Your - implementation may (but, ideally, should not) truncate precision - when reading or writing a `SignedInteger`; however, if it does so, - it should (a) signal its client that truncation has occurred, and - (b) make it clear to the client that comparing such truncated - values for equality or ordering will not yield results that match - the expected semantics of the data model. - `String`s are, [as in JSON](https://tools.ietf.org/html/rfc8259#section-7), possibly escaped text surrounded by double quotes. The escaping rules are the @@ -177,62 +128,109 @@ Base64 characters are allowed. ByteString =/ "#[" *(ws / base64char) ws "]" base64char = %x41-5A / %x61-7A / %x30-39 / "+" / "/" / "-" / "_" / "=" -A `Symbol` may be written in a “bare” form[^cf-sexp-token] so long as -it conforms to certain restrictions on the characters appearing in the -symbol. Alternatively, it may be written in a quoted form. The quoted -form is much the same as the syntax for `String`s, including embedded -escape syntax, except using a bar or pipe character (`|`) instead of a -double quote mark. +A `Symbol` may be written in either of two forms. - Symbol = symstart *symcont / "|" *symchar "|" - symstart = ALPHA / sympunct / symustart - symcont = ALPHA / sympunct / symustart / symucont / DIGIT / "-" - sympunct = "~" / "!" / "$" / "%" / "^" / "&" / "*" / - "?" / "_" / "=" / "+" / "/" / "." +The first is a quoted form, much the same as the syntax for `String`s, +including embedded escape syntax, except using a bar or pipe character +(`|`) instead of a double quote mark. + + QuotedSymbol = "|" *symchar "|" symchar = unescaped / %x22 / escape (escaped / %x7C / %s"u" 4HEXDIG) - symustart = - symucont = + +Alternatively, a `Symbol` may be written in a “bare” form[^cf-sexp-token]. +The grammar for numeric data is a subset of the grammar for bare `Symbol`s, +so if a `SymbolOrNumber` also matches the grammar for `Float`, `Double` or +`SignedInteger`, then it must be interpreted as one of those, and otherwise +it must be interpreted as a bare `Symbol`. + + SymbolOrNumber = 1*baresymchar + baresymchar = ALPHA / DIGIT / sympunct / symuchar + sympunct = "~" / "!" / "$" / "%" / "^" / "&" / "*" / + "?" / "_" / "=" / "+" / "-" / "/" / "." + symuchar = [^cf-sexp-token]: Compare with the [SPKI S-expression][sexp.txt] definition of “token representation”, and with the [R6RS definition of identifiers](http://www.r6rs.org/final/html/r6rs/r6rs-Z-H-7.html#node_sec_4.2.4). -An `Embedded` is written as a `Value` chosen to represent the denoted -object, prefixed with `#!`. +Numeric data follow the [JSON +grammar](https://tools.ietf.org/html/rfc8259#section-6) except that leading +zeros are permitted and an optional leading `+` sign is allowed. The +addition of a trailing “f” distinguishes a `Float` from a `Double` value. +`Float`s and `Double`s always have either a fractional part or an exponent +part, where `SignedInteger`s never have +either.[^reading-and-writing-floats-accurately] +[^arbitrary-precision-signedinteger] + + Float = flt %i"f" + Double = flt + SignedInteger = int + + nat = 1*DIGIT + int = ["-"/"+"] nat + frac = "." 1*DIGIT + exp = %i"e" ["-"/"+"] 1*DIGIT + flt = int (frac exp / frac / exp) + + [^reading-and-writing-floats-accurately]: **Implementation note.** + Your language's standard library likely has a good routine for + converting between decimal notation and IEEE 754 floating-point. + However, if not, or if you are interested in the challenges of + accurately reading and writing floating point numbers, see the + excellent matched pair of 1990 papers by Clinger and Steele & + White, and a recent follow-up by Jaffer: + + Clinger, William D. ‘How to Read Floating Point Numbers + Accurately’. In Proc. PLDI. White Plains, New York, 1990. + . + + Steele, Guy L., Jr., and Jon L. White. ‘How to Print + Floating-Point Numbers Accurately’. In Proc. PLDI. White Plains, + New York, 1990. . + + Jaffer, Aubrey. ‘Easy Accurate Reading and Writing of + Floating-Point Numbers’. ArXiv:1310.8121 [Cs], 27 October 2013. + . + + [^arbitrary-precision-signedinteger]: **Implementation note.** Be + aware when implementing reading and writing of `SignedInteger`s + that the data model *requires* arbitrary-precision integers. Your + implementation may (but, ideally, should not) truncate precision + when reading or writing a `SignedInteger`; however, if it does so, + it should (a) signal its client that truncation has occurred, and + (b) make it clear to the client that comparing such truncated + values for equality or ordering will not yield results that match + the expected semantics of the data model. + +Some valid IEEE 754 `Float`s and `Double`s are not covered by the grammar +above, namely, the several million NaNs and the two infinities. These are +represented as raw hexadecimal strings similar to hexadecimal +`ByteString`s. Implementations are free to use hexadecimal floating-point +syntax whereever convenient, even for values representable using the +grammar above.[^rationale-no-general-machine-syntax] + + Value =/ HexFloat / HexDouble + HexFloat = "#xf" %x22 4(ws 2HEXDIG) ws %x22 + HexDouble = "#xd" %x22 8(ws 2HEXDIG) ws %x22 + + [^rationale-no-general-machine-syntax]: **Rationale.** Previous versions + of this specification included an escape to the [machine-oriented + binary syntax](preserves-binary.html) by prefixing a `ByteString` + containing the binary representation of a `Value` with `#=`. The only + true need for this feature was to represent otherwise-unrepresentable + floating-point values. Instead, this specification allows such + floating-point values to be written directly. Removing the `#=` syntax + simplifies implementations (there is no longer any need to support the + machine-oriented syntax) and avoids complications around treatment of + annotations potentially contained within machine-encoded values. + +Finally, an `Embedded` is written as a `Value` chosen to represent the +denoted object, prefixed with `#!`. Embedded = "#!" Value -Finally, any `Value` may be represented by escaping from the textual -syntax to the [machine-oriented binary syntax](preserves-binary.html) -by prefixing a `ByteString` containing the binary representation of the -`Value` with `#=`.[^rationale-switch-to-binary] -[^no-literal-binary-in-text] [^machine-value-annotations] - - Machine = "#=" ws ByteString - - [^rationale-switch-to-binary]: **Rationale.** The textual syntax - cannot express every `Value`: specifically, it cannot express the - several million floating-point NaNs, or the two floating-point - Infinities. Since the machine-oriented binary format for `Value`s - expresses each `Value` with precision, embedding binary `Value`s - solves the problem. - - [^no-literal-binary-in-text]: Every text is ultimately physically - stored as bytes; therefore, it might seem possible to escape to the - raw form of binary encoding from within a piece of textual syntax. - However, while bytes must be involved in any *representation* of - text, the text *itself* is logically a sequence of *code points* and - is not *intrinsically* a binary structure at all. It would be - incoherent to expect to be able to access the representation of the - text from within the text itself. - - [^machine-value-annotations]: Any text-syntax annotations preceding - the `#` are prepended to any binary-syntax annotations yielded by - decoding the `ByteString`. - ## Annotations When written down, a `Value` may have an associated sequence of @@ -293,5 +291,22 @@ The text syntax for `Boolean`s, `Symbol`s, and `ByteString`s is directly inspired by [Racket](https://racket-lang.org/)'s lexical syntax. +## Appendix. Regular expressions for bare symbols and numbers + +When parsing, if a token matches both `SymbolOrNumber` and `Number`, it's a +number; use `Float`, `Double` and `SignedInteger` to disambiguate. If it +matches `SymbolOrNumber` but not `Number`, it's a "bare" `Symbol`. + + SymbolOrNumber: ^[-a-zA-Z0-9~!$%^&*?_=+/.]+$ + Number: ^([-+]?\d+)(((\.\d+([eE][-+]?\d+)?)|([eE][-+]?\d+))([fF]?))?$ + Float: ^([-+]?\d+)(((\.\d+([eE][-+]?\d+)?)|([eE][-+]?\d+))[fF])$ + Double: ^([-+]?\d+)(((\.\d+([eE][-+]?\d+)?)|([eE][-+]?\d+)))$ + SignedInteger: ^([-+]?\d+)$ + +When printing, if a symbol matches both `SymbolOrNumber` and `Number` or +neither `SymbolOrNumber` nor `Number`, it must be quoted (`|...|`). If it +matches `SymbolOrNumber` but not `Number`, it may be printed as a "bare" +`Symbol`. + ## Notes diff --git a/preserves.md b/preserves.md index 411d0b7..8e66ab7 100644 --- a/preserves.md +++ b/preserves.md @@ -220,21 +220,23 @@ The total ordering specified [above](#total-order) means that the following stat -| Value | Encoded byte sequence | -|-----------------------------|---------------------------------------------------------------------------------| -| `>` | B4 B3 07 'c' 'a' 'p' 't' 'u' 'r' 'e' B4 B3 07 'd' 'i' 's' 'c' 'a' 'r' 'd' 84 84 | -| `[1 2 3 4]` | B5 91 92 93 94 84 | -| `[-2 -1 0 1]` | B5 9E 9F 90 91 84 | -| `"hello"` (format B) | B1 05 'h' 'e' 'l' 'l' 'o' | -| `["a" b #"c" [] #{} #t #f]` | B5 B1 01 'a' B3 01 'b' B2 01 'c' B5 84 B6 84 81 80 84 | -| `-257` | A1 FE FF | -| `-1` | 9F | -| `0` | 90 | -| `1` | 91 | -| `255` | A1 00 FF | -| `1.0f` | 82 3F 80 00 00 | -| `1.0` | 83 3F F0 00 00 00 00 00 00 | -| `-1.202e300` | 83 FE 3C B7 B7 59 BF 04 26 | +| Value | Encoded byte sequence | +|-----------------------------------------------------|---------------------------------------------------------------------------------| +| `>` | B4 B3 07 'c' 'a' 'p' 't' 'u' 'r' 'e' B4 B3 07 'd' 'i' 's' 'c' 'a' 'r' 'd' 84 84 | +| `[1 2 3 4]` | B5 91 92 93 94 84 | +| `[-2 -1 0 1]` | B5 9E 9F 90 91 84 | +| `"hello"` (format B) | B1 05 'h' 'e' 'l' 'l' 'o' | +| `["a" b #"c" [] #{} #t #f]` | B5 B1 01 'a' B3 01 'b' B2 01 'c' B5 84 B6 84 81 80 84 | +| `-257` | A1 FE FF | +| `-1` | 9F | +| `0` | 90 | +| `1` | 91 | +| `255` | A1 00 FF | +| `1.0f` | 82 3F 80 00 00 | +| `1.0` | 83 3F F0 00 00 00 00 00 00 | +| `-1.202e300` | 83 FE 3C B7 B7 59 BF 04 26 | +| `#xf"7f800000"`, positive `Float` infinity | 82 7F 80 00 00 | +| `#xd"fff0000000000000"`, negative `Double` infinity | 83 FF F0 00 00 00 00 00 00 | The next example uses a non-`Symbol` label for a record.[^extensibility2] The `Record` diff --git a/tests/samples.bin b/tests/samples.bin index 5d5732a..0d2ad15 100644 Binary files a/tests/samples.bin and b/tests/samples.bin differ diff --git a/tests/samples.pr b/tests/samples.pr index 9450d49..a29deb4 100644 --- a/tests/samples.pr +++ b/tests/samples.pr @@ -74,9 +74,45 @@ dict3: @"Duplicate key" dict4: @"Unexpected close brace" dict5: @"Missing value" + double0: + double+0: + double-0: double1: double2: + double3: + double4: @"Fewer than 16 digits" + double5: @"More than 16 digits" + double6: @"Invalid chars" + double7: @"Positive infinity" + double8: @"Negative infinity" + double9: @"-qNaN" + double10: @"-qNaN" + double11: @"+qNaN" + double12: @"+qNaN" + double13: @"Bad spacing" + double14: @"-sNaN" + double15: @"-sNaN" + double16: @"+sNaN" + double17: @"+sNaN" + float0: + float+0: + float-0: float1: + float2: + float3: @"Fewer than 8 digits" + float4: @"More than 8 digits" + float5: @"Invalid chars" + float6: @"Positive infinity" + float7: @"Negative infinity" + float8: @"+sNaN" + float9: @"+sNaN" + float10: @"-sNaN" + float11: @"-sNaN" + float12: @"Bad spacing" + float13: @"+qNaN" + float14: @"+qNaN" + float15: @"-qNaN" + float16: @"-qNaN" int-257: int-256: int-255: @@ -89,10 +125,13 @@ int-2: int-1: int0: + int+0: + int-0: int1: int12: int13: int127: + int+127: int128: int255: int256: @@ -112,6 +151,8 @@ list8: @"Missing close bracket" list9: @"Unexpected close bracket" list10: @"Missing end byte" + list11: + list12: noinput0: @"No input at all" embed0: embed1: @@ -138,17 +179,22 @@ string5: symbol0: symbol2: + symbol3: + symbol4: + symbol5: + symbol6: + symbol7: + symbol8: + symbol9: + symbol10: + symbol11: + symbol12: + symbol13: tag0: @"Unexpected end tag" tag1: @"Invalid tag" tag2: @"Invalid tag" whitespace0: @"Leading spaces have to eventually yield something" whitespace1: @"No input at all" - value1: - value2: - value3: - value4: - value5: - value6: longlist14: