From 99b0ddbb13499de08365f0d557ffcb134b8589a3 Mon Sep 17 00:00:00 2001 From: Emery Hemingway Date: Fri, 16 Jul 2021 19:11:19 +0200 Subject: [PATCH] Textual parser --- .gitignore | 3 +- README.md | 2 +- src/preserves.nim | 78 +++++++++++++++++++------------- src/preserves/parse.nim | 98 +++++++++++++++++++++++++++++++++++++++++ src/preserves/pegs.nim | 72 ++++++++++++++++++++++++++++++ tests/test_integers.nim | 4 +- tests/test_parser.nim | 39 ++++++++++++++++ tests/test_rfc8259.nim | 2 +- 8 files changed, 263 insertions(+), 35 deletions(-) create mode 100644 src/preserves/parse.nim create mode 100644 src/preserves/pegs.nim create mode 100644 tests/test_parser.nim diff --git a/.gitignore b/.gitignore index b03728b..49b0d54 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ -tests/test_rfc8259 tests/test_integers +tests/test_parser +tests/test_rfc8259 diff --git a/README.md b/README.md index 5449276..aa7c538 100644 --- a/README.md +++ b/README.md @@ -2,5 +2,5 @@ Nim implementation of the [Preserves data language](https://preserves.gitlab.io/ Missing features: * embedded values -* parsing from human-readable encoding * ordering of compound values +* schemas diff --git a/src/preserves.nim b/src/preserves.nim index a2a46d6..d63c8e3 100644 --- a/src/preserves.nim +++ b/src/preserves.nim @@ -1,7 +1,7 @@ # SPDX-License-Identifier: ISC import bigints -import std/[base64, endians, hashes, macros, sets, streams, tables, typetraits] +import std/[base64, endians, hashes, macros, sets, streams, strutils, tables, typetraits] import json except `%`, `%*` @@ -151,9 +151,7 @@ proc `==`*(x, y: Preserve): bool = of pkSymbol: result = x.symbol == y.symbol of pkRecord: - for i, val in x.record: - if y.record[i] != val: return false - result = true + result = x.record == y.record of pkSequence: for i, val in x.sequence: if y.sequence[i] != val: return false @@ -171,26 +169,32 @@ proc `==`*(x, y: Preserve): bool = of pkEmbedded: result = x.embedded == y.embedded -proc `$`*(prs: Preserve): string = +proc concat(result: var string; prs: Preserve) = case prs.kind: of pkBoolean: case prs.bool - of false: result = "#f" - of true: result = "#t" + of false: result.add "#f" + of true: result.add "#t" of pkFloat: - result = $prs.float & "f" + result.add($prs.float & "f") of pkDouble: - result = $prs.double + result.add $prs.double of pkSignedInteger: - result = $prs.int + result.add $prs.int of pkBigInteger: - result = $prs.bigint + result.add $prs.bigint of pkString: - result = escapeJson(prs.string) + result.add escapeJson(prs.string) of pkByteString: - result.add("#[") - result.add(base64.encode(prs.bytes)) - result.add(']') + for b in prs.bytes: + if b.char notin {'\20'..'\21', '#'..'[', ']'..'~'}: + result.add("#[") + result.add(base64.encode(prs.bytes)) + result.add(']') + return + result.add("#\"") + result.add(cast[string](prs.bytes)) + result.add('"') of pkSymbol: result.add(escapeJsonUnquoted(prs.symbol)) of pkRecord: @@ -199,36 +203,38 @@ proc `$`*(prs: Preserve): string = result.add($prs.record[prs.record.high]) for i in 0..') of pkSequence: result.add('[') for i, val in prs.sequence: if i > 0: result.add(' ') - result.add($val) + result.concat(val) result.add(']') of pkSet: result.add("#{") for val in prs.set.items: - result.add($val) + result.concat(val) result.add(' ') - if result.len > 2: + if prs.set.len > 1: result.setLen(result.high) result.add('}') of pkDictionary: result.add('{') for (key, value) in prs.dict.pairs: - result.add($key) + result.concat(key) result.add(": ") - result.add($value) + result.concat(value) result.add(' ') - if result.len > 1: + if prs.dict.len > 1: result.setLen(result.high) result.add('}') of pkEmbedded: result.add(prs.embedded.repr) +proc `$`*(prs: Preserve): string = concat(result, prs) + iterator items*(prs: Preserve): Preserve = case prs.kind of pkRecord: @@ -353,7 +359,7 @@ proc write*(str: Stream; prs: Preserve) = of pkByteString: str.write(0xb2'u8) str.writeVarint(prs.bytes.len) - str.write(prs.bytes) + str.write(cast[string](prs.bytes)) of pkSymbol: str.write(0xb3'u8) str.writeVarint(prs.symbol.len) @@ -385,7 +391,13 @@ proc write*(str: Stream; prs: Preserve) = str.write(0x86'u8) raiseAssert("binary representation of embedded values is undefined") -proc parsePreserve*(s: Stream): Preserve = +proc encode*(prs: Preserve): string = + let s = newStringStream() + s.write prs + s.setPosition 0 + result = s.readAll + +proc decodePreserves*(s: Stream): Preserve = proc assertStream(check: bool) = if not check: raise newException(ValueError, "invalid Preserves stream") @@ -423,26 +435,26 @@ proc parsePreserve*(s: Stream): Preserve = result = symbol(s.readStr(len)) of 0xb4: result = Preserve(kind: pkRecord) - var label = s.parsePreserve() + var label = s.decodePreserves() while s.peekUint8() != endMarker: - result.record.add(s.parsePreserve()) + result.record.add(s.decodePreserves()) result.record.add(label) discard s.readUint8() of 0xb5: result = Preserve(kind: pkSequence) while s.peekUint8() != endMarker: - result.sequence.add(s.parsePreserve()) + result.sequence.add(s.decodePreserves()) discard s.readUint8() of 0xb6: result = Preserve(kind: pkSet) while s.peekUint8() != endMarker: - result.set.incl(s.parsePreserve()) + result.set.incl(s.decodePreserves()) discard s.readUint8() of 0xb7: result = Preserve(kind: pkDictionary) while s.peekUint8() != endMarker: - let key = s.parsePreserve() - let val = s.parsePreserve() + let key = s.decodePreserves() + let val = s.decodePreserves() result.dict[key] = val discard s.readUint8() of 0xb0: @@ -470,6 +482,12 @@ proc parsePreserve*(s: Stream): Preserve = else: assertStream(false) +proc decodePreserves*(s: string): Preserve = + s.newStringStream.decodePreserves + +proc decodePreserves*(s: seq[byte]): Preserve = + cast[string](s).newStringStream.decodePreserves + proc initDictionary*(): Preserve = Preserve(kind: pkDictionary) proc `%`*(b: bool): Preserve = diff --git a/src/preserves/parse.nim b/src/preserves/parse.nim new file mode 100644 index 0000000..0bb6f6d --- /dev/null +++ b/src/preserves/parse.nim @@ -0,0 +1,98 @@ +# SPDX-License-Identifier: ISC + +import std/[base64, parseutils, sets, strutils, tables] +import npeg +import ../preserves, ./pegs + +type + Frame = tuple[value: Preserve, pos: int] + Stack = seq[Frame] + +proc shrink(stack: var Stack; n: int) = stack.setLen(stack.len - n) + +template pushStack(v: Preserve) = stack.add((v, capture[0].si)) + +const pegParser = peg("Document", stack: Stack): + # Override rules from pegs.nim + + Document <- Preserves.Document + + Preserves.Record <- Preserves.Record: + var + record: seq[Preserve] + labelOff: int + while stack[labelOff].pos < capture[0].si: + inc labelOff + for i in labelOff.succ..stack.high: + record.add(move stack[i].value) + record.add(move stack[labelOff].value) + stack.shrink record.len + pushStack Preserve(kind: pkRecord, record: move record) + + Preserves.Sequence <- Preserves.Sequence: + var sequence: seq[Preserve] + for frame in stack.mitems: + if frame.pos > capture[0].si: + sequence.add(move frame.value) + stack.shrink sequence.len + pushStack Preserve(kind: pkSequence, sequence: move sequence) + + Preserves.Dictionary <- Preserves.Dictionary: + var dict: Table[Preserve, Preserve] + for i in countDown(stack.high.pred, 0, 2): + if stack[i].pos < capture[0].si: break + dict[move stack[i].value] = move stack[i.succ].value + stack.shrink 2*dict.len + pushStack Preserve(kind: pkDictionary, dict: move dict) + + Preserves.Set <- Preserves.Set: + var set: HashSet[Preserve] + for frame in stack.mitems: + if frame.pos > capture[0].si: + set.incl(move frame.value) + stack.shrink set.len + pushStack Preserve(kind: pkSet, set: move set) + + Preserves.Boolean <- Preserves.Boolean: + case $0 + of "#f": pushStack Preserve(kind: pkBoolean) + of "#t": pushStack Preserve(kind: pkBoolean, bool: true) + else: discard + + Preserves.Float <- Preserves.Float: + pushStack Preserve(kind: pkFloat, float: parseFloat($1)) + + Preserves.Double <- Preserves.Double: + pushStack Preserve(kind: pkDouble) + let i = stack.high + discard parseBiggestFloat($0, stack[i].value.double) + + Preserves.SignedInteger <- Preserves.SignedInteger: + pushStack Preserve(kind: pkSignedInteger, int: parseInt($0)) + + Preserves.String <- Preserves.String: + pushStack Preserve(kind: pkString, string: unescape($0)) + + Preserves.charByteString <- Preserves.charByteString: + let s = unescape($1) + pushStack Preserve(kind: pkByteString, bytes: cast[seq[byte]](s)) + + Preserves.hexByteString <- Preserves.hexByteString: + pushStack Preserve(kind: pkByteString, bytes: cast[seq[byte]](parseHexStr($1))) + + Preserves.b64ByteString <- Preserves.b64ByteString: + pushStack Preserve(kind: pkByteString, bytes: cast[seq[byte]](base64.decode($1))) + + Preserves.Symbol <- Preserves.Symbol: + pushStack Preserve(kind: pkSymbol, symbol: $0) + + Preserves.Compact <- Preserves.Compact: + pushStack decodePreserves(stack.pop.value.bytes) + +proc parsePreserves*(text: string): Preserve {.gcsafe.} = + var stack: Stack + let match = pegParser.match(text, stack) + if not match.ok: + raise newException(ValueError, "failed to parse Preserves:\n" & text[match.matchMax..text.high]) + assert(stack.len == 1) + stack.pop.value diff --git a/src/preserves/pegs.nim b/src/preserves/pegs.nim new file mode 100644 index 0000000..249009c --- /dev/null +++ b/src/preserves/pegs.nim @@ -0,0 +1,72 @@ +# SPDX-FileCopyrightText: ☭ 2021 Emery Hemingway +# SPDX-License-Identifier: ISC + +import npeg, npeg/lib/utf8 + +when defined(nimHasUsed): {.used.} + +grammar "Preserves": + + Document <- Value * ws * !1 + + Value <- + (ws * (Record | Collection | Atom | Embedded | Compact)) | + (ws * '@' * Value * Value) | + (ws * ';' * @'\n' * Value) + + Collection <- Sequence | Dictionary | Set + + Atom <- Boolean | Float | Double | SignedInteger | String | ByteString | Symbol + + Record <- '<' * Value * *Value * ws * '>' + + Sequence <- '[' * ws * *(Value * ws) * ']' + + Dictionary <- '{' * ws * *(Value * ws * ':' * ws * Value * ws) * '}' + + Set <- "#{" * ws * *(Value * ws) * '}' + + Boolean <- "#f" | "#t" + + Float <- >flt * 'f' + Double <- flt + SignedInteger <- int + + nat <- '0' | (Digit-'0') * *Digit + int <- ?'-' * nat + frac <- '.' * +Digit + exp <- 'e' * ?('-'|'+') * +Digit + flt <- int * ((frac * exp) | frac | exp) + + stringBody <- ?escape * *( +( {'\x20'..'\xff'} - {'"'} - {'\\'}) * *escape) + String <- '"' * stringBody * '"' + + ByteString <- charByteString | hexByteString | b64ByteString + charByteString <- '#' * >('"' * >(*binchar) * '"') + hexByteString <- "#x\"" * ws * >(*(Xdigit[2] * ws)) * '"' + b64ByteString <- "#[" * ws * >(*(base64char * ws)) * ']' + + binchar <- binunescaped | (escape * (escaped | '"' | ('x' * Xdigit[2]))) + binunescaped <- {'\20'..'\21', '#'..'[', ']'..'~'} + base64char <- {'A'..'Z', 'a'..'z', '0'..'9', '+', '/', '-', '_', '='} + + Symbol <- (symstart * *symcont) | ('|' * *symchar * '|') + + symstart <- Alpha | sympunct | symustart + symcont <- Alpha | sympunct | symustart | symucont | Digit | '-' + sympunct <- {'~', '!', '$', '%', '^', '&', '*', '?', '_', '=', '+', '/', '.'} + symchar <- unescaped | '"' | (escape * (escaped | '|' | ('u' * Xdigit))) + symustart <- utf8.any - {0..127} + symucont <- utf8.any - {0..127} + # TODO: exclude some unicode ranges + + Embedded <- "#!" * Value + + Compact <- "#=" * ws * ByteString + + unescaped <- utf8.any - escaped + unicodeEscaped <- 'u' * Xdigit[4] + escaped <- '\\' * ({'{', '"', '|', '\\', 'b', 'f', 'n', 'r', 't'} | unicodeEscaped) + escape <- '\\' + + ws <- *(' ' | '\t' | '\r' | '\n' | ',') diff --git a/tests/test_integers.nim b/tests/test_integers.nim index 0950a1b..84383ba 100644 --- a/tests/test_integers.nim +++ b/tests/test_integers.nim @@ -43,7 +43,7 @@ suite "native": check(b == a) block: stream.setPosition(0) - let y = stream.parsePreserve() + let y = stream.decodePreserves() let a = num let b = y.int check(b == a) @@ -67,7 +67,7 @@ suite "big": check(b == a) block: stream.setPosition(0) - let y = stream.parsePreserve() + let y = stream.decodePreserves() let a = big let b = y.bigint check(b == a) diff --git a/tests/test_parser.nim b/tests/test_parser.nim new file mode 100644 index 0000000..9721002 --- /dev/null +++ b/tests/test_parser.nim @@ -0,0 +1,39 @@ +# SPDX-License-Identifier: ISC + +import std/[strutils, unittest] +import preserves, preserves/parse + +const examples = [ +(""">""", "\xB4\xB3\x07capture\xB4\xB3\x07discard\x84\x84"), +("""[1 2 3 4]""", "\xB5\x91\x92\x93\x94\x84"), +("""[-2 -1 0 1]""", "\xB5\x9E\x9F\x90\x91\x84"), +(""""hello"""", "\xB1\x05hello"), +("""["a" b #"c" [] #{} #t #f]""", "\xB5\xB1\x01a\xB3\x01b\xB2\x01c\xB5\x84\xB6\x84\x81\x80\x84"), +("""-257""", "\xA1\xFE\xFF"), +("""-1""", "\x9F"), +("""0""", "\x90"), +("""1""", "\x91"), +("""255""", "\xA1\x00\xFF"), +("""1.0f""", "\x82\x3F\x80\x00\x00"), +("""1.0""", "\x83\x3F\xF0\x00\x00\x00\x00\x00\x00"), +("""-1.202e300""", "\x83\xFE\x3C\xB7\xB7\x59\xBF\x04\x26"), +("""#=#x"B4B30763617074757265B4B307646973636172648484"""", "\xB4\xB3\x07capture\xB4\xB3\x07discard\x84\x84"), +("""#f""", "\x80") +] + +suite "parse": + for (txt, bin) in examples: + test txt: + checkpoint(txt) + let test = parsePreserves(txt) + checkpoint($test) + block: + let + a= test + b = decodePreserves(bin) + check(a == b) + block: + let + a = encode test + b = bin + check(a.toHex == b.toHex) diff --git a/tests/test_rfc8259.nim b/tests/test_rfc8259.nim index e40ae68..1696633 100644 --- a/tests/test_rfc8259.nim +++ b/tests/test_rfc8259.nim @@ -62,7 +62,7 @@ for i, jsText in testVectors: stream.write(x) stream.setPosition(0) let - y = stream.parsePreserve() + y = stream.decodePreserves() test = y.toJson check(y == x) check(test == control)