Textual parser

This commit is contained in:
Emery Hemingway 2021-07-16 19:11:19 +02:00
parent 30bfaa8c00
commit 99b0ddbb13
8 changed files with 263 additions and 35 deletions

3
.gitignore vendored
View File

@ -1,2 +1,3 @@
tests/test_rfc8259
tests/test_integers
tests/test_parser
tests/test_rfc8259

View File

@ -2,5 +2,5 @@ Nim implementation of the [Preserves data language](https://preserves.gitlab.io/
Missing features:
* embedded values
* parsing from human-readable encoding
* ordering of compound values
* schemas

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: ISC
import bigints
import std/[base64, endians, hashes, macros, sets, streams, tables, typetraits]
import std/[base64, endians, hashes, macros, sets, streams, strutils, tables, typetraits]
import json except `%`, `%*`
@ -151,9 +151,7 @@ proc `==`*(x, y: Preserve): bool =
of pkSymbol:
result = x.symbol == y.symbol
of pkRecord:
for i, val in x.record:
if y.record[i] != val: return false
result = true
result = x.record == y.record
of pkSequence:
for i, val in x.sequence:
if y.sequence[i] != val: return false
@ -171,26 +169,32 @@ proc `==`*(x, y: Preserve): bool =
of pkEmbedded:
result = x.embedded == y.embedded
proc `$`*(prs: Preserve): string =
proc concat(result: var string; prs: Preserve) =
case prs.kind:
of pkBoolean:
case prs.bool
of false: result = "#f"
of true: result = "#t"
of false: result.add "#f"
of true: result.add "#t"
of pkFloat:
result = $prs.float & "f"
result.add($prs.float & "f")
of pkDouble:
result = $prs.double
result.add $prs.double
of pkSignedInteger:
result = $prs.int
result.add $prs.int
of pkBigInteger:
result = $prs.bigint
result.add $prs.bigint
of pkString:
result = escapeJson(prs.string)
result.add escapeJson(prs.string)
of pkByteString:
result.add("#[")
result.add(base64.encode(prs.bytes))
result.add(']')
for b in prs.bytes:
if b.char notin {'\20'..'\21', '#'..'[', ']'..'~'}:
result.add("#[")
result.add(base64.encode(prs.bytes))
result.add(']')
return
result.add("#\"")
result.add(cast[string](prs.bytes))
result.add('"')
of pkSymbol:
result.add(escapeJsonUnquoted(prs.symbol))
of pkRecord:
@ -199,36 +203,38 @@ proc `$`*(prs: Preserve): string =
result.add($prs.record[prs.record.high])
for i in 0..<prs.record.high:
result.add(' ')
result.add($prs.record[i])
result.concat(prs.record[i])
result.add('>')
of pkSequence:
result.add('[')
for i, val in prs.sequence:
if i > 0:
result.add(' ')
result.add($val)
result.concat(val)
result.add(']')
of pkSet:
result.add("#{")
for val in prs.set.items:
result.add($val)
result.concat(val)
result.add(' ')
if result.len > 2:
if prs.set.len > 1:
result.setLen(result.high)
result.add('}')
of pkDictionary:
result.add('{')
for (key, value) in prs.dict.pairs:
result.add($key)
result.concat(key)
result.add(": ")
result.add($value)
result.concat(value)
result.add(' ')
if result.len > 1:
if prs.dict.len > 1:
result.setLen(result.high)
result.add('}')
of pkEmbedded:
result.add(prs.embedded.repr)
proc `$`*(prs: Preserve): string = concat(result, prs)
iterator items*(prs: Preserve): Preserve =
case prs.kind
of pkRecord:
@ -353,7 +359,7 @@ proc write*(str: Stream; prs: Preserve) =
of pkByteString:
str.write(0xb2'u8)
str.writeVarint(prs.bytes.len)
str.write(prs.bytes)
str.write(cast[string](prs.bytes))
of pkSymbol:
str.write(0xb3'u8)
str.writeVarint(prs.symbol.len)
@ -385,7 +391,13 @@ proc write*(str: Stream; prs: Preserve) =
str.write(0x86'u8)
raiseAssert("binary representation of embedded values is undefined")
proc parsePreserve*(s: Stream): Preserve =
proc encode*(prs: Preserve): string =
let s = newStringStream()
s.write prs
s.setPosition 0
result = s.readAll
proc decodePreserves*(s: Stream): Preserve =
proc assertStream(check: bool) =
if not check:
raise newException(ValueError, "invalid Preserves stream")
@ -423,26 +435,26 @@ proc parsePreserve*(s: Stream): Preserve =
result = symbol(s.readStr(len))
of 0xb4:
result = Preserve(kind: pkRecord)
var label = s.parsePreserve()
var label = s.decodePreserves()
while s.peekUint8() != endMarker:
result.record.add(s.parsePreserve())
result.record.add(s.decodePreserves())
result.record.add(label)
discard s.readUint8()
of 0xb5:
result = Preserve(kind: pkSequence)
while s.peekUint8() != endMarker:
result.sequence.add(s.parsePreserve())
result.sequence.add(s.decodePreserves())
discard s.readUint8()
of 0xb6:
result = Preserve(kind: pkSet)
while s.peekUint8() != endMarker:
result.set.incl(s.parsePreserve())
result.set.incl(s.decodePreserves())
discard s.readUint8()
of 0xb7:
result = Preserve(kind: pkDictionary)
while s.peekUint8() != endMarker:
let key = s.parsePreserve()
let val = s.parsePreserve()
let key = s.decodePreserves()
let val = s.decodePreserves()
result.dict[key] = val
discard s.readUint8()
of 0xb0:
@ -470,6 +482,12 @@ proc parsePreserve*(s: Stream): Preserve =
else:
assertStream(false)
proc decodePreserves*(s: string): Preserve =
s.newStringStream.decodePreserves
proc decodePreserves*(s: seq[byte]): Preserve =
cast[string](s).newStringStream.decodePreserves
proc initDictionary*(): Preserve = Preserve(kind: pkDictionary)
proc `%`*(b: bool): Preserve =

98
src/preserves/parse.nim Normal file
View File

@ -0,0 +1,98 @@
# SPDX-License-Identifier: ISC
import std/[base64, parseutils, sets, strutils, tables]
import npeg
import ../preserves, ./pegs
type
Frame = tuple[value: Preserve, pos: int]
Stack = seq[Frame]
proc shrink(stack: var Stack; n: int) = stack.setLen(stack.len - n)
template pushStack(v: Preserve) = stack.add((v, capture[0].si))
const pegParser = peg("Document", stack: Stack):
# Override rules from pegs.nim
Document <- Preserves.Document
Preserves.Record <- Preserves.Record:
var
record: seq[Preserve]
labelOff: int
while stack[labelOff].pos < capture[0].si:
inc labelOff
for i in labelOff.succ..stack.high:
record.add(move stack[i].value)
record.add(move stack[labelOff].value)
stack.shrink record.len
pushStack Preserve(kind: pkRecord, record: move record)
Preserves.Sequence <- Preserves.Sequence:
var sequence: seq[Preserve]
for frame in stack.mitems:
if frame.pos > capture[0].si:
sequence.add(move frame.value)
stack.shrink sequence.len
pushStack Preserve(kind: pkSequence, sequence: move sequence)
Preserves.Dictionary <- Preserves.Dictionary:
var dict: Table[Preserve, Preserve]
for i in countDown(stack.high.pred, 0, 2):
if stack[i].pos < capture[0].si: break
dict[move stack[i].value] = move stack[i.succ].value
stack.shrink 2*dict.len
pushStack Preserve(kind: pkDictionary, dict: move dict)
Preserves.Set <- Preserves.Set:
var set: HashSet[Preserve]
for frame in stack.mitems:
if frame.pos > capture[0].si:
set.incl(move frame.value)
stack.shrink set.len
pushStack Preserve(kind: pkSet, set: move set)
Preserves.Boolean <- Preserves.Boolean:
case $0
of "#f": pushStack Preserve(kind: pkBoolean)
of "#t": pushStack Preserve(kind: pkBoolean, bool: true)
else: discard
Preserves.Float <- Preserves.Float:
pushStack Preserve(kind: pkFloat, float: parseFloat($1))
Preserves.Double <- Preserves.Double:
pushStack Preserve(kind: pkDouble)
let i = stack.high
discard parseBiggestFloat($0, stack[i].value.double)
Preserves.SignedInteger <- Preserves.SignedInteger:
pushStack Preserve(kind: pkSignedInteger, int: parseInt($0))
Preserves.String <- Preserves.String:
pushStack Preserve(kind: pkString, string: unescape($0))
Preserves.charByteString <- Preserves.charByteString:
let s = unescape($1)
pushStack Preserve(kind: pkByteString, bytes: cast[seq[byte]](s))
Preserves.hexByteString <- Preserves.hexByteString:
pushStack Preserve(kind: pkByteString, bytes: cast[seq[byte]](parseHexStr($1)))
Preserves.b64ByteString <- Preserves.b64ByteString:
pushStack Preserve(kind: pkByteString, bytes: cast[seq[byte]](base64.decode($1)))
Preserves.Symbol <- Preserves.Symbol:
pushStack Preserve(kind: pkSymbol, symbol: $0)
Preserves.Compact <- Preserves.Compact:
pushStack decodePreserves(stack.pop.value.bytes)
proc parsePreserves*(text: string): Preserve {.gcsafe.} =
var stack: Stack
let match = pegParser.match(text, stack)
if not match.ok:
raise newException(ValueError, "failed to parse Preserves:\n" & text[match.matchMax..text.high])
assert(stack.len == 1)
stack.pop.value

72
src/preserves/pegs.nim Normal file
View File

@ -0,0 +1,72 @@
# SPDX-FileCopyrightText: ☭ 2021 Emery Hemingway
# SPDX-License-Identifier: ISC
import npeg, npeg/lib/utf8
when defined(nimHasUsed): {.used.}
grammar "Preserves":
Document <- Value * ws * !1
Value <-
(ws * (Record | Collection | Atom | Embedded | Compact)) |
(ws * '@' * Value * Value) |
(ws * ';' * @'\n' * Value)
Collection <- Sequence | Dictionary | Set
Atom <- Boolean | Float | Double | SignedInteger | String | ByteString | Symbol
Record <- '<' * Value * *Value * ws * '>'
Sequence <- '[' * ws * *(Value * ws) * ']'
Dictionary <- '{' * ws * *(Value * ws * ':' * ws * Value * ws) * '}'
Set <- "#{" * ws * *(Value * ws) * '}'
Boolean <- "#f" | "#t"
Float <- >flt * 'f'
Double <- flt
SignedInteger <- int
nat <- '0' | (Digit-'0') * *Digit
int <- ?'-' * nat
frac <- '.' * +Digit
exp <- 'e' * ?('-'|'+') * +Digit
flt <- int * ((frac * exp) | frac | exp)
stringBody <- ?escape * *( +( {'\x20'..'\xff'} - {'"'} - {'\\'}) * *escape)
String <- '"' * stringBody * '"'
ByteString <- charByteString | hexByteString | b64ByteString
charByteString <- '#' * >('"' * >(*binchar) * '"')
hexByteString <- "#x\"" * ws * >(*(Xdigit[2] * ws)) * '"'
b64ByteString <- "#[" * ws * >(*(base64char * ws)) * ']'
binchar <- binunescaped | (escape * (escaped | '"' | ('x' * Xdigit[2])))
binunescaped <- {'\20'..'\21', '#'..'[', ']'..'~'}
base64char <- {'A'..'Z', 'a'..'z', '0'..'9', '+', '/', '-', '_', '='}
Symbol <- (symstart * *symcont) | ('|' * *symchar * '|')
symstart <- Alpha | sympunct | symustart
symcont <- Alpha | sympunct | symustart | symucont | Digit | '-'
sympunct <- {'~', '!', '$', '%', '^', '&', '*', '?', '_', '=', '+', '/', '.'}
symchar <- unescaped | '"' | (escape * (escaped | '|' | ('u' * Xdigit)))
symustart <- utf8.any - {0..127}
symucont <- utf8.any - {0..127}
# TODO: exclude some unicode ranges
Embedded <- "#!" * Value
Compact <- "#=" * ws * ByteString
unescaped <- utf8.any - escaped
unicodeEscaped <- 'u' * Xdigit[4]
escaped <- '\\' * ({'{', '"', '|', '\\', 'b', 'f', 'n', 'r', 't'} | unicodeEscaped)
escape <- '\\'
ws <- *(' ' | '\t' | '\r' | '\n' | ',')

View File

@ -43,7 +43,7 @@ suite "native":
check(b == a)
block:
stream.setPosition(0)
let y = stream.parsePreserve()
let y = stream.decodePreserves()
let a = num
let b = y.int
check(b == a)
@ -67,7 +67,7 @@ suite "big":
check(b == a)
block:
stream.setPosition(0)
let y = stream.parsePreserve()
let y = stream.decodePreserves()
let a = big
let b = y.bigint
check(b == a)

39
tests/test_parser.nim Normal file
View File

@ -0,0 +1,39 @@
# SPDX-License-Identifier: ISC
import std/[strutils, unittest]
import preserves, preserves/parse
const examples = [
("""<capture <discard>>""", "\xB4\xB3\x07capture\xB4\xB3\x07discard\x84\x84"),
("""[1 2 3 4]""", "\xB5\x91\x92\x93\x94\x84"),
("""[-2 -1 0 1]""", "\xB5\x9E\x9F\x90\x91\x84"),
(""""hello"""", "\xB1\x05hello"),
("""["a" b #"c" [] #{} #t #f]""", "\xB5\xB1\x01a\xB3\x01b\xB2\x01c\xB5\x84\xB6\x84\x81\x80\x84"),
("""-257""", "\xA1\xFE\xFF"),
("""-1""", "\x9F"),
("""0""", "\x90"),
("""1""", "\x91"),
("""255""", "\xA1\x00\xFF"),
("""1.0f""", "\x82\x3F\x80\x00\x00"),
("""1.0""", "\x83\x3F\xF0\x00\x00\x00\x00\x00\x00"),
("""-1.202e300""", "\x83\xFE\x3C\xB7\xB7\x59\xBF\x04\x26"),
("""#=#x"B4B30763617074757265B4B307646973636172648484"""", "\xB4\xB3\x07capture\xB4\xB3\x07discard\x84\x84"),
("""#f""", "\x80")
]
suite "parse":
for (txt, bin) in examples:
test txt:
checkpoint(txt)
let test = parsePreserves(txt)
checkpoint($test)
block:
let
a= test
b = decodePreserves(bin)
check(a == b)
block:
let
a = encode test
b = bin
check(a.toHex == b.toHex)

View File

@ -62,7 +62,7 @@ for i, jsText in testVectors:
stream.write(x)
stream.setPosition(0)
let
y = stream.parsePreserve()
y = stream.decodePreserves()
test = y.toJson
check(y == x)
check(test == control)