preserves-nim/src/preserves/private/parsing.nim

290 lines
9.2 KiB
Nim

# SPDX-FileCopyrightText: ☭ Emery Hemingway
# SPDX-License-Identifier: Unlicense
import std/[base64, options, parseutils, strutils, unicode]
from std/sequtils import insert
import bigints, npeg
import ../pegs
import ./decoding, ./values
type
Frame = tuple[value: Value, pos: int]
Stack = seq[Frame]
proc shrink(stack: var Stack; n: int) = stack.setLen(stack.len - n)
template pushStack(v: Value) = stack.add((v, capture[0].si))
proc joinWhitespace(s: string): string =
result = newStringOfCap(s.len)
for token, isSep in tokenize(s, Whitespace + {','}):
if not isSep: add(result, token)
template unescape*(buf: var string; capture: string) =
var i: int
while i < len(capture):
if capture[i] == '\\':
inc(i)
case capture[i]
of '\\': add(buf, char 0x5c)
of '/': add(buf, char 0x2f)
of 'b': add(buf, char 0x08)
of 'f': add(buf, char 0x0c)
of 'n': add(buf, char 0x0a)
of 'r': add(buf, char 0x0d)
of 't': add(buf, char 0x09)
of '"': add(buf, char 0x22)
of 'u':
var short: uint16
inc(i)
discard parseHex(capture, short, i, 4)
inc(i, 3)
if (short shr 15) == 0:
add(buf, Rune(short).toUtf8)
elif (short shr 10) == 0b110110:
if i+6 >= capture.len:
raise newException(ValueError, "Invalid UTF-16 surrogate pair")
var rune = uint32(short shl 10) + 0x10000
validate(capture[i+1] == '\\')
validate(capture[i+2] == 'u')
inc(i, 3)
discard parseHex(capture, short, i, 4)
if (short shr 10) != 0b110111:
raise newException(ValueError, "Invalid UTF-16 surrogate pair")
inc(i, 3)
rune = rune or (short and 0b1111111111)
#add(buf, Rune(rune).toUTF8)
let j = buf.len
buf.setLen(buf.len+4)
rune.Rune.fastToUTF8Copy(buf, j, false)
else:
raise newException(ValueError, "Invalid UTF-16 escape sequence " & capture)
else:
validate(false)
else:
add(buf, capture[i])
inc(i)
template unescape(buf: var seq[byte]; capture: string) =
var i: int
while i < len(capture):
if capture[i] == '\\':
inc(i)
case capture[i]
of '\\': add(buf, 0x5c'u8)
of '/': add(buf, 0x2f'u8)
of 'b': add(buf, 0x08'u8)
of 'f': add(buf, 0x0c'u8)
of 'n': add(buf, 0x0a'u8)
of 'r': add(buf, 0x0d'u8)
of 't': add(buf, 0x09'u8)
of '"': add(buf, 0x22'u8)
of 'x':
var b: byte
inc(i)
discard parseHex(capture, b, i, 2)
inc(i)
add(buf, b)
else:
validate(false)
else:
add(buf, byte capture[i])
inc(i)
proc pushHexNibble[T](result: var T; c: char) =
var n = case c
of '0'..'9': T(ord(c) - ord('0'))
of 'a'..'f': T(ord(c) - ord('a') + 10)
of 'A'..'F': T(ord(c) - ord('A') + 10)
else: return
result = (result shl 4) or n
proc parsePreserves*(text: string): Value =
## Parse a text-encoded Preserves `string` to a Preserves `Value`.
let pegParser = peg("Document", stack: Stack):
# Override rules from pegs.nim
Document <- Preserves.Document
Preserves.Record <- Preserves.Record:
var
record: seq[Value]
labelOff: int
while stack[labelOff].pos < capture[0].si:
inc labelOff
for i in labelOff.succ..stack.high:
record.add(move stack[i].value)
record.add(move stack[labelOff].value)
stack.shrink record.len
pushStack Value(kind: pkRecord, record: move record)
Preserves.Sequence <- Preserves.Sequence:
var sequence: seq[Value]
for frame in stack.mitems:
if frame.pos > capture[0].si:
sequence.add(move frame.value)
stack.shrink sequence.len
pushStack Value(kind: pkSequence, sequence: move sequence)
Preserves.Dictionary <- Preserves.Dictionary:
var prs = Value(kind: pkDictionary)
for i in countDown(stack.high.pred, 0, 2):
if stack[i].pos < capture[0].si: break
var
val = stack.pop.value
key = stack.pop.value
for j in 0..prs.dict.high:
validate(prs.dict[j].key != key)
prs[key] = val
pushStack prs
Preserves.Set <- Preserves.Set:
var prs = Value(kind: pkSet)
for frame in stack.mitems:
if frame.pos > capture[0].si:
for e in prs.set: validate(e != frame.value)
prs.incl(move frame.value)
stack.shrink prs.set.len
pushStack prs
Preserves.Boolean <- Preserves.Boolean:
case $0
of "#f": pushStack Value(kind: pkBoolean)
of "#t": pushStack Value(kind: pkBoolean, bool: true)
else: discard
Preserves.Float <- Preserves.Float:
pushStack Value(kind: pkFloat, float: parseFloat($1))
Preserves.Double <- Preserves.Double:
pushStack Value(kind: pkDouble)
let i = stack.high
discard parseBiggestFloat($0, stack[i].value.double)
Preserves.FloatRaw <- Preserves.FloatRaw:
var reg: uint32
for c in $1: pushHexNibble(reg, c)
pushStack Value(kind: pkFloat, float: cast[float32](reg))
Preserves.DoubleRaw <- Preserves.DoubleRaw:
var reg: uint64
for c in $1: pushHexNibble(reg, c)
pushStack Value(kind: pkDouble, double: cast[float64](reg))
Preserves.SignedInteger <- Preserves.SignedInteger:
var
big = initBigInt($0)
small = toInt[int](big)
if small.isSome:
pushStack Value(kind: pkRegister, register: small.get)
else:
pushStack Value(kind: pkBigInt, bigint: big)
Preserves.String <- Preserves.String:
var v = Value(kind: pkString, string: newStringOfCap(len($1)))
unescape(v.string, $1)
if validateUtf8(v.string) != -1:
raise newException(ValueError, "Preserves text contains an invalid UTF-8 sequence")
pushStack v
Preserves.charByteString <- Preserves.charByteString:
var v = Value(kind: pkByteString, bytes: newSeqOfCap[byte](len($1)))
unescape(v.bytes, $1)
pushStack v
Preserves.hexByteString <- Preserves.hexByteString:
pushStack Value(kind: pkByteString, bytes: cast[seq[byte]](parseHexStr(joinWhitespace($1))))
Preserves.b64ByteString <- Preserves.b64ByteString:
pushStack Value(kind: pkByteString, bytes: cast[seq[byte]](base64.decode(joinWhitespace($1))))
Preserves.Symbol <- Preserves.Symbol:
var buf = newStringOfCap(len($1))
unescape(buf, $1)
pushStack Value(kind: pkSymbol, symbol: Symbol buf)
Preserves.Embedded <- Preserves.Embedded:
var v = stack.pop.value
v.embedded = true
pushStack v
Preserves.Annotation <- Preserves.Annotation:
var val = stack.pop.value
discard stack.pop.value
pushStack val
Preserves.Compact <- Preserves.Compact:
pushStack decodePreserves(stack.pop.value.bytes)
var stack: Stack
let match = pegParser.match(text, stack)
if not match.ok:
raise newException(ValueError, "failed to parse Preserves:\n" & text[match.matchMax..text.high])
assert(stack.len == 1)
stack.pop.value
proc parsePreservesAtom*(text: string): Atom =
## Parse a text-encoded Preserves `string` to a Preserves `Atom`.
let pegParser = peg("Atom", a: Atom):
# Override rules from pegs.nim
Atom <- ?"#!" * Preserves.Atom
Preserves.Boolean <- Preserves.Boolean:
case $0
of "#f": a = Atom(kind: pkBoolean)
of "#t": a = Atom(kind: pkBoolean, bool: true)
else: discard
Preserves.Float <- Preserves.Float:
a = Atom(kind: pkFloat, float: parseFloat($1))
Preserves.Double <- Preserves.Double:
a = Atom(kind: pkDouble)
discard parseBiggestFloat($0, a.double)
Preserves.FloatRaw <- Preserves.FloatRaw:
var reg: uint32
for c in $1: pushHexNibble(reg, c)
a = Atom(kind: pkFloat, float: cast[float32](reg))
Preserves.DoubleRaw <- Preserves.DoubleRaw:
var reg: uint64
for c in $1: pushHexNibble(reg, c)
a = Atom(kind: pkDouble, double: cast[float64](reg))
Preserves.SignedInteger <- Preserves.SignedInteger:
var
big = initBigInt($0)
small = toInt[int](big)
if small.isSome:
a = Atom(kind: pkRegister, register: small.get)
else:
a = Atom(kind: pkBigInt, bigint: big)
Preserves.String <- Preserves.String:
a = Atom(kind: pkString, string: newStringOfCap(len($1)))
unescape(a.string, $1)
if validateUtf8(a.string) != -1:
raise newException(ValueError, "Preserves text contains an invalid UTF-8 sequence")
Preserves.charByteString <- Preserves.charByteString:
a = Atom(kind: pkByteString, bytes: newSeqOfCap[byte](len($1)))
unescape(a.bytes, $1)
Preserves.hexByteString <- Preserves.hexByteString:
a = Atom(kind: pkByteString, bytes: cast[seq[byte]](parseHexStr(joinWhitespace($1))))
Preserves.b64ByteString <- Preserves.b64ByteString:
a = Atom(kind: pkByteString, bytes: cast[seq[byte]](base64.decode(joinWhitespace($1))))
Preserves.Symbol <- Preserves.Symbol:
var buf = newStringOfCap(len($1))
unescape(buf, $1)
a = Atom(kind: pkSymbol, symbol: Symbol buf)
if not pegParser.match(text, result).ok:
raise newException(ValueError, "failed to parse Preserves atom: " & text)