preserves-nim/src/preserves/private/parsing.nim

# SPDX-FileCopyrightText: ☭ Emery Hemingway
# SPDX-License-Identifier: Unlicense

import std/[base64, options, parseutils, strutils, unicode]
from std/sequtils import insert

import bigints, npeg

import ../pegs
import ./decoding, ./values

type
  Frame = tuple[value: Value, pos: int]
  Stack = seq[Frame]

proc shrink(stack: var Stack; n: int) = stack.setLen(stack.len - n)

template pushStack(v: Value) = stack.add((v, capture[0].si))

proc joinWhitespace(s: string): string =
  result = newStringOfCap(s.len)
  for token, isSep in tokenize(s, Whitespace + {','}):
    if not isSep: add(result, token)

template unescape*(buf: var string; capture: string) =
  var i: int
  while i < len(capture):
    if capture[i] == '\\':
      inc(i)
      case capture[i]
      of '\\': add(buf, char 0x5c)
      of '/': add(buf, char 0x2f)
      of 'b': add(buf, char 0x08)
      of 'f': add(buf, char 0x0c)
      of 'n': add(buf, char 0x0a)
      of 'r': add(buf, char 0x0d)
      of 't': add(buf, char 0x09)
      of '"': add(buf, char 0x22)
      of 'u':
        var short: uint16
        inc(i)
        discard parseHex(capture, short, i, 4)
        inc(i, 3)
        if (short shr 15) == 0:
          add(buf, Rune(short).toUtf8)
        elif (short shr 10) == 0b110110:
          if i+6 >= capture.len:
            raise newException(ValueError, "Invalid UTF-16 surrogate pair")
          var rune = uint32(short shl 10) + 0x10000
          validate(capture[i+1] == '\\')
          validate(capture[i+2] == 'u')
          inc(i, 3)
          discard parseHex(capture, short, i, 4)
          if (short shr 10) != 0b110111:
            raise newException(ValueError, "Invalid UTF-16 surrogate pair")
          inc(i, 3)
          rune = rune or (short and 0b1111111111)
          #add(buf, Rune(rune).toUTF8)
          let j = buf.len
          buf.setLen(buf.len+4)
          rune.Rune.fastToUTF8Copy(buf, j, false)
        else:
          raise newException(ValueError, "Invalid UTF-16 escape sequence " & capture)
      else:
        validate(false)
    else:
      add(buf, capture[i])
    inc(i)

template unescape(buf: var seq[byte]; capture: string) =
  var i: int
  while i < len(capture):
    if capture[i] == '\\':
      inc(i)
      case capture[i]
      of '\\': add(buf, 0x5c'u8)
      of '/': add(buf, 0x2f'u8)
      of 'b': add(buf, 0x08'u8)
      of 'f': add(buf, 0x0c'u8)
      of 'n': add(buf, 0x0a'u8)
      of 'r': add(buf, 0x0d'u8)
      of 't': add(buf, 0x09'u8)
      of '"': add(buf, 0x22'u8)
      of 'x':
        var b: byte
        inc(i)
        discard parseHex(capture, b, i, 2)
        inc(i)
        add(buf, b)
      else:
        validate(false)
    else:
      add(buf, byte capture[i])
    inc(i)

proc pushHexNibble[T](result: var T; c: char) =
  var n = case c
    of '0'..'9': T(ord(c) - ord('0'))
    of 'a'..'f': T(ord(c) - ord('a') + 10)
    of 'A'..'F': T(ord(c) - ord('A') + 10)
    else: return
  result = (result shl 4) or n

proc parsePreserves*(text: string): Value =
  ## Parse a text-encoded Preserves `string` to a Preserves `Value`.
  let pegParser = peg("Document", stack: Stack):
    # Override rules from pegs.nim

    Document <- Preserves.Document

    Preserves.Record <- Preserves.Record:
      var
        record: seq[Value]
        labelOff: int
      while stack[labelOff].pos < capture[0].si:
        inc labelOff
      for i in labelOff.succ..stack.high:
        record.add(move stack[i].value)
      record.add(move stack[labelOff].value)
      stack.shrink record.len
      pushStack Value(kind: pkRecord, record: move record)

    Preserves.Sequence <- Preserves.Sequence:
      var sequence: seq[Value]
      for frame in stack.mitems:
        if frame.pos > capture[0].si:
          sequence.add(move frame.value)
      stack.shrink sequence.len
      pushStack Value(kind: pkSequence, sequence: move sequence)

    Preserves.Dictionary <- Preserves.Dictionary:
      var prs = Value(kind: pkDictionary)
      for i in countDown(stack.high.pred, 0, 2):
        if stack[i].pos < capture[0].si: break
        var
          val = stack.pop.value
          key = stack.pop.value
        for j in 0..prs.dict.high:
          validate(prs.dict[j].key != key)
        prs[key] = val
      pushStack prs

    Preserves.Set <- Preserves.Set:
      var prs = Value(kind: pkSet)
      for frame in stack.mitems:
        if frame.pos > capture[0].si:
          for e in prs.set: validate(e != frame.value)
          prs.incl(move frame.value)
      stack.shrink prs.set.len
      pushStack prs

    Preserves.Boolean <- Preserves.Boolean:
      case $0
      of "#f": pushStack Value(kind: pkBoolean)
      of "#t": pushStack Value(kind: pkBoolean, bool: true)
      else: discard

    Preserves.Float <- Preserves.Float:
      pushStack Value(kind: pkFloat, float: parseFloat($1))

    Preserves.Double <- Preserves.Double:
      pushStack Value(kind: pkDouble)
      let i = stack.high
      discard parseBiggestFloat($0, stack[i].value.double)

    Preserves.FloatRaw <- Preserves.FloatRaw:
      var reg: uint32
      for c in $1: pushHexNibble(reg, c)
      pushStack Value(kind: pkFloat, float: cast[float32](reg))

    Preserves.DoubleRaw <- Preserves.DoubleRaw:
      var reg: uint64
      for c in $1: pushHexNibble(reg, c)
      pushStack Value(kind: pkDouble, double: cast[float64](reg))

    Preserves.SignedInteger <- Preserves.SignedInteger:
      var
        big = initBigInt($0)
        small = toInt[int](big)
      if small.isSome:
        pushStack Value(kind: pkRegister, register: small.get)
      else:
        pushStack Value(kind: pkBigInt, bigint: big)

    Preserves.String <- Preserves.String:
      var v = Value(kind: pkString, string: newStringOfCap(len($1)))
      unescape(v.string, $1)
      if validateUtf8(v.string) != -1:
        raise newException(ValueError, "Preserves text contains an invalid UTF-8 sequence")
      pushStack v

    Preserves.charByteString <- Preserves.charByteString:
      var v = Value(kind: pkByteString, bytes: newSeqOfCap[byte](len($1)))
      unescape(v.bytes, $1)
      pushStack v

    Preserves.hexByteString <- Preserves.hexByteString:
      pushStack Value(kind: pkByteString, bytes: cast[seq[byte]](parseHexStr(joinWhitespace($1))))

    Preserves.b64ByteString <- Preserves.b64ByteString:
      pushStack Value(kind: pkByteString, bytes: cast[seq[byte]](base64.decode(joinWhitespace($1))))

    Preserves.Symbol <- Preserves.Symbol:
      var buf = newStringOfCap(len($1))
      unescape(buf, $1)
      pushStack Value(kind: pkSymbol, symbol: Symbol buf)

    Preserves.Embedded <- Preserves.Embedded:
      var v = stack.pop.value
      v.embedded = true
      pushStack v

    Preserves.Annotation <- Preserves.Annotation:
      var val = stack.pop.value
      discard stack.pop.value
      pushStack val

    Preserves.Compact <- Preserves.Compact:
      pushStack decodePreserves(stack.pop.value.bytes)

  var stack: Stack
  let match = pegParser.match(text, stack)
  if not match.ok:
    raise newException(ValueError, "failed to parse Preserves:\n" & text[match.matchMax..text.high])
  assert(stack.len == 1)
  stack.pop.value

proc parsePreservesAtom*(text: string): Atom =
  ## Parse a text-encoded Preserves `string` to a Preserves `Atom`.
  let pegParser = peg("Atom", a: Atom):
    # Override rules from pegs.nim

    Atom <- ?"#!" * Preserves.Atom

    Preserves.Boolean <- Preserves.Boolean:
      case $0
      of "#f": a = Atom(kind: pkBoolean)
      of "#t": a = Atom(kind: pkBoolean, bool: true)
      else: discard

    Preserves.Float <- Preserves.Float:
      a = Atom(kind: pkFloat, float: parseFloat($1))

    Preserves.Double <- Preserves.Double:
      a = Atom(kind: pkDouble)
      discard parseBiggestFloat($0, a.double)

    Preserves.FloatRaw <- Preserves.FloatRaw:
      var reg: uint32
      for c in $1: pushHexNibble(reg, c)
      a = Atom(kind: pkFloat, float: cast[float32](reg))

    Preserves.DoubleRaw <- Preserves.DoubleRaw:
      var reg: uint64
      for c in $1: pushHexNibble(reg, c)
      a = Atom(kind: pkDouble, double: cast[float64](reg))

    Preserves.SignedInteger <- Preserves.SignedInteger:
      var
        big = initBigInt($0)
        small = toInt[int](big)
      if small.isSome:
        a = Atom(kind: pkRegister, register: small.get)
      else:
        a = Atom(kind: pkBigInt, bigint: big)

    Preserves.String <- Preserves.String:
      a = Atom(kind: pkString, string: newStringOfCap(len($1)))
      unescape(a.string, $1)
      if validateUtf8(a.string) != -1:
        raise newException(ValueError, "Preserves text contains an invalid UTF-8 sequence")

    Preserves.charByteString <- Preserves.charByteString:
      a = Atom(kind: pkByteString, bytes: newSeqOfCap[byte](len($1)))
      unescape(a.bytes, $1)

    Preserves.hexByteString <- Preserves.hexByteString:
      a = Atom(kind: pkByteString, bytes: cast[seq[byte]](parseHexStr(joinWhitespace($1))))

    Preserves.b64ByteString <- Preserves.b64ByteString:
      a = Atom(kind: pkByteString, bytes: cast[seq[byte]](base64.decode(joinWhitespace($1))))

    Preserves.Symbol <- Preserves.Symbol:
      var buf = newStringOfCap(len($1))
      unescape(buf, $1)
      a = Atom(kind: pkSymbol, symbol: Symbol buf)

  if not pegParser.match(text, result).ok:
    raise newException(ValueError, "failed to parse Preserves atom: " & text)