Fix string escaping

2022-10-29 18:34:01 -05:00 · 2022-10-29 18:34:01 -05:00 · 489d6b31d5
parent 201cb7c68e
commit 489d6b31d5
3 changed files with 66 additions and 33 deletions
--- a/preserves.nimble
+++ b/preserves.nimble
@ -1,6 +1,6 @@
 # Package
-version = "20221027"
+version = "20221030"
 author        = "Emery Hemingway"
 description   = "data model and serialization format"
 license       = "Unlicense"
--- a/src/preserves/pegs.nim
+++ b/src/preserves/pegs.nim
@ -40,7 +40,8 @@ grammar "Preserves":
  exp <- 'e' * ?('-'|'+') * +Digit
  flt <- int * ((frac * exp) | frac | exp)
-  String <- '"' * *(escape * (escaped | unicodeEscaped) | (utf8.any - '"')) * '"'
+  char <- unescaped | '|' | (escape * (escaped | '"' | ('u' * Xdigit[4])))
  String <- '"' * >(*char) * '"'
  ByteString <- charByteString | hexByteString | b64ByteString
  charByteString <- "#\"" * >(*binchar) * '"'
@ -48,7 +49,7 @@ grammar "Preserves":
  b64ByteString <- "#[" * ws * >(*(base64char * ws)) * ']'
  binchar <- binunescaped | (escape * (escaped | '"' | ('x' * Xdigit[2])))
-  binunescaped <- {'\20'..'\21', '#'..'[', ']'..'~'}
+  binunescaped <- {' '..'!', '#'..'[', ']'..'~'}
  base64char <- {'A'..'Z', 'a'..'z', '0'..'9', '+', '/', '-', '_', '='}
  Symbol <- (symstart * *symcont) | ('|' * *symchar * '|')
@ -65,9 +66,9 @@ grammar "Preserves":
  Compact <- "#=" * ws * ByteString
-  unescaped <- utf8.any - escaped
+  unescaped <- utf8.any - { '\x00'..'\x19', '"', '\\', '|' }
  unicodeEscaped <- 'u' * Xdigit[4]
-  escaped <- {'{', '"', '|', '\\', 'b', 'f', 'n', 'r', 't'}
+  escaped <- {'\\', '/', 'b', 'f', 'n', 'r', 't'}
  escape <- '\\'
  ws <- *(' ' | '\t' | '\r' | '\n' | ',')
--- a/src/preserves/private/parse.nim
+++ b/src/preserves/private/parse.nim
@ -3,7 +3,9 @@
 # this module is included in ../../preserves.nim
-import std/[parseutils, strutils]
+import std/[parseutils, unicode]
 from std/sequtils import insert
 from std/strutils import Whitespace, parseFloat, parseHexStr, parseInt, tokenize
 import npeg
 import ../pegs
@ -21,6 +23,58 @@ proc joinWhitespace(s: string): string =
  for token, isSep in tokenize(s, Whitespace + {','}):
    if not isSep: add(result, token)
 template unescape(buf: var string; capture: string) =
  var i: int
  while i < len(capture):
    if capture[i] == '\\':
      inc(i)
      case capture[i]
      of '\\': add(buf, char 0x5c)
      of '/': add(buf, char 0x2f)
      of 'b': add(buf, char 0x08)
      of 'f': add(buf, char 0x0c)
      of 'n': add(buf, char 0x0a)
      of 'r': add(buf, char 0x0d)
      of 't': add(buf, char 0x09)
      of '"': add(buf, char 0x22)
      of 'u':
        var r: int32
        inc(i)
        discard parseHex(capture, r, i, 4)
        inc(i, 3)
        add(buf, Rune r)
      else:
        validate(false)
    else:
      add(buf, capture[i])
    inc(i)
 template unescape(buf: var seq[byte]; capture: string) =
  var i: int
  while i < len(capture):
    if capture[i] == '\\':
      inc(i)
      case capture[i]
      of '\\': add(buf, 0x5c'u8)
      of '/': add(buf, 0x2f'u8)
      of 'b': add(buf, 0x08'u8)
      of 'f': add(buf, 0x0c'u8)
      of 'n': add(buf, 0x0a'u8)
      of 'r': add(buf, 0x0d'u8)
      of 't': add(buf, 0x09'u8)
      of '"': add(buf, 0x22'u8)
      of 'x':
        var b: byte
        inc(i)
        discard parseHex(capture, b, i, 2)
        inc(i)
        add(buf, b)
      else:
        validate(false)
    else:
      add(buf, byte capture[i])
    inc(i)
 proc parsePreserves*(text: string): Preserve[void] {.gcsafe.} =
  ## Parse a text-encoded Preserves `string` to a `Preserve` value.
  runnableExamples:
@ -89,35 +143,13 @@ proc parsePreserves*(text: string): Preserve[void] {.gcsafe.} =
      pushStack Value(kind: pkSignedInteger, int: parseInt($0))
    Preserves.String <- Preserves.String:
-      pushStack Value(kind: pkString, string: unescape($0).replace("\\n", "\n"))
+      var v = Value(kind: pkString, string: newStringOfCap(len($1)))
      unescape(v.string, $1)
      pushStack v
    Preserves.charByteString <- Preserves.charByteString:
-      let chars = $1
+      var v = Value(kind: pkByteString, bytes: newSeqOfCap[byte](len($1)))
-      var
+      unescape(v.bytes, $1)
        v = Value(kind: pkByteString, bytes: newSeqOfCap[byte](chars.len))
        i: int
      while i < len(chars):
        if chars[i] == '\\':
          inc(i)
          case chars[i]
          of '\\': add(v.bytes, 0x5c'u8)
          of '/': add(v.bytes, 0x2f'u8)
          of 'b': add(v.bytes, 0x08'u8)
          of 'f': add(v.bytes, 0x0c'u8)
          of 'n': add(v.bytes, 0x0a'u8)
          of 'r': add(v.bytes, 0x0d'u8)
          of 't': add(v.bytes, 0x09'u8)
          of '"': add(v.bytes, 0x22'u8)
          of 'x':
            var b: byte
            inc(i)
            discard parseHex(chars, b, i, 2)
            inc(i)
            add(v.bytes, b)
          else: discard
        else:
          add(v.bytes, byte chars[i])
        inc(i)
      pushStack v
    Preserves.hexByteString <- Preserves.hexByteString: