From 489d6b31d5b00745e9bf554daeea457c4045c9e7 Mon Sep 17 00:00:00 2001
From: Emery Hemingway <ehmry@posteo.net>
Date: Sat, 29 Oct 2022 18:34:01 -0500
Subject: [PATCH] Fix string escaping

---
 preserves.nimble                |  2 +-
 src/preserves/pegs.nim          |  9 ++--
 src/preserves/private/parse.nim | 88 ++++++++++++++++++++++-----------
 3 files changed, 66 insertions(+), 33 deletions(-)

diff --git a/preserves.nimble b/preserves.nimble
index 9e42c43..6382aff 100644
--- a/preserves.nimble
+++ b/preserves.nimble
@@ -1,6 +1,6 @@
 # Package
 
-version = "20221027"
+version = "20221030"
 author        = "Emery Hemingway"
 description   = "data model and serialization format"
 license       = "Unlicense"
diff --git a/src/preserves/pegs.nim b/src/preserves/pegs.nim
index 9c63fb7..b64b5d4 100644
--- a/src/preserves/pegs.nim
+++ b/src/preserves/pegs.nim
@@ -40,7 +40,8 @@ grammar "Preserves":
   exp <- 'e' * ?('-'|'+') * +Digit
   flt <- int * ((frac * exp) | frac | exp)
 
-  String <- '"' * *(escape * (escaped | unicodeEscaped) | (utf8.any - '"')) * '"'
+  char <- unescaped | '|' | (escape * (escaped | '"' | ('u' * Xdigit[4])))
+  String <- '"' * >(*char) * '"'
 
   ByteString <- charByteString | hexByteString | b64ByteString
   charByteString <- "#\"" * >(*binchar) * '"'
@@ -48,7 +49,7 @@ grammar "Preserves":
   b64ByteString <- "#[" * ws * >(*(base64char * ws)) * ']'
 
   binchar <- binunescaped | (escape * (escaped | '"' | ('x' * Xdigit[2])))
-  binunescaped <- {'\20'..'\21', '#'..'[', ']'..'~'}
+  binunescaped <- {' '..'!', '#'..'[', ']'..'~'}
   base64char <- {'A'..'Z', 'a'..'z', '0'..'9', '+', '/', '-', '_', '='}
 
   Symbol <- (symstart * *symcont) | ('|' * *symchar * '|')
@@ -65,9 +66,9 @@ grammar "Preserves":
 
   Compact <- "#=" * ws * ByteString
 
-  unescaped <- utf8.any - escaped
+  unescaped <- utf8.any - { '\x00'..'\x19', '"', '\\', '|' }
   unicodeEscaped <- 'u' * Xdigit[4]
-  escaped <- {'{', '"', '|', '\\', 'b', 'f', 'n', 'r', 't'}
+  escaped <- {'\\', '/', 'b', 'f', 'n', 'r', 't'}
   escape <- '\\'
 
   ws <- *(' ' | '\t' | '\r' | '\n' | ',')
diff --git a/src/preserves/private/parse.nim b/src/preserves/private/parse.nim
index e00e3cb..6c8f8fb 100644
--- a/src/preserves/private/parse.nim
+++ b/src/preserves/private/parse.nim
@@ -3,7 +3,9 @@
 
 # this module is included in ../../preserves.nim
 
-import std/[parseutils, strutils]
+import std/[parseutils, unicode]
+from std/sequtils import insert
+from std/strutils import Whitespace, parseFloat, parseHexStr, parseInt, tokenize
 import npeg
 import ../pegs
 
@@ -21,6 +23,58 @@ proc joinWhitespace(s: string): string =
   for token, isSep in tokenize(s, Whitespace + {','}):
     if not isSep: add(result, token)
 
+template unescape(buf: var string; capture: string) =
+  var i: int
+  while i < len(capture):
+    if capture[i] == '\\':
+      inc(i)
+      case capture[i]
+      of '\\': add(buf, char 0x5c)
+      of '/': add(buf, char 0x2f)
+      of 'b': add(buf, char 0x08)
+      of 'f': add(buf, char 0x0c)
+      of 'n': add(buf, char 0x0a)
+      of 'r': add(buf, char 0x0d)
+      of 't': add(buf, char 0x09)
+      of '"': add(buf, char 0x22)
+      of 'u':
+        var r: int32
+        inc(i)
+        discard parseHex(capture, r, i, 4)
+        inc(i, 3)
+        add(buf, Rune r)
+      else:
+        validate(false)
+    else:
+      add(buf, capture[i])
+    inc(i)
+
+template unescape(buf: var seq[byte]; capture: string) =
+  var i: int
+  while i < len(capture):
+    if capture[i] == '\\':
+      inc(i)
+      case capture[i]
+      of '\\': add(buf, 0x5c'u8)
+      of '/': add(buf, 0x2f'u8)
+      of 'b': add(buf, 0x08'u8)
+      of 'f': add(buf, 0x0c'u8)
+      of 'n': add(buf, 0x0a'u8)
+      of 'r': add(buf, 0x0d'u8)
+      of 't': add(buf, 0x09'u8)
+      of '"': add(buf, 0x22'u8)
+      of 'x':
+        var b: byte
+        inc(i)
+        discard parseHex(capture, b, i, 2)
+        inc(i)
+        add(buf, b)
+      else:
+        validate(false)
+    else:
+      add(buf, byte capture[i])
+    inc(i)
+
 proc parsePreserves*(text: string): Preserve[void] {.gcsafe.} =
   ## Parse a text-encoded Preserves `string` to a `Preserve` value.
   runnableExamples:
@@ -89,35 +143,13 @@ proc parsePreserves*(text: string): Preserve[void] {.gcsafe.} =
       pushStack Value(kind: pkSignedInteger, int: parseInt($0))
 
     Preserves.String <- Preserves.String:
-      pushStack Value(kind: pkString, string: unescape($0).replace("\\n", "\n"))
+      var v = Value(kind: pkString, string: newStringOfCap(len($1)))
+      unescape(v.string, $1)
+      pushStack v
 
     Preserves.charByteString <- Preserves.charByteString:
-      let chars = $1
-      var
-        v = Value(kind: pkByteString, bytes: newSeqOfCap[byte](chars.len))
-        i: int
-      while i < len(chars):
-        if chars[i] == '\\':
-          inc(i)
-          case chars[i]
-          of '\\': add(v.bytes, 0x5c'u8)
-          of '/': add(v.bytes, 0x2f'u8)
-          of 'b': add(v.bytes, 0x08'u8)
-          of 'f': add(v.bytes, 0x0c'u8)
-          of 'n': add(v.bytes, 0x0a'u8)
-          of 'r': add(v.bytes, 0x0d'u8)
-          of 't': add(v.bytes, 0x09'u8)
-          of '"': add(v.bytes, 0x22'u8)
-          of 'x':
-            var b: byte
-            inc(i)
-            discard parseHex(chars, b, i, 2)
-            inc(i)
-            add(v.bytes, b)
-          else: discard
-        else:
-          add(v.bytes, byte chars[i])
-        inc(i)
+      var v = Value(kind: pkByteString, bytes: newSeqOfCap[byte](len($1)))
+      unescape(v.bytes, $1)
       pushStack v
 
     Preserves.hexByteString <- Preserves.hexByteString: