UTF-16 surrogate pair parsing

2023-12-24 01:10:21 +02:00 · 2023-12-24 01:10:21 +02:00 · d3a236bb92
parent 8a70cd0987
commit d3a236bb92
4 changed files with 41 additions and 38 deletions
--- a/preserves.nimble
+++ b/preserves.nimble
@ -1,6 +1,6 @@
 # Package
-version = "20231222"
+version = "20231224"
 author        = "Emery Hemingway"
 description   = "data model and serialization format"
 license       = "Unlicense"
--- a/src/preserves/private/decoding.nim
+++ b/src/preserves/private/decoding.nim
@ -76,12 +76,10 @@ proc decodePreserves*(s: Stream; E = void): Preserve[E] =
      else:
        result.bigint.fromBytes(buf, bigEndian)
  of 0xb1:
-    var data = newString(s.readVarint())
+    result = Preserve[E](kind: pkString, string: newString(s.readVarint()))
-    if data.len > 0:
+    if result.string.len > 0:
-      let n = s.readData(unsafeAddr data[0], data.len)
+      if s.readData(addr result.string[0], result.string.len) != result.string.len:
      if n != data.len:
        raise newException(IOError, "short read")
    result = Preserve[E](kind: pkString, string: data)
  of 0xb2:
    var data = newSeq[byte](s.readVarint())
    if data.len > 0:
--- a/src/preserves/private/parsing.nim
+++ b/src/preserves/private/parsing.nim
@ -38,19 +38,30 @@ template unescape*(buf: var string; capture: string) =
      of 't': add(buf, char 0x09)
      of '"': add(buf, char 0x22)
      of 'u':
-        var
+        var short: uint16
          long: uint32
          short: uint16
        inc(i)
        discard parseHex(capture, short, i, 4)
        inc(i, 3)
-        long = uint32(short)
+        if (short shr 15) == 0:
-        if capture[i] == 'u':
+          add(buf, Rune(short).toUtf8)
-          inc(i)
+        elif (short shr 10) == 0b110110:
-          discard parseHex(capture, short, i, 4)
+          if i+6 >= capture.len:
            raise newException(ValueError, "Invalid UTF-16 surrogate pair")
          var rune = uint32(short shl 10) + 0x10000
          validate(capture[i+1] == '\\')
          validate(capture[i+2] == 'u')
          inc(i, 3)
-          long = (long shl 16) or uint32(short)
+          discard parseHex(capture, short, i, 4)
-        add(buf, long.Rune.toUtf8)
+          if (short shr 10) != 0b110111:
            raise newException(ValueError, "Invalid UTF-16 surrogate pair")
          inc(i, 3)
          rune = rune or (short and 0b1111111111)
          #add(buf, Rune(rune).toUTF8)
          let j = buf.len
          buf.setLen(buf.len+4)
          rune.Rune.fastToUTF8Copy(buf, j, false)
        else:
          raise newException(ValueError, "Invalid UTF-16 escape sequence " & capture)
      else:
        validate(false)
    else:
--- a/src/preserves/private/texts.nim
+++ b/src/preserves/private/texts.nim
@ -13,32 +13,26 @@ template writeEscaped(stream: Stream; text: string; delim: char) =
  const escaped = { '"', '\\', '\b', '\f', '\n', '\r', '\t' }
  var
    i: int
    r: Rune
    c: char
  while i < text.len:
    c = text[i]
-    if (c.ord and 0x80) == 0x00:
+    case c
-      case c
+    of delim:
-      of delim:
+      write(stream, '\\')
-        write(stream, '\\')
+      write(stream, delim)
-        write(stream, delim)
+    of '\\': write(stream, "\\\\")
-      of '\\': write(stream, "\\\\")
+    of '\b': write(stream, "\\b")
-      of '\b': write(stream, "\\b")
+    of '\f': write(stream, "\\f")
-      of '\f': write(stream, "\\f")
+    of '\n': write(stream, "\\n")
-      of '\n': write(stream, "\\n")
+    of '\r': write(stream, "\\r")
-      of '\r': write(stream, "\\r")
+    of '\t': write(stream, "\\t")
-      of '\t': write(stream, "\\t")
+    of { '\x00'..'\x1f', '\x7f' } - escaped:
-      of { '\x00'..'\x1f', '\x7f' } - escaped:
+      # do not use \x__ notation because
-        # do not use \x__ notation because
+      # it is a subset of \u____.
-        # it is a subset of \u____.
+      write(stream, "\\u00")
-        write(stream, "\\u00")
+      write(stream, c.uint8.toHex(2))
-        write(stream, c.uint8.toHex(2))
+    else: write(stream, c)
-      else: write(stream, c)
+    inc i
      inc i
    else:
      fastRuneAt(text, i, r)
      write(stream, "\\u")
      write(stream, r.uint16.toHex(4))
 proc writeSymbol(stream: Stream; sym: string) =
  if sym.len > 0 and sym[0] in {'A'..'z'} and not sym.anyIt(char(it) in { '\x00'..'\x19', '"', '\\', '|' }):