UTF-16 surrogate pair parsing

This commit is contained in:
Emery Hemingway 2023-12-24 01:10:21 +02:00
parent 8a70cd0987
commit d3a236bb92
4 changed files with 41 additions and 38 deletions

View File

@ -1,6 +1,6 @@
# Package
version = "20231222"
version = "20231224"
author = "Emery Hemingway"
description = "data model and serialization format"
license = "Unlicense"

View File

@ -76,12 +76,10 @@ proc decodePreserves*(s: Stream; E = void): Preserve[E] =
else:
result.bigint.fromBytes(buf, bigEndian)
of 0xb1:
var data = newString(s.readVarint())
if data.len > 0:
let n = s.readData(unsafeAddr data[0], data.len)
if n != data.len:
result = Preserve[E](kind: pkString, string: newString(s.readVarint()))
if result.string.len > 0:
if s.readData(addr result.string[0], result.string.len) != result.string.len:
raise newException(IOError, "short read")
result = Preserve[E](kind: pkString, string: data)
of 0xb2:
var data = newSeq[byte](s.readVarint())
if data.len > 0:

View File

@ -38,19 +38,30 @@ template unescape*(buf: var string; capture: string) =
of 't': add(buf, char 0x09)
of '"': add(buf, char 0x22)
of 'u':
var
long: uint32
short: uint16
var short: uint16
inc(i)
discard parseHex(capture, short, i, 4)
inc(i, 3)
long = uint32(short)
if capture[i] == 'u':
inc(i)
discard parseHex(capture, short, i, 4)
if (short shr 15) == 0:
add(buf, Rune(short).toUtf8)
elif (short shr 10) == 0b110110:
if i+6 >= capture.len:
raise newException(ValueError, "Invalid UTF-16 surrogate pair")
var rune = uint32(short shl 10) + 0x10000
validate(capture[i+1] == '\\')
validate(capture[i+2] == 'u')
inc(i, 3)
long = (long shl 16) or uint32(short)
add(buf, long.Rune.toUtf8)
discard parseHex(capture, short, i, 4)
if (short shr 10) != 0b110111:
raise newException(ValueError, "Invalid UTF-16 surrogate pair")
inc(i, 3)
rune = rune or (short and 0b1111111111)
#add(buf, Rune(rune).toUTF8)
let j = buf.len
buf.setLen(buf.len+4)
rune.Rune.fastToUTF8Copy(buf, j, false)
else:
raise newException(ValueError, "Invalid UTF-16 escape sequence " & capture)
else:
validate(false)
else:

View File

@ -13,32 +13,26 @@ template writeEscaped(stream: Stream; text: string; delim: char) =
const escaped = { '"', '\\', '\b', '\f', '\n', '\r', '\t' }
var
i: int
r: Rune
c: char
while i < text.len:
c = text[i]
if (c.ord and 0x80) == 0x00:
case c
of delim:
write(stream, '\\')
write(stream, delim)
of '\\': write(stream, "\\\\")
of '\b': write(stream, "\\b")
of '\f': write(stream, "\\f")
of '\n': write(stream, "\\n")
of '\r': write(stream, "\\r")
of '\t': write(stream, "\\t")
of { '\x00'..'\x1f', '\x7f' } - escaped:
# do not use \x__ notation because
# it is a subset of \u____.
write(stream, "\\u00")
write(stream, c.uint8.toHex(2))
else: write(stream, c)
inc i
else:
fastRuneAt(text, i, r)
write(stream, "\\u")
write(stream, r.uint16.toHex(4))
case c
of delim:
write(stream, '\\')
write(stream, delim)
of '\\': write(stream, "\\\\")
of '\b': write(stream, "\\b")
of '\f': write(stream, "\\f")
of '\n': write(stream, "\\n")
of '\r': write(stream, "\\r")
of '\t': write(stream, "\\t")
of { '\x00'..'\x1f', '\x7f' } - escaped:
# do not use \x__ notation because
# it is a subset of \u____.
write(stream, "\\u00")
write(stream, c.uint8.toHex(2))
else: write(stream, c)
inc i
proc writeSymbol(stream: Stream; sym: string) =
if sym.len > 0 and sym[0] in {'A'..'z'} and not sym.anyIt(char(it) in { '\x00'..'\x19', '"', '\\', '|' }):