UTF-16 surrogate pair parsing

This commit is contained in:
Emery Hemingway 2023-12-24 01:10:21 +02:00
parent 8a70cd0987
commit d3a236bb92
4 changed files with 41 additions and 38 deletions

View File

@ -1,6 +1,6 @@
# Package # Package
version = "20231222" version = "20231224"
author = "Emery Hemingway" author = "Emery Hemingway"
description = "data model and serialization format" description = "data model and serialization format"
license = "Unlicense" license = "Unlicense"

View File

@ -76,12 +76,10 @@ proc decodePreserves*(s: Stream; E = void): Preserve[E] =
else: else:
result.bigint.fromBytes(buf, bigEndian) result.bigint.fromBytes(buf, bigEndian)
of 0xb1: of 0xb1:
var data = newString(s.readVarint()) result = Preserve[E](kind: pkString, string: newString(s.readVarint()))
if data.len > 0: if result.string.len > 0:
let n = s.readData(unsafeAddr data[0], data.len) if s.readData(addr result.string[0], result.string.len) != result.string.len:
if n != data.len:
raise newException(IOError, "short read") raise newException(IOError, "short read")
result = Preserve[E](kind: pkString, string: data)
of 0xb2: of 0xb2:
var data = newSeq[byte](s.readVarint()) var data = newSeq[byte](s.readVarint())
if data.len > 0: if data.len > 0:

View File

@ -38,19 +38,30 @@ template unescape*(buf: var string; capture: string) =
of 't': add(buf, char 0x09) of 't': add(buf, char 0x09)
of '"': add(buf, char 0x22) of '"': add(buf, char 0x22)
of 'u': of 'u':
var var short: uint16
long: uint32
short: uint16
inc(i) inc(i)
discard parseHex(capture, short, i, 4) discard parseHex(capture, short, i, 4)
inc(i, 3) inc(i, 3)
long = uint32(short) if (short shr 15) == 0:
if capture[i] == 'u': add(buf, Rune(short).toUtf8)
inc(i) elif (short shr 10) == 0b110110:
discard parseHex(capture, short, i, 4) if i+6 >= capture.len:
raise newException(ValueError, "Invalid UTF-16 surrogate pair")
var rune = uint32(short shl 10) + 0x10000
validate(capture[i+1] == '\\')
validate(capture[i+2] == 'u')
inc(i, 3) inc(i, 3)
long = (long shl 16) or uint32(short) discard parseHex(capture, short, i, 4)
add(buf, long.Rune.toUtf8) if (short shr 10) != 0b110111:
raise newException(ValueError, "Invalid UTF-16 surrogate pair")
inc(i, 3)
rune = rune or (short and 0b1111111111)
#add(buf, Rune(rune).toUTF8)
let j = buf.len
buf.setLen(buf.len+4)
rune.Rune.fastToUTF8Copy(buf, j, false)
else:
raise newException(ValueError, "Invalid UTF-16 escape sequence " & capture)
else: else:
validate(false) validate(false)
else: else:

View File

@ -13,32 +13,26 @@ template writeEscaped(stream: Stream; text: string; delim: char) =
const escaped = { '"', '\\', '\b', '\f', '\n', '\r', '\t' } const escaped = { '"', '\\', '\b', '\f', '\n', '\r', '\t' }
var var
i: int i: int
r: Rune
c: char c: char
while i < text.len: while i < text.len:
c = text[i] c = text[i]
if (c.ord and 0x80) == 0x00: case c
case c of delim:
of delim: write(stream, '\\')
write(stream, '\\') write(stream, delim)
write(stream, delim) of '\\': write(stream, "\\\\")
of '\\': write(stream, "\\\\") of '\b': write(stream, "\\b")
of '\b': write(stream, "\\b") of '\f': write(stream, "\\f")
of '\f': write(stream, "\\f") of '\n': write(stream, "\\n")
of '\n': write(stream, "\\n") of '\r': write(stream, "\\r")
of '\r': write(stream, "\\r") of '\t': write(stream, "\\t")
of '\t': write(stream, "\\t") of { '\x00'..'\x1f', '\x7f' } - escaped:
of { '\x00'..'\x1f', '\x7f' } - escaped: # do not use \x__ notation because
# do not use \x__ notation because # it is a subset of \u____.
# it is a subset of \u____. write(stream, "\\u00")
write(stream, "\\u00") write(stream, c.uint8.toHex(2))
write(stream, c.uint8.toHex(2)) else: write(stream, c)
else: write(stream, c) inc i
inc i
else:
fastRuneAt(text, i, r)
write(stream, "\\u")
write(stream, r.uint16.toHex(4))
proc writeSymbol(stream: Stream; sym: string) = proc writeSymbol(stream: Stream; sym: string) =
if sym.len > 0 and sym[0] in {'A'..'z'} and not sym.anyIt(char(it) in { '\x00'..'\x19', '"', '\\', '|' }): if sym.len > 0 and sym[0] in {'A'..'z'} and not sym.anyIt(char(it) in { '\x00'..'\x19', '"', '\\', '|' }):