UTF-16 surrogate pair parsing
This commit is contained in:
parent
8a70cd0987
commit
d3a236bb92
|
@ -1,6 +1,6 @@
|
|||
# Package
|
||||
|
||||
version = "20231222"
|
||||
version = "20231224"
|
||||
author = "Emery Hemingway"
|
||||
description = "data model and serialization format"
|
||||
license = "Unlicense"
|
||||
|
|
|
@ -76,12 +76,10 @@ proc decodePreserves*(s: Stream; E = void): Preserve[E] =
|
|||
else:
|
||||
result.bigint.fromBytes(buf, bigEndian)
|
||||
of 0xb1:
|
||||
var data = newString(s.readVarint())
|
||||
if data.len > 0:
|
||||
let n = s.readData(unsafeAddr data[0], data.len)
|
||||
if n != data.len:
|
||||
result = Preserve[E](kind: pkString, string: newString(s.readVarint()))
|
||||
if result.string.len > 0:
|
||||
if s.readData(addr result.string[0], result.string.len) != result.string.len:
|
||||
raise newException(IOError, "short read")
|
||||
result = Preserve[E](kind: pkString, string: data)
|
||||
of 0xb2:
|
||||
var data = newSeq[byte](s.readVarint())
|
||||
if data.len > 0:
|
||||
|
|
|
@ -38,19 +38,30 @@ template unescape*(buf: var string; capture: string) =
|
|||
of 't': add(buf, char 0x09)
|
||||
of '"': add(buf, char 0x22)
|
||||
of 'u':
|
||||
var
|
||||
long: uint32
|
||||
short: uint16
|
||||
var short: uint16
|
||||
inc(i)
|
||||
discard parseHex(capture, short, i, 4)
|
||||
inc(i, 3)
|
||||
long = uint32(short)
|
||||
if capture[i] == 'u':
|
||||
inc(i)
|
||||
discard parseHex(capture, short, i, 4)
|
||||
if (short shr 15) == 0:
|
||||
add(buf, Rune(short).toUtf8)
|
||||
elif (short shr 10) == 0b110110:
|
||||
if i+6 >= capture.len:
|
||||
raise newException(ValueError, "Invalid UTF-16 surrogate pair")
|
||||
var rune = uint32(short shl 10) + 0x10000
|
||||
validate(capture[i+1] == '\\')
|
||||
validate(capture[i+2] == 'u')
|
||||
inc(i, 3)
|
||||
long = (long shl 16) or uint32(short)
|
||||
add(buf, long.Rune.toUtf8)
|
||||
discard parseHex(capture, short, i, 4)
|
||||
if (short shr 10) != 0b110111:
|
||||
raise newException(ValueError, "Invalid UTF-16 surrogate pair")
|
||||
inc(i, 3)
|
||||
rune = rune or (short and 0b1111111111)
|
||||
#add(buf, Rune(rune).toUTF8)
|
||||
let j = buf.len
|
||||
buf.setLen(buf.len+4)
|
||||
rune.Rune.fastToUTF8Copy(buf, j, false)
|
||||
else:
|
||||
raise newException(ValueError, "Invalid UTF-16 escape sequence " & capture)
|
||||
else:
|
||||
validate(false)
|
||||
else:
|
||||
|
|
|
@ -13,32 +13,26 @@ template writeEscaped(stream: Stream; text: string; delim: char) =
|
|||
const escaped = { '"', '\\', '\b', '\f', '\n', '\r', '\t' }
|
||||
var
|
||||
i: int
|
||||
r: Rune
|
||||
c: char
|
||||
while i < text.len:
|
||||
c = text[i]
|
||||
if (c.ord and 0x80) == 0x00:
|
||||
case c
|
||||
of delim:
|
||||
write(stream, '\\')
|
||||
write(stream, delim)
|
||||
of '\\': write(stream, "\\\\")
|
||||
of '\b': write(stream, "\\b")
|
||||
of '\f': write(stream, "\\f")
|
||||
of '\n': write(stream, "\\n")
|
||||
of '\r': write(stream, "\\r")
|
||||
of '\t': write(stream, "\\t")
|
||||
of { '\x00'..'\x1f', '\x7f' } - escaped:
|
||||
# do not use \x__ notation because
|
||||
# it is a subset of \u____.
|
||||
write(stream, "\\u00")
|
||||
write(stream, c.uint8.toHex(2))
|
||||
else: write(stream, c)
|
||||
inc i
|
||||
else:
|
||||
fastRuneAt(text, i, r)
|
||||
write(stream, "\\u")
|
||||
write(stream, r.uint16.toHex(4))
|
||||
case c
|
||||
of delim:
|
||||
write(stream, '\\')
|
||||
write(stream, delim)
|
||||
of '\\': write(stream, "\\\\")
|
||||
of '\b': write(stream, "\\b")
|
||||
of '\f': write(stream, "\\f")
|
||||
of '\n': write(stream, "\\n")
|
||||
of '\r': write(stream, "\\r")
|
||||
of '\t': write(stream, "\\t")
|
||||
of { '\x00'..'\x1f', '\x7f' } - escaped:
|
||||
# do not use \x__ notation because
|
||||
# it is a subset of \u____.
|
||||
write(stream, "\\u00")
|
||||
write(stream, c.uint8.toHex(2))
|
||||
else: write(stream, c)
|
||||
inc i
|
||||
|
||||
proc writeSymbol(stream: Stream; sym: string) =
|
||||
if sym.len > 0 and sym[0] in {'A'..'z'} and not sym.anyIt(char(it) in { '\x00'..'\x19', '"', '\\', '|' }):
|
||||
|
|
Loading…
Reference in New Issue