UTF-16 surrogate pair parsing
This commit is contained in:
parent
8a70cd0987
commit
d3a236bb92
|
@ -1,6 +1,6 @@
|
||||||
# Package
|
# Package
|
||||||
|
|
||||||
version = "20231222"
|
version = "20231224"
|
||||||
author = "Emery Hemingway"
|
author = "Emery Hemingway"
|
||||||
description = "data model and serialization format"
|
description = "data model and serialization format"
|
||||||
license = "Unlicense"
|
license = "Unlicense"
|
||||||
|
|
|
@ -76,12 +76,10 @@ proc decodePreserves*(s: Stream; E = void): Preserve[E] =
|
||||||
else:
|
else:
|
||||||
result.bigint.fromBytes(buf, bigEndian)
|
result.bigint.fromBytes(buf, bigEndian)
|
||||||
of 0xb1:
|
of 0xb1:
|
||||||
var data = newString(s.readVarint())
|
result = Preserve[E](kind: pkString, string: newString(s.readVarint()))
|
||||||
if data.len > 0:
|
if result.string.len > 0:
|
||||||
let n = s.readData(unsafeAddr data[0], data.len)
|
if s.readData(addr result.string[0], result.string.len) != result.string.len:
|
||||||
if n != data.len:
|
|
||||||
raise newException(IOError, "short read")
|
raise newException(IOError, "short read")
|
||||||
result = Preserve[E](kind: pkString, string: data)
|
|
||||||
of 0xb2:
|
of 0xb2:
|
||||||
var data = newSeq[byte](s.readVarint())
|
var data = newSeq[byte](s.readVarint())
|
||||||
if data.len > 0:
|
if data.len > 0:
|
||||||
|
|
|
@ -38,19 +38,30 @@ template unescape*(buf: var string; capture: string) =
|
||||||
of 't': add(buf, char 0x09)
|
of 't': add(buf, char 0x09)
|
||||||
of '"': add(buf, char 0x22)
|
of '"': add(buf, char 0x22)
|
||||||
of 'u':
|
of 'u':
|
||||||
var
|
var short: uint16
|
||||||
long: uint32
|
|
||||||
short: uint16
|
|
||||||
inc(i)
|
inc(i)
|
||||||
discard parseHex(capture, short, i, 4)
|
discard parseHex(capture, short, i, 4)
|
||||||
inc(i, 3)
|
inc(i, 3)
|
||||||
long = uint32(short)
|
if (short shr 15) == 0:
|
||||||
if capture[i] == 'u':
|
add(buf, Rune(short).toUtf8)
|
||||||
inc(i)
|
elif (short shr 10) == 0b110110:
|
||||||
discard parseHex(capture, short, i, 4)
|
if i+6 >= capture.len:
|
||||||
|
raise newException(ValueError, "Invalid UTF-16 surrogate pair")
|
||||||
|
var rune = uint32(short shl 10) + 0x10000
|
||||||
|
validate(capture[i+1] == '\\')
|
||||||
|
validate(capture[i+2] == 'u')
|
||||||
inc(i, 3)
|
inc(i, 3)
|
||||||
long = (long shl 16) or uint32(short)
|
discard parseHex(capture, short, i, 4)
|
||||||
add(buf, long.Rune.toUtf8)
|
if (short shr 10) != 0b110111:
|
||||||
|
raise newException(ValueError, "Invalid UTF-16 surrogate pair")
|
||||||
|
inc(i, 3)
|
||||||
|
rune = rune or (short and 0b1111111111)
|
||||||
|
#add(buf, Rune(rune).toUTF8)
|
||||||
|
let j = buf.len
|
||||||
|
buf.setLen(buf.len+4)
|
||||||
|
rune.Rune.fastToUTF8Copy(buf, j, false)
|
||||||
|
else:
|
||||||
|
raise newException(ValueError, "Invalid UTF-16 escape sequence " & capture)
|
||||||
else:
|
else:
|
||||||
validate(false)
|
validate(false)
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -13,32 +13,26 @@ template writeEscaped(stream: Stream; text: string; delim: char) =
|
||||||
const escaped = { '"', '\\', '\b', '\f', '\n', '\r', '\t' }
|
const escaped = { '"', '\\', '\b', '\f', '\n', '\r', '\t' }
|
||||||
var
|
var
|
||||||
i: int
|
i: int
|
||||||
r: Rune
|
|
||||||
c: char
|
c: char
|
||||||
while i < text.len:
|
while i < text.len:
|
||||||
c = text[i]
|
c = text[i]
|
||||||
if (c.ord and 0x80) == 0x00:
|
case c
|
||||||
case c
|
of delim:
|
||||||
of delim:
|
write(stream, '\\')
|
||||||
write(stream, '\\')
|
write(stream, delim)
|
||||||
write(stream, delim)
|
of '\\': write(stream, "\\\\")
|
||||||
of '\\': write(stream, "\\\\")
|
of '\b': write(stream, "\\b")
|
||||||
of '\b': write(stream, "\\b")
|
of '\f': write(stream, "\\f")
|
||||||
of '\f': write(stream, "\\f")
|
of '\n': write(stream, "\\n")
|
||||||
of '\n': write(stream, "\\n")
|
of '\r': write(stream, "\\r")
|
||||||
of '\r': write(stream, "\\r")
|
of '\t': write(stream, "\\t")
|
||||||
of '\t': write(stream, "\\t")
|
of { '\x00'..'\x1f', '\x7f' } - escaped:
|
||||||
of { '\x00'..'\x1f', '\x7f' } - escaped:
|
# do not use \x__ notation because
|
||||||
# do not use \x__ notation because
|
# it is a subset of \u____.
|
||||||
# it is a subset of \u____.
|
write(stream, "\\u00")
|
||||||
write(stream, "\\u00")
|
write(stream, c.uint8.toHex(2))
|
||||||
write(stream, c.uint8.toHex(2))
|
else: write(stream, c)
|
||||||
else: write(stream, c)
|
inc i
|
||||||
inc i
|
|
||||||
else:
|
|
||||||
fastRuneAt(text, i, r)
|
|
||||||
write(stream, "\\u")
|
|
||||||
write(stream, r.uint16.toHex(4))
|
|
||||||
|
|
||||||
proc writeSymbol(stream: Stream; sym: string) =
|
proc writeSymbol(stream: Stream; sym: string) =
|
||||||
if sym.len > 0 and sym[0] in {'A'..'z'} and not sym.anyIt(char(it) in { '\x00'..'\x19', '"', '\\', '|' }):
|
if sym.len > 0 and sym[0] in {'A'..'z'} and not sym.anyIt(char(it) in { '\x00'..'\x19', '"', '\\', '|' }):
|
||||||
|
|
Loading…
Reference in New Issue