New tests and fixes for bad surrogate pair situations. Closes #43

This commit is contained in:
Tony Garnock-Jones 2023-10-29 21:30:54 +01:00
parent 9595872177
commit b8fb7abab1
8 changed files with 69 additions and 11 deletions

View File

@ -221,7 +221,9 @@ class Parser(TextCodec):
def read_string(self, delimiter):
def u16_escape(acc):
n1 = self.hexnum(4)
if n1 >= 0xd800 and n1 <= 0xdbff:
if n1 >= 0xd800 and n1 <= 0xdfff:
if n1 >= 0xdc00:
raise DecodeError('Bad first half of surrogate pair')
ok = True
ok = ok and self.nextchar() == '\\'
ok = ok and self.nextchar() == 'u'

View File

@ -184,6 +184,22 @@
string3: <Test #x"b10568656c6c6f" "hello">
string4: <Test #x"b1 14 616263e6b0b4e6b0b45c2f22080c0a0d0978797a" "abc\u6c34\u6C34\\/\"\b\f\n\r\txyz">
string5: <Test #x"b104f09d849e" "\uD834\uDD1E">
string6: @"Short unicode escape" <ParseError "\"\\u6c\"">
string7: @"Short unicode escape" <ParseError "\"\\u6c3\"">
surrogatepair0str: @"Unmatched high surrogate" <ParseError "\"blah\\uD834\"">
surrogatepair1str: @"Unmatched low surrogate" <ParseError "\"\\uDD1Eblah\"">
surrogatepair2str: @"Unmatched high surrogate" <ParseError "\"blah\\uD834blah\"">
surrogatepair3str: @"Unmatched low surrogate" <ParseError "\"blah\\uDD1Eblah\"">
surrogatepair4str: @"Swapped surrogates" <ParseError "\"blah\\uDD1E\\uD834blah\"">
surrogatepair5str: @"Two high surrogates" <ParseError "\"blah\\uD834\\uD834blah\"">
surrogatepair6str: @"Two low surrogates" <ParseError "\"blah\\uDD1E\\uDD1Eblah\"">
surrogatepair0sym: @"Unmatched high surrogate" <ParseError "|blah\\uD834|">
surrogatepair1sym: @"Unmatched low surrogate" <ParseError "|\\uDD1Eblah|">
surrogatepair2sym: @"Unmatched high surrogate" <ParseError "|blah\\uD834blah|">
surrogatepair3sym: @"Unmatched low surrogate" <ParseError "|blah\\uDD1Eblah|">
surrogatepair4sym: @"Swapped surrogates" <ParseError "|blah\\uDD1E\\uD834blah|">
surrogatepair5sym: @"Two high surrogates" <ParseError "|blah\\uD834\\uD834blah|">
surrogatepair6sym: @"Two low surrogates" <ParseError "|blah\\uDD1E\\uDD1Eblah|">
symbol0: <Test #x"b300" ||>
symbol2: <Test #x"b30568656c6c6f" hello>
symbol3: <Test #x"b305312d322d33" 1-2-3>
@ -197,6 +213,7 @@
symbol11: <Test #x"b3042d2d2d61" ---a>
symbol12: <Test #x"b3042d2d2d31" ---1>
symbol13: <Test #x"b3042b312e78" +1.x>
symbol14: <Test #x"b304f09d849e" |\uD834\uDD1E|>
tag0: @"Unexpected end tag" <DecodeError #x"84">
tag1: @"Invalid tag" <DecodeError #x"10">
tag2: @"Invalid tag" <DecodeError #x"61b10110">

View File

@ -177,16 +177,18 @@
(match in-port
[(px #px#"^[a-fA-F0-9]{4}" (list hexdigits))
(define n1 (string->number (bytes->string/utf-8 hexdigits) 16))
(if (<= #xd800 n1 #xdfff) ;; surrogate pair first half
(match in-port
[(px #px#"^\\\\u([a-fA-F0-9]{4})" (list _ hexdigits2))
(define n2 (string->number (bytes->string/utf-8 hexdigits2) 16))
(if (<= #xdc00 n2 #xdfff)
(+ (arithmetic-shift (- n1 #xd800) 10)
(- n2 #xdc00)
#x10000)
(parse-error "Bad second half of surrogate pair"))]
[_ (parse-error "Missing second half of surrogate pair")])
(if (<= #xd800 n1 #xdfff) ;; surrogate pair
(if (>= n1 #xdc00)
(parse-error "Bad first half of surrogate pair")
(match in-port
[(px #px#"^\\\\u([a-fA-F0-9]{4})" (list _ hexdigits2))
(define n2 (string->number (bytes->string/utf-8 hexdigits2) 16))
(if (<= #xdc00 n2 #xdfff)
(+ (arithmetic-shift (- n1 #xd800) 10)
(- n2 #xdc00)
#x10000)
(parse-error "Bad second half of surrogate pair"))]
[_ (parse-error "Missing second half of surrogate pair")]))
n1)]
[_ (parse-error "Bad string \\u escape")])))))

View File

@ -0,0 +1,3 @@
test:
$(MAKE) -C ../.. update-test-data
raco test test-main.rkt

View File

@ -184,6 +184,22 @@
string3: <Test #x"b10568656c6c6f" "hello">
string4: <Test #x"b1 14 616263e6b0b4e6b0b45c2f22080c0a0d0978797a" "abc\u6c34\u6C34\\/\"\b\f\n\r\txyz">
string5: <Test #x"b104f09d849e" "\uD834\uDD1E">
string6: @"Short unicode escape" <ParseError "\"\\u6c\"">
string7: @"Short unicode escape" <ParseError "\"\\u6c3\"">
surrogatepair0str: @"Unmatched high surrogate" <ParseError "\"blah\\uD834\"">
surrogatepair1str: @"Unmatched low surrogate" <ParseError "\"\\uDD1Eblah\"">
surrogatepair2str: @"Unmatched high surrogate" <ParseError "\"blah\\uD834blah\"">
surrogatepair3str: @"Unmatched low surrogate" <ParseError "\"blah\\uDD1Eblah\"">
surrogatepair4str: @"Swapped surrogates" <ParseError "\"blah\\uDD1E\\uD834blah\"">
surrogatepair5str: @"Two high surrogates" <ParseError "\"blah\\uD834\\uD834blah\"">
surrogatepair6str: @"Two low surrogates" <ParseError "\"blah\\uDD1E\\uDD1Eblah\"">
surrogatepair0sym: @"Unmatched high surrogate" <ParseError "|blah\\uD834|">
surrogatepair1sym: @"Unmatched low surrogate" <ParseError "|\\uDD1Eblah|">
surrogatepair2sym: @"Unmatched high surrogate" <ParseError "|blah\\uD834blah|">
surrogatepair3sym: @"Unmatched low surrogate" <ParseError "|blah\\uDD1Eblah|">
surrogatepair4sym: @"Swapped surrogates" <ParseError "|blah\\uDD1E\\uD834blah|">
surrogatepair5sym: @"Two high surrogates" <ParseError "|blah\\uD834\\uD834blah|">
surrogatepair6sym: @"Two low surrogates" <ParseError "|blah\\uDD1E\\uDD1Eblah|">
symbol0: <Test #x"b300" ||>
symbol2: <Test #x"b30568656c6c6f" hello>
symbol3: <Test #x"b305312d322d33" 1-2-3>
@ -197,6 +213,7 @@
symbol11: <Test #x"b3042d2d2d61" ---a>
symbol12: <Test #x"b3042d2d2d31" ---1>
symbol13: <Test #x"b3042b312e78" +1.x>
symbol14: <Test #x"b304f09d849e" |\uD834\uDD1E|>
tag0: @"Unexpected end tag" <DecodeError #x"84">
tag1: @"Invalid tag" <DecodeError #x"10">
tag2: @"Invalid tag" <DecodeError #x"61b10110">

Binary file not shown.

View File

@ -184,6 +184,22 @@
string3: <Test #x"b10568656c6c6f" "hello">
string4: <Test #x"b1 14 616263e6b0b4e6b0b45c2f22080c0a0d0978797a" "abc\u6c34\u6C34\\/\"\b\f\n\r\txyz">
string5: <Test #x"b104f09d849e" "\uD834\uDD1E">
string6: @"Short unicode escape" <ParseError "\"\\u6c\"">
string7: @"Short unicode escape" <ParseError "\"\\u6c3\"">
surrogatepair0str: @"Unmatched high surrogate" <ParseError "\"blah\\uD834\"">
surrogatepair1str: @"Unmatched low surrogate" <ParseError "\"\\uDD1Eblah\"">
surrogatepair2str: @"Unmatched high surrogate" <ParseError "\"blah\\uD834blah\"">
surrogatepair3str: @"Unmatched low surrogate" <ParseError "\"blah\\uDD1Eblah\"">
surrogatepair4str: @"Swapped surrogates" <ParseError "\"blah\\uDD1E\\uD834blah\"">
surrogatepair5str: @"Two high surrogates" <ParseError "\"blah\\uD834\\uD834blah\"">
surrogatepair6str: @"Two low surrogates" <ParseError "\"blah\\uDD1E\\uDD1Eblah\"">
surrogatepair0sym: @"Unmatched high surrogate" <ParseError "|blah\\uD834|">
surrogatepair1sym: @"Unmatched low surrogate" <ParseError "|\\uDD1Eblah|">
surrogatepair2sym: @"Unmatched high surrogate" <ParseError "|blah\\uD834blah|">
surrogatepair3sym: @"Unmatched low surrogate" <ParseError "|blah\\uDD1Eblah|">
surrogatepair4sym: @"Swapped surrogates" <ParseError "|blah\\uDD1E\\uD834blah|">
surrogatepair5sym: @"Two high surrogates" <ParseError "|blah\\uD834\\uD834blah|">
surrogatepair6sym: @"Two low surrogates" <ParseError "|blah\\uDD1E\\uDD1Eblah|">
symbol0: <Test #x"b300" ||>
symbol2: <Test #x"b30568656c6c6f" hello>
symbol3: <Test #x"b305312d322d33" 1-2-3>
@ -197,6 +213,7 @@
symbol11: <Test #x"b3042d2d2d61" ---a>
symbol12: <Test #x"b3042d2d2d31" ---1>
symbol13: <Test #x"b3042b312e78" +1.x>
symbol14: <Test #x"b304f09d849e" |\uD834\uDD1E|>
tag0: @"Unexpected end tag" <DecodeError #x"84">
tag1: @"Invalid tag" <DecodeError #x"10">
tag2: @"Invalid tag" <DecodeError #x"61b10110">