New tests and fixes for bad surrogate pair situations. Closes #43
This commit is contained in:
parent
9595872177
commit
b8fb7abab1
|
@ -221,7 +221,9 @@ class Parser(TextCodec):
|
||||||
def read_string(self, delimiter):
|
def read_string(self, delimiter):
|
||||||
def u16_escape(acc):
|
def u16_escape(acc):
|
||||||
n1 = self.hexnum(4)
|
n1 = self.hexnum(4)
|
||||||
if n1 >= 0xd800 and n1 <= 0xdbff:
|
if n1 >= 0xd800 and n1 <= 0xdfff:
|
||||||
|
if n1 >= 0xdc00:
|
||||||
|
raise DecodeError('Bad first half of surrogate pair')
|
||||||
ok = True
|
ok = True
|
||||||
ok = ok and self.nextchar() == '\\'
|
ok = ok and self.nextchar() == '\\'
|
||||||
ok = ok and self.nextchar() == 'u'
|
ok = ok and self.nextchar() == 'u'
|
||||||
|
|
Binary file not shown.
|
@ -184,6 +184,22 @@
|
||||||
string3: <Test #x"b10568656c6c6f" "hello">
|
string3: <Test #x"b10568656c6c6f" "hello">
|
||||||
string4: <Test #x"b1 14 616263e6b0b4e6b0b45c2f22080c0a0d0978797a" "abc\u6c34\u6C34\\/\"\b\f\n\r\txyz">
|
string4: <Test #x"b1 14 616263e6b0b4e6b0b45c2f22080c0a0d0978797a" "abc\u6c34\u6C34\\/\"\b\f\n\r\txyz">
|
||||||
string5: <Test #x"b104f09d849e" "\uD834\uDD1E">
|
string5: <Test #x"b104f09d849e" "\uD834\uDD1E">
|
||||||
|
string6: @"Short unicode escape" <ParseError "\"\\u6c\"">
|
||||||
|
string7: @"Short unicode escape" <ParseError "\"\\u6c3\"">
|
||||||
|
surrogatepair0str: @"Unmatched high surrogate" <ParseError "\"blah\\uD834\"">
|
||||||
|
surrogatepair1str: @"Unmatched low surrogate" <ParseError "\"\\uDD1Eblah\"">
|
||||||
|
surrogatepair2str: @"Unmatched high surrogate" <ParseError "\"blah\\uD834blah\"">
|
||||||
|
surrogatepair3str: @"Unmatched low surrogate" <ParseError "\"blah\\uDD1Eblah\"">
|
||||||
|
surrogatepair4str: @"Swapped surrogates" <ParseError "\"blah\\uDD1E\\uD834blah\"">
|
||||||
|
surrogatepair5str: @"Two high surrogates" <ParseError "\"blah\\uD834\\uD834blah\"">
|
||||||
|
surrogatepair6str: @"Two low surrogates" <ParseError "\"blah\\uDD1E\\uDD1Eblah\"">
|
||||||
|
surrogatepair0sym: @"Unmatched high surrogate" <ParseError "|blah\\uD834|">
|
||||||
|
surrogatepair1sym: @"Unmatched low surrogate" <ParseError "|\\uDD1Eblah|">
|
||||||
|
surrogatepair2sym: @"Unmatched high surrogate" <ParseError "|blah\\uD834blah|">
|
||||||
|
surrogatepair3sym: @"Unmatched low surrogate" <ParseError "|blah\\uDD1Eblah|">
|
||||||
|
surrogatepair4sym: @"Swapped surrogates" <ParseError "|blah\\uDD1E\\uD834blah|">
|
||||||
|
surrogatepair5sym: @"Two high surrogates" <ParseError "|blah\\uD834\\uD834blah|">
|
||||||
|
surrogatepair6sym: @"Two low surrogates" <ParseError "|blah\\uDD1E\\uDD1Eblah|">
|
||||||
symbol0: <Test #x"b300" ||>
|
symbol0: <Test #x"b300" ||>
|
||||||
symbol2: <Test #x"b30568656c6c6f" hello>
|
symbol2: <Test #x"b30568656c6c6f" hello>
|
||||||
symbol3: <Test #x"b305312d322d33" 1-2-3>
|
symbol3: <Test #x"b305312d322d33" 1-2-3>
|
||||||
|
@ -197,6 +213,7 @@
|
||||||
symbol11: <Test #x"b3042d2d2d61" ---a>
|
symbol11: <Test #x"b3042d2d2d61" ---a>
|
||||||
symbol12: <Test #x"b3042d2d2d31" ---1>
|
symbol12: <Test #x"b3042d2d2d31" ---1>
|
||||||
symbol13: <Test #x"b3042b312e78" +1.x>
|
symbol13: <Test #x"b3042b312e78" +1.x>
|
||||||
|
symbol14: <Test #x"b304f09d849e" |\uD834\uDD1E|>
|
||||||
tag0: @"Unexpected end tag" <DecodeError #x"84">
|
tag0: @"Unexpected end tag" <DecodeError #x"84">
|
||||||
tag1: @"Invalid tag" <DecodeError #x"10">
|
tag1: @"Invalid tag" <DecodeError #x"10">
|
||||||
tag2: @"Invalid tag" <DecodeError #x"61b10110">
|
tag2: @"Invalid tag" <DecodeError #x"61b10110">
|
||||||
|
|
|
@ -177,16 +177,18 @@
|
||||||
(match in-port
|
(match in-port
|
||||||
[(px #px#"^[a-fA-F0-9]{4}" (list hexdigits))
|
[(px #px#"^[a-fA-F0-9]{4}" (list hexdigits))
|
||||||
(define n1 (string->number (bytes->string/utf-8 hexdigits) 16))
|
(define n1 (string->number (bytes->string/utf-8 hexdigits) 16))
|
||||||
(if (<= #xd800 n1 #xdfff) ;; surrogate pair first half
|
(if (<= #xd800 n1 #xdfff) ;; surrogate pair
|
||||||
(match in-port
|
(if (>= n1 #xdc00)
|
||||||
[(px #px#"^\\\\u([a-fA-F0-9]{4})" (list _ hexdigits2))
|
(parse-error "Bad first half of surrogate pair")
|
||||||
(define n2 (string->number (bytes->string/utf-8 hexdigits2) 16))
|
(match in-port
|
||||||
(if (<= #xdc00 n2 #xdfff)
|
[(px #px#"^\\\\u([a-fA-F0-9]{4})" (list _ hexdigits2))
|
||||||
(+ (arithmetic-shift (- n1 #xd800) 10)
|
(define n2 (string->number (bytes->string/utf-8 hexdigits2) 16))
|
||||||
(- n2 #xdc00)
|
(if (<= #xdc00 n2 #xdfff)
|
||||||
#x10000)
|
(+ (arithmetic-shift (- n1 #xd800) 10)
|
||||||
(parse-error "Bad second half of surrogate pair"))]
|
(- n2 #xdc00)
|
||||||
[_ (parse-error "Missing second half of surrogate pair")])
|
#x10000)
|
||||||
|
(parse-error "Bad second half of surrogate pair"))]
|
||||||
|
[_ (parse-error "Missing second half of surrogate pair")]))
|
||||||
n1)]
|
n1)]
|
||||||
[_ (parse-error "Bad string \\u escape")])))))
|
[_ (parse-error "Bad string \\u escape")])))))
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
test:
|
||||||
|
$(MAKE) -C ../.. update-test-data
|
||||||
|
raco test test-main.rkt
|
|
@ -184,6 +184,22 @@
|
||||||
string3: <Test #x"b10568656c6c6f" "hello">
|
string3: <Test #x"b10568656c6c6f" "hello">
|
||||||
string4: <Test #x"b1 14 616263e6b0b4e6b0b45c2f22080c0a0d0978797a" "abc\u6c34\u6C34\\/\"\b\f\n\r\txyz">
|
string4: <Test #x"b1 14 616263e6b0b4e6b0b45c2f22080c0a0d0978797a" "abc\u6c34\u6C34\\/\"\b\f\n\r\txyz">
|
||||||
string5: <Test #x"b104f09d849e" "\uD834\uDD1E">
|
string5: <Test #x"b104f09d849e" "\uD834\uDD1E">
|
||||||
|
string6: @"Short unicode escape" <ParseError "\"\\u6c\"">
|
||||||
|
string7: @"Short unicode escape" <ParseError "\"\\u6c3\"">
|
||||||
|
surrogatepair0str: @"Unmatched high surrogate" <ParseError "\"blah\\uD834\"">
|
||||||
|
surrogatepair1str: @"Unmatched low surrogate" <ParseError "\"\\uDD1Eblah\"">
|
||||||
|
surrogatepair2str: @"Unmatched high surrogate" <ParseError "\"blah\\uD834blah\"">
|
||||||
|
surrogatepair3str: @"Unmatched low surrogate" <ParseError "\"blah\\uDD1Eblah\"">
|
||||||
|
surrogatepair4str: @"Swapped surrogates" <ParseError "\"blah\\uDD1E\\uD834blah\"">
|
||||||
|
surrogatepair5str: @"Two high surrogates" <ParseError "\"blah\\uD834\\uD834blah\"">
|
||||||
|
surrogatepair6str: @"Two low surrogates" <ParseError "\"blah\\uDD1E\\uDD1Eblah\"">
|
||||||
|
surrogatepair0sym: @"Unmatched high surrogate" <ParseError "|blah\\uD834|">
|
||||||
|
surrogatepair1sym: @"Unmatched low surrogate" <ParseError "|\\uDD1Eblah|">
|
||||||
|
surrogatepair2sym: @"Unmatched high surrogate" <ParseError "|blah\\uD834blah|">
|
||||||
|
surrogatepair3sym: @"Unmatched low surrogate" <ParseError "|blah\\uDD1Eblah|">
|
||||||
|
surrogatepair4sym: @"Swapped surrogates" <ParseError "|blah\\uDD1E\\uD834blah|">
|
||||||
|
surrogatepair5sym: @"Two high surrogates" <ParseError "|blah\\uD834\\uD834blah|">
|
||||||
|
surrogatepair6sym: @"Two low surrogates" <ParseError "|blah\\uDD1E\\uDD1Eblah|">
|
||||||
symbol0: <Test #x"b300" ||>
|
symbol0: <Test #x"b300" ||>
|
||||||
symbol2: <Test #x"b30568656c6c6f" hello>
|
symbol2: <Test #x"b30568656c6c6f" hello>
|
||||||
symbol3: <Test #x"b305312d322d33" 1-2-3>
|
symbol3: <Test #x"b305312d322d33" 1-2-3>
|
||||||
|
@ -197,6 +213,7 @@
|
||||||
symbol11: <Test #x"b3042d2d2d61" ---a>
|
symbol11: <Test #x"b3042d2d2d61" ---a>
|
||||||
symbol12: <Test #x"b3042d2d2d31" ---1>
|
symbol12: <Test #x"b3042d2d2d31" ---1>
|
||||||
symbol13: <Test #x"b3042b312e78" +1.x>
|
symbol13: <Test #x"b3042b312e78" +1.x>
|
||||||
|
symbol14: <Test #x"b304f09d849e" |\uD834\uDD1E|>
|
||||||
tag0: @"Unexpected end tag" <DecodeError #x"84">
|
tag0: @"Unexpected end tag" <DecodeError #x"84">
|
||||||
tag1: @"Invalid tag" <DecodeError #x"10">
|
tag1: @"Invalid tag" <DecodeError #x"10">
|
||||||
tag2: @"Invalid tag" <DecodeError #x"61b10110">
|
tag2: @"Invalid tag" <DecodeError #x"61b10110">
|
||||||
|
|
Binary file not shown.
|
@ -184,6 +184,22 @@
|
||||||
string3: <Test #x"b10568656c6c6f" "hello">
|
string3: <Test #x"b10568656c6c6f" "hello">
|
||||||
string4: <Test #x"b1 14 616263e6b0b4e6b0b45c2f22080c0a0d0978797a" "abc\u6c34\u6C34\\/\"\b\f\n\r\txyz">
|
string4: <Test #x"b1 14 616263e6b0b4e6b0b45c2f22080c0a0d0978797a" "abc\u6c34\u6C34\\/\"\b\f\n\r\txyz">
|
||||||
string5: <Test #x"b104f09d849e" "\uD834\uDD1E">
|
string5: <Test #x"b104f09d849e" "\uD834\uDD1E">
|
||||||
|
string6: @"Short unicode escape" <ParseError "\"\\u6c\"">
|
||||||
|
string7: @"Short unicode escape" <ParseError "\"\\u6c3\"">
|
||||||
|
surrogatepair0str: @"Unmatched high surrogate" <ParseError "\"blah\\uD834\"">
|
||||||
|
surrogatepair1str: @"Unmatched low surrogate" <ParseError "\"\\uDD1Eblah\"">
|
||||||
|
surrogatepair2str: @"Unmatched high surrogate" <ParseError "\"blah\\uD834blah\"">
|
||||||
|
surrogatepair3str: @"Unmatched low surrogate" <ParseError "\"blah\\uDD1Eblah\"">
|
||||||
|
surrogatepair4str: @"Swapped surrogates" <ParseError "\"blah\\uDD1E\\uD834blah\"">
|
||||||
|
surrogatepair5str: @"Two high surrogates" <ParseError "\"blah\\uD834\\uD834blah\"">
|
||||||
|
surrogatepair6str: @"Two low surrogates" <ParseError "\"blah\\uDD1E\\uDD1Eblah\"">
|
||||||
|
surrogatepair0sym: @"Unmatched high surrogate" <ParseError "|blah\\uD834|">
|
||||||
|
surrogatepair1sym: @"Unmatched low surrogate" <ParseError "|\\uDD1Eblah|">
|
||||||
|
surrogatepair2sym: @"Unmatched high surrogate" <ParseError "|blah\\uD834blah|">
|
||||||
|
surrogatepair3sym: @"Unmatched low surrogate" <ParseError "|blah\\uDD1Eblah|">
|
||||||
|
surrogatepair4sym: @"Swapped surrogates" <ParseError "|blah\\uDD1E\\uD834blah|">
|
||||||
|
surrogatepair5sym: @"Two high surrogates" <ParseError "|blah\\uD834\\uD834blah|">
|
||||||
|
surrogatepair6sym: @"Two low surrogates" <ParseError "|blah\\uDD1E\\uDD1Eblah|">
|
||||||
symbol0: <Test #x"b300" ||>
|
symbol0: <Test #x"b300" ||>
|
||||||
symbol2: <Test #x"b30568656c6c6f" hello>
|
symbol2: <Test #x"b30568656c6c6f" hello>
|
||||||
symbol3: <Test #x"b305312d322d33" 1-2-3>
|
symbol3: <Test #x"b305312d322d33" 1-2-3>
|
||||||
|
@ -197,6 +213,7 @@
|
||||||
symbol11: <Test #x"b3042d2d2d61" ---a>
|
symbol11: <Test #x"b3042d2d2d61" ---a>
|
||||||
symbol12: <Test #x"b3042d2d2d31" ---1>
|
symbol12: <Test #x"b3042d2d2d31" ---1>
|
||||||
symbol13: <Test #x"b3042b312e78" +1.x>
|
symbol13: <Test #x"b3042b312e78" +1.x>
|
||||||
|
symbol14: <Test #x"b304f09d849e" |\uD834\uDD1E|>
|
||||||
tag0: @"Unexpected end tag" <DecodeError #x"84">
|
tag0: @"Unexpected end tag" <DecodeError #x"84">
|
||||||
tag1: @"Invalid tag" <DecodeError #x"10">
|
tag1: @"Invalid tag" <DecodeError #x"10">
|
||||||
tag2: @"Invalid tag" <DecodeError #x"61b10110">
|
tag2: @"Invalid tag" <DecodeError #x"61b10110">
|
||||||
|
|
Loading…
Reference in New Issue