preserves/implementations/racket/preserves/preserves/read-text-generic.rkt

311 lines
12 KiB
Racket

#lang racket/base
(provide make-preserve-text-reader
make-preserve-string-reader
parse-error
skip-whitespace
skip-whitespace/commas
eof-guard)
(require racket/match)
(require "embedded.rkt")
(require "annotation.rkt")
(require "float.rkt")
(require "float-bytes.rkt")
(require syntax/readerr)
(require (only-in file/sha1 hex-string->bytes))
(require (only-in net/base64 base64-decode))
(define PIPE #\|)
(define (make-preserve-string-reader reader-name reader)
(lambda (s
#:read-syntax? [read-syntax? #f]
#:decode-embedded [decode-embedded #f]
#:source [source "<string>"])
(define p (open-input-string s))
(when read-syntax? (port-count-lines! p))
(define v (reader p source read-syntax? decode-embedded))
(when (eof-object? v)
(parse-error #:raise-proc raise-read-eof-error reader-name p source "Unexpected end of input"))
(skip-whitespace p)
(when (not (eof-object? (peek-char p)))
(parse-error reader-name p source "Unexpected following text"))
v))
(define (parse-error #:raise-proc [raise-proc raise-read-error] reader-name i source fmt . args)
(define-values [line column pos] (port-next-location i))
(raise-proc (format "~a: ~a" reader-name (apply format fmt args))
source
line
column
pos
#f))
(define (skip-whitespace i)
(regexp-match? #px#"^\\s*" i)) ;; side effect: consumes matched portion of input
(define (skip-whitespace/commas i)
(regexp-match? #px#"^(\\s|,)*" i)) ;; side effect: consumes matched portion of input
(define (eof-guard reader-name in-port source v)
(when (eof-object? v)
(parse-error #:raise-proc raise-read-eof-error reader-name in-port source
"Unexpected end of input"))
v)
(define (next-char reader-name in-port source)
(eof-guard reader-name in-port source (read-char in-port)))
(define-match-expander px
(syntax-rules ()
[(_ re pat) (app (lambda (v) (regexp-try-match re v)) pat)]))
(define ((make-preserve-text-reader #:reader-name reader-name
#:on-char on-char0
#:on-hash on-hash0
#:read-annotated-value read-annotated-value0)
in-port source read-syntax? decode-embedded0)
(define read-annotations? read-syntax?)
(define decode-embedded (or decode-embedded0
(lambda (v)
(error reader-name "No decode-embedded function supplied"))))
;;---------------------------------------------------------------------------
;; Basic I/O
(define (parse-error* fmt . args)
(apply parse-error reader-name in-port source fmt args))
(define (next-char*) (next-char reader-name in-port source))
(define (skip-whitespace*) (skip-whitespace in-port))
(define (skip-whitespace/commas*) (skip-whitespace/commas in-port))
;;---------------------------------------------------------------------------
;; Core of parser
(define (next) (wrap (pos) (next*)))
(define (next*)
(skip-whitespace*)
(match (next-char*)
[#\" (read-string #\")]
[(== PIPE) (string->symbol (read-string PIPE))]
[#\@ (annotate-next-with (next))]
[#\# (match (next-char*)
[(or #\space #\tab) (annotate-next-with (read-comment-line))]
[(or #\newline #\return) (annotate-next-with "")]
[#\f (unless (delimiter-follows?) (parse-error* "Delimiter must follow #f")) #f]
[#\t (unless (delimiter-follows?) (parse-error* "Delimiter must follow #t")) #t]
[#\" (read-literal-binary)]
[#\x (match (next-char*)
[#\" (read-hex-binary '())]
[#\f (read-hex-float 'float)]
[#\d (read-hex-float 'double)]
[c (parse-error* "Invalid #x syntax: ~v" c)])]
[#\[ (read-base64-binary '())]
[#\! (embedded (decode-embedded (next)))]
[c (on-hash c)])]
[c (on-char c)]))
(define on-char
(on-char0 in-port source next parse-error*
(lambda (c) (read-raw-symbol-or-number (list c)))))
(define on-hash
(on-hash0 in-port source next parse-error*
(lambda (c) (parse-error* "Invalid # syntax: ~v" c))))
(define read-annotated-value
(read-annotated-value0 in-port source next parse-error*))
(define (annotate-next-with a)
(if read-annotations?
(annotate (read-annotated-value) a)
(read-annotated-value)))
;;---------------------------------------------------------------------------
;; Source location tracking
(define pos
(if read-syntax?
(lambda ()
(define-values (line column position) (port-next-location in-port))
(list line column position))
(lambda () #f)))
(define wrap
(if read-syntax?
(lambda (pos0 v)
(if (annotated? v)
v
(let ()
(match-define (list line0 column0 position0) pos0)
(match-define (list line1 column1 position1) (pos))
(define loc (and line0 column0 position0 position1
(srcloc source line0 column0 position0 (- position1 position0))))
(annotated '() loc v))))
(lambda (pos0 v) v)))
;;---------------------------------------------------------------------------
;; String-like things
(define (read-stringlike xform-item finish terminator-char hexescape-char hexescape-proc)
(let loop ((acc '()))
(match (next-char*)
[(== terminator-char) (finish (reverse acc))]
[#\\ (match (next-char*)
[(== hexescape-char) (loop (cons (hexescape-proc) acc))]
[(and c (or (== terminator-char) #\\ #\/)) (loop (cons (xform-item c) acc))]
[#\b (loop (cons (xform-item #\u08) acc))]
[#\f (loop (cons (xform-item #\u0C) acc))]
[#\n (loop (cons (xform-item #\u0A) acc))]
[#\r (loop (cons (xform-item #\u0D) acc))]
[#\t (loop (cons (xform-item #\u09) acc))]
[c (parse-error* "Invalid escape code \\~a" c)])]
[c (loop (cons (xform-item c) acc))])))
(define (read-string terminator-char)
(read-stringlike values
list->string
terminator-char
#\u
(lambda ()
(integer->char
(match in-port
[(px #px#"^[a-fA-F0-9]{4}" (list hexdigits))
(define n1 (string->number (bytes->string/utf-8 hexdigits) 16))
(if (<= #xd800 n1 #xdfff) ;; surrogate pair
(if (>= n1 #xdc00)
(parse-error* "Bad first half of surrogate pair")
(match in-port
[(px #px#"^\\\\u([a-fA-F0-9]{4})" (list _ hexdigits2))
(define n2 (string->number (bytes->string/utf-8 hexdigits2) 16))
(if (<= #xdc00 n2 #xdfff)
(+ (arithmetic-shift (- n1 #xd800) 10)
(- n2 #xdc00)
#x10000)
(parse-error* "Bad second half of surrogate pair"))]
[_ (parse-error* "Missing second half of surrogate pair")]))
n1)]
[_ (parse-error* "Bad string \\u escape")])))))
(define (read-literal-binary)
(read-stringlike (lambda (c)
(define b (char->integer c))
(when (>= b 256)
(parse-error* "Invalid code point ~a (~v) in literal binary" b c))
b)
list->bytes
#\"
#\x
(lambda ()
(match in-port
[(px #px#"^[a-fA-F0-9]{2}" (list hexdigits))
(string->number (bytes->string/utf-8 hexdigits) 16)]
[_ (parse-error* "Bad binary \\x escape")]))))
;;---------------------------------------------------------------------------
;; Hex-encoded ByteStrings
(define (hexdigit? ch)
(or (and (char>=? ch #\A) (char<=? ch #\F))
(and (char>=? ch #\a) (char<=? ch #\f))
(and (char>=? ch #\0) (char<=? ch #\9))))
(define (read-hex-binary acc)
(skip-whitespace*)
(define ch (next-char*))
(cond [(eqv? ch #\")
(hex-string->bytes (list->string (reverse acc)))]
[(hexdigit? ch)
(define ch2 (next-char*))
(when (not (hexdigit? ch2))
(parse-error* "Hex-encoded binary digits must come in pairs"))
(read-hex-binary (cons ch2 (cons ch acc)))]
[else
(parse-error* "Invalid hex character")]))
;;---------------------------------------------------------------------------
;; Hex-encoded floating point numbers
(define (read-hex-float precision)
(unless (eqv? (next-char*) #\")
(parse-error* "Missing open-double-quote in hex-encoded floating-point number"))
(define bs (read-hex-binary '()))
(unless (= (bytes-length bs) (match precision ['float 4] ['double 8]))
(parse-error* "Incorrect number of bytes in hex-encoded floating-point number"))
(match precision
['float (bytes->float bs)]
['double (bytes->double bs)]))
;;---------------------------------------------------------------------------
;; Base64-encoded ByteStrings
(define (read-base64-binary acc)
(skip-whitespace*)
(define ch (next-char*))
(cond [(eqv? ch #\])
(base64-decode (string->bytes/latin-1 (list->string (reverse acc))))]
[(char=? ch #\-) (read-base64-binary (cons #\+ acc))]
[(char=? ch #\_) (read-base64-binary (cons #\/ acc))]
[(or (and (char>=? ch #\A) (char<=? ch #\Z))
(and (char>=? ch #\a) (char<=? ch #\z))
(and (char>=? ch #\0) (char<=? ch #\9))
(memv ch '(#\+ #\/ #\=)))
(read-base64-binary (cons ch acc))]
[else
(parse-error* "Invalid base64 character")]))
;;---------------------------------------------------------------------------
;; Comments
(define (read-comment-line)
(define pos0 (pos))
(let loop ((acc '()))
(match (read-char in-port)
[(or (? eof-object?) #\newline #\return)
(wrap pos0 (list->string (reverse acc)))]
[c (loop (cons c acc))])))
;;---------------------------------------------------------------------------
;; "Raw" symbols and numbers
(define (delimiter-follows?)
(define c (peek-char in-port))
(or (eof-object? c)
(char-whitespace? c)
(eqv? c PIPE)
(memv c '(#\< #\> #\[ #\] #\{ #\} #\( #\)
#\# #\: #\" #\@ #\; #\,))))
(define (read-raw-symbol-or-number acc)
(if (delimiter-follows?)
(let ((input (list->string (reverse acc))))
(or (analyze-number input)
(string->symbol input)))
(read-raw-symbol-or-number (cons (read-char in-port) acc))))
(define (analyze-number input)
(match input
[(pregexp #px"^([-+]?\\d+)(((\\.\\d+([eE][-+]?\\d+)?)|([eE][-+]?\\d+))([fF]?))?$"
(list _ whole _ frac _ _ _ f))
(define n (string->number (if frac (string-append whole frac) whole)))
(cond [(not n) #f]
[(and f (positive? (string-length f))) (float n)]
[else n])]
[_ #f]))
;;---------------------------------------------------------------------------
;; Main entry point to parser
(skip-whitespace*)
(match (peek-char in-port)
[(? eof-object?) eof]
[_ (next)]))