#lang racket/base (provide make-preserve-text-reader make-preserve-string-reader parse-error skip-whitespace skip-whitespace/commas eof-guard) (require racket/match) (require "embedded.rkt") (require "annotation.rkt") (require "float.rkt") (require "float-bytes.rkt") (require syntax/readerr) (require (only-in file/sha1 hex-string->bytes)) (require (only-in net/base64 base64-decode)) (define PIPE #\|) (define (make-preserve-string-reader reader-name reader) (lambda (s #:read-syntax? [read-syntax? #f] #:decode-embedded [decode-embedded #f] #:source [source ""]) (define p (open-input-string s)) (when read-syntax? (port-count-lines! p)) (define v (reader p source read-syntax? decode-embedded)) (when (eof-object? v) (parse-error #:raise-proc raise-read-eof-error reader-name p source "Unexpected end of input")) (skip-whitespace p) (when (not (eof-object? (peek-char p))) (parse-error reader-name p source "Unexpected following text")) v)) (define (parse-error #:raise-proc [raise-proc raise-read-error] reader-name i source fmt . args) (define-values [line column pos] (port-next-location i)) (raise-proc (format "~a: ~a" reader-name (apply format fmt args)) source line column pos #f)) (define (skip-whitespace i) (regexp-match? #px#"^\\s*" i)) ;; side effect: consumes matched portion of input (define (skip-whitespace/commas i) (regexp-match? #px#"^(\\s|,)*" i)) ;; side effect: consumes matched portion of input (define (eof-guard reader-name in-port source v) (when (eof-object? v) (parse-error #:raise-proc raise-read-eof-error reader-name in-port source "Unexpected end of input")) v) (define (next-char reader-name in-port source) (eof-guard reader-name in-port source (read-char in-port))) (define-match-expander px (syntax-rules () [(_ re pat) (app (lambda (v) (regexp-try-match re v)) pat)])) (define ((make-preserve-text-reader #:reader-name reader-name #:on-char on-char0 #:on-hash on-hash0 #:read-annotated-value read-annotated-value0) in-port source read-syntax? decode-embedded0) (define read-annotations? read-syntax?) (define decode-embedded (or decode-embedded0 (lambda (v) (error reader-name "No decode-embedded function supplied")))) ;;--------------------------------------------------------------------------- ;; Basic I/O (define (parse-error* fmt . args) (apply parse-error reader-name in-port source fmt args)) (define (next-char*) (next-char reader-name in-port source)) (define (skip-whitespace*) (skip-whitespace in-port)) (define (skip-whitespace/commas*) (skip-whitespace/commas in-port)) ;;--------------------------------------------------------------------------- ;; Core of parser (define (next) (wrap (pos) (next*))) (define (next*) (skip-whitespace*) (match (next-char*) [#\" (read-string #\")] [(== PIPE) (string->symbol (read-string PIPE))] [#\@ (annotate-next-with (next))] [#\# (match (next-char*) [(or #\space #\tab) (annotate-next-with (read-comment-line))] [(or #\newline #\return) (annotate-next-with "")] [#\f (unless (delimiter-follows?) (parse-error* "Delimiter must follow #f")) #f] [#\t (unless (delimiter-follows?) (parse-error* "Delimiter must follow #t")) #t] [#\" (read-literal-binary)] [#\x (match (next-char*) [#\" (read-hex-binary '())] [#\f (read-hex-float 'float)] [#\d (read-hex-float 'double)] [c (parse-error* "Invalid #x syntax: ~v" c)])] [#\[ (read-base64-binary '())] [#\! (embedded (decode-embedded (next)))] [c (on-hash c)])] [c (on-char c)])) (define on-char (on-char0 in-port source next parse-error* (lambda (c) (read-raw-symbol-or-number (list c))))) (define on-hash (on-hash0 in-port source next parse-error* (lambda (c) (parse-error* "Invalid # syntax: ~v" c)))) (define read-annotated-value (read-annotated-value0 in-port source next parse-error*)) (define (annotate-next-with a) (if read-annotations? (annotate (read-annotated-value) a) (read-annotated-value))) ;;--------------------------------------------------------------------------- ;; Source location tracking (define pos (if read-syntax? (lambda () (define-values (line column position) (port-next-location in-port)) (list line column position)) (lambda () #f))) (define wrap (if read-syntax? (lambda (pos0 v) (if (annotated? v) v (let () (match-define (list line0 column0 position0) pos0) (match-define (list line1 column1 position1) (pos)) (define loc (and line0 column0 position0 position1 (srcloc source line0 column0 position0 (- position1 position0)))) (annotated '() loc v)))) (lambda (pos0 v) v))) ;;--------------------------------------------------------------------------- ;; String-like things (define (read-stringlike xform-item finish terminator-char hexescape-char hexescape-proc) (let loop ((acc '())) (match (next-char*) [(== terminator-char) (finish (reverse acc))] [#\\ (match (next-char*) [(== hexescape-char) (loop (cons (hexescape-proc) acc))] [(and c (or (== terminator-char) #\\ #\/)) (loop (cons (xform-item c) acc))] [#\b (loop (cons (xform-item #\u08) acc))] [#\f (loop (cons (xform-item #\u0C) acc))] [#\n (loop (cons (xform-item #\u0A) acc))] [#\r (loop (cons (xform-item #\u0D) acc))] [#\t (loop (cons (xform-item #\u09) acc))] [c (parse-error* "Invalid escape code \\~a" c)])] [c (loop (cons (xform-item c) acc))]))) (define (read-string terminator-char) (read-stringlike values list->string terminator-char #\u (lambda () (integer->char (match in-port [(px #px#"^[a-fA-F0-9]{4}" (list hexdigits)) (define n1 (string->number (bytes->string/utf-8 hexdigits) 16)) (if (<= #xd800 n1 #xdfff) ;; surrogate pair (if (>= n1 #xdc00) (parse-error* "Bad first half of surrogate pair") (match in-port [(px #px#"^\\\\u([a-fA-F0-9]{4})" (list _ hexdigits2)) (define n2 (string->number (bytes->string/utf-8 hexdigits2) 16)) (if (<= #xdc00 n2 #xdfff) (+ (arithmetic-shift (- n1 #xd800) 10) (- n2 #xdc00) #x10000) (parse-error* "Bad second half of surrogate pair"))] [_ (parse-error* "Missing second half of surrogate pair")])) n1)] [_ (parse-error* "Bad string \\u escape")]))))) (define (read-literal-binary) (read-stringlike (lambda (c) (define b (char->integer c)) (when (>= b 256) (parse-error* "Invalid code point ~a (~v) in literal binary" b c)) b) list->bytes #\" #\x (lambda () (match in-port [(px #px#"^[a-fA-F0-9]{2}" (list hexdigits)) (string->number (bytes->string/utf-8 hexdigits) 16)] [_ (parse-error* "Bad binary \\x escape")])))) ;;--------------------------------------------------------------------------- ;; Hex-encoded ByteStrings (define (hexdigit? ch) (or (and (char>=? ch #\A) (char<=? ch #\F)) (and (char>=? ch #\a) (char<=? ch #\f)) (and (char>=? ch #\0) (char<=? ch #\9)))) (define (read-hex-binary acc) (skip-whitespace*) (define ch (next-char*)) (cond [(eqv? ch #\") (hex-string->bytes (list->string (reverse acc)))] [(hexdigit? ch) (define ch2 (next-char*)) (when (not (hexdigit? ch2)) (parse-error* "Hex-encoded binary digits must come in pairs")) (read-hex-binary (cons ch2 (cons ch acc)))] [else (parse-error* "Invalid hex character")])) ;;--------------------------------------------------------------------------- ;; Hex-encoded floating point numbers (define (read-hex-float precision) (unless (eqv? (next-char*) #\") (parse-error* "Missing open-double-quote in hex-encoded floating-point number")) (define bs (read-hex-binary '())) (unless (= (bytes-length bs) (match precision ['float 4] ['double 8])) (parse-error* "Incorrect number of bytes in hex-encoded floating-point number")) (match precision ['float (bytes->float bs)] ['double (bytes->double bs)])) ;;--------------------------------------------------------------------------- ;; Base64-encoded ByteStrings (define (read-base64-binary acc) (skip-whitespace*) (define ch (next-char*)) (cond [(eqv? ch #\]) (base64-decode (string->bytes/latin-1 (list->string (reverse acc))))] [(char=? ch #\-) (read-base64-binary (cons #\+ acc))] [(char=? ch #\_) (read-base64-binary (cons #\/ acc))] [(or (and (char>=? ch #\A) (char<=? ch #\Z)) (and (char>=? ch #\a) (char<=? ch #\z)) (and (char>=? ch #\0) (char<=? ch #\9)) (memv ch '(#\+ #\/ #\=))) (read-base64-binary (cons ch acc))] [else (parse-error* "Invalid base64 character")])) ;;--------------------------------------------------------------------------- ;; Comments (define (read-comment-line) (define pos0 (pos)) (let loop ((acc '())) (match (next-char*) [(or #\newline #\return) (wrap pos0 (list->string (reverse acc)))] [c (loop (cons c acc))]))) ;;--------------------------------------------------------------------------- ;; "Raw" symbols and numbers (define (delimiter-follows?) (define c (peek-char in-port)) (or (eof-object? c) (char-whitespace? c) (eqv? c PIPE) (memv c '(#\< #\> #\[ #\] #\{ #\} #\( #\) #\# #\: #\" #\@ #\; #\,)))) (define (read-raw-symbol-or-number acc) (if (delimiter-follows?) (let ((input (list->string (reverse acc)))) (or (analyze-number input) (string->symbol input))) (read-raw-symbol-or-number (cons (read-char in-port) acc)))) (define (analyze-number input) (match input [(pregexp #px"^([-+]?\\d+)(((\\.\\d+([eE][-+]?\\d+)?)|([eE][-+]?\\d+))([fF]?))?$" (list _ whole _ frac _ _ _ f)) (define n (string->number (if frac (string-append whole frac) whole))) (cond [(not n) #f] [(and f (positive? (string-length f))) (float n)] [else n])] [_ #f])) ;;--------------------------------------------------------------------------- ;; Main entry point to parser (skip-whitespace*) (match (peek-char in-port) [(? eof-object?) eof] [_ (next)]))