#lang racket/base (provide read-preserve/text string->preserve) (require racket/match) (require racket/set) (require "annotation.rkt") (require "read-binary.rkt") (require "record.rkt") (require "float.rkt") (require syntax/readerr) (require (only-in file/sha1 hex-string->bytes)) (require (only-in net/base64 base64-decode)) (define PIPE #\|) (define (parse-error* #:raise-proc [raise-proc raise-read-error] i source fmt . args) (define-values [line column pos] (port-next-location i)) (raise-proc (format "read-preserve: ~a" (apply format fmt args)) source line column pos #f)) (define (default-decode-embedded v) (error 'read-preserve/text "No decode-embedded function supplied")) (define (string->preserve s #:read-syntax? [read-syntax? #f] #:decode-embedded [decode-embedded #f] #:source [source ""]) (define p (open-input-string s)) (when read-syntax? (port-count-lines! p)) (define v (read-preserve/text p #:read-syntax? read-syntax? #:decode-embedded decode-embedded #:source source)) (when (eof-object? v) (parse-error* #:raise-proc raise-read-eof-error p source "Unexpected end of input")) (skip-whitespace* p) (when (not (eof-object? (peek-char p))) (parse-error* p source "Unexpected text following preserve")) v) (define (skip-whitespace* i) (regexp-match? #px#"^(\\s|,)*" i)) ;; side effect: consumes matched portion of input (define-match-expander px (syntax-rules () [(_ re pat) (app (lambda (v) (regexp-try-match re v)) pat)])) (define (read-preserve/text [in-port (current-input-port)] #:read-syntax? [read-syntax? #f] #:decode-embedded [decode-embedded0 #f] #:source [source (object-name in-port)]) (define read-annotations? read-syntax?) (define decode-embedded (or decode-embedded0 default-decode-embedded)) ;;--------------------------------------------------------------------------- ;; Core of parser (define (next) (wrap (pos) (next*))) (define (next*) (skip-whitespace) (match (next-char) [#\- (read-intpart (list #\-) (next-char))] [(and c (or #\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9)) (read-intpart '() c)] [#\" (read-string #\")] [(== PIPE) (string->symbol (read-string PIPE))] [#\; (annotate-next-with (read-comment-line))] [#\@ (annotate-next-with (next))] [#\: (parse-error "Unexpected key/value separator between items")] [#\# (match (next-char) [#\f #f] [#\t #t] [#\{ (sequence-fold (set) set-add* values #\})] [#\" (read-literal-binary)] [#\x (if (eqv? (next-char) #\") (read-hex-binary '()) (parse-error "Expected open-quote at start of hex ByteString"))] [#\[ (read-base64-binary '())] [#\= (define bs (read-preserve/text in-port #:read-syntax? #t #:source source)) (when (not (bytes? (annotated-item bs))) (parse-error "ByteString must follow #=")) (when (not (null? (annotated-annotations bs))) (parse-error "Annotations not permitted after #=")) (bytes->preserve (annotated-item bs) (lambda (message . args) (apply parse-error (string-append "Embedded binary value: " message) args)) #:read-syntax? read-syntax? #:on-short (lambda () (parse-error "Incomplete embedded binary value")))] [#\! (decode-embedded (next))] [c (parse-error "Invalid # syntax: ~v" c)])] [#\< (match (read-sequence #\>) ['() (parse-error "Missing record label")] [(cons label fields) (record label fields)])] [#\[ (read-sequence #\])] [#\{ (read-dictionary)] [#\> (parse-error "Unexpected >")] [#\] (parse-error "Unexpected ]")] [#\} (parse-error "Unexpected }")] [c (read-raw-symbol (list c))])) (define (set-add* s e) (when (set-member? s e) (parse-error "Duplicate set element: ~v" e)) (set-add s e)) (define (annotate-next-with a) (if read-annotations? (annotate (next) a) (next))) ;;--------------------------------------------------------------------------- ;; Basic I/O (define (parse-error fmt . args) (apply parse-error* in-port source fmt args)) (define (eof-guard v) (when (eof-object? v) (parse-error* #:raise-proc raise-read-eof-error in-port source "Unexpected end of input")) v) (define (next-char) (eof-guard (read-char in-port))) (define (skip-whitespace) (skip-whitespace* in-port)) ;;--------------------------------------------------------------------------- ;; Source location tracking (define pos (if read-syntax? (lambda () (define-values (line column position) (port-next-location in-port)) (list line column position)) (lambda () #f))) (define wrap (if read-syntax? (lambda (pos0 v) (if (annotated? v) v (let () (match-define (list line0 column0 position0) pos0) (match-define (list line1 column1 position1) (pos)) (define loc (and line0 column0 position0 position1 (srcloc source line0 column0 position0 (- position1 position0)))) (annotated '() loc v)))) (lambda (pos0 v) v))) ;;--------------------------------------------------------------------------- ;; Numbers (define (read-intpart acc-rev ch) (match ch [#\0 (read-fracexp (cons ch acc-rev))] [_ (read-digit+ acc-rev read-fracexp ch)])) (define (read-digit* acc-rev k) (match (peek-char in-port) [(? char? (? char-numeric?)) (read-digit* (cons (read-char in-port) acc-rev) k)] [_ (k acc-rev)])) (define (read-digit+ acc-rev k [ch (read-char in-port)]) (match ch [(? char? (? char-numeric?)) (read-digit* (cons ch acc-rev) k)] [_ (parse-error "Incomplete number")])) (define (read-fracexp acc-rev) (match (peek-char in-port) [#\. (read-digit+ (cons (read-char in-port) acc-rev) read-exp)] [_ (read-exp acc-rev)])) (define (read-exp acc-rev) (match (peek-char in-port) [(or #\e #\E) (read-sign-and-exp (cons (read-char in-port) acc-rev))] [_ (finish-number acc-rev)])) (define (read-sign-and-exp acc-rev) (match (peek-char in-port) [(or #\+ #\-) (read-digit+ (cons (read-char in-port) acc-rev) finish-number)] [_ (read-digit+ acc-rev finish-number)])) (define (finish-number acc-rev) (define s (list->string (reverse acc-rev))) (define n (string->number s 10)) (when (not n) (parse-error "Invalid number: ~v" s)) (if (flonum? n) (match (peek-char in-port) [(or #\f #\F) (read-char in-port) (float n)] [_ n]) n)) ;;--------------------------------------------------------------------------- ;; String-like things (define (read-stringlike xform-item finish terminator-char hexescape-char hexescape-proc) (let loop ((acc '())) (match (next-char) [(== terminator-char) (finish (reverse acc))] [#\\ (match (next-char) [(== hexescape-char) (loop (cons (hexescape-proc) acc))] [(and c (or (== terminator-char) #\\ #\/)) (loop (cons (xform-item c) acc))] [#\b (loop (cons (xform-item #\u08) acc))] [#\f (loop (cons (xform-item #\u0C) acc))] [#\n (loop (cons (xform-item #\u0A) acc))] [#\r (loop (cons (xform-item #\u0D) acc))] [#\t (loop (cons (xform-item #\u09) acc))] [c (parse-error "Invalid escape code \\~a" c)])] [c (loop (cons (xform-item c) acc))]))) (define (read-string terminator-char) (read-stringlike values list->string terminator-char #\u (lambda () (integer->char (match in-port [(px #px#"^[a-fA-F0-9]{4}" (list hexdigits)) (define n1 (string->number (bytes->string/utf-8 hexdigits) 16)) (if (<= #xd800 n1 #xdfff) ;; surrogate pair first half (match in-port [(px #px#"^\\\\u([a-fA-F0-9]{4})" (list _ hexdigits2)) (define n2 (string->number (bytes->string/utf-8 hexdigits2) 16)) (if (<= #xdc00 n2 #xdfff) (+ (arithmetic-shift (- n1 #xd800) 10) (- n2 #xdc00) #x10000) (parse-error "Bad second half of surrogate pair"))] [_ (parse-error "Missing second half of surrogate pair")]) n1)] [_ (parse-error "Bad string \\u escape")]))))) (define (read-literal-binary) (read-stringlike (lambda (c) (define b (char->integer c)) (when (>= b 256) (parse-error "Invalid code point ~a (~v) in literal binary" b c)) b) list->bytes #\" #\x (lambda () (match in-port [(px #px#"^[a-fA-F0-9]{2}" (list hexdigits)) (string->number (bytes->string/utf-8 hexdigits) 16)] [_ (parse-error "Bad binary \\x escape")])))) ;;--------------------------------------------------------------------------- ;; Hex-encoded ByteStrings (define (hexdigit? ch) (or (and (char>=? ch #\A) (char<=? ch #\F)) (and (char>=? ch #\a) (char<=? ch #\f)) (and (char>=? ch #\0) (char<=? ch #\9)))) (define (read-hex-binary acc) (skip-whitespace) (define ch (next-char)) (cond [(eqv? ch #\") (hex-string->bytes (list->string (reverse acc)))] [(hexdigit? ch) (define ch2 (next-char)) (when (not (hexdigit? ch2)) (parse-error "Hex-encoded binary digits must come in pairs")) (read-hex-binary (cons ch2 (cons ch acc)))] [else (parse-error "Invalid hex character")])) ;;--------------------------------------------------------------------------- ;; Base64-encoded ByteStrings (define (read-base64-binary acc) (skip-whitespace) (define ch (next-char)) (cond [(eqv? ch #\]) (base64-decode (string->bytes/latin-1 (list->string (reverse acc))))] [(or (and (char>=? ch #\A) (char<=? ch #\Z)) (and (char>=? ch #\a) (char<=? ch #\z)) (and (char>=? ch #\0) (char<=? ch #\9)) (memv ch '(#\+ #\/ #\- #\_ #\=))) (read-base64-binary (cons ch acc))] [else (parse-error "Invalid base64 character")])) ;;--------------------------------------------------------------------------- ;; Comments (define (read-comment-line) (define pos0 (pos)) (let loop ((acc '())) (match (next-char) [(or #\newline #\return) (wrap pos0 (list->string (reverse acc)))] [c (loop (cons c acc))]))) ;;--------------------------------------------------------------------------- ;; Collections (define (sequence-fold acc accumulate-one finish terminator-char) (let loop ((acc acc)) (skip-whitespace) (match (eof-guard (peek-char in-port)) [(== terminator-char) (read-char in-port) (finish acc)] [_ (loop (accumulate-one acc (next)))]))) (define (read-sequence terminator) (sequence-fold '() (lambda (acc v) (cons v acc)) reverse terminator)) (define (read-dictionary) (sequence-fold (hash) (lambda (acc k) (skip-whitespace) (match (peek-char in-port) [#\: (read-char in-port) (when (hash-has-key? acc k) (parse-error "Duplicate key: ~v" k)) (hash-set acc k (next))] [_ (parse-error "Missing expected key/value separator")])) values #\})) ;;--------------------------------------------------------------------------- ;; "Raw" symbols (define (read-raw-symbol acc) (match (peek-char in-port) [(or (? eof-object?) (? char? (or #\( #\) #\{ #\} #\[ #\] #\< #\> #\" #\; #\, #\@ #\# #\: (== PIPE) (? char-whitespace?)))) (string->symbol (list->string (reverse acc)))] [_ (read-raw-symbol (cons (read-char in-port) acc))])) ;;--------------------------------------------------------------------------- ;; Main entry point to parser (skip-whitespace) (match (peek-char in-port) [(? eof-object?) eof] [_ (next)]))