diff --git a/implementations/racket/preserves/preserves/read-text-generic.rkt b/implementations/racket/preserves/preserves/read-text-generic.rkt new file mode 100644 index 0000000..bdc88ce --- /dev/null +++ b/implementations/racket/preserves/preserves/read-text-generic.rkt @@ -0,0 +1,290 @@ +#lang racket/base + +(provide make-preserve-text-reader + + parse-error + skip-whitespace + skip-whitespace/commas + eof-guard) + +(require racket/match) +(require "embedded.rkt") +(require "annotation.rkt") +(require "float.rkt") +(require "float-bytes.rkt") +(require syntax/readerr) +(require (only-in file/sha1 hex-string->bytes)) +(require (only-in net/base64 base64-decode)) + +(define PIPE #\|) + +(define (parse-error #:raise-proc [raise-proc raise-read-error] reader-name i source fmt . args) + (define-values [line column pos] (port-next-location i)) + (raise-proc (format "~a: ~a" reader-name (apply format fmt args)) + source + line + column + pos + #f)) + +(define (skip-whitespace i) + (regexp-match? #px#"^\\s*" i)) ;; side effect: consumes matched portion of input + +(define (skip-whitespace/commas i) + (regexp-match? #px#"^(\\s|,)*" i)) ;; side effect: consumes matched portion of input + +(define (eof-guard reader-name in-port source v) + (when (eof-object? v) + (parse-error #:raise-proc raise-read-eof-error reader-name in-port source + "Unexpected end of input")) + v) + +(define (next-char reader-name in-port source) + (eof-guard reader-name in-port source (read-char in-port))) + +(define-match-expander px + (syntax-rules () + [(_ re pat) (app (lambda (v) (regexp-try-match re v)) pat)])) + +(define ((make-preserve-text-reader #:reader-name reader-name + #:on-char on-char0 + #:on-hash on-hash0) + in-port source read-syntax? decode-embedded0) + (define read-annotations? read-syntax?) + (define decode-embedded (or decode-embedded0 + (lambda (v) + (error reader-name "No decode-embedded function supplied")))) + + ;;--------------------------------------------------------------------------- + ;; Basic I/O + + (define (parse-error* fmt . args) + (apply parse-error reader-name in-port source fmt args)) + + (define (next-char*) (next-char reader-name in-port source)) + + (define (skip-whitespace*) (skip-whitespace in-port)) + (define (skip-whitespace/commas*) (skip-whitespace/commas in-port)) + + ;;--------------------------------------------------------------------------- + ;; Core of parser + + (define (next) (wrap (pos) (next*))) + + (define (next*) + (skip-whitespace*) + (match (next-char*) + [#\" (read-string #\")] + [(== PIPE) (string->symbol (read-string PIPE))] + + [#\@ (annotate-next-with (next))] + + [#\# (match (next-char*) + [(or #\space #\tab) (annotate-next-with (read-comment-line))] + [(or #\newline #\return) (annotate-next-with "")] + [#\f (unless (delimiter-follows?) (parse-error* "Delimiter must follow #f")) #f] + [#\t (unless (delimiter-follows?) (parse-error* "Delimiter must follow #t")) #t] + [#\" (read-literal-binary)] + [#\x (match (next-char*) + [#\" (read-hex-binary '())] + [#\f (read-hex-float 'float)] + [#\d (read-hex-float 'double)] + [c (parse-error* "Invalid #x syntax: ~v" c)])] + [#\[ (read-base64-binary '())] + [#\! (embedded (decode-embedded (next)))] + [c (on-hash c)])] + + [c (on-char c)])) + + (define on-char + (on-char0 in-port source next parse-error* + (lambda (c) (read-raw-symbol-or-number (list c))))) + + (define on-hash + (on-hash0 in-port source next parse-error* + (lambda (c) (parse-error* "Invalid # syntax: ~v" c)))) + + (define (annotate-next-with a) + (if read-annotations? + (annotate (next) a) + (next))) + + ;;--------------------------------------------------------------------------- + ;; Source location tracking + + (define pos + (if read-syntax? + (lambda () + (define-values (line column position) (port-next-location in-port)) + (list line column position)) + (lambda () #f))) + + (define wrap + (if read-syntax? + (lambda (pos0 v) + (if (annotated? v) + v + (let () + (match-define (list line0 column0 position0) pos0) + (match-define (list line1 column1 position1) (pos)) + (define loc (and line0 column0 position0 position1 + (srcloc source line0 column0 position0 (- position1 position0)))) + (annotated '() loc v)))) + (lambda (pos0 v) v))) + + ;;--------------------------------------------------------------------------- + ;; String-like things + + (define (read-stringlike xform-item finish terminator-char hexescape-char hexescape-proc) + (let loop ((acc '())) + (match (next-char*) + [(== terminator-char) (finish (reverse acc))] + [#\\ (match (next-char*) + [(== hexescape-char) (loop (cons (hexescape-proc) acc))] + [(and c (or (== terminator-char) #\\ #\/)) (loop (cons (xform-item c) acc))] + [#\b (loop (cons (xform-item #\u08) acc))] + [#\f (loop (cons (xform-item #\u0C) acc))] + [#\n (loop (cons (xform-item #\u0A) acc))] + [#\r (loop (cons (xform-item #\u0D) acc))] + [#\t (loop (cons (xform-item #\u09) acc))] + [c (parse-error* "Invalid escape code \\~a" c)])] + [c (loop (cons (xform-item c) acc))]))) + + (define (read-string terminator-char) + (read-stringlike values + list->string + terminator-char + #\u + (lambda () + (integer->char + (match in-port + [(px #px#"^[a-fA-F0-9]{4}" (list hexdigits)) + (define n1 (string->number (bytes->string/utf-8 hexdigits) 16)) + (if (<= #xd800 n1 #xdfff) ;; surrogate pair + (if (>= n1 #xdc00) + (parse-error* "Bad first half of surrogate pair") + (match in-port + [(px #px#"^\\\\u([a-fA-F0-9]{4})" (list _ hexdigits2)) + (define n2 (string->number (bytes->string/utf-8 hexdigits2) 16)) + (if (<= #xdc00 n2 #xdfff) + (+ (arithmetic-shift (- n1 #xd800) 10) + (- n2 #xdc00) + #x10000) + (parse-error* "Bad second half of surrogate pair"))] + [_ (parse-error* "Missing second half of surrogate pair")])) + n1)] + [_ (parse-error* "Bad string \\u escape")]))))) + + (define (read-literal-binary) + (read-stringlike (lambda (c) + (define b (char->integer c)) + (when (>= b 256) + (parse-error* "Invalid code point ~a (~v) in literal binary" b c)) + b) + list->bytes + #\" + #\x + (lambda () + (match in-port + [(px #px#"^[a-fA-F0-9]{2}" (list hexdigits)) + (string->number (bytes->string/utf-8 hexdigits) 16)] + [_ (parse-error* "Bad binary \\x escape")])))) + + ;;--------------------------------------------------------------------------- + ;; Hex-encoded ByteStrings + + (define (hexdigit? ch) + (or (and (char>=? ch #\A) (char<=? ch #\F)) + (and (char>=? ch #\a) (char<=? ch #\f)) + (and (char>=? ch #\0) (char<=? ch #\9)))) + + (define (read-hex-binary acc) + (skip-whitespace*) + (define ch (next-char*)) + (cond [(eqv? ch #\") + (hex-string->bytes (list->string (reverse acc)))] + [(hexdigit? ch) + (define ch2 (next-char*)) + (when (not (hexdigit? ch2)) + (parse-error* "Hex-encoded binary digits must come in pairs")) + (read-hex-binary (cons ch2 (cons ch acc)))] + [else + (parse-error* "Invalid hex character")])) + + ;;--------------------------------------------------------------------------- + ;; Hex-encoded floating point numbers + + (define (read-hex-float precision) + (unless (eqv? (next-char*) #\") + (parse-error* "Missing open-double-quote in hex-encoded floating-point number")) + (define bs (read-hex-binary '())) + (unless (= (bytes-length bs) (match precision ['float 4] ['double 8])) + (parse-error* "Incorrect number of bytes in hex-encoded floating-point number")) + (match precision + ['float (bytes->float bs)] + ['double (bytes->double bs)])) + + ;;--------------------------------------------------------------------------- + ;; Base64-encoded ByteStrings + + (define (read-base64-binary acc) + (skip-whitespace*) + (define ch (next-char*)) + (cond [(eqv? ch #\]) + (base64-decode (string->bytes/latin-1 (list->string (reverse acc))))] + [(char=? ch #\-) (read-base64-binary (cons #\+ acc))] + [(char=? ch #\_) (read-base64-binary (cons #\/ acc))] + [(or (and (char>=? ch #\A) (char<=? ch #\Z)) + (and (char>=? ch #\a) (char<=? ch #\z)) + (and (char>=? ch #\0) (char<=? ch #\9)) + (memv ch '(#\+ #\/ #\=))) + (read-base64-binary (cons ch acc))] + [else + (parse-error* "Invalid base64 character")])) + + ;;--------------------------------------------------------------------------- + ;; Comments + + (define (read-comment-line) + (define pos0 (pos)) + (let loop ((acc '())) + (match (next-char*) + [(or #\newline #\return) + (wrap pos0 (list->string (reverse acc)))] + [c (loop (cons c acc))]))) + + ;;--------------------------------------------------------------------------- + ;; "Raw" symbols and numbers + + (define (delimiter-follows?) + (define c (peek-char in-port)) + (or (eof-object? c) + (char-whitespace? c) + (eqv? c PIPE) + (memv c '(#\< #\> #\[ #\] #\{ #\} #\( #\) + #\# #\: #\" #\@ #\; #\,)))) + + (define (read-raw-symbol-or-number acc) + (if (delimiter-follows?) + (let ((input (list->string (reverse acc)))) + (or (analyze-number input) + (string->symbol input))) + (read-raw-symbol-or-number (cons (read-char in-port) acc)))) + + (define (analyze-number input) + (match input + [(pregexp #px"^([-+]?\\d+)(((\\.\\d+([eE][-+]?\\d+)?)|([eE][-+]?\\d+))([fF]?))?$" + (list _ whole _ frac _ _ _ f)) + (define n (string->number (if frac (string-append whole frac) whole))) + (cond [(not n) #f] + [(and f (positive? (string-length f))) (float n)] + [else n])] + [_ #f])) + + ;;--------------------------------------------------------------------------- + ;; Main entry point to parser + + (skip-whitespace*) + (match (peek-char in-port) + [(? eof-object?) eof] + [_ (next)])) diff --git a/implementations/racket/preserves/preserves/read-text.rkt b/implementations/racket/preserves/preserves/read-text.rkt index 5fba960..5a46450 100644 --- a/implementations/racket/preserves/preserves/read-text.rkt +++ b/implementations/racket/preserves/preserves/read-text.rkt @@ -5,29 +5,11 @@ (require racket/match) (require racket/set) -(require "embedded.rkt") -(require "annotation.rkt") -(require "read-binary.rkt") (require "record.rkt") -(require "float.rkt") -(require "float-bytes.rkt") +(require "read-text-generic.rkt") (require syntax/readerr) -(require (only-in file/sha1 hex-string->bytes)) -(require (only-in net/base64 base64-decode)) -(define PIPE #\|) - -(define (parse-error* #:raise-proc [raise-proc raise-read-error] i source fmt . args) - (define-values [line column pos] (port-next-location i)) - (raise-proc (format "read-preserve: ~a" (apply format fmt args)) - source - line - column - pos - #f)) - -(define (default-decode-embedded v) - (error 'read-preserve/text "No decode-embedded function supplied")) +(define *reader-name* 'read-preserve/text) (define (string->preserve s #:read-syntax? [read-syntax? #f] @@ -40,302 +22,74 @@ #:decode-embedded decode-embedded #:source source)) (when (eof-object? v) - (parse-error* #:raise-proc raise-read-eof-error p source "Unexpected end of input")) - (skip-whitespace* p) + (parse-error #:raise-proc raise-read-eof-error *reader-name* p source "Unexpected end of input")) + (skip-whitespace p) (when (not (eof-object? (peek-char p))) - (parse-error* p source "Unexpected text following preserve")) + (parse-error *reader-name* p source "Unexpected text following preserve")) v) -(define (skip-whitespace* i) - (regexp-match? #px#"^\\s*" i)) ;; side effect: consumes matched portion of input +(define text-reader + (make-preserve-text-reader + #:reader-name *reader-name* + #:on-hash (lambda (in-port source next parse-error* default) + (match-lambda + [#\{ (sequence-fold in-port + source + next + skip-whitespace/commas + (set) + (lambda (s e) + (when (set-member? s e) + (parse-error* "Duplicate set element: ~v" e)) + (set-add s e)) + values + #\})] + [c (default c)])) + #:on-char (lambda (in-port source next parse-error* default) + (match-lambda + [#\< (match (read-sequence in-port source next skip-whitespace #\>) + ['() (parse-error* "Missing record label")] + [(cons label fields) (record label fields)])] + [#\[ (read-sequence in-port source next skip-whitespace/commas #\])] + [#\{ (read-dictionary in-port source next parse-error*)] -(define-match-expander px - (syntax-rules () - [(_ re pat) (app (lambda (v) (regexp-try-match re v)) pat)])) + [#\; (parse-error* "Semicolon is reserved syntax")] + [#\: (parse-error* "Unexpected key/value separator between items")] + [#\> (parse-error* "Unexpected >")] + [#\] (parse-error* "Unexpected ]")] + [#\} (parse-error* "Unexpected }")] + [#\, (parse-error* "Unexpected ,")] + + [c (default c)])))) + +(define (sequence-fold in-port source next skip-ws acc accumulate-one finish terminator-char) + (let loop ((acc acc)) + (skip-ws in-port) + (match (eof-guard *reader-name* in-port source (peek-char in-port)) + [(== terminator-char) (read-char in-port) (finish acc)] + [_ (loop (accumulate-one acc (next)))]))) + +(define (read-sequence in-port source next skip-ws terminator) + (sequence-fold in-port source next skip-ws '() (lambda (acc v) (cons v acc)) reverse terminator)) + +(define (read-dictionary in-port source next parse-error*) + (sequence-fold in-port + source + next + skip-whitespace/commas + (hash) + (lambda (acc k) + (skip-whitespace in-port) + (match (peek-char in-port) + [#\: (read-char in-port) + (when (hash-has-key? acc k) (parse-error* "Duplicate key: ~v" k)) + (hash-set acc k (next))] + [_ (parse-error* "Missing expected key/value separator")])) + values + #\})) (define (read-preserve/text [in-port (current-input-port)] #:read-syntax? [read-syntax? #f] - #:decode-embedded [decode-embedded0 #f] + #:decode-embedded [decode-embedded #f] #:source [source (object-name in-port)]) - (define read-annotations? read-syntax?) - (define decode-embedded (or decode-embedded0 default-decode-embedded)) - - ;;--------------------------------------------------------------------------- - ;; Core of parser - - (define (next) (wrap (pos) (next*))) - - (define (next*) - (skip-whitespace) - (match (next-char) - [#\" (read-string #\")] - [(== PIPE) (string->symbol (read-string PIPE))] - - [#\@ (annotate-next-with (next))] - - [#\; (parse-error "Semicolon is reserved syntax")] - [#\: (parse-error "Unexpected key/value separator between items")] - - [#\# (match (next-char) - [(or #\space #\tab) (annotate-next-with (read-comment-line))] - [(or #\newline #\return) (annotate-next-with "")] - [#\f (unless (delimiter-follows?) (parse-error "Delimiter must follow #f")) #f] - [#\t (unless (delimiter-follows?) (parse-error "Delimiter must follow #t")) #t] - [#\{ (sequence-fold #t (set) set-add* values #\})] - [#\" (read-literal-binary)] - [#\x (match (next-char) - [#\" (read-hex-binary '())] - [#\f (read-hex-float 'float)] - [#\d (read-hex-float 'double)] - [c (parse-error "Invalid #x syntax: ~v" c)])] - [#\[ (read-base64-binary '())] - [#\! (embedded (decode-embedded (next)))] - [c (parse-error "Invalid # syntax: ~v" c)])] - - [#\< (match (read-sequence #\> #f) - ['() (parse-error "Missing record label")] - [(cons label fields) (record label fields)])] - [#\[ (read-sequence #\] #t)] - [#\{ (read-dictionary)] - - [#\> (parse-error "Unexpected >")] - [#\] (parse-error "Unexpected ]")] - [#\} (parse-error "Unexpected }")] - [#\, (parse-error "Unexpected ,")] - - [c (read-raw-symbol-or-number (list c))])) - - (define (set-add* s e) - (when (set-member? s e) (parse-error "Duplicate set element: ~v" e)) - (set-add s e)) - - (define (annotate-next-with a) - (if read-annotations? - (annotate (next) a) - (next))) - - ;;--------------------------------------------------------------------------- - ;; Basic I/O - - (define (parse-error fmt . args) - (apply parse-error* in-port source fmt args)) - - (define (eof-guard v) - (when (eof-object? v) - (parse-error* #:raise-proc raise-read-eof-error in-port source "Unexpected end of input")) - v) - - (define (next-char) (eof-guard (read-char in-port))) - - (define (skip-whitespace) (skip-whitespace* in-port)) - - (define (skip-whitespace/commas) - (regexp-match? #px#"^(\\s|,)*" in-port)) ;; side effect: consumes matched portion of input - - ;;--------------------------------------------------------------------------- - ;; Source location tracking - - (define pos - (if read-syntax? - (lambda () - (define-values (line column position) (port-next-location in-port)) - (list line column position)) - (lambda () #f))) - - (define wrap - (if read-syntax? - (lambda (pos0 v) - (if (annotated? v) - v - (let () - (match-define (list line0 column0 position0) pos0) - (match-define (list line1 column1 position1) (pos)) - (define loc (and line0 column0 position0 position1 - (srcloc source line0 column0 position0 (- position1 position0)))) - (annotated '() loc v)))) - (lambda (pos0 v) v))) - - ;;--------------------------------------------------------------------------- - ;; String-like things - - (define (read-stringlike xform-item finish terminator-char hexescape-char hexescape-proc) - (let loop ((acc '())) - (match (next-char) - [(== terminator-char) (finish (reverse acc))] - [#\\ (match (next-char) - [(== hexescape-char) (loop (cons (hexescape-proc) acc))] - [(and c (or (== terminator-char) #\\ #\/)) (loop (cons (xform-item c) acc))] - [#\b (loop (cons (xform-item #\u08) acc))] - [#\f (loop (cons (xform-item #\u0C) acc))] - [#\n (loop (cons (xform-item #\u0A) acc))] - [#\r (loop (cons (xform-item #\u0D) acc))] - [#\t (loop (cons (xform-item #\u09) acc))] - [c (parse-error "Invalid escape code \\~a" c)])] - [c (loop (cons (xform-item c) acc))]))) - - (define (read-string terminator-char) - (read-stringlike values - list->string - terminator-char - #\u - (lambda () - (integer->char - (match in-port - [(px #px#"^[a-fA-F0-9]{4}" (list hexdigits)) - (define n1 (string->number (bytes->string/utf-8 hexdigits) 16)) - (if (<= #xd800 n1 #xdfff) ;; surrogate pair - (if (>= n1 #xdc00) - (parse-error "Bad first half of surrogate pair") - (match in-port - [(px #px#"^\\\\u([a-fA-F0-9]{4})" (list _ hexdigits2)) - (define n2 (string->number (bytes->string/utf-8 hexdigits2) 16)) - (if (<= #xdc00 n2 #xdfff) - (+ (arithmetic-shift (- n1 #xd800) 10) - (- n2 #xdc00) - #x10000) - (parse-error "Bad second half of surrogate pair"))] - [_ (parse-error "Missing second half of surrogate pair")])) - n1)] - [_ (parse-error "Bad string \\u escape")]))))) - - (define (read-literal-binary) - (read-stringlike (lambda (c) - (define b (char->integer c)) - (when (>= b 256) - (parse-error "Invalid code point ~a (~v) in literal binary" b c)) - b) - list->bytes - #\" - #\x - (lambda () - (match in-port - [(px #px#"^[a-fA-F0-9]{2}" (list hexdigits)) - (string->number (bytes->string/utf-8 hexdigits) 16)] - [_ (parse-error "Bad binary \\x escape")])))) - - ;;--------------------------------------------------------------------------- - ;; Hex-encoded ByteStrings - - (define (hexdigit? ch) - (or (and (char>=? ch #\A) (char<=? ch #\F)) - (and (char>=? ch #\a) (char<=? ch #\f)) - (and (char>=? ch #\0) (char<=? ch #\9)))) - - (define (read-hex-binary acc) - (skip-whitespace) - (define ch (next-char)) - (cond [(eqv? ch #\") - (hex-string->bytes (list->string (reverse acc)))] - [(hexdigit? ch) - (define ch2 (next-char)) - (when (not (hexdigit? ch2)) - (parse-error "Hex-encoded binary digits must come in pairs")) - (read-hex-binary (cons ch2 (cons ch acc)))] - [else - (parse-error "Invalid hex character")])) - - ;;--------------------------------------------------------------------------- - ;; Hex-encoded floating point numbers - - (define (read-hex-float precision) - (unless (eqv? (next-char) #\") - (parse-error "Missing open-double-quote in hex-encoded floating-point number")) - (define bs (read-hex-binary '())) - (unless (= (bytes-length bs) (match precision ['float 4] ['double 8])) - (parse-error "Incorrect number of bytes in hex-encoded floating-point number")) - (match precision - ['float (bytes->float bs)] - ['double (bytes->double bs)])) - - ;;--------------------------------------------------------------------------- - ;; Base64-encoded ByteStrings - - (define (read-base64-binary acc) - (skip-whitespace) - (define ch (next-char)) - (cond [(eqv? ch #\]) - (base64-decode (string->bytes/latin-1 (list->string (reverse acc))))] - [(char=? ch #\-) (read-base64-binary (cons #\+ acc))] - [(char=? ch #\_) (read-base64-binary (cons #\/ acc))] - [(or (and (char>=? ch #\A) (char<=? ch #\Z)) - (and (char>=? ch #\a) (char<=? ch #\z)) - (and (char>=? ch #\0) (char<=? ch #\9)) - (memv ch '(#\+ #\/ #\=))) - (read-base64-binary (cons ch acc))] - [else - (parse-error "Invalid base64 character")])) - - ;;--------------------------------------------------------------------------- - ;; Comments - - (define (read-comment-line) - (define pos0 (pos)) - (let loop ((acc '())) - (match (next-char) - [(or #\newline #\return) - (wrap pos0 (list->string (reverse acc)))] - [c (loop (cons c acc))]))) - - ;;--------------------------------------------------------------------------- - ;; Collections - - (define (sequence-fold commas-allowed? acc accumulate-one finish terminator-char) - (let loop ((acc acc)) - (if commas-allowed? - (skip-whitespace/commas) - (skip-whitespace)) - (match (eof-guard (peek-char in-port)) - [(== terminator-char) (read-char in-port) (finish acc)] - [_ (loop (accumulate-one acc (next)))]))) - - (define (read-sequence terminator commas-allowed?) - (sequence-fold commas-allowed? '() (lambda (acc v) (cons v acc)) reverse terminator)) - - (define (read-dictionary) - (sequence-fold #t - (hash) - (lambda (acc k) - (skip-whitespace) - (match (peek-char in-port) - [#\: (read-char in-port) - (when (hash-has-key? acc k) (parse-error "Duplicate key: ~v" k)) - (hash-set acc k (next))] - [_ (parse-error "Missing expected key/value separator")])) - values - #\})) - - ;;--------------------------------------------------------------------------- - ;; "Raw" symbols and numbers - - (define (delimiter-follows?) - (define c (peek-char in-port)) - (or (eof-object? c) - (char-whitespace? c) - (eqv? c PIPE) - (memv c '(#\< #\> #\[ #\] #\{ #\} #\( #\) - #\# #\: #\" #\@ #\; #\,)))) - - (define (read-raw-symbol-or-number acc) - (if (delimiter-follows?) - (let ((input (list->string (reverse acc)))) - (or (analyze-number input) - (string->symbol input))) - (read-raw-symbol-or-number (cons (read-char in-port) acc)))) - - (define (analyze-number input) - (match input - [(pregexp #px"^([-+]?\\d+)(((\\.\\d+([eE][-+]?\\d+)?)|([eE][-+]?\\d+))([fF]?))?$" - (list _ whole _ frac _ _ _ f)) - (define n (string->number (if frac (string-append whole frac) whole))) - (cond [(not n) #f] - [(and f (positive? (string-length f))) (float n)] - [else n])] - [_ #f])) - - ;;--------------------------------------------------------------------------- - ;; Main entry point to parser - - (skip-whitespace) - (match (peek-char in-port) - [(? eof-object?) eof] - [_ (next)])) + (text-reader in-port source read-syntax? decode-embedded))