From 54b33f5f1343198593b6ca168942db7f5db19951 Mon Sep 17 00:00:00 2001 From: Tony Garnock-Jones Date: Thu, 22 Aug 2019 20:59:37 +0100 Subject: [PATCH] Heuristics for choosing base64 vs ascii-ish binary display --- implementations/racket/preserves/main.rkt | 60 ++++++++++++++++++----- 1 file changed, 47 insertions(+), 13 deletions(-) diff --git a/implementations/racket/preserves/main.rkt b/implementations/racket/preserves/main.rkt index 46bf6d6..400a044 100644 --- a/implementations/racket/preserves/main.rkt +++ b/implementations/racket/preserves/main.rkt @@ -13,6 +13,8 @@ read-preserve-syntax string->preserve string->preserve-syntax + (struct-out binary-display-heuristics) + current-binary-display-heuristics write-preserve preserve->string current-value->placeholder @@ -33,6 +35,7 @@ (require "record.rkt") (require "varint.rkt") (require (only-in syntax/readerr raise-read-error raise-read-eof-error)) +(require net/base64) (struct stream-of (kind generator-thunk) #:transparent) @@ -355,7 +358,6 @@ (define (read-preserve [i (current-input-port)] #:read-syntax? [read-syntax? #f] #:source [source (object-name i)]) - (local-require net/base64) (local-require file/sha1) (define-match-expander px @@ -635,6 +637,10 @@ ;;--------------------------------------------------------------------------- +(struct binary-display-heuristics (printable-ascii-proportion max-length) #:transparent) + +(define current-binary-display-heuristics (make-parameter (binary-display-heuristics 3/4 1024))) + (define (write-preserve v0 [o (current-output-port)] #:indent [indent-amount0 #f]) (define indent-amount (match indent-amount0 [#f 0] @@ -721,6 +727,45 @@ (<= #x23 b #x5b) (<= #x5d b #x7e))) + (define (write-binary-stringlike v) + (! "#\"") + (for [(b (in-bytes v))] + (match b + [#x22 (! "\\\"")] + [(? binunescaped?) (write-stringlike-char (integer->char b))] + [_ (write-stringlike-char (integer->char b) + (lambda (c) (! "\\x~a" (~a #:min-width 2 + #:align 'right + #:left-pad-string "0" + (number->string b 16)))))])) + (! "\"")) + + (define (write-binary-base64 outer-distance v) + ;; Racket's encoder breaks lines after 72 characters. + ;; That corresponds to 54 bytes of input binary. + (! "#base64{") + (if (and indenting? (> (bytes-length v) 54)) + (let* ((inner-distance (+ outer-distance indent-amount)) + (line-separator (bytes-append #"\n" (make-bytes inner-distance 32))) + (encoded (base64-encode v line-separator))) + (write-bytes line-separator o) + (write-bytes encoded o 0 (- (bytes-length encoded) indent-amount))) + (write-bytes (base64-encode v #"") o)) + (! "}")) + + (define (write-binary outer-distance v) + (match-define (binary-display-heuristics proportion maxlen) (current-binary-display-heuristics)) + (define vlen (bytes-length v)) + (if (>= vlen maxlen) + (write-binary-base64 outer-distance v) + (let* ((sample-length (min vlen maxlen)) + (printable-ascii-count (for/sum [(i (in-range 0 sample-length)) + (b (in-bytes v))] + (if (or (<= 32 b 126) (= b 9) (= b 10) (= b 13)) 1 0)))) + (if (or (zero? vlen) (>= (/ printable-ascii-count sample-length) proportion)) + (write-binary-stringlike v) + (write-binary-base64 outer-distance v))))) + (define (write-value distance v) (match v [(annotated annotations _ item) @@ -741,18 +786,7 @@ [#\" (! "\\\"")] [_ (write-stringlike-char c)])) (! "\"")] - [(? bytes?) - (! "#\"") - (for [(b (in-bytes v))] - (match b - [#x22 (! "\\\"")] - [(? binunescaped?) (write-stringlike-char (integer->char b))] - [_ (write-stringlike-char (integer->char b) - (lambda (c) (! "\\x~a" (~a #:min-width 2 - #:align 'right - #:left-pad-string "0" - (number->string b 16)))))])) - (! "\"")] + [(? bytes?) (write-binary distance v)] [(? symbol?) (define s (symbol->string v)) ;; FIXME: This regular expression is conservatively correct, but Anglo-chauvinistic.