Optionally-canonicalizing binary output.

This commit is contained in:
Tony Garnock-Jones 2019-10-24 14:35:27 +01:00
parent 02c02b641f
commit 0832b94ce9
1 changed files with 168 additions and 54 deletions

View File

@ -2,6 +2,7 @@
;; Preserve, as in Fruit Preserve, as in a remarkably weak pun on pickling/dehydration etc
(provide (struct-out stream-of)
stream-of->preserve
(all-from-out "record.rkt")
(struct-out annotated)
annotate
@ -22,9 +23,13 @@
encode
decode
decode-syntax
wire-value)
wire-value
preserve-order
preserve<?
canonicalize-preserves?)
(require racket/bytes)
(require (only-in racket/contract any/c))
(require racket/dict)
(require (only-in racket/format ~a))
(require racket/generator)
@ -37,9 +42,22 @@
(require (only-in syntax/readerr raise-read-error raise-read-eof-error))
(require net/base64)
(require (for-syntax racket/base))
(require data/order)
(struct stream-of (kind generator-thunk) #:transparent)
(define (stream-of->preserve s)
(match-define (stream-of kind generator-thunk) s)
(define g (generator-thunk))
(define pieces (for/list [(p (in-producer g (void)))] p))
(match kind
['string (bytes->string/utf-8 (bytes-append* pieces))]
['byte-string (bytes-append* pieces)]
['symbol (string->symbol (bytes->string/utf-8 (bytes-append* pieces)))]
['sequence pieces]
['set (list->set pieces)]
['dictionary (apply hash pieces)]))
;; Syntax properties and syntax objects would be almost perfect for
;; representing annotations, plus position/source tracking as
;; lagniappe, but unfortunately they don't play nicely with data much
@ -159,43 +177,53 @@
(reverse (for/fold [(acc '())] [((k v) (in-dict d))] (cons v (cons k acc)))))
(define (encode-value v)
(match ((current-value->placeholder) v)
(define canonicalizing? (canonicalize-preserves?))
(match (and (not canonicalizing?) ((current-value->placeholder) v))
[(? integer? n)
(bit-string (#b0001 :: bits 4) (n :: (wire-length)))]
[#f
(match v
[#f (bytes #b00000000)]
[#t (bytes #b00000001)]
[(? single-flonum?) (bit-string #b00000010 (v :: float bits 32))]
[(? double-flonum?) (bit-string #b00000011 (v :: float bits 64))]
[(annotated annotations _ item)
(bit-string ((apply bit-string-append
(map (lambda (a) (bit-string #b00000101 ((encode-value a) :: binary)))
annotations)) :: binary)
((encode-value item) :: binary))]
[(stream-of 'string p) (encode-stream 1 1 bytes? (p))]
[(stream-of 'byte-string p) (encode-stream 1 2 bytes? (p))]
[(stream-of 'symbol p) (encode-stream 1 3 bytes? (p))]
[(stream-of 'sequence p) (encode-stream 2 1 (lambda (x) #t) (p))]
[(stream-of 'set p) (encode-stream 2 2 (lambda (x) #t) (p))]
[(stream-of 'dictionary p) (encode-stream 2 3 (lambda (x) #t) (p))]
(let restart ((v v))
(match v
[#f (bytes #b00000000)]
[#t (bytes #b00000001)]
[(? single-flonum?) (bit-string #b00000010 (v :: float bits 32))]
[(? double-flonum?) (bit-string #b00000011 (v :: float bits 64))]
[(annotated annotations _ item)
(if canonicalizing?
(restart item)
(bit-string ((apply bit-string-append
(map (lambda (a) (bit-string #b00000101 ((encode-value a) :: binary)))
annotations)) :: binary)
((encode-value item) :: binary)))]
[(? stream-of?) #:when canonicalizing?
(restart (stream-of->preserve v))]
[(stream-of 'string p) (encode-stream 1 1 bytes? (p))]
[(stream-of 'byte-string p) (encode-stream 1 2 bytes? (p))]
[(stream-of 'symbol p) (encode-stream 1 3 bytes? (p))]
[(stream-of 'sequence p) (encode-stream 2 1 (lambda (x) #t) (p))]
[(stream-of 'set p) (encode-stream 2 2 (lambda (x) #t) (p))]
[(stream-of 'dictionary p) (encode-stream 2 3 (lambda (x) #t) (p))]
[(? integer? x) #:when (<= -3 x 12) (bit-string (#b0011 :: bits 4) (x :: bits 4))]
;; [0 (bytes #b10000000)]
[(? integer?)
(define raw-bit-count (+ (integer-length v) 1)) ;; at least one sign bit
(define byte-count (quotient (+ raw-bit-count 7) 8))
(bit-string (#b0100 :: bits 4) (byte-count :: (wire-length)) (v :: integer bytes byte-count))]
[(? string?) (encode-binary-like 1 (string->bytes/utf-8 v))]
[(? bytes?) (encode-binary-like 2 v)]
[(? symbol?) (encode-binary-like 3 (string->bytes/utf-8 (symbol->string v)))]
[(? integer? x) #:when (<= -3 x 12) (bit-string (#b0011 :: bits 4) (x :: bits 4))]
;; [0 (bytes #b10000000)]
[(? integer?)
(define raw-bit-count (+ (integer-length v) 1)) ;; at least one sign bit
(define byte-count (quotient (+ raw-bit-count 7) 8))
(bit-string (#b0100 :: bits 4) (byte-count :: (wire-length)) (v :: integer bytes byte-count))]
[(? string?) (encode-binary-like 1 (string->bytes/utf-8 v))]
[(? bytes?) (encode-binary-like 2 v)]
[(? symbol?) (encode-binary-like 3 (string->bytes/utf-8 (symbol->string v)))]
[(record label fields) (encode-array-like 0 (cons label fields))]
[(? list?) (encode-array-like 1 v)]
[(? set?) (encode-array-like 2 (set->list v))]
[(? dict?) (encode-array-like 3 (dict-keys-and-values v))]
[(record label fields) (encode-array-like 0 (cons label fields))]
[(? list?) (encode-array-like 1 v)]
[(? set?) (encode-array-like 2 (if canonicalizing?
(canonical-set-elements v)
(set->list v)))]
[(? dict?) (encode-array-like 3 (if canonicalizing?
(canonical-dict-keys-and-values v)
(dict-keys-and-values v)))]
[_ (error 'encode-value "Cannot encode value ~v" v)])]))
[_ (error 'encode-value "Cannot encode value ~v" v)]))]))
;;---------------------------------------------------------------------------
@ -654,6 +682,7 @@
[#t 2] ;; a default
[other other]))
(define indenting? (and indent-amount0 #t))
(define canonicalizing? (canonicalize-preserves?))
(define-syntax-rule (! fmt arg ...) (fprintf o fmt arg ...))
@ -776,22 +805,13 @@
(define (write-value distance v)
(match v
[(annotated annotations _ item)
(for [(a (in-list annotations))]
(! "@")
(write-value (+ distance 1) a)
(!indent* distance))
(when (not canonicalizing?)
(for [(a (in-list annotations))]
(! "@")
(write-value (+ distance 1) a)
(!indent* distance)))
(write-value distance item)]
[(stream-of kind generator-thunk)
(define g (generator-thunk))
(define pieces (for/list [(p (in-producer g (void)))] p))
(write-value distance
(match kind
['string (bytes->string/utf-8 (bytes-append* pieces))]
['byte-string (bytes-append* pieces)]
['symbol (string->symbol (bytes->string/utf-8 (bytes-append* pieces)))]
['sequence pieces]
['set (list->set pieces)]
['dictionary (apply hash pieces)]))]
[(? stream-of?) (write-value distance (stream-of->preserve v))]
[#f (! "#false")]
[#t (! "#true")]
[(? single-flonum?) (! "~vf" (real->double-flonum v))]
@ -818,8 +838,12 @@
(! "|")))]
[(record label fields) (write-record distance label fields)]
[(? list?) (write-sequence distance "[" "," "]" write-value v)]
[(? set?) (write-sequence distance "#set{" "," "}" write-value (set->list v))]
[(? dict?) (write-sequence distance "{" "," "}" write-key-value (dict->list v))]
[(? set?) (write-sequence distance "#set{" "," "}" write-value (if canonicalizing?
(canonical-set-elements v)
(set->list v)))]
[(? dict?) (write-sequence distance "{" "," "}" write-key-value (if canonicalizing?
(canonical-dict-entries v)
(dict->list v)))]
[_ (error 'write-preserve "Cannot encode value ~v" v)]))
@ -830,6 +854,94 @@
;;---------------------------------------------------------------------------
(define (typecode v)
(match v
[(? boolean?) 0]
[(? single-flonum?) 1]
[(? double-flonum?) 2]
[(? integer? x) 3]
[(? string?) 4]
[(? bytes?) 5]
[(? symbol?) 6]
[(record _ _) 7]
[(? list?) 8]
[(? set?) 9]
[(? dict?) 10]
[_ (error 'preserve-order "Cannot compare value ~v" v)]))
(define-syntax chain-order
(syntax-rules ()
[(_ o) o]
[(_ o more ...) (match o
['= (chain-order more ...)]
[other other])]))
(define (prepare-for-order v)
(match v
[(annotated _ _ item) (prepare-for-order item)]
[(? stream-of?) (stream-of->preserve v)]
[_ v]))
(define preserve-order
(order 'preserve-order
any/c
(lambda (a* b*)
(define a (prepare-for-order a*))
(define b (prepare-for-order b*))
(define ta (typecode a))
(define tb (typecode b))
(cond [(< ta tb) '<]
[(> ta tb) '>]
[else (match ta ;; == tb
[7 (chain-order
(preserve-order (record-label a) (record-label b))
(preserve-order (record-fields a)) (preserve-order (record-fields b)))]
[8 (match* (a b)
[('() '()) '=]
[('() _) '<]
[(_ '()) '>]
[((cons a0 a1) (cons b0 b1))
(chain-order (preserve-order a0 b0) (preserve-order a1 b1))])]
[9 (preserve-order (canonical-set-elements a) (canonical-set-elements b))]
[10 (preserve-order (canonical-dict-keys a) (canonical-dict-keys b))]
[_ (datum-order a b)])]))))
(define preserve<? (order-<? preserve-order))
;;---------------------------------------------------------------------------
(define canonicalize-preserves? (make-parameter #f))
(define *canonical-cache* (vector (make-weak-hasheq)
(make-weak-hasheq)
(make-weak-hasheq)
(make-weak-hasheq)))
(define (canonical-set-elements v)
(hash-ref! (vector-ref *canonical-cache* 0)
v
(lambda () (sort (set->list v) preserve<?))))
(define (canonical-dict-entries v)
(hash-ref! (vector-ref *canonical-cache* 1)
v
(lambda () (sort (dict->list v) preserve<? #:key car))))
(define (canonical-dict-keys-and-values v)
(hash-ref! (vector-ref *canonical-cache* 2)
v
(lambda () (let loop ((xs (canonical-dict-entries v)))
(match xs
['() '()]
[(cons (cons kk vv) rest) (cons kk (cons vv (loop rest)))])))))
(define (canonical-dict-keys v)
(hash-ref! (vector-ref *canonical-cache* 3)
v
(lambda () (map car (canonical-dict-entries v)))))
;;---------------------------------------------------------------------------
(module+ test
(require rackunit)
(require racket/runtime-path)
@ -994,10 +1106,10 @@
(define (run-test-case variety t-name loc binary-form annotated-text-form)
(define text-form (strip-annotations annotated-text-form))
(define-values (forward back)
(define-values (forward back can-execute-nondet-with-canonicalization?)
(match (hash-ref samples-txt-expected t-name text-form)
[(asymmetric f b) (values f b)]
[v (values v v)]))
[(asymmetric f b) (values f b #f)] ;; #f because e.g. annotation4 includes annotations
[v (values v v #t)]))
(check-equal? text-form back loc)
(check-equal? (d-strip (encode text-form)) back loc)
(check-equal? (d-strip (encode forward)) back loc)
@ -1009,8 +1121,10 @@
(check-equal? (string->preserve-syntax (preserve->string annotated-text-form))
annotated-text-form
loc)
(unless (memq variety '(nondeterministic))
(check-equal? (encode forward) binary-form loc))
(when (or (not (memq variety '(nondeterministic)))
(and can-execute-nondet-with-canonicalization?))
(parameterize ((canonicalize-preserves? (if (memq variety '(nondeterministic)) #t #f)))
(check-equal? (encode forward) binary-form loc)))
(unless (memq variety '(nondeterministic streaming))
(check-equal? (encode annotated-text-form) binary-form loc)))