From 832fec6c43d23ce489047af283a176051960b29d Mon Sep 17 00:00:00 2001 From: Tony Garnock-Jones Date: Sat, 11 Jun 2022 10:06:51 +0200 Subject: [PATCH] Fix spec issues with len() and IEEE754 endianness (thanks to isd) --- README.md | 1 + ...rves-binary-cheatsheet.md => cheatsheet.md | 22 +- implementations/c/.gitignore | 2 - implementations/c/Makefile | 8 - implementations/c/main.c | 96 -- implementations/c/preserves.h | 1091 ----------------- preserves-binary.md | 11 +- 7 files changed, 21 insertions(+), 1210 deletions(-) rename preserves-binary-cheatsheet.md => cheatsheet.md (63%) delete mode 100644 implementations/c/.gitignore delete mode 100644 implementations/c/Makefile delete mode 100644 implementations/c/main.c delete mode 100644 implementations/c/preserves.h diff --git a/README.md b/README.md index a7bb402..563d91c 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,7 @@ Implementations of the data model, plus Syrup transfer syntax: ## Additional resources + - [Cheat sheet(s) for Preserves syntax](cheatsheet.html) - Some [conventions for common data types](conventions.html) - [Open questions](questions.html); see also the [issues list]({{page.projectpages}}/issues) diff --git a/preserves-binary-cheatsheet.md b/cheatsheet.md similarity index 63% rename from preserves-binary-cheatsheet.md rename to cheatsheet.md index 9e5d9d8..5e7b179 100644 --- a/preserves-binary-cheatsheet.md +++ b/cheatsheet.md @@ -21,10 +21,13 @@ For a value `v`, we write `«v»` for the binary encoding of `v`. [0xA5] ++ S if S ∈ ByteString [0xA6] ++ utf8(S) if S ∈ Symbol -`intbytes(x)` gives the big-endian two's-complement signed binary -representation of `x`, taking exactly as many whole bytes as needed to -unambiguously identify the value and its sign. `intbytes(0)` is the -empty byte sequence. +The functions `binary32(F)` and `binary64(D)` yield big-endian 4- and +8-byte IEEE 754 binary representations of `F` and `D`, respectively. + +The function `intbytes(x)` gives the big-endian two's-complement signed +binary representation of `x`, taking exactly as many whole bytes as +needed to unambiguously identify the value and its sign. `intbytes(0)` +is the empty byte sequence. ### Compounds @@ -33,11 +36,12 @@ empty byte sequence. «#{E_1...E_m}» = [0xA9] ++ seq(«E_1», ..., «E_m») «{K_1:V_1...K_m:V_m}» = [0xAA] ++ seq(«K_1», «V_1», ..., «K_m», «V_m») - seq(R_1, ... R_m) = len(R_1) ++ R_1 ++...++ len(R_m) ++ R_m + seq(R_1, ..., R_m) = len(|R_1|) ++ R_1 ++...++ len(|R_m|) ++ R_m - len(m) = e(m, 128) - where e(v, d) = [v + d] if v < 128 - e(v / 128, 0) ++ [(v % 128) + d] if v ≥ 128 + len(m) = e(m, 128) + + e(v, d) = [v + d] if v < 128 + e(v / 128, 0) ++ [(v % 128) + d] if v ≥ 128 ### Embeddeds @@ -49,4 +53,4 @@ To annotate a `Repr` `r` (that *MUST NOT* itself already be annotated) with some sequence of `Value`s `[v_1, ..., v_m]`, surround `r` as follows: - [0xBE] ++ len(r) ++ r ++ len(v_1) ++ v_1 ++...++ len(v_m) ++ v_m + [0xBE] ++ len(|r|) ++ r ++ len(|«v_1»|) ++ «v_1» ++...++ len(|«v_m»|) ++ «v_m» diff --git a/implementations/c/.gitignore b/implementations/c/.gitignore deleted file mode 100644 index 9be4083..0000000 --- a/implementations/c/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -m.output.txt -m diff --git a/implementations/c/Makefile b/implementations/c/Makefile deleted file mode 100644 index 08f1b99..0000000 --- a/implementations/c/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -m: main.c preserves.h - gcc -Wall -Wextra -Werror -g3 -o $@ main.c - -go: m - cat ../../tests/samples.bin | ./m | tee m.output.txt - -clean: - rm -f m diff --git a/implementations/c/main.c b/implementations/c/main.c deleted file mode 100644 index 8d2dbb5..0000000 --- a/implementations/c/main.c +++ /dev/null @@ -1,96 +0,0 @@ -#include -#include -#include -#include - -#include - -#define PRESERVES_IMPLEMENTATION -#include "preserves.h" - -static double now() { - struct timeval tv; - if (gettimeofday(&tv, NULL) < 0) { - perror("gettimeofday"); - } - return (double) tv.tv_sec + ((double) tv.tv_usec) / 1000000.0; -} - -int main(__attribute__ ((unused)) int argc, - __attribute__ ((unused)) char const * const argv[]) -{ - preserves_bytes_t input = preserves_create_bytes(); - bool silent = getenv("SILENT") != NULL; - - double start = now(); - - { - preserves_bytes_t chunk = preserves_create_bytes(); - if (preserves_resize_bytes(&chunk, 131072) == -1) { - perror("allocating chunk"); - return EXIT_FAILURE; - } - - while (true) { - size_t count = fread(chunk.ptr, 1, chunk.len, stdin); - if (count == 0) { - if (ferror(stdin)) { - perror("reading"); - return EXIT_FAILURE; - } - break; - } - if (preserves_extend_bytes(&input, preserves_bytes_subsequence(&chunk, 0, count)) == -1) { - perror("appending"); - return EXIT_FAILURE; - } - } - - preserves_free_bytes(&chunk); - } - - double mid = now(); - - { - preserves_reader_t reader = preserves_create_reader(); - preserves_reader_result_t result = preserves_read_binary(&reader, &input, 1); - more_input: - if (result.index == NULL) { - perror("parsing"); - return EXIT_FAILURE; - } - - if (!silent) { - printf("Size of index: %lu bytes; %lu entries\n", - reader.index_pos * sizeof(preserves_index_entry_t), - reader.index_pos); - } - - if (!silent) { - for (preserves_index_entry_t *i = result.index; i != result.end_marker; i++) { - preserves_dump_index_entry(stdout, &reader.input, i, true); - } - preserves_dump_index_entry(stdout, &reader.input, result.end_marker, true); - } - - if (result.end_marker->data._err == PRESERVES_END_MORE_INPUT_REMAINING) { - if (!silent) { - printf("\n"); - } - reader.index_pos = 0; - result = preserves_read_binary_continue(&reader, 1); - goto more_input; - } - - preserves_free_reader(&reader); - } - - double end = now(); - - printf("stage 1: %g s\n", mid - start); - printf("stage 2: %g s\n", end - mid); - printf("total: %g s\n", end - start); - - preserves_free_bytes(&input); - return EXIT_SUCCESS; -} diff --git a/implementations/c/preserves.h b/implementations/c/preserves.h deleted file mode 100644 index 446b21d..0000000 --- a/implementations/c/preserves.h +++ /dev/null @@ -1,1091 +0,0 @@ -/// SPDX-License-Identifier: Apache-2.0 -/// SPDX-FileCopyrightText: Copyright © 2022 Tony Garnock-Jones - -#ifndef libpreserves_26109214_f3bd_44c8_95ba_8c650c954965 -#define libpreserves_26109214_f3bd_44c8_95ba_8c650c954965 - -// Single file header. #define PRESERVES_IMPLEMENTATION to get the implementations. - -#ifdef PRESERVES_IMPLEMENTATION -#define PRESERVES_INLINE -#define PRESERVES_IMPLEMENTATION_CHUNK(...) __VA_ARGS__ -#else -#define PRESERVES_INLINE static inline -#define PRESERVES_IMPLEMENTATION_CHUNK(...) -#endif - -#define PRESERVES_OUTOFLINE(declaration, ...) \ - extern declaration; \ - PRESERVES_IMPLEMENTATION_CHUNK(inline declaration __VA_ARGS__) - -#include -#include -#include -#include -#include // for ntohl, htonl - -/////////////////////////////////////////////////////////////////////////// -// General-purpose fat pointer, for e.g. strings, binary blobs, etc. - -typedef struct preserves_bytes { - bool borrowed:1; - size_t len:(sizeof(size_t) * 8 - 1); - void *ptr; -} preserves_bytes_t; - -#define PRESERVES_ARRAY_ELEMENT_COUNT(bytes_ptr, element_type) \ - ((bytes_ptr)->len / sizeof(element_type)) - -#define PRESERVES_ARRAY_ELEMENT(bytes_ptr, element_type, index) \ - (((element_type *) (bytes_ptr)->ptr)[index]) - -#define PRESERVES_RESIZE_ARRAY(bytes_ptr, element_type, size) \ - preserves_resize_bytes(bytes_ptr, sizeof(element_type) * (size)) - -#define PRESERVES_ARRAY_ACCESS(bytes_ptr, element_type, length_var, base_ptr_var) \ - size_t length_var = PRESERVES_ARRAY_ELEMENT_COUNT(bytes_ptr, element_type); \ - element_type *base_ptr_var = &PRESERVES_ARRAY_ELEMENT(bytes_ptr, element_type, 0) - -PRESERVES_INLINE preserves_bytes_t preserves_create_bytes(void) { - return (preserves_bytes_t) { .borrowed = 0, .len = 0, .ptr = NULL }; -} - -PRESERVES_INLINE int preserves_resize_bytes(preserves_bytes_t *bs, size_t size) { - if (bs->borrowed) abort(); - - if (size == 0) { - free(bs->ptr); - bs->ptr = NULL; - bs->len = 0; - return 0; - } - - void *ptr = realloc(bs->ptr, size); - if (ptr == NULL) return -1; - - bs->ptr = ptr; - if (size > bs->len) { - memset(((uint8_t *) bs->ptr) + bs->len, 0, size - bs->len); - } - bs->len = size; - return 0; -} - -PRESERVES_INLINE void preserves_free_bytes(preserves_bytes_t *bs) { - if (!bs->borrowed) preserves_resize_bytes(bs, 0); - *bs = (preserves_bytes_t) { .borrowed = 0, .len = 0, .ptr = NULL }; -} - -PRESERVES_INLINE void preserves_bytes_move(preserves_bytes_t *dest, preserves_bytes_t *src) { - preserves_free_bytes(dest); - *dest = *src; - *src = preserves_create_bytes(); -} - -PRESERVES_INLINE int preserves_extend_bytes(preserves_bytes_t *dest, preserves_bytes_t src) { - if (dest->borrowed) abort(); - - void *ptr = realloc(dest->ptr, dest->len + src.len); - if (ptr == NULL) return -1; - - dest->ptr = ptr; - memcpy(((uint8_t *) dest->ptr) + dest->len, src.ptr, src.len); - dest->len += src.len; - return 0; -} - -PRESERVES_INLINE preserves_bytes_t preserves_bytes_subsequence(preserves_bytes_t *bs, - size_t offset, - size_t len) { - if (offset >= bs->len) return preserves_create_bytes(); - if (len > bs->len) return preserves_create_bytes(); - if (offset > (bs->len - len)) len = bs->len - offset; - return (preserves_bytes_t) { .borrowed = 1, .len = len, .ptr = ((uint8_t *) bs->ptr) + offset }; -} - -/////////////////////////////////////////////////////////////////////////// -// Memory arenas - -typedef struct preserves_pool { - size_t pagesize; - preserves_bytes_t page_pointers; // for allocations smaller than pagesize - preserves_bytes_t large_block_pointers; // for allocations larger than or equal to pagesize - size_t next_page; - uint8_t *alloc_block_base; - size_t alloc_block_used; -} preserves_pool_t; - -PRESERVES_INLINE preserves_pool_t preserves_create_pool(size_t pagesize) { - return (preserves_pool_t) { - .pagesize = pagesize, - .page_pointers = preserves_create_bytes(), - .large_block_pointers = preserves_create_bytes(), - .next_page = 0, - .alloc_block_base = NULL, - .alloc_block_used = pagesize, - }; -} - -PRESERVES_INLINE void preserves_free_blocklist(preserves_bytes_t *bl) { - PRESERVES_ARRAY_ACCESS(bl, void *, num_blocks, blocklist); - for (size_t i = 0; i < num_blocks; i++) { - free(blocklist[i]); - } - preserves_free_bytes(bl); -} - -PRESERVES_INLINE void preserves_recycle_pool(preserves_pool_t *pool) { - preserves_free_blocklist(&pool->large_block_pointers); - pool->next_page = 0; - pool->alloc_block_base = NULL; - pool->alloc_block_used = pool->pagesize; -} - -PRESERVES_INLINE void preserves_free_pool(preserves_pool_t *pool) { - preserves_recycle_pool(pool); - preserves_free_blocklist(&pool->page_pointers); -} - -PRESERVES_OUTOFLINE(void *_preserves_pool_record_block(preserves_bytes_t *bl, size_t blocksize), { - void *ptr = calloc(1, blocksize); - if (ptr == NULL) return NULL; - if (preserves_resize_bytes(bl, bl->len + sizeof(void *)) == -1) { - free(ptr); - return NULL; - } - PRESERVES_ARRAY_ACCESS(bl, void *, num_blocks, blocklist); - blocklist[num_blocks - 1] = ptr; - return ptr; - }); - -PRESERVES_OUTOFLINE -( - int _preserves_pool_add_page_and_alloc(preserves_pool_t *pool, preserves_bytes_t *bs, size_t count), { - if (pool->next_page >= PRESERVES_ARRAY_ELEMENT_COUNT(&pool->page_pointers, void *)) { - void *ptr = _preserves_pool_record_block(&pool->page_pointers, pool->pagesize); - if (ptr == NULL) return -1; - pool->alloc_block_base = ptr; - pool->next_page = PRESERVES_ARRAY_ELEMENT_COUNT(&pool->page_pointers, void *); - } else { - pool->alloc_block_base = PRESERVES_ARRAY_ELEMENT(&pool->page_pointers, void *, pool->next_page); - pool->next_page++; - } - - pool->alloc_block_used = count; - *bs = (preserves_bytes_t) { .borrowed = 1, .len = count, .ptr = pool->alloc_block_base }; - return 0; - }); - -PRESERVES_INLINE int preserves_pool_alloc_bytes_align(preserves_pool_t *pool, - preserves_bytes_t *bs, - size_t count, - size_t alignment) { - preserves_free_bytes(bs); - if (count == 0) return 0; - - count = (count + alignment - 1) & (~(alignment - 1)); - // ^ round up to nearest `alignment`-byte boundary - - if (count > pool->pagesize) { - void *ptr = _preserves_pool_record_block(&pool->large_block_pointers, count); - if (ptr == NULL) return -1; - *bs = (preserves_bytes_t) { .borrowed = 1, .len = count, .ptr = ptr }; - return 0; - } - - if (pool->alloc_block_used + count <= pool->pagesize) { - *bs = (preserves_bytes_t) { - .borrowed = 1, - .len = count, - .ptr = pool->alloc_block_base + pool->alloc_block_used, - }; - pool->alloc_block_used += count; - return 0; - } - - return _preserves_pool_add_page_and_alloc(pool, bs, count); -} - -PRESERVES_INLINE int preserves_pool_alloc_bytes(preserves_pool_t *pool, - preserves_bytes_t *bs, - size_t count) { - return preserves_pool_alloc_bytes_align(pool, bs, count, 16); -} - -/////////////////////////////////////////////////////////////////////////// -// Binary codec details - -typedef enum preserves_binary_format_tag { - PRESERVES_BINARY_FORMAT_TAG_FALSE = 0x80, - PRESERVES_BINARY_FORMAT_TAG_TRUE = 0x81, - PRESERVES_BINARY_FORMAT_TAG_FLOAT = 0x82, - PRESERVES_BINARY_FORMAT_TAG_DOUBLE = 0x83, - PRESERVES_BINARY_FORMAT_TAG_END = 0x84, - PRESERVES_BINARY_FORMAT_TAG_ANNOTATION = 0x85, - PRESERVES_BINARY_FORMAT_TAG_EMBEDDED = 0x86, - - PRESERVES_BINARY_FORMAT_TAG_SMALL_INTEGER_LO = 0x90, - PRESERVES_BINARY_FORMAT_TAG_SMALL_INTEGER_HI = 0x9F, - - PRESERVES_BINARY_FORMAT_TAG_MEDIUM_INTEGER_LO = 0xA0, - PRESERVES_BINARY_FORMAT_TAG_MEDIUM_INTEGER_HI = 0xAF, - - PRESERVES_BINARY_FORMAT_TAG_LARGE_INTEGER = 0xB0, - PRESERVES_BINARY_FORMAT_TAG_STRING = 0xB1, - PRESERVES_BINARY_FORMAT_TAG_BYTE_STRING = 0xB2, - PRESERVES_BINARY_FORMAT_TAG_SYMBOL = 0xB3, - - PRESERVES_BINARY_FORMAT_TAG_RECORD = 0xB4, - PRESERVES_BINARY_FORMAT_TAG_SEQUENCE = 0xB5, - PRESERVES_BINARY_FORMAT_TAG_SET = 0xB6, - PRESERVES_BINARY_FORMAT_TAG_DICTIONARY = 0xB7, -} preserves_binary_format_tag_t; - -PRESERVES_OUTOFLINE -( - char const *preserves_binary_format_tag_name(preserves_binary_format_tag_t tag), { - switch (tag) { - case PRESERVES_BINARY_FORMAT_TAG_FALSE: return "FALSE"; - case PRESERVES_BINARY_FORMAT_TAG_TRUE: return "TRUE"; - case PRESERVES_BINARY_FORMAT_TAG_FLOAT: return "FLOAT"; - case PRESERVES_BINARY_FORMAT_TAG_DOUBLE: return "DOUBLE"; - case PRESERVES_BINARY_FORMAT_TAG_END: return "END"; - case PRESERVES_BINARY_FORMAT_TAG_ANNOTATION: return "ANNOTATION"; - case PRESERVES_BINARY_FORMAT_TAG_EMBEDDED: return "EMBEDDED"; - case PRESERVES_BINARY_FORMAT_TAG_LARGE_INTEGER: return "LARGE_INTEGER"; - case PRESERVES_BINARY_FORMAT_TAG_STRING: return "STRING"; - case PRESERVES_BINARY_FORMAT_TAG_BYTE_STRING: return "BYTE_STRING"; - case PRESERVES_BINARY_FORMAT_TAG_SYMBOL: return "SYMBOL"; - case PRESERVES_BINARY_FORMAT_TAG_RECORD: return "RECORD"; - case PRESERVES_BINARY_FORMAT_TAG_SEQUENCE: return "SEQUENCE"; - case PRESERVES_BINARY_FORMAT_TAG_SET: return "SET"; - case PRESERVES_BINARY_FORMAT_TAG_DICTIONARY: return "DICTIONARY"; - default: - if ((tag >= PRESERVES_BINARY_FORMAT_TAG_SMALL_INTEGER_LO) && - (tag <= PRESERVES_BINARY_FORMAT_TAG_SMALL_INTEGER_HI)) { - return "SMALL_INTEGER"; - } else if ((tag >= PRESERVES_BINARY_FORMAT_TAG_MEDIUM_INTEGER_LO) && - (tag <= PRESERVES_BINARY_FORMAT_TAG_MEDIUM_INTEGER_HI)) { - return "MEDIUM_INTEGER"; - } else { - return "UNKNOWN"; - } - } - }); - -/////////////////////////////////////////////////////////////////////////// -// Index representation - -typedef enum preserves_type_tag { - PRESERVES_BOOLEAN = 0, - PRESERVES_FLOAT, - PRESERVES_DOUBLE, - - PRESERVES_SIGNED_INTEGER, - PRESERVES_STRING, - PRESERVES_BYTE_STRING, - PRESERVES_COMPACT, - PRESERVES_SYMBOL, - - PRESERVES_RECORD, - PRESERVES_SEQUENCE, - PRESERVES_SET, - PRESERVES_DICTIONARY, - - PRESERVES_EMBEDDED, - PRESERVES_ANNOTATION, - PRESERVES_END_MARKER, -} preserves_type_tag_t; - -PRESERVES_OUTOFLINE(char const *preserves_type_tag_name(preserves_type_tag_t type), { - switch (type) { - case PRESERVES_BOOLEAN: return "BOOLEAN"; - case PRESERVES_FLOAT: return "FLOAT"; - case PRESERVES_DOUBLE: return "DOUBLE"; - case PRESERVES_SIGNED_INTEGER: return "SIGNED_INTEGER"; - case PRESERVES_STRING: return "STRING"; - case PRESERVES_BYTE_STRING: return "BYTE_STRING"; - case PRESERVES_COMPACT: return "COMPACT"; - case PRESERVES_SYMBOL: return "SYMBOL"; - case PRESERVES_RECORD: return "RECORD"; - case PRESERVES_SEQUENCE: return "SEQUENCE"; - case PRESERVES_SET: return "SET"; - case PRESERVES_DICTIONARY: return "DICTIONARY"; - case PRESERVES_EMBEDDED: return "EMBEDDED"; - case PRESERVES_ANNOTATION: return "ANNOTATION"; - case PRESERVES_END_MARKER: return "END_MARKER"; - default: return "UNKNOWN"; - } - }); - -typedef enum preserves_error_code { - PRESERVES_END_SYSTEM_ERROR = -2, - PRESERVES_END_NO_ERROR = -1, - - PRESERVES_END_EOF = 0, - PRESERVES_END_MORE_INPUT_REMAINING, - PRESERVES_END_INCOMPLETE_INPUT, - PRESERVES_END_UNEXPECTED_END, - PRESERVES_END_DICTIONARY_MISSING_VALUE, - PRESERVES_END_RECORD_MISSING_LABEL, - PRESERVES_END_VARINT_TOO_BIG, - PRESERVES_END_INVALID_UTF8, - PRESERVES_END_INVALID_TAG, -} preserves_error_code_t; - -PRESERVES_OUTOFLINE(char const *preserves_error_code_name(preserves_error_code_t code), { - switch (code) { - case PRESERVES_END_SYSTEM_ERROR: return "SYSTEM_ERROR"; - case PRESERVES_END_NO_ERROR: return "NO_ERROR"; - case PRESERVES_END_EOF: return "EOF"; - case PRESERVES_END_MORE_INPUT_REMAINING: return "MORE_INPUT_REMAINING"; - case PRESERVES_END_INCOMPLETE_INPUT: return "INCOMPLETE_INPUT"; - case PRESERVES_END_UNEXPECTED_END: return "UNEXPECTED_END"; - case PRESERVES_END_DICTIONARY_MISSING_VALUE: return "DICTIONARY_MISSING_VALUE"; - case PRESERVES_END_RECORD_MISSING_LABEL: return "RECORD_MISSING_LABEL"; - case PRESERVES_END_VARINT_TOO_BIG: return "VARINT_TOO_BIG"; - case PRESERVES_END_INVALID_UTF8: return "INVALID_UTF8"; - case PRESERVES_END_INVALID_TAG: return "INVALID_TAG"; - default: return "UNKNOWN"; - } - }); - -typedef enum preserves_index_entry_representation { - PRESERVES_REPR_NONE = 0, - PRESERVES_INT_SIGNED, - PRESERVES_INT_UNSIGNED, - PRESERVES_INT_LARGE_BINARY, - PRESERVES_INT_LARGE_TEXT, - PRESERVES_LITERAL, - PRESERVES_ESCAPED, - PRESERVES_HEX, - PRESERVES_BASE64, -} preserves_index_entry_representation_t; - -PRESERVES_OUTOFLINE -( - char const *preserves_index_entry_representation_name(preserves_index_entry_representation_t repr), { - switch (repr) { - case PRESERVES_REPR_NONE: return "REPR_NONE"; - case PRESERVES_INT_SIGNED: return "INT_SIGNED"; - case PRESERVES_INT_UNSIGNED: return "INT_UNSIGNED"; - case PRESERVES_INT_LARGE_BINARY: return "INT_LARGE_BINARY"; - case PRESERVES_INT_LARGE_TEXT: return "INT_LARGE_TEXT"; - case PRESERVES_LITERAL: return "LITERAL"; - case PRESERVES_ESCAPED: return "ESCAPED"; - case PRESERVES_HEX: return "HEX"; - case PRESERVES_BASE64: return "BASE64"; - default: return "UNKNOWN"; - } - }); - -/* - PRESERVES_BOOLEAN: repr==PRESERVES_REPR_NONE, len=0, data._boolean - PRESERVES_FLOAT: repr=PRESERVES_REPR_NONE, len=0, data._float - PRESERVES_DOUBLE: repr=PRESERVES_REPR_NONE, len=0, data._double - - PRESERVES_SIGNED_INTEGER: - - repr==PRESERVES_INT_SIGNED -> len=0, data._signed - - repr==PRESERVES_INT_UNSIGNED -> len=0, data._unsigned - - repr==PRESERVES_INT_LARGE_BINARY -> len, data._unsigned as absolute offset within input - - repr==PRESERVES_INT_LARGE_TEXT -> len, data._unsigned as absolute offset within input - PRESERVES_STRING: - - repr=PRESERVES_LITERAL -> len, data._unsigned as absolute offset within input to utf-8 bytes - - repr=PRESERVES_ESCAPED -> len, data._unsigned as absolute offset within input to utf-8 bytes - that need String-style backslash-escapes interpreted - PRESERVES_BYTE_STRING: - - repr=PRESERVES_LITERAL -> len, data._unsigned as absolute offset within input to utf-8 bytes - - repr=PRESERVES_ESCAPED -> len, data._unsigned as absolute offset within input to utf-8 bytes - that need ByteString-style backslash-escapes interpreted - - repr=PRESERVES_HEX -> len, data._unsigned as absolute offset within input to ASCII bytes of hex - - repr=PRESERVES_BASE64 -> len, data._unsigned as absolute offset within input to ASCII bytes of base64 - PRESERVES_COMPACT: - - repr as for BYTE_STRING, but bytes denote a nested binary-encoded value. - PRESERVES_SYMBOL: - - repr=PRESERVES_LITERAL -> len, data._unsigned as absolute offset within input to utf-8 bytes - - repr=PRESERVES_ESCAPED -> len, data._unsigned as absolute offset within input to utf-8 bytes - that need Symbol-style backslash-escapes interpreted - - PRESERVES_RECORD, PRESERVES_SEQUENCE, PRESERVES_SET, PRESERVES_DICTIONARY: - - repr==PRESERVES_REPR_NONE, - - len counts number of items: - - PRESERVES_RECORD -> number of fields plus one (for the label) - - PRESERVES_SEQUENCE -> number of items - - PRESERVES_SET -> number of items - - PRESERVES_DICTIONARY -> twice the number of key-value pairs - - data._unsigned as relative offset within index to next item, - starting from this entry; zero means "no end known" - - PRESERVES_EMBEDDED: repr==PRESERVES_REPR_NONE, len==0, following item is the embedded value - PRESERVES_ANNOTATION: - - repr=PRESERVES_REPR_NONE, - - len counts number of annotations, - - data._unsigned as relative offset within index to annotated - item, starting from this entry; zero means "no end known" - - the annotated item will not be a PRESERVES_ANNOTATION - - PRESERVES_END_MARKER: repr==PRESERVES_REPR_NONE, len==0, data._err - */ -typedef struct preserves_index_entry { - preserves_type_tag_t type:4; - preserves_index_entry_representation_t repr:4; - uint64_t len:56; - - union { - bool _boolean; - float _float; - double _double; - int64_t _signed; - uint64_t _unsigned; - preserves_error_code_t _err; - } data; -} preserves_index_entry_t; - -#ifndef NDEBUG -extern void preserves_dump_index_entry(FILE* f, preserves_bytes_t *input, preserves_index_entry_t *i, bool add_newline); -#endif - -typedef struct preserves_reader { - preserves_bytes_t input; - preserves_bytes_t index; - preserves_bytes_t stack; - size_t stack_top; /* ascending empty */ - size_t input_pos; /* ascending full */ - size_t index_pos; /* ascending empty */ - bool annotation_tag_seen; -} preserves_reader_t; - -typedef struct preserves_reader_result { - preserves_index_entry_t *index; - preserves_index_entry_t *end_marker; -} preserves_reader_result_t; - -PRESERVES_INLINE preserves_reader_result_t preserves_reader_error_result(void) { - return (preserves_reader_result_t) { .index = NULL, .end_marker = NULL }; -} - -PRESERVES_INLINE preserves_reader_t preserves_create_reader(void) { - return (preserves_reader_t) { - .input = preserves_create_bytes(), - .index = preserves_create_bytes(), - .stack = preserves_create_bytes(), - .stack_top = 0, - .input_pos = 0, - .index_pos = 0, - .annotation_tag_seen = false, - }; -} - -PRESERVES_OUTOFLINE(void preserves_free_reader(preserves_reader_t *r), { - preserves_free_bytes(&r->input); - preserves_free_bytes(&r->index); - preserves_free_bytes(&r->stack); - r->stack_top = 0; - r->input_pos = 0; - r->index_pos = 0; - r->annotation_tag_seen = false; - }); - -PRESERVES_IMPLEMENTATION_CHUNK -( -#define MINIMUM_PRESERVES_READER_STACK_SIZE 32 - typedef uint64_t preserves_index_offset_t; - - static inline bool _preserves_reader_ateof(preserves_reader_t *r) { - return (r->input_pos >= r->input.len); - } - - static inline int _preserves_reader_peek(preserves_reader_t *r) { - if (_preserves_reader_ateof(r)) return -1; - return PRESERVES_ARRAY_ELEMENT(&r->input, uint8_t, r->input_pos); - } - - static inline int _preserves_reader_next(preserves_reader_t *r) { - if (r->input_pos >= r->input.len) return -1; - int result = PRESERVES_ARRAY_ELEMENT(&r->input, uint8_t, r->input_pos); - r->input_pos++; - return result; - } - - static inline void *_preserves_reader_next_bytes(preserves_reader_t *r, size_t count) { - preserves_bytes_t bs = preserves_bytes_subsequence(&r->input, r->input_pos, count); - if (bs.len != count) return NULL; - r->input_pos += count; - return bs.ptr; - } - - static inline preserves_index_entry_t *_preserves_reader_index_entry(preserves_reader_t *r, - size_t i) { - size_t limit = PRESERVES_ARRAY_ELEMENT_COUNT(&r->index, preserves_index_entry_t); - while (i >= limit) { - limit = limit * 2; - if (limit < 16) limit = 16; - if (PRESERVES_RESIZE_ARRAY(&r->index, preserves_index_entry_t, limit) == -1) { - return NULL; - } - } - return &PRESERVES_ARRAY_ELEMENT(&r->index, preserves_index_entry_t, i); - } - - static inline size_t _preserves_reader_stack_peek(preserves_reader_t *r) { - if (r->stack_top >= PRESERVES_ARRAY_ELEMENT_COUNT(&r->stack, size_t)) { - abort(); - } - return PRESERVES_ARRAY_ELEMENT(&r->stack, size_t, r->stack_top - 1); - } - - static inline preserves_index_entry_t *_preserves_reader_stack_top_entry(preserves_reader_t *r) { - return _preserves_reader_index_entry(r, _preserves_reader_stack_peek(r)); - } - - static inline void _preserves_reader_stack_drop(preserves_reader_t *r) { - if (r->stack_top == 0) abort(); - /* printf("popping "); */ - /* preserves_dump_index_entry(stdout, &r->input, _preserves_reader_stack_top_entry(r), true); */ - r->stack_top--; - } - - static inline preserves_index_entry_t *_preserves_reader_finish_seq(preserves_reader_t *r) { - size_t base_index = _preserves_reader_stack_peek(r); - preserves_index_entry_t *base = _preserves_reader_stack_top_entry(r); - base->data._unsigned = r->index_pos - base_index; - _preserves_reader_stack_drop(r); - return base; - } - - static inline bool _preserves_reader_in_annotations(preserves_reader_t *r) { - return (r->stack_top > 0) && - (_preserves_reader_stack_top_entry(r)->type == PRESERVES_ANNOTATION); - } - - static inline void _preserves_reader_inc_collection_len(preserves_reader_t *r, size_t *count_ptr) { - if (r->stack_top > 0) { - check_for_embedded: - preserves_index_entry_t *base = _preserves_reader_stack_top_entry(r); - if (base->type == PRESERVES_EMBEDDED) { - _preserves_reader_stack_drop(r); - goto check_for_embedded; - } else { - base->len++; - } - /* printf("added to base, which is now "); */ - /* preserves_dump_index_entry(stdout, &r->input, base, true); */ - } else { - (*count_ptr)--; - } - } - - static inline preserves_index_entry_t *_preserves_reader_emit_entry(preserves_reader_t *r, - size_t *count_ptr, - preserves_index_entry_t e) { - if (!r->annotation_tag_seen && _preserves_reader_in_annotations(r)) { - /* printf("(popping annotation collector)\n"); */ - _preserves_reader_finish_seq(r); - } - if (count_ptr != NULL) { - _preserves_reader_inc_collection_len(r, count_ptr); - } - - /* printf("-- emitting: "); */ - /* preserves_dump_index_entry(stdout, &r->input, &e, true); */ - - preserves_index_entry_t *ix = _preserves_reader_index_entry(r, r->index_pos); - if (ix == NULL) return NULL; - *ix = e; - r->index_pos++; - - r->annotation_tag_seen = false; - - return ix; - } - - static inline preserves_reader_result_t _preserves_reader_finish(preserves_reader_t *r, - preserves_error_code_t code) { - if (code == PRESERVES_END_SYSTEM_ERROR) { - return preserves_reader_error_result(); - } else { - preserves_index_entry_t *index = _preserves_reader_index_entry(r, 0); - if (index == NULL) return preserves_reader_error_result(); - preserves_index_entry_t *end_marker = - _preserves_reader_emit_entry(r, NULL, (preserves_index_entry_t) { - .type = PRESERVES_END_MARKER, - .repr = PRESERVES_REPR_NONE, - .len = 0, - .data = { ._err = code }, - }); - if (end_marker == NULL) return preserves_reader_error_result(); - return (preserves_reader_result_t) { .index = index, .end_marker = end_marker }; - } - } - - static inline int _preserves_reader_varint(preserves_reader_t *r, size_t *v) { - unsigned int shift_amount = 0; - size_t result = 0; - while (true) { - int b = _preserves_reader_next(r); - if (b == -1) return -1; - result |= (b & 0x7f) << shift_amount; - if (b & 0x80) { - shift_amount += 7; - if (shift_amount > ((sizeof(size_t) * 8) - 7)) return -2; - } else { - *v = result; - return 0; - } - } - } - - static inline preserves_index_entry_t *_preserves_emit_small_int(preserves_reader_t *r, - size_t *count_ptr, - bool is_unsigned, - int64_t value) { - return _preserves_reader_emit_entry(r, count_ptr, (preserves_index_entry_t) { - .type = PRESERVES_SIGNED_INTEGER, - .repr = is_unsigned ? PRESERVES_INT_UNSIGNED : PRESERVES_INT_SIGNED, - .len = 0, - .data = { ._signed = value }, - }); - } - - static inline int _preserves_reader_decode_intbytes(preserves_reader_t *r, - size_t *count_ptr, - size_t len) { - size_t starting_pos = r->input_pos; - uint8_t *bs = _preserves_reader_next_bytes(r, len); - if (bs == NULL) return -1; - - bool is_unsigned = false; - size_t remaining = len; - while ((remaining > 0) && (*bs == 0)) { - is_unsigned = true; - bs++; - remaining--; - } - - if (remaining == 0) { - // This shouldn't happen, but it does have a denotation. - return (_preserves_emit_small_int(r, count_ptr, is_unsigned, 0) == NULL) ? -1 : 0; - } - - if (remaining > 8) { - if (is_unsigned && (*bs & 0x80)) { - remaining++; - bs--; - } - return (_preserves_reader_emit_entry(r, count_ptr, (preserves_index_entry_t) { - .type = PRESERVES_SIGNED_INTEGER, - .repr = PRESERVES_INT_LARGE_BINARY, - .len = remaining, - .data = { ._unsigned = starting_pos + (len - remaining) }, - }) == NULL) ? -1 : 0; - } - - uint64_t buf = 0; - while (remaining > 0) { - remaining--; - buf = buf | ((*bs) << (remaining << 3)); - bs++; - } - - int64_t value = *(int64_t *)&buf; - return (_preserves_emit_small_int(r, count_ptr, is_unsigned, value) == NULL) ? -1 : 0; - } - - static inline bool utf8_tail(uint8_t b) { - return (b >= 0x80 && b <= 0xbf); - } - - static inline int check_utf8(uint8_t *bs, size_t len) { - // https://datatracker.ietf.org/doc/html/rfc3629#section-4 - while (len > 0) { - uint8_t b0 = *bs++; - len--; - if (b0 >= 0x80) { - if (len < 1) return -1; - uint8_t b1 = *bs++; - len--; - if (b0 >= 0xc2 && b0 <= 0xdf) { - if (!utf8_tail(b1)) return -1; - } else { - if (len < 1) return -1; - uint8_t b2 = *bs++; - len--; - if (b0 == 0xe0) { - if (!(b1 >= 0xa0 && b1 <= 0xbf && utf8_tail(b2))) return -1; - } else if (b0 >= 0xe1 && b0 <= 0xec) { - if (!(utf8_tail(b1) && utf8_tail(b2))) return -1; - } else if (b0 == 0xed) { - if (!(b1 >= 0x80 && b1 <= 0x9f && utf8_tail(b2))) return -1; - } else if (b0 >= 0xee && b0 <= 0xef) { - if (!(utf8_tail(b1) && utf8_tail(b2))) return -1; - } else { - if (len < 1) return -1; - uint8_t b3 = *bs++; - len--; - if (b0 == 0xf0) { - if (!(b1 >= 0x90 && b1 <= 0xbf && utf8_tail(b2) && utf8_tail(b3))) return -1; - } else if (b0 >= 0xf1 && b0 <= 0xf3) { - if (!(utf8_tail(b1) && utf8_tail(b2) && utf8_tail(b3))) return -1; - } else if (b0 == 0xf4) { - if (!(b1 >= 0x80 && b1 <= 0x8f && utf8_tail(b2) && utf8_tail(b3))) return -1; - } else { - // ok! - } - } - } - } - } - return 0; - } - - static inline preserves_error_code_t _preserves_reader_read_stringlike(preserves_reader_t *r, - size_t *count_ptr, - preserves_type_tag_t type, - bool should_check_utf8) { - size_t len = 0; - switch (_preserves_reader_varint(r, &len)) { - case -1: return PRESERVES_END_INCOMPLETE_INPUT; - case -2: return PRESERVES_END_VARINT_TOO_BIG; - default: break; - } - size_t starting_pos = r->input_pos; - uint8_t *maybe_utf = _preserves_reader_next_bytes(r, len); - if (should_check_utf8 && (check_utf8(maybe_utf, len) == -1)) { - return PRESERVES_END_INVALID_UTF8; - } - if (_preserves_reader_emit_entry(r, count_ptr, (preserves_index_entry_t) { - .type = type, - .repr = PRESERVES_LITERAL, - .len = len, - .data = { ._unsigned = starting_pos }, - }) == NULL) { - return PRESERVES_END_SYSTEM_ERROR; - } - return PRESERVES_END_NO_ERROR; - } - - static inline preserves_index_entry_t *_preserves_reader_push(preserves_reader_t *r, - preserves_type_tag_t type) { - preserves_index_entry_t *ix = _preserves_reader_emit_entry(r, NULL, (preserves_index_entry_t) { - .type = type, .repr = PRESERVES_REPR_NONE, .len = 0, .data = { ._unsigned = 0 }}); - if (ix == NULL) return NULL; - - size_t limit = PRESERVES_ARRAY_ELEMENT_COUNT(&r->stack, size_t); - if (r->stack_top >= limit) { - limit += 32; - if (PRESERVES_RESIZE_ARRAY(&r->stack, size_t, limit) == -1) return NULL; - } - PRESERVES_ARRAY_ELEMENT(&r->stack, size_t, r->stack_top) = r->index_pos - 1; - r->stack_top++; - return ix; - } -) - -PRESERVES_INLINE preserves_index_entry_t *preserves_skip_annotations(preserves_index_entry_t *ix) { - if (ix == NULL) return NULL; - if (ix->type != PRESERVES_ANNOTATION) return ix; - ix += ix->data._unsigned; - if (ix->type == PRESERVES_ANNOTATION) abort(); - return ix; -} - -#define RETURN_ON_FAIL(e) if ((e) == NULL) return preserves_reader_error_result() -PRESERVES_OUTOFLINE -( - preserves_reader_result_t preserves_read_binary_continue(preserves_reader_t *r, size_t count), { - while (count) { - /* for (int i = r->stack_top - 1; i >= 0; i--) { */ - /* size_t ip = PRESERVES_ARRAY_ELEMENT(&r->stack, size_t, i); */ - /* printf(" %02d: (%5lu) ", i, ip); */ - /* preserves_dump_index_entry(stdout, &r->input, _preserves_reader_index_entry(r, ip), true); */ - /* } */ - /* printf("pos %lu (%05lx), count %lu, annotation tag seen %d: ", */ - /* r->input_pos, */ - /* r->input_pos, */ - /* count, */ - /* r->annotation_tag_seen); */ - int b = _preserves_reader_next(r); - /* printf("tag 0x%02x %s\n", b, preserves_binary_format_tag_name(b)); */ - if (b == -1) return _preserves_reader_finish(r, PRESERVES_END_INCOMPLETE_INPUT); - - switch (b) { - case PRESERVES_BINARY_FORMAT_TAG_FALSE: - RETURN_ON_FAIL(_preserves_reader_emit_entry(r, &count, (preserves_index_entry_t) { - .type = PRESERVES_BOOLEAN, .repr = PRESERVES_REPR_NONE, .len = 0, .data = { - ._boolean = false - }})); - break; - - case PRESERVES_BINARY_FORMAT_TAG_TRUE: - RETURN_ON_FAIL(_preserves_reader_emit_entry(r, &count, (preserves_index_entry_t) { - .type = PRESERVES_BOOLEAN, .repr = PRESERVES_REPR_NONE, .len = 0, .data = { - ._boolean = true - }})); - break; - - case PRESERVES_BINARY_FORMAT_TAG_FLOAT: { - uint8_t *bs = _preserves_reader_next_bytes(r, 4); - if (bs == NULL) return _preserves_reader_finish(r, PRESERVES_END_INCOMPLETE_INPUT); - uint32_t i; - memcpy(&i, bs, 4); - i = ntohl(i); - float f; - memcpy(&f, &i, 4); - RETURN_ON_FAIL(_preserves_reader_emit_entry(r, &count, (preserves_index_entry_t) { - .type = PRESERVES_FLOAT, .repr = PRESERVES_REPR_NONE, .len = 0, .data = { - ._float = f - }})); - break; - } - - case PRESERVES_BINARY_FORMAT_TAG_DOUBLE: { - uint8_t *bs = _preserves_reader_next_bytes(r, 8); - if (bs == NULL) return _preserves_reader_finish(r, PRESERVES_END_INCOMPLETE_INPUT); - uint32_t lo, hi; - memcpy(&hi, bs, 4); - memcpy(&lo, bs + 4, 4); - lo = ntohl(lo); - hi = ntohl(hi); - uint64_t i = (((uint64_t) hi) << 32) | ((uint64_t) lo); - double f; - memcpy(&f, &i, 8); - RETURN_ON_FAIL(_preserves_reader_emit_entry(r, &count, (preserves_index_entry_t) { - .type = PRESERVES_DOUBLE, .repr = PRESERVES_REPR_NONE, .len = 0, .data = { - ._double = f - }})); - break; - } - - case PRESERVES_BINARY_FORMAT_TAG_END: - if (r->stack_top == 0) { - return _preserves_reader_finish(r, PRESERVES_END_UNEXPECTED_END); - } - preserves_index_entry_t *base = _preserves_reader_finish_seq(r); - _preserves_reader_inc_collection_len(r, &count); - if ((base->type == PRESERVES_DICTIONARY) && ((base->len % 2) != 0)) { - return _preserves_reader_finish(r, PRESERVES_END_DICTIONARY_MISSING_VALUE); - } - if ((base->type == PRESERVES_RECORD) && (base->len == 0)) { - return _preserves_reader_finish(r, PRESERVES_END_RECORD_MISSING_LABEL); - } - break; - - case PRESERVES_BINARY_FORMAT_TAG_ANNOTATION: - if (r->annotation_tag_seen || !_preserves_reader_in_annotations(r)) { - RETURN_ON_FAIL(_preserves_reader_push(r, PRESERVES_ANNOTATION)); - } - r->annotation_tag_seen = true; - break; - - case PRESERVES_BINARY_FORMAT_TAG_EMBEDDED: - RETURN_ON_FAIL(_preserves_reader_push(r, PRESERVES_EMBEDDED)); - break; - - case PRESERVES_BINARY_FORMAT_TAG_LARGE_INTEGER: { - size_t len = 0; - switch (_preserves_reader_varint(r, &len)) { - case -1: return _preserves_reader_finish(r, PRESERVES_END_INCOMPLETE_INPUT); - case -2: return _preserves_reader_finish(r, PRESERVES_END_VARINT_TOO_BIG); - default: break; - } - if (_preserves_reader_decode_intbytes(r, &count, len) == -1) { - return _preserves_reader_finish(r, PRESERVES_END_INCOMPLETE_INPUT); - } - break; - } - - case PRESERVES_BINARY_FORMAT_TAG_STRING: { - preserves_error_code_t code = - _preserves_reader_read_stringlike(r, &count, PRESERVES_STRING, true); - if (code != PRESERVES_END_NO_ERROR) return _preserves_reader_finish(r, code); - break; - } - - case PRESERVES_BINARY_FORMAT_TAG_BYTE_STRING: { - preserves_error_code_t code = - _preserves_reader_read_stringlike(r, &count, PRESERVES_BYTE_STRING, false); - if (code != PRESERVES_END_NO_ERROR) return _preserves_reader_finish(r, code); - break; - } - - case PRESERVES_BINARY_FORMAT_TAG_SYMBOL: { - preserves_error_code_t code = - _preserves_reader_read_stringlike(r, &count, PRESERVES_SYMBOL, true); - if (code != PRESERVES_END_NO_ERROR) return _preserves_reader_finish(r, code); - break; - } - - case PRESERVES_BINARY_FORMAT_TAG_RECORD: - RETURN_ON_FAIL(_preserves_reader_push(r, PRESERVES_RECORD)); - break; - - case PRESERVES_BINARY_FORMAT_TAG_SEQUENCE: - RETURN_ON_FAIL(_preserves_reader_push(r, PRESERVES_SEQUENCE)); - break; - - case PRESERVES_BINARY_FORMAT_TAG_SET: - RETURN_ON_FAIL(_preserves_reader_push(r, PRESERVES_SET)); - break; - - case PRESERVES_BINARY_FORMAT_TAG_DICTIONARY: - RETURN_ON_FAIL(_preserves_reader_push(r, PRESERVES_DICTIONARY)); - break; - - default: - if ((b >= PRESERVES_BINARY_FORMAT_TAG_SMALL_INTEGER_LO) && - (b <= PRESERVES_BINARY_FORMAT_TAG_SMALL_INTEGER_HI)) { - int64_t value = b - PRESERVES_BINARY_FORMAT_TAG_SMALL_INTEGER_LO; - if (value > 12) value -= 16; - RETURN_ON_FAIL(_preserves_emit_small_int(r, &count, false, value)); - } else if ((b >= PRESERVES_BINARY_FORMAT_TAG_MEDIUM_INTEGER_LO) && - (b <= PRESERVES_BINARY_FORMAT_TAG_MEDIUM_INTEGER_HI)) { - size_t len = (b - PRESERVES_BINARY_FORMAT_TAG_MEDIUM_INTEGER_LO) + 1; - if (_preserves_reader_decode_intbytes(r, &count, len) == -1) { - return _preserves_reader_finish(r, PRESERVES_END_INCOMPLETE_INPUT); - } - } else { - return _preserves_reader_finish(r, PRESERVES_END_INVALID_TAG); - } - break; - } - } - - return _preserves_reader_finish(r, - (_preserves_reader_ateof(r) ? PRESERVES_END_EOF : - (r->stack_top > 0) ? PRESERVES_END_INCOMPLETE_INPUT : - PRESERVES_END_MORE_INPUT_REMAINING)); - } -); -#undef RETURN_ON_FAIL - -PRESERVES_OUTOFLINE -( - preserves_reader_result_t preserves_read_binary(preserves_reader_t *r, - preserves_bytes_t *input, - size_t count), { - { - size_t required_stack_bytes = MINIMUM_PRESERVES_READER_STACK_SIZE * sizeof(size_t); - if (r->stack.len < required_stack_bytes) { - if (preserves_resize_bytes(&r->stack, required_stack_bytes) == -1) { - return preserves_reader_error_result(); - } - } - } - - r->stack_top = 0; - r->input_pos = 0; - r->index_pos = 0; - r->annotation_tag_seen = false; - preserves_bytes_move(&r->input, input); - - return preserves_read_binary_continue(r, count); - } -); - -/////////////////////////////////////////////////////////////////////////// -// Debug utilities - -#ifndef NDEBUG - -PRESERVES_IMPLEMENTATION_CHUNK -( - static void preserves_dump_bytes(FILE *f, - preserves_bytes_t *data) { - fprintf(f, ">>>"); - for (size_t i = 0; i < data->len; i++) { - uint8_t c = PRESERVES_ARRAY_ELEMENT(data, uint8_t, i); - if (c < 0x20 || c >= 0x80) { - fprintf(f, "\\x%02x", c); - } else { - fprintf(f, "%c", c); - } - } - fprintf(f, "<<<"); - } - - void preserves_dump_index_entry(FILE *f, - preserves_bytes_t *input, - preserves_index_entry_t *i, - bool add_newline) { - fprintf(f, - "%s %s length %lu", - preserves_type_tag_name(i->type), - i->repr == PRESERVES_REPR_NONE ? "-" : preserves_index_entry_representation_name(i->repr), - (size_t) i->len); - switch (i->type) { - case PRESERVES_BOOLEAN: - fprintf(f, i->data._boolean ? " #t" : " #f"); - break; - - case PRESERVES_FLOAT: - fprintf(f, " %f", i->data._float); - break; - - case PRESERVES_DOUBLE: - fprintf(f, " %f", i->data._double); - break; - - case PRESERVES_STRING: - case PRESERVES_BYTE_STRING: - case PRESERVES_COMPACT: - case PRESERVES_SYMBOL: { - fprintf(f, " offset %lu ", i->data._unsigned); - preserves_bytes_t data = preserves_bytes_subsequence(input, i->data._unsigned, i->len); - preserves_dump_bytes(f, &data); - break; - } - - case PRESERVES_RECORD: - case PRESERVES_SEQUENCE: - case PRESERVES_SET: - case PRESERVES_DICTIONARY: - fprintf(f, " skip %lu", i->data._unsigned - 1); - break; - - case PRESERVES_EMBEDDED: - case PRESERVES_ANNOTATION: - break; - - case PRESERVES_END_MARKER: - fprintf(f, ": %s", preserves_error_code_name(i->data._err)); - break; - - case PRESERVES_SIGNED_INTEGER: - switch (i->repr) { - case PRESERVES_INT_SIGNED: - fprintf(f, ": %ld", i->data._signed); - break; - case PRESERVES_INT_UNSIGNED: - default: - fprintf(f, ": %lu", i->data._unsigned); - break; - case PRESERVES_INT_LARGE_BINARY: - case PRESERVES_INT_LARGE_TEXT: { - fprintf(f, " offset %lu ", i->data._unsigned); - preserves_bytes_t data = preserves_bytes_subsequence(input, i->data._unsigned, i->len); - preserves_dump_bytes(f, &data); - break; - } - } - break; - - default: - fprintf(f, ": %lu (%ld)", i->data._unsigned, i->data._signed); - break; - } - - if (add_newline) { - fprintf(f, "\n"); - } - } -) - -#endif - -/////////////////////////////////////////////////////////////////////////// - -#undef PRESERVES_INLINE -#undef PRESERVES_IMPLEMENTATION_CHUNK -#undef PRESERVES_OUTOFLINE - -#endif diff --git a/preserves-binary.md b/preserves-binary.md index 1dd9dca..f188d60 100644 --- a/preserves-binary.md +++ b/preserves-binary.md @@ -53,6 +53,8 @@ defined recursively as follows: big-endian, unlike [LEB128][] encoding ([as used by Google][google-varint] in protobufs). +We write `len(|r|)` for the varint-encoding of the length of `Repr` `r`. + The following table illustrates varint-encoding. | Number, `m` | `m` in binary, grouped into 7-bit chunks | `len(m)` bytes | @@ -71,7 +73,8 @@ varint-encoding of `m` *MUST NOT* start with `0`. «[X_1...X_m]» = [0xA8] ++ seq(«X_1», ..., «X_m») «#{E_1...E_m}» = [0xA9] ++ seq(«E_1», ..., «E_m») «{K_1:V_1...K_m:V_m}» = [0xAA] ++ seq(«K_1», «V_1», ..., «K_m», «V_m») - where seq(R_1, ... R_m) = len(R_1) ++ R_1 ++...++ len(R_m) ++ R_m + + seq(R_1, ..., R_m) = len(|R_1|) ++ R_1 ++...++ len(|R_m|) ++ R_m There is *no* ordering requirement on the `E_i` elements or `K_i`/`V_i` pairs.[^no-sorting-rationale] They may appear in any @@ -172,7 +175,7 @@ represent the denoted object, prefixed with `[0xBF]`. To annotate a `Repr` `r` with some sequence of `Value`s `[v_1, ..., v_m]`, surround `r` as follows: - [0xBE] ++ len(r) ++ r ++ len(v_1) ++ v_1 ++...++ len(v_m) ++ v_m + [0xBE] ++ len(|r|) ++ r ++ len(|«v_1»|) ++ «v_1» ++...++ len(|«v_m»|) ++ «v_m» The `Repr` `r` *MUST NOT* already have annotations; that is, it must not begin with `0xBE`. @@ -180,7 +183,7 @@ For example, the `Repr` corresponding to textual syntax `@a@b[]`, i.e. an empty sequence annotated with two symbols, `a` and `b`, is «@a @b []» - = [0xBE] ++ len(«[]») ++ «[]» ++ len(«a») ++ «a» ++ len(«b») ++ «b» + = [0xBE] ++ len(|«[]»|) ++ «[]» ++ len(|«a»|) ++ «a» ++ len(|«b»|) ++ «b» = [0xBE, 0x81, 0xA8, 0x82, 0xA6, 0x61, 0x82, 0xA6, 0x62] ## Security Considerations @@ -230,7 +233,7 @@ undetermined number of `Value`s across, say, a TCP/IP connection: - If the binary syntax is to be used for the connection, start the connection with byte `0xA8` (sequence). After the initial byte, send - each value `v` as `len(«v») ++ «v»`. A side effect of this approach + each value `v` as `len(|«v»|) ++ «v»`. A side effect of this approach is that the entire stream, when complete, is a valid `Sequence` `Repr`.