From 9a718548c04907fb140938d80c246a9ad3c3e7d1 Mon Sep 17 00:00:00 2001 From: Tony Garnock-Jones Date: Fri, 14 Jan 2022 00:27:18 +0100 Subject: [PATCH] First steps toward a C library for Preserves --- implementations/c/.gitignore | 2 + implementations/c/Makefile | 8 + implementations/c/main.c | 63 ++ implementations/c/preserves.h | 1085 +++++++++++++++++++++++++++++++++ 4 files changed, 1158 insertions(+) create mode 100644 implementations/c/.gitignore create mode 100644 implementations/c/Makefile create mode 100644 implementations/c/main.c create mode 100644 implementations/c/preserves.h diff --git a/implementations/c/.gitignore b/implementations/c/.gitignore new file mode 100644 index 0000000..9be4083 --- /dev/null +++ b/implementations/c/.gitignore @@ -0,0 +1,2 @@ +m.output.txt +m diff --git a/implementations/c/Makefile b/implementations/c/Makefile new file mode 100644 index 0000000..08f1b99 --- /dev/null +++ b/implementations/c/Makefile @@ -0,0 +1,8 @@ +m: main.c preserves.h + gcc -Wall -Wextra -Werror -g3 -o $@ main.c + +go: m + cat ../../tests/samples.bin | ./m | tee m.output.txt + +clean: + rm -f m diff --git a/implementations/c/main.c b/implementations/c/main.c new file mode 100644 index 0000000..63e7969 --- /dev/null +++ b/implementations/c/main.c @@ -0,0 +1,63 @@ +#include +#include +#include +#include + +#define PRESERVES_IMPLEMENTATION +#include "preserves.h" + +int main(__attribute__ ((unused)) int argc, + __attribute__ ((unused)) char const * const argv[]) +{ + preserves_bytes_t input = preserves_create_bytes(); + + { + preserves_bytes_t chunk = preserves_create_bytes(); + if (preserves_resize_bytes(&chunk, 131072) == -1) { + perror("allocating chunk"); + return EXIT_FAILURE; + } + + while (true) { + size_t count = fread(chunk.ptr, 1, chunk.len, stdin); + if (count == 0) { + if (ferror(stdin)) { + perror("reading"); + return EXIT_FAILURE; + } + break; + } + if (preserves_extend_bytes(&input, preserves_bytes_subsequence(&chunk, 0, count)) == -1) { + perror("appending"); + return EXIT_FAILURE; + } + } + + preserves_free_bytes(&chunk); + } + + { + preserves_reader_t reader = preserves_create_reader(); + preserves_reader_result_t result = preserves_read_binary(&reader, &input, 1); + if (result.index == NULL) { + perror("parsing"); + return EXIT_FAILURE; + } + + printf("Size of index: %lu bytes; %lu entries\n", + reader.index_pos * sizeof(preserves_index_entry_t), + reader.index_pos); + + if (true) { + for (preserves_index_entry_t *i = result.index; i != result.end_marker; i++) { + preserves_dump_index_entry(stdout, &reader.input, i, true); + } + preserves_dump_index_entry(stdout, &reader.input, result.end_marker, true); + } + + preserves_free_reader(&reader); + } + + preserves_free_bytes(&input); + return EXIT_SUCCESS; +} diff --git a/implementations/c/preserves.h b/implementations/c/preserves.h new file mode 100644 index 0000000..c80d71a --- /dev/null +++ b/implementations/c/preserves.h @@ -0,0 +1,1085 @@ +/// SPDX-License-Identifier: Apache-2.0 +/// SPDX-FileCopyrightText: Copyright © 2022 Tony Garnock-Jones + +#ifndef libpreserves_26109214_f3bd_44c8_95ba_8c650c954965 +#define libpreserves_26109214_f3bd_44c8_95ba_8c650c954965 + +// Single file header. #define PRESERVES_IMPLEMENTATION to get the implementations. + +#ifdef PRESERVES_IMPLEMENTATION +#define PRESERVES_INLINE +#define PRESERVES_IMPLEMENTATION_CHUNK(...) __VA_ARGS__ +#else +#define PRESERVES_INLINE static inline +#define PRESERVES_IMPLEMENTATION_CHUNK(...) +#endif + +#define PRESERVES_OUTOFLINE(declaration, ...) \ + extern declaration; \ + PRESERVES_IMPLEMENTATION_CHUNK(inline declaration __VA_ARGS__) + +#include +#include +#include +#include +#include // for ntohl, htonl + +/////////////////////////////////////////////////////////////////////////// +// General-purpose fat pointer, for e.g. strings, binary blobs, etc. + +typedef struct preserves_bytes { + bool borrowed:1; + size_t len:(sizeof(size_t) * 8 - 1); + void *ptr; +} preserves_bytes_t; + +#define PRESERVES_ARRAY_ELEMENT_COUNT(bytes_ptr, element_type) \ + ((bytes_ptr)->len / sizeof(element_type)) + +#define PRESERVES_ARRAY_ELEMENT(bytes_ptr, element_type, index) \ + (((element_type *) (bytes_ptr)->ptr)[index]) + +#define PRESERVES_RESIZE_ARRAY(bytes_ptr, element_type, size) \ + preserves_resize_bytes(bytes_ptr, sizeof(element_type) * (size)) + +#define PRESERVES_ARRAY_ACCESS(bytes_ptr, element_type, length_var, base_ptr_var) \ + size_t length_var = PRESERVES_ARRAY_ELEMENT_COUNT(bytes_ptr, element_type); \ + element_type *base_ptr_var = &PRESERVES_ARRAY_ELEMENT(bytes_ptr, element_type, 0) + +PRESERVES_INLINE preserves_bytes_t preserves_create_bytes(void) { + return (preserves_bytes_t) { .borrowed = 0, .len = 0, .ptr = NULL }; +} + +PRESERVES_INLINE int preserves_resize_bytes(preserves_bytes_t *bs, size_t size) { + if (bs->borrowed) abort(); + + if (size == 0) { + free(bs->ptr); + bs->ptr = NULL; + bs->len = 0; + return 0; + } + + void *ptr = realloc(bs->ptr, size); + if (ptr == NULL) return -1; + + bs->ptr = ptr; + if (size > bs->len) { + memset(((uint8_t *) bs->ptr) + bs->len, 0, size - bs->len); + } + bs->len = size; + return 0; +} + +PRESERVES_INLINE void preserves_free_bytes(preserves_bytes_t *bs) { + if (!bs->borrowed) preserves_resize_bytes(bs, 0); + *bs = (preserves_bytes_t) { .borrowed = 0, .len = 0, .ptr = NULL }; +} + +PRESERVES_INLINE void preserves_bytes_move(preserves_bytes_t *dest, preserves_bytes_t *src) { + preserves_free_bytes(dest); + *dest = *src; + *src = preserves_create_bytes(); +} + +PRESERVES_INLINE int preserves_extend_bytes(preserves_bytes_t *dest, preserves_bytes_t src) { + if (dest->borrowed) abort(); + + void *ptr = realloc(dest->ptr, dest->len + src.len); + if (ptr == NULL) return -1; + + dest->ptr = ptr; + memcpy(((uint8_t *) dest->ptr) + dest->len, src.ptr, src.len); + dest->len += src.len; + return 0; +} + +PRESERVES_INLINE preserves_bytes_t preserves_bytes_subsequence(preserves_bytes_t *bs, + size_t offset, + size_t len) { + if (offset >= bs->len) return preserves_create_bytes(); + if (len > bs->len) return preserves_create_bytes(); + if (offset > (bs->len - len)) len = bs->len - offset; + return (preserves_bytes_t) { .borrowed = 1, .len = len, .ptr = ((uint8_t *) bs->ptr) + offset }; +} + +/////////////////////////////////////////////////////////////////////////// +// Memory arenas + +typedef struct preserves_pool { + size_t pagesize; + preserves_bytes_t page_pointers; // for allocations smaller than pagesize + preserves_bytes_t large_block_pointers; // for allocations larger than or equal to pagesize + size_t next_page; + uint8_t *alloc_block_base; + size_t alloc_block_used; +} preserves_pool_t; + +PRESERVES_INLINE preserves_pool_t preserves_create_pool(size_t pagesize) { + return (preserves_pool_t) { + .pagesize = pagesize, + .page_pointers = preserves_create_bytes(), + .large_block_pointers = preserves_create_bytes(), + .next_page = 0, + .alloc_block_base = NULL, + .alloc_block_used = pagesize, + }; +} + +PRESERVES_INLINE void preserves_free_blocklist(preserves_bytes_t *bl) { + PRESERVES_ARRAY_ACCESS(bl, void *, num_blocks, blocklist); + for (size_t i = 0; i < num_blocks; i++) { + free(blocklist[i]); + } + preserves_free_bytes(bl); +} + +PRESERVES_INLINE void preserves_recycle_pool(preserves_pool_t *pool) { + preserves_free_blocklist(&pool->large_block_pointers); + pool->next_page = 0; + pool->alloc_block_base = NULL; + pool->alloc_block_used = pool->pagesize; +} + +PRESERVES_INLINE void preserves_free_pool(preserves_pool_t *pool) { + preserves_recycle_pool(pool); + preserves_free_blocklist(&pool->page_pointers); +} + +PRESERVES_OUTOFLINE(void *_preserves_pool_record_block(preserves_bytes_t *bl, size_t blocksize), { + void *ptr = calloc(1, blocksize); + if (ptr == NULL) return NULL; + if (preserves_resize_bytes(bl, bl->len + sizeof(void *)) == -1) { + free(ptr); + return NULL; + } + PRESERVES_ARRAY_ACCESS(bl, void *, num_blocks, blocklist); + blocklist[num_blocks - 1] = ptr; + return ptr; + }); + +PRESERVES_OUTOFLINE +( + int _preserves_pool_add_page_and_alloc(preserves_pool_t *pool, preserves_bytes_t *bs, size_t count), { + if (pool->next_page >= PRESERVES_ARRAY_ELEMENT_COUNT(&pool->page_pointers, void *)) { + void *ptr = _preserves_pool_record_block(&pool->page_pointers, pool->pagesize); + if (ptr == NULL) return -1; + pool->alloc_block_base = ptr; + pool->next_page = PRESERVES_ARRAY_ELEMENT_COUNT(&pool->page_pointers, void *); + } else { + pool->alloc_block_base = PRESERVES_ARRAY_ELEMENT(&pool->page_pointers, void *, pool->next_page); + pool->next_page++; + } + + pool->alloc_block_used = count; + *bs = (preserves_bytes_t) { .borrowed = 1, .len = count, .ptr = pool->alloc_block_base }; + return 0; + }); + +PRESERVES_INLINE int preserves_pool_alloc_bytes_align(preserves_pool_t *pool, + preserves_bytes_t *bs, + size_t count, + size_t alignment) { + preserves_free_bytes(bs); + if (count == 0) return 0; + + count = (count + alignment - 1) & (~(alignment - 1)); + // ^ round up to nearest `alignment`-byte boundary + + if (count > pool->pagesize) { + void *ptr = _preserves_pool_record_block(&pool->large_block_pointers, count); + if (ptr == NULL) return -1; + *bs = (preserves_bytes_t) { .borrowed = 1, .len = count, .ptr = ptr }; + return 0; + } + + if (pool->alloc_block_used + count <= pool->pagesize) { + *bs = (preserves_bytes_t) { + .borrowed = 1, + .len = count, + .ptr = pool->alloc_block_base + pool->alloc_block_used, + }; + pool->alloc_block_used += count; + return 0; + } + + return _preserves_pool_add_page_and_alloc(pool, bs, count); +} + +PRESERVES_INLINE int preserves_pool_alloc_bytes(preserves_pool_t *pool, + preserves_bytes_t *bs, + size_t count) { + return preserves_pool_alloc_bytes_align(pool, bs, count, 16); +} + +/////////////////////////////////////////////////////////////////////////// +// Binary codec details + +typedef enum preserves_binary_format_tag { + PRESERVES_BINARY_FORMAT_TAG_FALSE = 0x80, + PRESERVES_BINARY_FORMAT_TAG_TRUE = 0x81, + PRESERVES_BINARY_FORMAT_TAG_FLOAT = 0x82, + PRESERVES_BINARY_FORMAT_TAG_DOUBLE = 0x83, + PRESERVES_BINARY_FORMAT_TAG_END = 0x84, + PRESERVES_BINARY_FORMAT_TAG_ANNOTATION = 0x85, + PRESERVES_BINARY_FORMAT_TAG_EMBEDDED = 0x86, + + PRESERVES_BINARY_FORMAT_TAG_SMALL_INTEGER_LO = 0x90, + PRESERVES_BINARY_FORMAT_TAG_SMALL_INTEGER_HI = 0x9F, + + PRESERVES_BINARY_FORMAT_TAG_MEDIUM_INTEGER_LO = 0xA0, + PRESERVES_BINARY_FORMAT_TAG_MEDIUM_INTEGER_HI = 0xAF, + + PRESERVES_BINARY_FORMAT_TAG_LARGE_INTEGER = 0xB0, + PRESERVES_BINARY_FORMAT_TAG_STRING = 0xB1, + PRESERVES_BINARY_FORMAT_TAG_BYTE_STRING = 0xB2, + PRESERVES_BINARY_FORMAT_TAG_SYMBOL = 0xB3, + + PRESERVES_BINARY_FORMAT_TAG_RECORD = 0xB4, + PRESERVES_BINARY_FORMAT_TAG_SEQUENCE = 0xB5, + PRESERVES_BINARY_FORMAT_TAG_SET = 0xB6, + PRESERVES_BINARY_FORMAT_TAG_DICTIONARY = 0xB7, +} preserves_binary_format_tag_t; + +PRESERVES_OUTOFLINE +( + char const *preserves_binary_format_tag_name(preserves_binary_format_tag_t tag), { + switch (tag) { + case PRESERVES_BINARY_FORMAT_TAG_FALSE: return "FALSE"; + case PRESERVES_BINARY_FORMAT_TAG_TRUE: return "TRUE"; + case PRESERVES_BINARY_FORMAT_TAG_FLOAT: return "FLOAT"; + case PRESERVES_BINARY_FORMAT_TAG_DOUBLE: return "DOUBLE"; + case PRESERVES_BINARY_FORMAT_TAG_END: return "END"; + case PRESERVES_BINARY_FORMAT_TAG_ANNOTATION: return "ANNOTATION"; + case PRESERVES_BINARY_FORMAT_TAG_EMBEDDED: return "EMBEDDED"; + case PRESERVES_BINARY_FORMAT_TAG_LARGE_INTEGER: return "LARGE_INTEGER"; + case PRESERVES_BINARY_FORMAT_TAG_STRING: return "STRING"; + case PRESERVES_BINARY_FORMAT_TAG_BYTE_STRING: return "BYTE_STRING"; + case PRESERVES_BINARY_FORMAT_TAG_SYMBOL: return "SYMBOL"; + case PRESERVES_BINARY_FORMAT_TAG_RECORD: return "RECORD"; + case PRESERVES_BINARY_FORMAT_TAG_SEQUENCE: return "SEQUENCE"; + case PRESERVES_BINARY_FORMAT_TAG_SET: return "SET"; + case PRESERVES_BINARY_FORMAT_TAG_DICTIONARY: return "DICTIONARY"; + default: + if ((tag >= PRESERVES_BINARY_FORMAT_TAG_SMALL_INTEGER_LO) && + (tag <= PRESERVES_BINARY_FORMAT_TAG_SMALL_INTEGER_HI)) { + return "SMALL_INTEGER"; + } else if ((tag >= PRESERVES_BINARY_FORMAT_TAG_MEDIUM_INTEGER_LO) && + (tag <= PRESERVES_BINARY_FORMAT_TAG_MEDIUM_INTEGER_HI)) { + return "MEDIUM_INTEGER"; + } else { + return "UNKNOWN"; + } + } + }); + +/////////////////////////////////////////////////////////////////////////// +// Index representation + +typedef enum preserves_type_tag { + PRESERVES_BOOLEAN = 0, + PRESERVES_FLOAT, + PRESERVES_DOUBLE, + + PRESERVES_SIGNED_INTEGER, + PRESERVES_STRING, + PRESERVES_BYTE_STRING, + PRESERVES_SYMBOL, + + PRESERVES_RECORD, + PRESERVES_SEQUENCE, + PRESERVES_SET, + PRESERVES_DICTIONARY, + + PRESERVES_EMBEDDED, + PRESERVES_ANNOTATION, + PRESERVES_END_MARKER, +} preserves_type_tag_t; + +PRESERVES_OUTOFLINE(char const *preserves_type_tag_name(preserves_type_tag_t type), { + switch (type) { + case PRESERVES_BOOLEAN: return "BOOLEAN"; + case PRESERVES_FLOAT: return "FLOAT"; + case PRESERVES_DOUBLE: return "DOUBLE"; + case PRESERVES_SIGNED_INTEGER: return "SIGNED_INTEGER"; + case PRESERVES_STRING: return "STRING"; + case PRESERVES_BYTE_STRING: return "BYTE_STRING"; + case PRESERVES_SYMBOL: return "SYMBOL"; + case PRESERVES_RECORD: return "RECORD"; + case PRESERVES_SEQUENCE: return "SEQUENCE"; + case PRESERVES_SET: return "SET"; + case PRESERVES_DICTIONARY: return "DICTIONARY"; + case PRESERVES_EMBEDDED: return "EMBEDDED"; + case PRESERVES_ANNOTATION: return "ANNOTATION"; + case PRESERVES_END_MARKER: return "END_MARKER"; + default: return "UNKNOWN"; + } + }); + +typedef enum preserves_error_code { + PRESERVES_END_SYSTEM_ERROR = -2, + PRESERVES_END_NO_ERROR = -1, + + PRESERVES_END_EOF = 0, + PRESERVES_END_MORE_INPUT_REMAINING, + PRESERVES_END_INCOMPLETE_INPUT, + PRESERVES_END_UNEXPECTED_END, + PRESERVES_END_DICTIONARY_MISSING_VALUE, + PRESERVES_END_RECORD_MISSING_LABEL, + PRESERVES_END_VARINT_TOO_BIG, + PRESERVES_END_INVALID_UTF8, + PRESERVES_END_INVALID_TAG, +} preserves_error_code_t; + +PRESERVES_OUTOFLINE(char const *preserves_error_code_name(preserves_error_code_t code), { + switch (code) { + case PRESERVES_END_SYSTEM_ERROR: return "SYSTEM_ERROR"; + case PRESERVES_END_NO_ERROR: return "NO_ERROR"; + case PRESERVES_END_EOF: return "EOF"; + case PRESERVES_END_MORE_INPUT_REMAINING: return "MORE_INPUT_REMAINING"; + case PRESERVES_END_INCOMPLETE_INPUT: return "INCOMPLETE_INPUT"; + case PRESERVES_END_UNEXPECTED_END: return "UNEXPECTED_END"; + case PRESERVES_END_DICTIONARY_MISSING_VALUE: return "DICTIONARY_MISSING_VALUE"; + case PRESERVES_END_RECORD_MISSING_LABEL: return "RECORD_MISSING_LABEL"; + case PRESERVES_END_VARINT_TOO_BIG: return "VARINT_TOO_BIG"; + case PRESERVES_END_INVALID_UTF8: return "INVALID_UTF8"; + case PRESERVES_END_INVALID_TAG: return "INVALID_TAG"; + default: return "UNKNOWN"; + } + }); + +typedef enum preserves_index_entry_representation { + PRESERVES_REPR_NONE = 0, + PRESERVES_INT_SIGNED, + PRESERVES_INT_UNSIGNED, + PRESERVES_INT_LARGE_BINARY, + PRESERVES_INT_LARGE_TEXT, + PRESERVES_LITERAL, + PRESERVES_ESCAPED, + PRESERVES_HEX, + PRESERVES_BASE64, +} preserves_index_entry_representation_t; + +PRESERVES_OUTOFLINE +( + char const *preserves_index_entry_representation_name(preserves_index_entry_representation_t repr), { + switch (repr) { + case PRESERVES_REPR_NONE: return "REPR_NONE"; + case PRESERVES_INT_SIGNED: return "INT_SIGNED"; + case PRESERVES_INT_UNSIGNED: return "INT_UNSIGNED"; + case PRESERVES_INT_LARGE_BINARY: return "INT_LARGE_BINARY"; + case PRESERVES_INT_LARGE_TEXT: return "INT_LARGE_TEXT"; + case PRESERVES_LITERAL: return "LITERAL"; + case PRESERVES_ESCAPED: return "ESCAPED"; + case PRESERVES_HEX: return "HEX"; + case PRESERVES_BASE64: return "BASE64"; + default: return "UNKNOWN"; + } + }); + +/* + PRESERVES_BOOLEAN: repr==PRESERVES_REPR_NONE, len=0, data._boolean + PRESERVES_FLOAT: repr=PRESERVES_REPR_NONE, len=0, data._float + PRESERVES_DOUBLE: repr=PRESERVES_REPR_NONE, len=0, data._double + + PRESERVES_SIGNED_INTEGER: + - repr==PRESERVES_INT_SIGNED -> len=0, data._signed + - repr==PRESERVES_INT_UNSIGNED -> len=0, data._unsigned + - repr==PRESERVES_INT_LARGE_BINARY -> len, data._unsigned as absolute offset within input + - repr==PRESERVES_INT_LARGE_TEXT -> len, data._unsigned as absolute offset within input + PRESERVES_STRING: + - repr=PRESERVES_LITERAL -> len, data._unsigned as absolute offset within input to utf-8 bytes + - repr=PRESERVES_ESCAPED -> len, data._unsigned as absolute offset within input to utf-8 bytes + that need String-style backslash-escapes interpreted + PRESERVES_BYTE_STRING: + - repr=PRESERVES_LITERAL -> len, data._unsigned as absolute offset within input to utf-8 bytes + - repr=PRESERVES_ESCAPED -> len, data._unsigned as absolute offset within input to utf-8 bytes + that need ByteString-style backslash-escapes interpreted + - repr=PRESERVES_HEX -> len, data._unsigned as absolute offset within input to ASCII bytes of hex + - repr=PRESERVES_BASE64 -> len, data._unsigned as absolute offset within input to ASCII bytes of base64 + PRESERVES_SYMBOL: + - repr=PRESERVES_LITERAL -> len, data._unsigned as absolute offset within input to utf-8 bytes + - repr=PRESERVES_ESCAPED -> len, data._unsigned as absolute offset within input to utf-8 bytes + that need Symbol-style backslash-escapes interpreted + + PRESERVES_RECORD, PRESERVES_SEQUENCE, PRESERVES_SET, PRESERVES_DICTIONARY: + - repr==PRESERVES_REPR_NONE, + - len counts number of items: + - PRESERVES_RECORD -> number of fields plus one (for the label) + - PRESERVES_SEQUENCE -> number of items + - PRESERVES_SET -> number of items + - PRESERVES_DICTIONARY -> twice the number of key-value pairs + - data._unsigned as relative offset within index to next item, + starting from this entry; zero means "no end known" + + PRESERVES_EMBEDDED: repr==PRESERVES_REPR_NONE, len==0, following item is the embedded value + PRESERVES_ANNOTATION: + - repr=PRESERVES_REPR_NONE, + - len counts number of annotations, + - data._unsigned as relative offset within index to annotated + item, starting from this entry; zero means "no end known" + - the annotated item will not be a PRESERVES_ANNOTATION + + PRESERVES_END_MARKER: repr==PRESERVES_REPR_NONE, len==0, data._err + */ +typedef struct preserves_index_entry { + preserves_type_tag_t type:4; + preserves_index_entry_representation_t repr:4; + uint64_t len:56; + + union { + bool _boolean; + float _float; + double _double; + int64_t _signed; + uint64_t _unsigned; + preserves_error_code_t _err; + } data; +} preserves_index_entry_t; + +#ifndef NDEBUG +extern void preserves_dump_index_entry(FILE* f, preserves_bytes_t *input, preserves_index_entry_t *i, bool add_newline); +#endif + +typedef struct preserves_reader { + preserves_bytes_t input; + preserves_bytes_t index; + preserves_bytes_t stack; + size_t stack_top; /* ascending empty */ + size_t input_pos; /* ascending full */ + size_t index_pos; /* ascending empty */ + bool annotation_tag_seen; +} preserves_reader_t; + +typedef struct preserves_reader_result { + preserves_index_entry_t *index; + preserves_index_entry_t *end_marker; +} preserves_reader_result_t; + +PRESERVES_INLINE preserves_reader_result_t preserves_reader_error_result(void) { + return (preserves_reader_result_t) { .index = NULL, .end_marker = NULL }; +} + +PRESERVES_INLINE preserves_reader_t preserves_create_reader(void) { + return (preserves_reader_t) { + .input = preserves_create_bytes(), + .index = preserves_create_bytes(), + .stack = preserves_create_bytes(), + .stack_top = 0, + .input_pos = 0, + .index_pos = 0, + .annotation_tag_seen = false, + }; +} + +PRESERVES_OUTOFLINE(void preserves_free_reader(preserves_reader_t *r), { + preserves_free_bytes(&r->input); + preserves_free_bytes(&r->index); + preserves_free_bytes(&r->stack); + r->stack_top = 0; + r->input_pos = 0; + r->index_pos = 0; + r->annotation_tag_seen = false; + }); + +PRESERVES_IMPLEMENTATION_CHUNK +( +#define MINIMUM_PRESERVES_READER_STACK_SIZE 32 + typedef uint64_t preserves_index_offset_t; + + static inline bool _preserves_reader_ateof(preserves_reader_t *r) { + return (r->input_pos >= r->input.len); + } + + static inline int _preserves_reader_peek(preserves_reader_t *r) { + if (_preserves_reader_ateof(r)) return -1; + return PRESERVES_ARRAY_ELEMENT(&r->input, uint8_t, r->input_pos); + } + + static inline int _preserves_reader_next(preserves_reader_t *r) { + if (r->input_pos >= r->input.len) return -1; + int result = PRESERVES_ARRAY_ELEMENT(&r->input, uint8_t, r->input_pos); + r->input_pos++; + return result; + } + + static inline void *_preserves_reader_next_bytes(preserves_reader_t *r, size_t count) { + preserves_bytes_t bs = preserves_bytes_subsequence(&r->input, r->input_pos, count); + if (bs.len != count) return NULL; + r->input_pos += count; + return bs.ptr; + } + + static inline preserves_index_entry_t *_preserves_reader_index_entry(preserves_reader_t *r, + size_t i) { + size_t limit = PRESERVES_ARRAY_ELEMENT_COUNT(&r->index, preserves_index_entry_t); + while (i >= limit) { + limit = limit * 2; + if (limit < 16) limit = 16; + if (PRESERVES_RESIZE_ARRAY(&r->index, preserves_index_entry_t, limit) == -1) { + return NULL; + } + } + return &PRESERVES_ARRAY_ELEMENT(&r->index, preserves_index_entry_t, i); + } + + static inline size_t _preserves_reader_stack_peek(preserves_reader_t *r) { + if (r->stack_top >= PRESERVES_ARRAY_ELEMENT_COUNT(&r->stack, size_t)) { + abort(); + } + return PRESERVES_ARRAY_ELEMENT(&r->stack, size_t, r->stack_top - 1); + } + + static inline preserves_index_entry_t *_preserves_reader_stack_top_entry(preserves_reader_t *r) { + return _preserves_reader_index_entry(r, _preserves_reader_stack_peek(r)); + } + + static inline void _preserves_reader_stack_drop(preserves_reader_t *r) { + if (r->stack_top == 0) abort(); + /* printf("popping "); */ + /* preserves_dump_index_entry(stdout, &r->input, _preserves_reader_stack_top_entry(r), true); */ + r->stack_top--; + } + + static inline preserves_index_entry_t *_preserves_reader_finish_seq(preserves_reader_t *r) { + size_t base_index = _preserves_reader_stack_peek(r); + preserves_index_entry_t *base = _preserves_reader_stack_top_entry(r); + base->data._unsigned = r->index_pos - base_index; + _preserves_reader_stack_drop(r); + return base; + } + + static inline bool _preserves_reader_in_annotations(preserves_reader_t *r) { + return (r->stack_top > 0) && + (_preserves_reader_stack_top_entry(r)->type == PRESERVES_ANNOTATION); + } + + static inline void _preserves_reader_inc_collection_len(preserves_reader_t *r, size_t *count_ptr) { + if (r->stack_top > 0) { + check_for_embedded: + preserves_index_entry_t *base = _preserves_reader_stack_top_entry(r); + if (base->type == PRESERVES_EMBEDDED) { + _preserves_reader_stack_drop(r); + goto check_for_embedded; + } else { + base->len++; + } + /* printf("added to base, which is now "); */ + /* preserves_dump_index_entry(stdout, &r->input, base, true); */ + } else { + (*count_ptr)--; + } + } + + static inline preserves_index_entry_t *_preserves_reader_emit_entry(preserves_reader_t *r, + size_t *count_ptr, + preserves_index_entry_t e) { + if (!r->annotation_tag_seen && _preserves_reader_in_annotations(r)) { + /* printf("(popping annotation collector)\n"); */ + _preserves_reader_finish_seq(r); + } + if (count_ptr != NULL) { + _preserves_reader_inc_collection_len(r, count_ptr); + } + + /* printf("-- emitting: "); */ + /* preserves_dump_index_entry(stdout, &r->input, &e, true); */ + + preserves_index_entry_t *ix = _preserves_reader_index_entry(r, r->index_pos); + if (ix == NULL) return NULL; + *ix = e; + r->index_pos++; + + r->annotation_tag_seen = false; + + return ix; + } + + static inline preserves_reader_result_t _preserves_reader_finish(preserves_reader_t *r, + preserves_error_code_t code) { + if (code == PRESERVES_END_SYSTEM_ERROR) { + return preserves_reader_error_result(); + } else { + preserves_index_entry_t *index = _preserves_reader_index_entry(r, 0); + if (index == NULL) return preserves_reader_error_result(); + preserves_index_entry_t *end_marker = + _preserves_reader_emit_entry(r, NULL, (preserves_index_entry_t) { + .type = PRESERVES_END_MARKER, + .repr = PRESERVES_REPR_NONE, + .len = 0, + .data = { ._err = code }, + }); + if (end_marker == NULL) return preserves_reader_error_result(); + return (preserves_reader_result_t) { .index = index, .end_marker = end_marker }; + } + } + + static inline int _preserves_reader_varint(preserves_reader_t *r, size_t *v) { + unsigned int shift_amount = 0; + size_t result = 0; + while (true) { + int b = _preserves_reader_next(r); + if (b == -1) return -1; + result |= (b & 0x7f) << shift_amount; + if (b & 0x80) { + shift_amount += 7; + if (shift_amount > ((sizeof(size_t) * 8) - 7)) return -2; + } else { + *v = result; + return 0; + } + } + } + + static inline preserves_index_entry_t *_preserves_emit_small_int(preserves_reader_t *r, + size_t *count_ptr, + bool is_unsigned, + int64_t value) { + return _preserves_reader_emit_entry(r, count_ptr, (preserves_index_entry_t) { + .type = PRESERVES_SIGNED_INTEGER, + .repr = is_unsigned ? PRESERVES_INT_UNSIGNED : PRESERVES_INT_SIGNED, + .len = 0, + .data = { ._signed = value }, + }); + } + + static inline int _preserves_reader_decode_intbytes(preserves_reader_t *r, + size_t *count_ptr, + size_t len) { + size_t starting_pos = r->input_pos; + uint8_t *bs = _preserves_reader_next_bytes(r, len); + if (bs == NULL) return -1; + + bool is_unsigned = false; + size_t remaining = len; + while ((remaining > 0) && (*bs == 0)) { + is_unsigned = true; + bs++; + remaining--; + } + + if (remaining == 0) { + // This shouldn't happen, but it does have a denotation. + return (_preserves_emit_small_int(r, count_ptr, is_unsigned, 0) == NULL) ? -1 : 0; + } + + if (remaining > 8) { + if (is_unsigned && (*bs & 0x80)) { + remaining++; + bs--; + } + return (_preserves_reader_emit_entry(r, count_ptr, (preserves_index_entry_t) { + .type = PRESERVES_SIGNED_INTEGER, + .repr = PRESERVES_INT_LARGE_BINARY, + .len = remaining, + .data = { ._unsigned = starting_pos + (len - remaining) }, + }) == NULL) ? -1 : 0; + } + + uint64_t buf = 0; + while (remaining > 0) { + remaining--; + buf = buf | ((*bs) << (remaining << 3)); + bs++; + } + + int64_t value = *(int64_t *)&buf; + return (_preserves_emit_small_int(r, count_ptr, is_unsigned, value) == NULL) ? -1 : 0; + } + + static inline bool utf8_tail(uint8_t b) { + return (b >= 0x80 && b <= 0xbf); + } + + static inline int check_utf8(uint8_t *bs, size_t len) { + // https://datatracker.ietf.org/doc/html/rfc3629#section-4 + while (len > 0) { + uint8_t b0 = *bs++; + len--; + if (b0 >= 0x80) { + if (len < 1) return -1; + uint8_t b1 = *bs++; + len--; + if (b0 >= 0xc2 && b0 <= 0xdf) { + if (!utf8_tail(b1)) return -1; + } else { + if (len < 1) return -1; + uint8_t b2 = *bs++; + len--; + if (b0 == 0xe0) { + if (!(b1 >= 0xa0 && b1 <= 0xbf && utf8_tail(b2))) return -1; + } else if (b0 >= 0xe1 && b0 <= 0xec) { + if (!(utf8_tail(b1) && utf8_tail(b2))) return -1; + } else if (b0 == 0xed) { + if (!(b1 >= 0x80 && b1 <= 0x9f && utf8_tail(b2))) return -1; + } else if (b0 >= 0xee && b0 <= 0xef) { + if (!(utf8_tail(b1) && utf8_tail(b2))) return -1; + } else { + if (len < 1) return -1; + uint8_t b3 = *bs++; + len--; + if (b0 == 0xf0) { + if (!(b1 >= 0x90 && b1 <= 0xbf && utf8_tail(b2) && utf8_tail(b3))) return -1; + } else if (b0 >= 0xf1 && b0 <= 0xf3) { + if (!(utf8_tail(b1) && utf8_tail(b2) && utf8_tail(b3))) return -1; + } else if (b0 == 0xf4) { + if (!(b1 >= 0x80 && b1 <= 0x8f && utf8_tail(b2) && utf8_tail(b3))) return -1; + } else { + // ok! + } + } + } + } + } + return 0; + } + + static inline preserves_error_code_t _preserves_reader_read_stringlike(preserves_reader_t *r, + size_t *count_ptr, + preserves_type_tag_t type) { + size_t len = 0; + switch (_preserves_reader_varint(r, &len)) { + case -1: return PRESERVES_END_INCOMPLETE_INPUT; + case -2: return PRESERVES_END_VARINT_TOO_BIG; + default: break; + } + size_t starting_pos = r->input_pos; + uint8_t *maybe_utf = _preserves_reader_next_bytes(r, len); + if ((type != PRESERVES_BYTE_STRING) && (check_utf8(maybe_utf, len) == -1)) { + return PRESERVES_END_INVALID_UTF8; + } + if (_preserves_reader_emit_entry(r, count_ptr, (preserves_index_entry_t) { + .type = type, + .repr = PRESERVES_LITERAL, + .len = len, + .data = { ._unsigned = starting_pos }, + }) == NULL) { + return PRESERVES_END_SYSTEM_ERROR; + } + return PRESERVES_END_NO_ERROR; + } + + static inline preserves_index_entry_t *_preserves_reader_push(preserves_reader_t *r, + preserves_type_tag_t type) { + preserves_index_entry_t *ix = _preserves_reader_emit_entry(r, NULL, (preserves_index_entry_t) { + .type = type, .repr = PRESERVES_REPR_NONE, .len = 0, .data = { ._unsigned = 0 }}); + if (ix == NULL) return NULL; + + size_t limit = PRESERVES_ARRAY_ELEMENT_COUNT(&r->stack, size_t); + if (r->stack_top >= limit) { + limit += 32; + if (PRESERVES_RESIZE_ARRAY(&r->stack, size_t, limit) == -1) return NULL; + } + PRESERVES_ARRAY_ELEMENT(&r->stack, size_t, r->stack_top) = r->index_pos - 1; + r->stack_top++; + return ix; + } +) + +PRESERVES_INLINE preserves_index_entry_t *preserves_skip_annotations(preserves_index_entry_t *ix) { + if (ix == NULL) return NULL; + if (ix->type != PRESERVES_ANNOTATION) return ix; + ix += ix->data._unsigned; + if (ix->type == PRESERVES_ANNOTATION) abort(); + return ix; +} + +#define RETURN_ON_FAIL(e) if ((e) == NULL) return preserves_reader_error_result() +PRESERVES_OUTOFLINE +( + preserves_reader_result_t preserves_read_binary_continue(preserves_reader_t *r, size_t count), { + while (count) { + /* for (int i = r->stack_top - 1; i >= 0; i--) { */ + /* size_t ip = PRESERVES_ARRAY_ELEMENT(&r->stack, size_t, i); */ + /* printf(" %02d: (%5lu) ", i, ip); */ + /* preserves_dump_index_entry(stdout, &r->input, _preserves_reader_index_entry(r, ip), true); */ + /* } */ + /* printf("pos %lu (%05lx), count %lu, annotation tag seen %d: ", */ + /* r->input_pos, */ + /* r->input_pos, */ + /* count, */ + /* r->annotation_tag_seen); */ + int b = _preserves_reader_next(r); + /* printf("tag 0x%02x %s\n", b, preserves_binary_format_tag_name(b)); */ + if (b == -1) return _preserves_reader_finish(r, PRESERVES_END_INCOMPLETE_INPUT); + + switch (b) { + case PRESERVES_BINARY_FORMAT_TAG_FALSE: + RETURN_ON_FAIL(_preserves_reader_emit_entry(r, &count, (preserves_index_entry_t) { + .type = PRESERVES_BOOLEAN, .repr = PRESERVES_REPR_NONE, .len = 0, .data = { + ._boolean = false + }})); + break; + + case PRESERVES_BINARY_FORMAT_TAG_TRUE: + RETURN_ON_FAIL(_preserves_reader_emit_entry(r, &count, (preserves_index_entry_t) { + .type = PRESERVES_BOOLEAN, .repr = PRESERVES_REPR_NONE, .len = 0, .data = { + ._boolean = true + }})); + break; + + case PRESERVES_BINARY_FORMAT_TAG_FLOAT: { + uint8_t *bs = _preserves_reader_next_bytes(r, 4); + if (bs == NULL) return _preserves_reader_finish(r, PRESERVES_END_INCOMPLETE_INPUT); + uint32_t i; + memcpy(&i, bs, 4); + i = ntohl(i); + float f; + memcpy(&f, &i, 4); + RETURN_ON_FAIL(_preserves_reader_emit_entry(r, &count, (preserves_index_entry_t) { + .type = PRESERVES_FLOAT, .repr = PRESERVES_REPR_NONE, .len = 0, .data = { + ._float = f + }})); + break; + } + + case PRESERVES_BINARY_FORMAT_TAG_DOUBLE: { + uint8_t *bs = _preserves_reader_next_bytes(r, 8); + if (bs == NULL) return _preserves_reader_finish(r, PRESERVES_END_INCOMPLETE_INPUT); + uint32_t lo, hi; + memcpy(&hi, bs, 4); + memcpy(&lo, bs + 4, 4); + lo = ntohl(lo); + hi = ntohl(hi); + uint64_t i = (((uint64_t) hi) << 32) | ((uint64_t) lo); + double f; + memcpy(&f, &i, 8); + RETURN_ON_FAIL(_preserves_reader_emit_entry(r, &count, (preserves_index_entry_t) { + .type = PRESERVES_DOUBLE, .repr = PRESERVES_REPR_NONE, .len = 0, .data = { + ._double = f + }})); + break; + } + + case PRESERVES_BINARY_FORMAT_TAG_END: + if (r->stack_top == 0) { + return _preserves_reader_finish(r, PRESERVES_END_UNEXPECTED_END); + } + preserves_index_entry_t *base = _preserves_reader_finish_seq(r); + _preserves_reader_inc_collection_len(r, &count); + if ((base->type == PRESERVES_DICTIONARY) && ((base->len % 2) != 0)) { + return _preserves_reader_finish(r, PRESERVES_END_DICTIONARY_MISSING_VALUE); + } + if ((base->type == PRESERVES_RECORD) && (base->len == 0)) { + return _preserves_reader_finish(r, PRESERVES_END_RECORD_MISSING_LABEL); + } + break; + + case PRESERVES_BINARY_FORMAT_TAG_ANNOTATION: + if (r->annotation_tag_seen || !_preserves_reader_in_annotations(r)) { + RETURN_ON_FAIL(_preserves_reader_push(r, PRESERVES_ANNOTATION)); + } + r->annotation_tag_seen = true; + break; + + case PRESERVES_BINARY_FORMAT_TAG_EMBEDDED: + RETURN_ON_FAIL(_preserves_reader_push(r, PRESERVES_EMBEDDED)); + break; + + case PRESERVES_BINARY_FORMAT_TAG_LARGE_INTEGER: { + size_t len = 0; + switch (_preserves_reader_varint(r, &len)) { + case -1: return _preserves_reader_finish(r, PRESERVES_END_INCOMPLETE_INPUT); + case -2: return _preserves_reader_finish(r, PRESERVES_END_VARINT_TOO_BIG); + default: break; + } + if (_preserves_reader_decode_intbytes(r, &count, len) == -1) { + return _preserves_reader_finish(r, PRESERVES_END_INCOMPLETE_INPUT); + } + break; + } + + case PRESERVES_BINARY_FORMAT_TAG_STRING: { + preserves_error_code_t code = + _preserves_reader_read_stringlike(r, &count, PRESERVES_STRING); + if (code != PRESERVES_END_NO_ERROR) return _preserves_reader_finish(r, code); + break; + } + + case PRESERVES_BINARY_FORMAT_TAG_BYTE_STRING: { + preserves_error_code_t code = + _preserves_reader_read_stringlike(r, &count, PRESERVES_BYTE_STRING); + if (code != PRESERVES_END_NO_ERROR) return _preserves_reader_finish(r, code); + break; + } + + case PRESERVES_BINARY_FORMAT_TAG_SYMBOL: { + preserves_error_code_t code = + _preserves_reader_read_stringlike(r, &count, PRESERVES_SYMBOL); + if (code != PRESERVES_END_NO_ERROR) return _preserves_reader_finish(r, code); + break; + } + + case PRESERVES_BINARY_FORMAT_TAG_RECORD: + RETURN_ON_FAIL(_preserves_reader_push(r, PRESERVES_RECORD)); + break; + + case PRESERVES_BINARY_FORMAT_TAG_SEQUENCE: + RETURN_ON_FAIL(_preserves_reader_push(r, PRESERVES_SEQUENCE)); + break; + + case PRESERVES_BINARY_FORMAT_TAG_SET: + RETURN_ON_FAIL(_preserves_reader_push(r, PRESERVES_SET)); + break; + + case PRESERVES_BINARY_FORMAT_TAG_DICTIONARY: + RETURN_ON_FAIL(_preserves_reader_push(r, PRESERVES_DICTIONARY)); + break; + + default: + if ((b >= PRESERVES_BINARY_FORMAT_TAG_SMALL_INTEGER_LO) && + (b <= PRESERVES_BINARY_FORMAT_TAG_SMALL_INTEGER_HI)) { + int64_t value = b - PRESERVES_BINARY_FORMAT_TAG_SMALL_INTEGER_LO; + if (value > 12) value -= 16; + RETURN_ON_FAIL(_preserves_emit_small_int(r, &count, false, value)); + } else if ((b >= PRESERVES_BINARY_FORMAT_TAG_MEDIUM_INTEGER_LO) && + (b <= PRESERVES_BINARY_FORMAT_TAG_MEDIUM_INTEGER_HI)) { + size_t len = (b - PRESERVES_BINARY_FORMAT_TAG_MEDIUM_INTEGER_LO) + 1; + if (_preserves_reader_decode_intbytes(r, &count, len) == -1) { + return _preserves_reader_finish(r, PRESERVES_END_INCOMPLETE_INPUT); + } + } else { + return _preserves_reader_finish(r, PRESERVES_END_INVALID_TAG); + } + break; + } + } + + return _preserves_reader_finish(r, + (_preserves_reader_ateof(r) ? PRESERVES_END_EOF : + (r->stack_top > 0) ? PRESERVES_END_INCOMPLETE_INPUT : + PRESERVES_END_MORE_INPUT_REMAINING)); + } +); +#undef RETURN_ON_FAIL + +PRESERVES_OUTOFLINE +( + preserves_reader_result_t preserves_read_binary(preserves_reader_t *r, + preserves_bytes_t *input, + size_t count), { + { + size_t required_stack_bytes = MINIMUM_PRESERVES_READER_STACK_SIZE * sizeof(size_t); + if (r->stack.len < required_stack_bytes) { + if (preserves_resize_bytes(&r->stack, required_stack_bytes) == -1) { + return preserves_reader_error_result(); + } + } + } + + r->stack_top = 0; + r->input_pos = 0; + r->index_pos = 0; + r->annotation_tag_seen = false; + preserves_bytes_move(&r->input, input); + + return preserves_read_binary_continue(r, count); + } +); + +/////////////////////////////////////////////////////////////////////////// +// Debug utilities + +#ifndef NDEBUG + +PRESERVES_IMPLEMENTATION_CHUNK +( + static void preserves_dump_bytes(FILE *f, + preserves_bytes_t *data) { + fprintf(f, ">>>"); + for (size_t i = 0; i < data->len; i++) { + uint8_t c = PRESERVES_ARRAY_ELEMENT(data, uint8_t, i); + if (c < 0x20 || c >= 0x80) { + fprintf(f, "\\x%02x", c); + } else { + fprintf(f, "%c", c); + } + } + fprintf(f, "<<<"); + } + + void preserves_dump_index_entry(FILE *f, + preserves_bytes_t *input, + preserves_index_entry_t *i, + bool add_newline) { + fprintf(f, + "%s %s length %lu", + preserves_type_tag_name(i->type), + i->repr == PRESERVES_REPR_NONE ? "-" : preserves_index_entry_representation_name(i->repr), + (size_t) i->len); + switch (i->type) { + case PRESERVES_BOOLEAN: + fprintf(f, i->data._boolean ? " #t" : " #f"); + break; + + case PRESERVES_FLOAT: + fprintf(f, " %f", i->data._float); + break; + + case PRESERVES_DOUBLE: + fprintf(f, " %f", i->data._double); + break; + + case PRESERVES_STRING: + case PRESERVES_BYTE_STRING: + case PRESERVES_SYMBOL: { + fprintf(f, " offset %lu ", i->data._unsigned); + preserves_bytes_t data = preserves_bytes_subsequence(input, i->data._unsigned, i->len); + preserves_dump_bytes(f, &data); + break; + } + + case PRESERVES_RECORD: + case PRESERVES_SEQUENCE: + case PRESERVES_SET: + case PRESERVES_DICTIONARY: + fprintf(f, " skip %lu", i->data._unsigned - 1); + break; + + case PRESERVES_EMBEDDED: + case PRESERVES_ANNOTATION: + break; + + case PRESERVES_END_MARKER: + fprintf(f, ": %s", preserves_error_code_name(i->data._err)); + break; + + case PRESERVES_SIGNED_INTEGER: + switch (i->repr) { + case PRESERVES_INT_SIGNED: + fprintf(f, ": %ld", i->data._signed); + break; + case PRESERVES_INT_UNSIGNED: + default: + fprintf(f, ": %lu", i->data._unsigned); + break; + case PRESERVES_INT_LARGE_BINARY: + case PRESERVES_INT_LARGE_TEXT: { + fprintf(f, " offset %lu ", i->data._unsigned); + preserves_bytes_t data = preserves_bytes_subsequence(input, i->data._unsigned, i->len); + preserves_dump_bytes(f, &data); + break; + } + } + break; + + default: + fprintf(f, ": %lu (%ld)", i->data._unsigned, i->data._signed); + break; + } + + if (add_newline) { + fprintf(f, "\n"); + } + } +) + +#endif + +/////////////////////////////////////////////////////////////////////////// + +#undef PRESERVES_INLINE +#undef PRESERVES_IMPLEMENTATION_CHUNK +#undef PRESERVES_OUTOFLINE + +#endif