/// SPDX-License-Identifier: Apache-2.0 /// SPDX-FileCopyrightText: Copyright © 2022 Tony Garnock-Jones #ifndef libpreserves_26109214_f3bd_44c8_95ba_8c650c954965 #define libpreserves_26109214_f3bd_44c8_95ba_8c650c954965 // Single file header. #define PRESERVES_IMPLEMENTATION to get the implementations. #ifdef PRESERVES_IMPLEMENTATION #define PRESERVES_INLINE #define PRESERVES_IMPLEMENTATION_CHUNK(...) __VA_ARGS__ #else #define PRESERVES_INLINE static inline #define PRESERVES_IMPLEMENTATION_CHUNK(...) #endif #define PRESERVES_OUTOFLINE(declaration, ...) \ extern declaration; \ PRESERVES_IMPLEMENTATION_CHUNK(inline declaration __VA_ARGS__) #include #include #include #include #include // for ntohl, htonl /////////////////////////////////////////////////////////////////////////// // General-purpose fat pointer, for e.g. strings, binary blobs, etc. typedef struct preserves_bytes { bool borrowed:1; size_t len:(sizeof(size_t) * 8 - 1); void *ptr; } preserves_bytes_t; #define PRESERVES_ARRAY_ELEMENT_COUNT(bytes_ptr, element_type) \ ((bytes_ptr)->len / sizeof(element_type)) #define PRESERVES_ARRAY_ELEMENT(bytes_ptr, element_type, index) \ (((element_type *) (bytes_ptr)->ptr)[index]) #define PRESERVES_RESIZE_ARRAY(bytes_ptr, element_type, size) \ preserves_resize_bytes(bytes_ptr, sizeof(element_type) * (size)) #define PRESERVES_ARRAY_ACCESS(bytes_ptr, element_type, length_var, base_ptr_var) \ size_t length_var = PRESERVES_ARRAY_ELEMENT_COUNT(bytes_ptr, element_type); \ element_type *base_ptr_var = &PRESERVES_ARRAY_ELEMENT(bytes_ptr, element_type, 0) PRESERVES_INLINE preserves_bytes_t preserves_create_bytes(void) { return (preserves_bytes_t) { .borrowed = 0, .len = 0, .ptr = NULL }; } PRESERVES_INLINE int preserves_resize_bytes(preserves_bytes_t *bs, size_t size) { if (bs->borrowed) abort(); if (size == 0) { free(bs->ptr); bs->ptr = NULL; bs->len = 0; return 0; } void *ptr = realloc(bs->ptr, size); if (ptr == NULL) return -1; bs->ptr = ptr; if (size > bs->len) { memset(((uint8_t *) bs->ptr) + bs->len, 0, size - bs->len); } bs->len = size; return 0; } PRESERVES_INLINE void preserves_free_bytes(preserves_bytes_t *bs) { if (!bs->borrowed) preserves_resize_bytes(bs, 0); *bs = (preserves_bytes_t) { .borrowed = 0, .len = 0, .ptr = NULL }; } PRESERVES_INLINE void preserves_bytes_move(preserves_bytes_t *dest, preserves_bytes_t *src) { preserves_free_bytes(dest); *dest = *src; *src = preserves_create_bytes(); } PRESERVES_INLINE int preserves_extend_bytes(preserves_bytes_t *dest, preserves_bytes_t src) { if (dest->borrowed) abort(); void *ptr = realloc(dest->ptr, dest->len + src.len); if (ptr == NULL) return -1; dest->ptr = ptr; memcpy(((uint8_t *) dest->ptr) + dest->len, src.ptr, src.len); dest->len += src.len; return 0; } PRESERVES_INLINE preserves_bytes_t preserves_bytes_subsequence(preserves_bytes_t *bs, size_t offset, size_t len) { if (offset >= bs->len) return preserves_create_bytes(); if (len > bs->len) return preserves_create_bytes(); if (offset > (bs->len - len)) len = bs->len - offset; return (preserves_bytes_t) { .borrowed = 1, .len = len, .ptr = ((uint8_t *) bs->ptr) + offset }; } /////////////////////////////////////////////////////////////////////////// // Memory arenas typedef struct preserves_pool { size_t pagesize; preserves_bytes_t page_pointers; // for allocations smaller than pagesize preserves_bytes_t large_block_pointers; // for allocations larger than or equal to pagesize size_t next_page; uint8_t *alloc_block_base; size_t alloc_block_used; } preserves_pool_t; PRESERVES_INLINE preserves_pool_t preserves_create_pool(size_t pagesize) { return (preserves_pool_t) { .pagesize = pagesize, .page_pointers = preserves_create_bytes(), .large_block_pointers = preserves_create_bytes(), .next_page = 0, .alloc_block_base = NULL, .alloc_block_used = pagesize, }; } PRESERVES_INLINE void preserves_free_blocklist(preserves_bytes_t *bl) { PRESERVES_ARRAY_ACCESS(bl, void *, num_blocks, blocklist); for (size_t i = 0; i < num_blocks; i++) { free(blocklist[i]); } preserves_free_bytes(bl); } PRESERVES_INLINE void preserves_recycle_pool(preserves_pool_t *pool) { preserves_free_blocklist(&pool->large_block_pointers); pool->next_page = 0; pool->alloc_block_base = NULL; pool->alloc_block_used = pool->pagesize; } PRESERVES_INLINE void preserves_free_pool(preserves_pool_t *pool) { preserves_recycle_pool(pool); preserves_free_blocklist(&pool->page_pointers); } PRESERVES_OUTOFLINE(void *_preserves_pool_record_block(preserves_bytes_t *bl, size_t blocksize), { void *ptr = calloc(1, blocksize); if (ptr == NULL) return NULL; if (preserves_resize_bytes(bl, bl->len + sizeof(void *)) == -1) { free(ptr); return NULL; } PRESERVES_ARRAY_ACCESS(bl, void *, num_blocks, blocklist); blocklist[num_blocks - 1] = ptr; return ptr; }); PRESERVES_OUTOFLINE ( int _preserves_pool_add_page_and_alloc(preserves_pool_t *pool, preserves_bytes_t *bs, size_t count), { if (pool->next_page >= PRESERVES_ARRAY_ELEMENT_COUNT(&pool->page_pointers, void *)) { void *ptr = _preserves_pool_record_block(&pool->page_pointers, pool->pagesize); if (ptr == NULL) return -1; pool->alloc_block_base = ptr; pool->next_page = PRESERVES_ARRAY_ELEMENT_COUNT(&pool->page_pointers, void *); } else { pool->alloc_block_base = PRESERVES_ARRAY_ELEMENT(&pool->page_pointers, void *, pool->next_page); pool->next_page++; } pool->alloc_block_used = count; *bs = (preserves_bytes_t) { .borrowed = 1, .len = count, .ptr = pool->alloc_block_base }; return 0; }); PRESERVES_INLINE int preserves_pool_alloc_bytes_align(preserves_pool_t *pool, preserves_bytes_t *bs, size_t count, size_t alignment) { preserves_free_bytes(bs); if (count == 0) return 0; count = (count + alignment - 1) & (~(alignment - 1)); // ^ round up to nearest `alignment`-byte boundary if (count > pool->pagesize) { void *ptr = _preserves_pool_record_block(&pool->large_block_pointers, count); if (ptr == NULL) return -1; *bs = (preserves_bytes_t) { .borrowed = 1, .len = count, .ptr = ptr }; return 0; } if (pool->alloc_block_used + count <= pool->pagesize) { *bs = (preserves_bytes_t) { .borrowed = 1, .len = count, .ptr = pool->alloc_block_base + pool->alloc_block_used, }; pool->alloc_block_used += count; return 0; } return _preserves_pool_add_page_and_alloc(pool, bs, count); } PRESERVES_INLINE int preserves_pool_alloc_bytes(preserves_pool_t *pool, preserves_bytes_t *bs, size_t count) { return preserves_pool_alloc_bytes_align(pool, bs, count, 16); } /////////////////////////////////////////////////////////////////////////// // Binary codec details typedef enum preserves_binary_format_tag { PRESERVES_BINARY_FORMAT_TAG_FALSE = 0x80, PRESERVES_BINARY_FORMAT_TAG_TRUE = 0x81, PRESERVES_BINARY_FORMAT_TAG_END = 0x84, PRESERVES_BINARY_FORMAT_TAG_ANNOTATION = 0x85, PRESERVES_BINARY_FORMAT_TAG_EMBEDDED = 0x86, PRESERVES_BINARY_FORMAT_TAG_IEEE754 = 0x87, PRESERVES_BINARY_FORMAT_TAG_SIGNED_INTEGER = 0xB0, PRESERVES_BINARY_FORMAT_TAG_STRING = 0xB1, PRESERVES_BINARY_FORMAT_TAG_BYTE_STRING = 0xB2, PRESERVES_BINARY_FORMAT_TAG_SYMBOL = 0xB3, PRESERVES_BINARY_FORMAT_TAG_RECORD = 0xB4, PRESERVES_BINARY_FORMAT_TAG_SEQUENCE = 0xB5, PRESERVES_BINARY_FORMAT_TAG_SET = 0xB6, PRESERVES_BINARY_FORMAT_TAG_DICTIONARY = 0xB7, } preserves_binary_format_tag_t; PRESERVES_OUTOFLINE ( char const *preserves_binary_format_tag_name(preserves_binary_format_tag_t tag), { switch (tag) { case PRESERVES_BINARY_FORMAT_TAG_FALSE: return "FALSE"; case PRESERVES_BINARY_FORMAT_TAG_TRUE: return "TRUE"; case PRESERVES_BINARY_FORMAT_TAG_END: return "END"; case PRESERVES_BINARY_FORMAT_TAG_ANNOTATION: return "ANNOTATION"; case PRESERVES_BINARY_FORMAT_TAG_EMBEDDED: return "EMBEDDED"; case PRESERVES_BINARY_FORMAT_TAG_IEEE754: return "IEEE754"; case PRESERVES_BINARY_FORMAT_TAG_SIGNED_INTEGER: return "SIGNED_INTEGER"; case PRESERVES_BINARY_FORMAT_TAG_STRING: return "STRING"; case PRESERVES_BINARY_FORMAT_TAG_BYTE_STRING: return "BYTE_STRING"; case PRESERVES_BINARY_FORMAT_TAG_SYMBOL: return "SYMBOL"; case PRESERVES_BINARY_FORMAT_TAG_RECORD: return "RECORD"; case PRESERVES_BINARY_FORMAT_TAG_SEQUENCE: return "SEQUENCE"; case PRESERVES_BINARY_FORMAT_TAG_SET: return "SET"; case PRESERVES_BINARY_FORMAT_TAG_DICTIONARY: return "DICTIONARY"; default: return "UNKNOWN"; } }); /////////////////////////////////////////////////////////////////////////// // Index representation typedef enum preserves_type_tag { PRESERVES_BOOLEAN = 0, PRESERVES_DOUBLE, PRESERVES_SIGNED_INTEGER, PRESERVES_STRING, PRESERVES_BYTE_STRING, PRESERVES_COMPACT, PRESERVES_SYMBOL, PRESERVES_RECORD, PRESERVES_SEQUENCE, PRESERVES_SET, PRESERVES_DICTIONARY, PRESERVES_EMBEDDED, PRESERVES_ANNOTATION, PRESERVES_END_MARKER, } preserves_type_tag_t; PRESERVES_OUTOFLINE(char const *preserves_type_tag_name(preserves_type_tag_t type), { switch (type) { case PRESERVES_BOOLEAN: return "BOOLEAN"; case PRESERVES_DOUBLE: return "DOUBLE"; case PRESERVES_SIGNED_INTEGER: return "SIGNED_INTEGER"; case PRESERVES_STRING: return "STRING"; case PRESERVES_BYTE_STRING: return "BYTE_STRING"; case PRESERVES_COMPACT: return "COMPACT"; case PRESERVES_SYMBOL: return "SYMBOL"; case PRESERVES_RECORD: return "RECORD"; case PRESERVES_SEQUENCE: return "SEQUENCE"; case PRESERVES_SET: return "SET"; case PRESERVES_DICTIONARY: return "DICTIONARY"; case PRESERVES_EMBEDDED: return "EMBEDDED"; case PRESERVES_ANNOTATION: return "ANNOTATION"; case PRESERVES_END_MARKER: return "END_MARKER"; default: return "UNKNOWN"; } }); typedef enum preserves_error_code { PRESERVES_END_SYSTEM_ERROR = -2, PRESERVES_END_NO_ERROR = -1, PRESERVES_END_EOF = 0, PRESERVES_END_MORE_INPUT_REMAINING, PRESERVES_END_INCOMPLETE_INPUT, PRESERVES_END_UNEXPECTED_END, PRESERVES_END_DICTIONARY_MISSING_VALUE, PRESERVES_END_RECORD_MISSING_LABEL, PRESERVES_END_VARINT_TOO_BIG, PRESERVES_END_INVALID_UTF8, PRESERVES_END_INVALID_TAG, PRESERVES_END_INVALID_IEEE754, } preserves_error_code_t; PRESERVES_OUTOFLINE(char const *preserves_error_code_name(preserves_error_code_t code), { switch (code) { case PRESERVES_END_SYSTEM_ERROR: return "SYSTEM_ERROR"; case PRESERVES_END_NO_ERROR: return "NO_ERROR"; case PRESERVES_END_EOF: return "EOF"; case PRESERVES_END_MORE_INPUT_REMAINING: return "MORE_INPUT_REMAINING"; case PRESERVES_END_INCOMPLETE_INPUT: return "INCOMPLETE_INPUT"; case PRESERVES_END_UNEXPECTED_END: return "UNEXPECTED_END"; case PRESERVES_END_DICTIONARY_MISSING_VALUE: return "DICTIONARY_MISSING_VALUE"; case PRESERVES_END_RECORD_MISSING_LABEL: return "RECORD_MISSING_LABEL"; case PRESERVES_END_VARINT_TOO_BIG: return "VARINT_TOO_BIG"; case PRESERVES_END_INVALID_UTF8: return "INVALID_UTF8"; case PRESERVES_END_INVALID_TAG: return "INVALID_TAG"; case PRESERVES_END_INVALID_IEEE754: return "INVALID_IEEE754"; default: return "UNKNOWN"; } }); typedef enum preserves_index_entry_representation { PRESERVES_REPR_NONE = 0, PRESERVES_INT_SIGNED, PRESERVES_INT_UNSIGNED, PRESERVES_INT_LARGE_BINARY, PRESERVES_INT_LARGE_TEXT, PRESERVES_LITERAL, PRESERVES_ESCAPED, PRESERVES_HEX, PRESERVES_BASE64, } preserves_index_entry_representation_t; PRESERVES_OUTOFLINE ( char const *preserves_index_entry_representation_name(preserves_index_entry_representation_t repr), { switch (repr) { case PRESERVES_REPR_NONE: return "REPR_NONE"; case PRESERVES_INT_SIGNED: return "INT_SIGNED"; case PRESERVES_INT_UNSIGNED: return "INT_UNSIGNED"; case PRESERVES_INT_LARGE_BINARY: return "INT_LARGE_BINARY"; case PRESERVES_INT_LARGE_TEXT: return "INT_LARGE_TEXT"; case PRESERVES_LITERAL: return "LITERAL"; case PRESERVES_ESCAPED: return "ESCAPED"; case PRESERVES_HEX: return "HEX"; case PRESERVES_BASE64: return "BASE64"; default: return "UNKNOWN"; } }); /* PRESERVES_BOOLEAN: repr==PRESERVES_REPR_NONE, len=0, data._boolean PRESERVES_DOUBLE: repr=PRESERVES_REPR_NONE, len=0, data._double PRESERVES_SIGNED_INTEGER: - repr==PRESERVES_INT_SIGNED -> len=0, data._signed - repr==PRESERVES_INT_UNSIGNED -> len=0, data._unsigned - repr==PRESERVES_INT_LARGE_BINARY -> len, data._unsigned as absolute offset within input - repr==PRESERVES_INT_LARGE_TEXT -> len, data._unsigned as absolute offset within input PRESERVES_STRING: - repr=PRESERVES_LITERAL -> len, data._unsigned as absolute offset within input to utf-8 bytes - repr=PRESERVES_ESCAPED -> len, data._unsigned as absolute offset within input to utf-8 bytes that need String-style backslash-escapes interpreted PRESERVES_BYTE_STRING: - repr=PRESERVES_LITERAL -> len, data._unsigned as absolute offset within input to utf-8 bytes - repr=PRESERVES_ESCAPED -> len, data._unsigned as absolute offset within input to utf-8 bytes that need ByteString-style backslash-escapes interpreted - repr=PRESERVES_HEX -> len, data._unsigned as absolute offset within input to ASCII bytes of hex - repr=PRESERVES_BASE64 -> len, data._unsigned as absolute offset within input to ASCII bytes of base64 PRESERVES_COMPACT: - repr as for BYTE_STRING, but bytes denote a nested binary-encoded value. PRESERVES_SYMBOL: - repr=PRESERVES_LITERAL -> len, data._unsigned as absolute offset within input to utf-8 bytes - repr=PRESERVES_ESCAPED -> len, data._unsigned as absolute offset within input to utf-8 bytes that need Symbol-style backslash-escapes interpreted PRESERVES_RECORD, PRESERVES_SEQUENCE, PRESERVES_SET, PRESERVES_DICTIONARY: - repr==PRESERVES_REPR_NONE, - len counts number of items: - PRESERVES_RECORD -> number of fields plus one (for the label) - PRESERVES_SEQUENCE -> number of items - PRESERVES_SET -> number of items - PRESERVES_DICTIONARY -> twice the number of key-value pairs - data._unsigned as relative offset within index to next item, starting from this entry; zero means "no end known" PRESERVES_EMBEDDED: repr==PRESERVES_REPR_NONE, len==0, following item is the embedded value PRESERVES_ANNOTATION: - repr==PRESERVES_REPR_NONE, - len counts number of annotations, - data._unsigned as relative offset within index to annotated item, starting from this entry; zero means "no end known" - the annotated item will not be a PRESERVES_ANNOTATION PRESERVES_END_MARKER: repr==PRESERVES_REPR_NONE, len==0, data._err */ typedef struct preserves_index_entry { preserves_type_tag_t type:4; preserves_index_entry_representation_t repr:4; uint64_t len:56; union { bool _boolean; double _double; int64_t _signed; uint64_t _unsigned; preserves_error_code_t _err; } data; } preserves_index_entry_t; #ifndef NDEBUG extern void preserves_dump_index_entry(FILE* f, preserves_bytes_t *input, preserves_index_entry_t *i, bool add_newline); #endif typedef struct preserves_reader { preserves_bytes_t input; preserves_bytes_t index; preserves_bytes_t stack; size_t stack_top; /* ascending empty */ size_t input_pos; /* ascending full */ size_t index_pos; /* ascending empty */ bool annotation_tag_seen; } preserves_reader_t; typedef struct preserves_reader_result { preserves_index_entry_t *index; preserves_index_entry_t *end_marker; } preserves_reader_result_t; PRESERVES_INLINE preserves_reader_result_t preserves_reader_error_result(void) { return (preserves_reader_result_t) { .index = NULL, .end_marker = NULL }; } PRESERVES_INLINE preserves_reader_t preserves_create_reader(void) { return (preserves_reader_t) { .input = preserves_create_bytes(), .index = preserves_create_bytes(), .stack = preserves_create_bytes(), .stack_top = 0, .input_pos = 0, .index_pos = 0, .annotation_tag_seen = false, }; } PRESERVES_OUTOFLINE(void preserves_free_reader(preserves_reader_t *r), { preserves_free_bytes(&r->input); preserves_free_bytes(&r->index); preserves_free_bytes(&r->stack); r->stack_top = 0; r->input_pos = 0; r->index_pos = 0; r->annotation_tag_seen = false; }); PRESERVES_IMPLEMENTATION_CHUNK ( #define MINIMUM_PRESERVES_READER_STACK_SIZE 32 typedef uint64_t preserves_index_offset_t; static inline bool _preserves_reader_ateof(preserves_reader_t *r) { return (r->input_pos >= r->input.len); } static inline int _preserves_reader_peek(preserves_reader_t *r) { if (_preserves_reader_ateof(r)) return -1; return PRESERVES_ARRAY_ELEMENT(&r->input, uint8_t, r->input_pos); } static inline int _preserves_reader_next(preserves_reader_t *r) { int result = _preserves_reader_peek(r); if (result == -1) return -1; r->input_pos++; return result; } static inline void *_preserves_reader_next_bytes(preserves_reader_t *r, size_t count) { preserves_bytes_t bs = preserves_bytes_subsequence(&r->input, r->input_pos, count); if (bs.len != count) return NULL; r->input_pos += count; return bs.ptr; } static inline preserves_index_entry_t *_preserves_reader_index_entry(preserves_reader_t *r, size_t i) { size_t limit = PRESERVES_ARRAY_ELEMENT_COUNT(&r->index, preserves_index_entry_t); while (i >= limit) { limit = limit * 2; if (limit < 16) limit = 16; if (PRESERVES_RESIZE_ARRAY(&r->index, preserves_index_entry_t, limit) == -1) { return NULL; } } return &PRESERVES_ARRAY_ELEMENT(&r->index, preserves_index_entry_t, i); } static inline size_t _preserves_reader_stack_peek(preserves_reader_t *r) { if (r->stack_top >= PRESERVES_ARRAY_ELEMENT_COUNT(&r->stack, size_t)) { abort(); } return PRESERVES_ARRAY_ELEMENT(&r->stack, size_t, r->stack_top - 1); } static inline preserves_index_entry_t *_preserves_reader_stack_top_entry(preserves_reader_t *r) { return _preserves_reader_index_entry(r, _preserves_reader_stack_peek(r)); } static inline void _preserves_reader_stack_drop(preserves_reader_t *r) { if (r->stack_top == 0) abort(); /* printf("popping "); */ /* preserves_dump_index_entry(stdout, &r->input, _preserves_reader_stack_top_entry(r), true); */ r->stack_top--; } static inline preserves_index_entry_t *_preserves_reader_finish_seq(preserves_reader_t *r) { size_t base_index = _preserves_reader_stack_peek(r); preserves_index_entry_t *base = _preserves_reader_stack_top_entry(r); base->data._unsigned = r->index_pos - base_index; _preserves_reader_stack_drop(r); return base; } static inline bool _preserves_reader_in_annotations(preserves_reader_t *r) { return (r->stack_top > 0) && (_preserves_reader_stack_top_entry(r)->type == PRESERVES_ANNOTATION); } static inline void _preserves_reader_inc_collection_len(preserves_reader_t *r, size_t *count_ptr) { if (r->stack_top > 0) { check_for_embedded: preserves_index_entry_t *base = _preserves_reader_stack_top_entry(r); if (base->type == PRESERVES_EMBEDDED) { _preserves_reader_stack_drop(r); goto check_for_embedded; } else { base->len++; } /* printf("added to base, which is now "); */ /* preserves_dump_index_entry(stdout, &r->input, base, true); */ } else { (*count_ptr)--; } } static inline preserves_index_entry_t *_preserves_reader_emit_entry(preserves_reader_t *r, size_t *count_ptr, preserves_index_entry_t e) { if (!r->annotation_tag_seen && _preserves_reader_in_annotations(r)) { /* printf("(popping annotation collector)\n"); */ _preserves_reader_finish_seq(r); } if (count_ptr != NULL) { _preserves_reader_inc_collection_len(r, count_ptr); } /* printf("-- emitting: "); */ /* preserves_dump_index_entry(stdout, &r->input, &e, true); */ preserves_index_entry_t *ix = _preserves_reader_index_entry(r, r->index_pos); if (ix == NULL) return NULL; *ix = e; r->index_pos++; r->annotation_tag_seen = false; return ix; } static inline preserves_reader_result_t _preserves_reader_finish(preserves_reader_t *r, preserves_error_code_t code) { if (code == PRESERVES_END_SYSTEM_ERROR) { return preserves_reader_error_result(); } else { preserves_index_entry_t *index = _preserves_reader_index_entry(r, 0); if (index == NULL) return preserves_reader_error_result(); preserves_index_entry_t *end_marker = _preserves_reader_emit_entry(r, NULL, (preserves_index_entry_t) { .type = PRESERVES_END_MARKER, .repr = PRESERVES_REPR_NONE, .len = 0, .data = { ._err = code }, }); if (end_marker == NULL) return preserves_reader_error_result(); return (preserves_reader_result_t) { .index = index, .end_marker = end_marker }; } } static inline size_t _preserves_reader_varint(preserves_reader_t *r, preserves_error_code_t *code) { unsigned int shift_amount = 0; size_t result = 0; while (true) { int b = _preserves_reader_next(r); if (b == -1) { *code = PRESERVES_END_INCOMPLETE_INPUT; return 0; } result |= (b & 0x7f) << shift_amount; if (b & 0x80) { shift_amount += 7; if (shift_amount > ((sizeof(size_t) * 8) - 7)) { *code = PRESERVES_END_VARINT_TOO_BIG; return 0; } } else { *code = PRESERVES_END_NO_ERROR; return result; } } } static inline preserves_index_entry_t *_preserves_emit_small_int(preserves_reader_t *r, size_t *count_ptr, bool is_unsigned, int64_t value) { return _preserves_reader_emit_entry(r, count_ptr, (preserves_index_entry_t) { .type = PRESERVES_SIGNED_INTEGER, .repr = is_unsigned ? PRESERVES_INT_UNSIGNED : PRESERVES_INT_SIGNED, .len = 0, .data = { ._signed = value }, }); } static inline int _preserves_reader_decode_intbytes(preserves_reader_t *r, size_t *count_ptr, size_t len) { size_t starting_pos = r->input_pos; uint8_t *bs = _preserves_reader_next_bytes(r, len); if (bs == NULL) return -1; bool is_unsigned = false; size_t remaining = len; while ((remaining > 0) && (*bs == 0)) { is_unsigned = true; bs++; remaining--; } if (remaining == 0) { // This shouldn't happen, but it does have a denotation. return (_preserves_emit_small_int(r, count_ptr, is_unsigned, 0) == NULL) ? -1 : 0; } if (remaining > 8) { if (is_unsigned && (*bs & 0x80)) { remaining++; bs--; } return (_preserves_reader_emit_entry(r, count_ptr, (preserves_index_entry_t) { .type = PRESERVES_SIGNED_INTEGER, .repr = PRESERVES_INT_LARGE_BINARY, .len = remaining, .data = { ._unsigned = starting_pos + (len - remaining) }, }) == NULL) ? -1 : 0; } uint64_t buf = 0; while (remaining > 0) { remaining--; buf = buf | ((*bs) << (remaining << 3)); bs++; } int64_t value = *(int64_t *)&buf; return (_preserves_emit_small_int(r, count_ptr, is_unsigned, value) == NULL) ? -1 : 0; } static inline bool utf8_tail(uint8_t b) { return (b >= 0x80 && b <= 0xbf); } static inline int check_utf8(uint8_t *bs, size_t len) { // https://datatracker.ietf.org/doc/html/rfc3629#section-4 while (len > 0) { uint8_t b0 = *bs++; len--; if (b0 >= 0x80) { if (len < 1) return -1; uint8_t b1 = *bs++; len--; if (b0 >= 0xc2 && b0 <= 0xdf) { if (!utf8_tail(b1)) return -1; } else { if (len < 1) return -1; uint8_t b2 = *bs++; len--; if (b0 == 0xe0) { if (!(b1 >= 0xa0 && b1 <= 0xbf && utf8_tail(b2))) return -1; } else if (b0 >= 0xe1 && b0 <= 0xec) { if (!(utf8_tail(b1) && utf8_tail(b2))) return -1; } else if (b0 == 0xed) { if (!(b1 >= 0x80 && b1 <= 0x9f && utf8_tail(b2))) return -1; } else if (b0 >= 0xee && b0 <= 0xef) { if (!(utf8_tail(b1) && utf8_tail(b2))) return -1; } else { if (len < 1) return -1; uint8_t b3 = *bs++; len--; if (b0 == 0xf0) { if (!(b1 >= 0x90 && b1 <= 0xbf && utf8_tail(b2) && utf8_tail(b3))) return -1; } else if (b0 >= 0xf1 && b0 <= 0xf3) { if (!(utf8_tail(b1) && utf8_tail(b2) && utf8_tail(b3))) return -1; } else if (b0 == 0xf4) { if (!(b1 >= 0x80 && b1 <= 0x8f && utf8_tail(b2) && utf8_tail(b3))) return -1; } else { // ok! } } } } } return 0; } static inline preserves_error_code_t _preserves_reader_read_stringlike(preserves_reader_t *r, size_t *count_ptr, preserves_type_tag_t type, bool should_check_utf8) { preserves_error_code_t varint_err = PRESERVES_END_NO_ERROR; size_t len = _preserves_reader_varint(r, &varint_err); if (varint_err != PRESERVES_END_NO_ERROR) return varint_err; size_t starting_pos = r->input_pos; uint8_t *maybe_utf = _preserves_reader_next_bytes(r, len); if (should_check_utf8 && (check_utf8(maybe_utf, len) == -1)) { return PRESERVES_END_INVALID_UTF8; } if (_preserves_reader_emit_entry(r, count_ptr, (preserves_index_entry_t) { .type = type, .repr = PRESERVES_LITERAL, .len = len, .data = { ._unsigned = starting_pos }, }) == NULL) { return PRESERVES_END_SYSTEM_ERROR; } return PRESERVES_END_NO_ERROR; } static inline preserves_index_entry_t *_preserves_reader_push(preserves_reader_t *r, preserves_type_tag_t type) { preserves_index_entry_t *ix = _preserves_reader_emit_entry(r, NULL, (preserves_index_entry_t) { .type = type, .repr = PRESERVES_REPR_NONE, .len = 0, .data = { ._unsigned = 0 }}); if (ix == NULL) return NULL; size_t limit = PRESERVES_ARRAY_ELEMENT_COUNT(&r->stack, size_t); if (r->stack_top >= limit) { limit += 32; if (PRESERVES_RESIZE_ARRAY(&r->stack, size_t, limit) == -1) return NULL; } PRESERVES_ARRAY_ELEMENT(&r->stack, size_t, r->stack_top) = r->index_pos - 1; r->stack_top++; return ix; } ) PRESERVES_INLINE preserves_index_entry_t *preserves_skip_annotations(preserves_index_entry_t *ix) { if (ix == NULL) return NULL; if (ix->type != PRESERVES_ANNOTATION) return ix; ix += ix->data._unsigned; if (ix->type == PRESERVES_ANNOTATION) abort(); return ix; } #define RETURN_ON_FAIL(e) if ((e) == NULL) return preserves_reader_error_result() PRESERVES_OUTOFLINE ( preserves_reader_result_t preserves_read_binary_continue(preserves_reader_t *r, size_t count), { while (count) { /* for (int i = r->stack_top - 1; i >= 0; i--) { */ /* size_t ip = PRESERVES_ARRAY_ELEMENT(&r->stack, size_t, i); */ /* printf(" %02d: (%5lu) ", i, ip); */ /* preserves_dump_index_entry(stdout, &r->input, _preserves_reader_index_entry(r, ip), true); */ /* } */ /* printf("pos %lu (%05lx), count %lu, annotation tag seen %d: ", */ /* r->input_pos, */ /* r->input_pos, */ /* count, */ /* r->annotation_tag_seen); */ int b = _preserves_reader_next(r); /* printf("tag 0x%02x %s\n", b, preserves_binary_format_tag_name(b)); */ if (b == -1) return _preserves_reader_finish(r, PRESERVES_END_INCOMPLETE_INPUT); switch (b) { case PRESERVES_BINARY_FORMAT_TAG_FALSE: RETURN_ON_FAIL(_preserves_reader_emit_entry(r, &count, (preserves_index_entry_t) { .type = PRESERVES_BOOLEAN, .repr = PRESERVES_REPR_NONE, .len = 0, .data = { ._boolean = false }})); break; case PRESERVES_BINARY_FORMAT_TAG_TRUE: RETURN_ON_FAIL(_preserves_reader_emit_entry(r, &count, (preserves_index_entry_t) { .type = PRESERVES_BOOLEAN, .repr = PRESERVES_REPR_NONE, .len = 0, .data = { ._boolean = true }})); break; case PRESERVES_BINARY_FORMAT_TAG_IEEE754: { preserves_error_code_t varint_err = PRESERVES_END_NO_ERROR; size_t len = _preserves_reader_varint(r, &varint_err); if (varint_err != PRESERVES_END_NO_ERROR) return _preserves_reader_finish(r, varint_err); uint8_t *bs = _preserves_reader_next_bytes(r, len); if (bs == NULL) return _preserves_reader_finish(r, PRESERVES_END_INCOMPLETE_INPUT); switch (len) { case 8: { uint32_t lo, hi; memcpy(&hi, bs, 4); memcpy(&lo, bs + 4, 4); lo = ntohl(lo); hi = ntohl(hi); uint64_t i = (((uint64_t) hi) << 32) | ((uint64_t) lo); double f; memcpy(&f, &i, 8); RETURN_ON_FAIL(_preserves_reader_emit_entry(r, &count, (preserves_index_entry_t) { .type = PRESERVES_DOUBLE, .repr = PRESERVES_REPR_NONE, .len = 0, .data = { ._double = f }})); break; } default: return _preserves_reader_finish(r, PRESERVES_END_INVALID_IEEE754); } break; } case PRESERVES_BINARY_FORMAT_TAG_END: if (r->stack_top == 0) { return _preserves_reader_finish(r, PRESERVES_END_UNEXPECTED_END); } preserves_index_entry_t *base = _preserves_reader_finish_seq(r); _preserves_reader_inc_collection_len(r, &count); if ((base->type == PRESERVES_DICTIONARY) && ((base->len % 2) != 0)) { return _preserves_reader_finish(r, PRESERVES_END_DICTIONARY_MISSING_VALUE); } if ((base->type == PRESERVES_RECORD) && (base->len == 0)) { return _preserves_reader_finish(r, PRESERVES_END_RECORD_MISSING_LABEL); } break; case PRESERVES_BINARY_FORMAT_TAG_ANNOTATION: if (r->annotation_tag_seen || !_preserves_reader_in_annotations(r)) { RETURN_ON_FAIL(_preserves_reader_push(r, PRESERVES_ANNOTATION)); } r->annotation_tag_seen = true; break; case PRESERVES_BINARY_FORMAT_TAG_EMBEDDED: RETURN_ON_FAIL(_preserves_reader_push(r, PRESERVES_EMBEDDED)); break; case PRESERVES_BINARY_FORMAT_TAG_SIGNED_INTEGER: { preserves_error_code_t varint_err = PRESERVES_END_NO_ERROR; size_t len = _preserves_reader_varint(r, &varint_err); if (varint_err != PRESERVES_END_NO_ERROR) return _preserves_reader_finish(r, varint_err); if (_preserves_reader_decode_intbytes(r, &count, len) == -1) { return _preserves_reader_finish(r, PRESERVES_END_INCOMPLETE_INPUT); } break; } case PRESERVES_BINARY_FORMAT_TAG_STRING: { preserves_error_code_t code = _preserves_reader_read_stringlike(r, &count, PRESERVES_STRING, true); if (code != PRESERVES_END_NO_ERROR) return _preserves_reader_finish(r, code); break; } case PRESERVES_BINARY_FORMAT_TAG_BYTE_STRING: { preserves_error_code_t code = _preserves_reader_read_stringlike(r, &count, PRESERVES_BYTE_STRING, false); if (code != PRESERVES_END_NO_ERROR) return _preserves_reader_finish(r, code); break; } case PRESERVES_BINARY_FORMAT_TAG_SYMBOL: { preserves_error_code_t code = _preserves_reader_read_stringlike(r, &count, PRESERVES_SYMBOL, true); if (code != PRESERVES_END_NO_ERROR) return _preserves_reader_finish(r, code); break; } case PRESERVES_BINARY_FORMAT_TAG_RECORD: RETURN_ON_FAIL(_preserves_reader_push(r, PRESERVES_RECORD)); break; case PRESERVES_BINARY_FORMAT_TAG_SEQUENCE: RETURN_ON_FAIL(_preserves_reader_push(r, PRESERVES_SEQUENCE)); break; case PRESERVES_BINARY_FORMAT_TAG_SET: RETURN_ON_FAIL(_preserves_reader_push(r, PRESERVES_SET)); break; case PRESERVES_BINARY_FORMAT_TAG_DICTIONARY: RETURN_ON_FAIL(_preserves_reader_push(r, PRESERVES_DICTIONARY)); break; default: return _preserves_reader_finish(r, PRESERVES_END_INVALID_TAG); } } return _preserves_reader_finish(r, (_preserves_reader_ateof(r) ? PRESERVES_END_EOF : (r->stack_top > 0) ? PRESERVES_END_INCOMPLETE_INPUT : PRESERVES_END_MORE_INPUT_REMAINING)); } ); #undef RETURN_ON_FAIL PRESERVES_OUTOFLINE ( preserves_reader_result_t preserves_read_binary(preserves_reader_t *r, preserves_bytes_t *input, size_t count), { { size_t required_stack_bytes = MINIMUM_PRESERVES_READER_STACK_SIZE * sizeof(size_t); if (r->stack.len < required_stack_bytes) { if (preserves_resize_bytes(&r->stack, required_stack_bytes) == -1) { return preserves_reader_error_result(); } } } r->stack_top = 0; r->input_pos = 0; r->index_pos = 0; r->annotation_tag_seen = false; preserves_bytes_move(&r->input, input); return preserves_read_binary_continue(r, count); } ); /////////////////////////////////////////////////////////////////////////// // Debug utilities #ifndef NDEBUG PRESERVES_IMPLEMENTATION_CHUNK ( static void preserves_dump_bytes(FILE *f, preserves_bytes_t *data) { fprintf(f, ">>>"); for (size_t i = 0; i < data->len; i++) { uint8_t c = PRESERVES_ARRAY_ELEMENT(data, uint8_t, i); if (c < 0x20 || c >= 0x80) { fprintf(f, "\\x%02x", c); } else { fprintf(f, "%c", c); } } fprintf(f, "<<<"); } void preserves_dump_index_entry(FILE *f, preserves_bytes_t *input, preserves_index_entry_t *i, bool add_newline) { fprintf(f, "%s %s length %lu", preserves_type_tag_name(i->type), i->repr == PRESERVES_REPR_NONE ? "-" : preserves_index_entry_representation_name(i->repr), (size_t) i->len); switch (i->type) { case PRESERVES_BOOLEAN: fprintf(f, i->data._boolean ? " #t" : " #f"); break; case PRESERVES_DOUBLE: fprintf(f, " %f", i->data._double); break; case PRESERVES_STRING: case PRESERVES_BYTE_STRING: case PRESERVES_COMPACT: case PRESERVES_SYMBOL: { fprintf(f, " offset %lu ", i->data._unsigned); preserves_bytes_t data = preserves_bytes_subsequence(input, i->data._unsigned, i->len); preserves_dump_bytes(f, &data); break; } case PRESERVES_RECORD: case PRESERVES_SEQUENCE: case PRESERVES_SET: case PRESERVES_DICTIONARY: fprintf(f, " skip %lu", i->data._unsigned - 1); break; case PRESERVES_ANNOTATION: fprintf(f, " annotated after %lu", i->data._unsigned - 1); break; case PRESERVES_EMBEDDED: break; case PRESERVES_END_MARKER: fprintf(f, ": %s", preserves_error_code_name(i->data._err)); break; case PRESERVES_SIGNED_INTEGER: switch (i->repr) { case PRESERVES_INT_SIGNED: fprintf(f, ": %ld", i->data._signed); break; case PRESERVES_INT_UNSIGNED: default: fprintf(f, ": %lu", i->data._unsigned); break; case PRESERVES_INT_LARGE_BINARY: case PRESERVES_INT_LARGE_TEXT: { fprintf(f, " offset %lu ", i->data._unsigned); preserves_bytes_t data = preserves_bytes_subsequence(input, i->data._unsigned, i->len); preserves_dump_bytes(f, &data); break; } } break; default: fprintf(f, ": %lu (%ld)", i->data._unsigned, i->data._signed); break; } if (add_newline) { fprintf(f, "\n"); } } ) #endif /////////////////////////////////////////////////////////////////////////// #undef PRESERVES_INLINE #undef PRESERVES_IMPLEMENTATION_CHUNK #undef PRESERVES_OUTOFLINE #endif