preserves/implementations/c/preserves.h

1053 lines
37 KiB
C

/// SPDX-License-Identifier: Apache-2.0
/// SPDX-FileCopyrightText: Copyright © 2022 Tony Garnock-Jones <tonyg@leastfixedpoint.com>
#ifndef libpreserves_26109214_f3bd_44c8_95ba_8c650c954965
#define libpreserves_26109214_f3bd_44c8_95ba_8c650c954965
// Single file header. #define PRESERVES_IMPLEMENTATION to get the implementations.
#ifdef PRESERVES_IMPLEMENTATION
#define PRESERVES_INLINE
#define PRESERVES_IMPLEMENTATION_CHUNK(...) __VA_ARGS__
#else
#define PRESERVES_INLINE static inline
#define PRESERVES_IMPLEMENTATION_CHUNK(...)
#endif
#define PRESERVES_OUTOFLINE(declaration, ...) \
extern declaration; \
PRESERVES_IMPLEMENTATION_CHUNK(inline declaration __VA_ARGS__)
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <string.h>
#include <arpa/inet.h> // for ntohl, htonl
///////////////////////////////////////////////////////////////////////////
// General-purpose fat pointer, for e.g. strings, binary blobs, etc.
typedef struct preserves_bytes {
bool borrowed:1;
size_t len:(sizeof(size_t) * 8 - 1);
void *ptr;
} preserves_bytes_t;
#define PRESERVES_ARRAY_ELEMENT_COUNT(bytes_ptr, element_type) \
((bytes_ptr)->len / sizeof(element_type))
#define PRESERVES_ARRAY_ELEMENT(bytes_ptr, element_type, index) \
(((element_type *) (bytes_ptr)->ptr)[index])
#define PRESERVES_RESIZE_ARRAY(bytes_ptr, element_type, size) \
preserves_resize_bytes(bytes_ptr, sizeof(element_type) * (size))
#define PRESERVES_ARRAY_ACCESS(bytes_ptr, element_type, length_var, base_ptr_var) \
size_t length_var = PRESERVES_ARRAY_ELEMENT_COUNT(bytes_ptr, element_type); \
element_type *base_ptr_var = &PRESERVES_ARRAY_ELEMENT(bytes_ptr, element_type, 0)
PRESERVES_INLINE preserves_bytes_t preserves_create_bytes(void) {
return (preserves_bytes_t) { .borrowed = 0, .len = 0, .ptr = NULL };
}
PRESERVES_INLINE int preserves_resize_bytes(preserves_bytes_t *bs, size_t size) {
if (bs->borrowed) abort();
if (size == 0) {
free(bs->ptr);
bs->ptr = NULL;
bs->len = 0;
return 0;
}
void *ptr = realloc(bs->ptr, size);
if (ptr == NULL) return -1;
bs->ptr = ptr;
if (size > bs->len) {
memset(((uint8_t *) bs->ptr) + bs->len, 0, size - bs->len);
}
bs->len = size;
return 0;
}
PRESERVES_INLINE void preserves_free_bytes(preserves_bytes_t *bs) {
if (!bs->borrowed) preserves_resize_bytes(bs, 0);
*bs = (preserves_bytes_t) { .borrowed = 0, .len = 0, .ptr = NULL };
}
PRESERVES_INLINE void preserves_bytes_move(preserves_bytes_t *dest, preserves_bytes_t *src) {
preserves_free_bytes(dest);
*dest = *src;
*src = preserves_create_bytes();
}
PRESERVES_INLINE int preserves_extend_bytes(preserves_bytes_t *dest, preserves_bytes_t src) {
if (dest->borrowed) abort();
void *ptr = realloc(dest->ptr, dest->len + src.len);
if (ptr == NULL) return -1;
dest->ptr = ptr;
memcpy(((uint8_t *) dest->ptr) + dest->len, src.ptr, src.len);
dest->len += src.len;
return 0;
}
PRESERVES_INLINE preserves_bytes_t preserves_bytes_subsequence(preserves_bytes_t *bs,
size_t offset,
size_t len) {
if (offset >= bs->len) return preserves_create_bytes();
if (len > bs->len) return preserves_create_bytes();
if (offset > (bs->len - len)) len = bs->len - offset;
return (preserves_bytes_t) { .borrowed = 1, .len = len, .ptr = ((uint8_t *) bs->ptr) + offset };
}
///////////////////////////////////////////////////////////////////////////
// Memory arenas
typedef struct preserves_pool {
size_t pagesize;
preserves_bytes_t page_pointers; // for allocations smaller than pagesize
preserves_bytes_t large_block_pointers; // for allocations larger than or equal to pagesize
size_t next_page;
uint8_t *alloc_block_base;
size_t alloc_block_used;
} preserves_pool_t;
PRESERVES_INLINE preserves_pool_t preserves_create_pool(size_t pagesize) {
return (preserves_pool_t) {
.pagesize = pagesize,
.page_pointers = preserves_create_bytes(),
.large_block_pointers = preserves_create_bytes(),
.next_page = 0,
.alloc_block_base = NULL,
.alloc_block_used = pagesize,
};
}
PRESERVES_INLINE void preserves_free_blocklist(preserves_bytes_t *bl) {
PRESERVES_ARRAY_ACCESS(bl, void *, num_blocks, blocklist);
for (size_t i = 0; i < num_blocks; i++) {
free(blocklist[i]);
}
preserves_free_bytes(bl);
}
PRESERVES_INLINE void preserves_recycle_pool(preserves_pool_t *pool) {
preserves_free_blocklist(&pool->large_block_pointers);
pool->next_page = 0;
pool->alloc_block_base = NULL;
pool->alloc_block_used = pool->pagesize;
}
PRESERVES_INLINE void preserves_free_pool(preserves_pool_t *pool) {
preserves_recycle_pool(pool);
preserves_free_blocklist(&pool->page_pointers);
}
PRESERVES_OUTOFLINE(void *_preserves_pool_record_block(preserves_bytes_t *bl, size_t blocksize), {
void *ptr = calloc(1, blocksize);
if (ptr == NULL) return NULL;
if (preserves_resize_bytes(bl, bl->len + sizeof(void *)) == -1) {
free(ptr);
return NULL;
}
PRESERVES_ARRAY_ACCESS(bl, void *, num_blocks, blocklist);
blocklist[num_blocks - 1] = ptr;
return ptr;
});
PRESERVES_OUTOFLINE
(
int _preserves_pool_add_page_and_alloc(preserves_pool_t *pool, preserves_bytes_t *bs, size_t count), {
if (pool->next_page >= PRESERVES_ARRAY_ELEMENT_COUNT(&pool->page_pointers, void *)) {
void *ptr = _preserves_pool_record_block(&pool->page_pointers, pool->pagesize);
if (ptr == NULL) return -1;
pool->alloc_block_base = ptr;
pool->next_page = PRESERVES_ARRAY_ELEMENT_COUNT(&pool->page_pointers, void *);
} else {
pool->alloc_block_base = PRESERVES_ARRAY_ELEMENT(&pool->page_pointers, void *, pool->next_page);
pool->next_page++;
}
pool->alloc_block_used = count;
*bs = (preserves_bytes_t) { .borrowed = 1, .len = count, .ptr = pool->alloc_block_base };
return 0;
});
PRESERVES_INLINE int preserves_pool_alloc_bytes_align(preserves_pool_t *pool,
preserves_bytes_t *bs,
size_t count,
size_t alignment) {
preserves_free_bytes(bs);
if (count == 0) return 0;
count = (count + alignment - 1) & (~(alignment - 1));
// ^ round up to nearest `alignment`-byte boundary
if (count > pool->pagesize) {
void *ptr = _preserves_pool_record_block(&pool->large_block_pointers, count);
if (ptr == NULL) return -1;
*bs = (preserves_bytes_t) { .borrowed = 1, .len = count, .ptr = ptr };
return 0;
}
if (pool->alloc_block_used + count <= pool->pagesize) {
*bs = (preserves_bytes_t) {
.borrowed = 1,
.len = count,
.ptr = pool->alloc_block_base + pool->alloc_block_used,
};
pool->alloc_block_used += count;
return 0;
}
return _preserves_pool_add_page_and_alloc(pool, bs, count);
}
PRESERVES_INLINE int preserves_pool_alloc_bytes(preserves_pool_t *pool,
preserves_bytes_t *bs,
size_t count) {
return preserves_pool_alloc_bytes_align(pool, bs, count, 16);
}
///////////////////////////////////////////////////////////////////////////
// Binary codec details
typedef enum preserves_binary_format_tag {
PRESERVES_BINARY_FORMAT_TAG_FALSE = 0x80,
PRESERVES_BINARY_FORMAT_TAG_TRUE = 0x81,
PRESERVES_BINARY_FORMAT_TAG_END = 0x84,
PRESERVES_BINARY_FORMAT_TAG_ANNOTATION = 0x85,
PRESERVES_BINARY_FORMAT_TAG_EMBEDDED = 0x86,
PRESERVES_BINARY_FORMAT_TAG_IEEE754 = 0x87,
PRESERVES_BINARY_FORMAT_TAG_SIGNED_INTEGER = 0xB0,
PRESERVES_BINARY_FORMAT_TAG_STRING = 0xB1,
PRESERVES_BINARY_FORMAT_TAG_BYTE_STRING = 0xB2,
PRESERVES_BINARY_FORMAT_TAG_SYMBOL = 0xB3,
PRESERVES_BINARY_FORMAT_TAG_RECORD = 0xB4,
PRESERVES_BINARY_FORMAT_TAG_SEQUENCE = 0xB5,
PRESERVES_BINARY_FORMAT_TAG_SET = 0xB6,
PRESERVES_BINARY_FORMAT_TAG_DICTIONARY = 0xB7,
} preserves_binary_format_tag_t;
PRESERVES_OUTOFLINE
(
char const *preserves_binary_format_tag_name(preserves_binary_format_tag_t tag), {
switch (tag) {
case PRESERVES_BINARY_FORMAT_TAG_FALSE: return "FALSE";
case PRESERVES_BINARY_FORMAT_TAG_TRUE: return "TRUE";
case PRESERVES_BINARY_FORMAT_TAG_END: return "END";
case PRESERVES_BINARY_FORMAT_TAG_ANNOTATION: return "ANNOTATION";
case PRESERVES_BINARY_FORMAT_TAG_EMBEDDED: return "EMBEDDED";
case PRESERVES_BINARY_FORMAT_TAG_IEEE754: return "IEEE754";
case PRESERVES_BINARY_FORMAT_TAG_SIGNED_INTEGER: return "SIGNED_INTEGER";
case PRESERVES_BINARY_FORMAT_TAG_STRING: return "STRING";
case PRESERVES_BINARY_FORMAT_TAG_BYTE_STRING: return "BYTE_STRING";
case PRESERVES_BINARY_FORMAT_TAG_SYMBOL: return "SYMBOL";
case PRESERVES_BINARY_FORMAT_TAG_RECORD: return "RECORD";
case PRESERVES_BINARY_FORMAT_TAG_SEQUENCE: return "SEQUENCE";
case PRESERVES_BINARY_FORMAT_TAG_SET: return "SET";
case PRESERVES_BINARY_FORMAT_TAG_DICTIONARY: return "DICTIONARY";
default: return "UNKNOWN";
}
});
///////////////////////////////////////////////////////////////////////////
// Index representation
typedef enum preserves_type_tag {
PRESERVES_BOOLEAN = 0,
PRESERVES_DOUBLE,
PRESERVES_SIGNED_INTEGER,
PRESERVES_STRING,
PRESERVES_BYTE_STRING,
PRESERVES_COMPACT,
PRESERVES_SYMBOL,
PRESERVES_RECORD,
PRESERVES_SEQUENCE,
PRESERVES_SET,
PRESERVES_DICTIONARY,
PRESERVES_EMBEDDED,
PRESERVES_ANNOTATION,
PRESERVES_END_MARKER,
} preserves_type_tag_t;
PRESERVES_OUTOFLINE(char const *preserves_type_tag_name(preserves_type_tag_t type), {
switch (type) {
case PRESERVES_BOOLEAN: return "BOOLEAN";
case PRESERVES_DOUBLE: return "DOUBLE";
case PRESERVES_SIGNED_INTEGER: return "SIGNED_INTEGER";
case PRESERVES_STRING: return "STRING";
case PRESERVES_BYTE_STRING: return "BYTE_STRING";
case PRESERVES_COMPACT: return "COMPACT";
case PRESERVES_SYMBOL: return "SYMBOL";
case PRESERVES_RECORD: return "RECORD";
case PRESERVES_SEQUENCE: return "SEQUENCE";
case PRESERVES_SET: return "SET";
case PRESERVES_DICTIONARY: return "DICTIONARY";
case PRESERVES_EMBEDDED: return "EMBEDDED";
case PRESERVES_ANNOTATION: return "ANNOTATION";
case PRESERVES_END_MARKER: return "END_MARKER";
default: return "UNKNOWN";
}
});
typedef enum preserves_error_code {
PRESERVES_END_SYSTEM_ERROR = -2,
PRESERVES_END_NO_ERROR = -1,
PRESERVES_END_EOF = 0,
PRESERVES_END_MORE_INPUT_REMAINING,
PRESERVES_END_INCOMPLETE_INPUT,
PRESERVES_END_UNEXPECTED_END,
PRESERVES_END_DICTIONARY_MISSING_VALUE,
PRESERVES_END_RECORD_MISSING_LABEL,
PRESERVES_END_VARINT_TOO_BIG,
PRESERVES_END_INVALID_UTF8,
PRESERVES_END_INVALID_TAG,
PRESERVES_END_INVALID_IEEE754,
} preserves_error_code_t;
PRESERVES_OUTOFLINE(char const *preserves_error_code_name(preserves_error_code_t code), {
switch (code) {
case PRESERVES_END_SYSTEM_ERROR: return "SYSTEM_ERROR";
case PRESERVES_END_NO_ERROR: return "NO_ERROR";
case PRESERVES_END_EOF: return "EOF";
case PRESERVES_END_MORE_INPUT_REMAINING: return "MORE_INPUT_REMAINING";
case PRESERVES_END_INCOMPLETE_INPUT: return "INCOMPLETE_INPUT";
case PRESERVES_END_UNEXPECTED_END: return "UNEXPECTED_END";
case PRESERVES_END_DICTIONARY_MISSING_VALUE: return "DICTIONARY_MISSING_VALUE";
case PRESERVES_END_RECORD_MISSING_LABEL: return "RECORD_MISSING_LABEL";
case PRESERVES_END_VARINT_TOO_BIG: return "VARINT_TOO_BIG";
case PRESERVES_END_INVALID_UTF8: return "INVALID_UTF8";
case PRESERVES_END_INVALID_TAG: return "INVALID_TAG";
case PRESERVES_END_INVALID_IEEE754: return "INVALID_IEEE754";
default: return "UNKNOWN";
}
});
typedef enum preserves_index_entry_representation {
PRESERVES_REPR_NONE = 0,
PRESERVES_INT_SIGNED,
PRESERVES_INT_UNSIGNED,
PRESERVES_INT_LARGE_BINARY,
PRESERVES_INT_LARGE_TEXT,
PRESERVES_LITERAL,
PRESERVES_ESCAPED,
PRESERVES_HEX,
PRESERVES_BASE64,
} preserves_index_entry_representation_t;
PRESERVES_OUTOFLINE
(
char const *preserves_index_entry_representation_name(preserves_index_entry_representation_t repr), {
switch (repr) {
case PRESERVES_REPR_NONE: return "REPR_NONE";
case PRESERVES_INT_SIGNED: return "INT_SIGNED";
case PRESERVES_INT_UNSIGNED: return "INT_UNSIGNED";
case PRESERVES_INT_LARGE_BINARY: return "INT_LARGE_BINARY";
case PRESERVES_INT_LARGE_TEXT: return "INT_LARGE_TEXT";
case PRESERVES_LITERAL: return "LITERAL";
case PRESERVES_ESCAPED: return "ESCAPED";
case PRESERVES_HEX: return "HEX";
case PRESERVES_BASE64: return "BASE64";
default: return "UNKNOWN";
}
});
/*
PRESERVES_BOOLEAN: repr==PRESERVES_REPR_NONE, len=0, data._boolean
PRESERVES_DOUBLE: repr=PRESERVES_REPR_NONE, len=0, data._double
PRESERVES_SIGNED_INTEGER:
- repr==PRESERVES_INT_SIGNED -> len=0, data._signed
- repr==PRESERVES_INT_UNSIGNED -> len=0, data._unsigned
- repr==PRESERVES_INT_LARGE_BINARY -> len, data._unsigned as absolute offset within input
- repr==PRESERVES_INT_LARGE_TEXT -> len, data._unsigned as absolute offset within input
PRESERVES_STRING:
- repr=PRESERVES_LITERAL -> len, data._unsigned as absolute offset within input to utf-8 bytes
- repr=PRESERVES_ESCAPED -> len, data._unsigned as absolute offset within input to utf-8 bytes
that need String-style backslash-escapes interpreted
PRESERVES_BYTE_STRING:
- repr=PRESERVES_LITERAL -> len, data._unsigned as absolute offset within input to utf-8 bytes
- repr=PRESERVES_ESCAPED -> len, data._unsigned as absolute offset within input to utf-8 bytes
that need ByteString-style backslash-escapes interpreted
- repr=PRESERVES_HEX -> len, data._unsigned as absolute offset within input to ASCII bytes of hex
- repr=PRESERVES_BASE64 -> len, data._unsigned as absolute offset within input to ASCII bytes of base64
PRESERVES_COMPACT:
- repr as for BYTE_STRING, but bytes denote a nested binary-encoded value.
PRESERVES_SYMBOL:
- repr=PRESERVES_LITERAL -> len, data._unsigned as absolute offset within input to utf-8 bytes
- repr=PRESERVES_ESCAPED -> len, data._unsigned as absolute offset within input to utf-8 bytes
that need Symbol-style backslash-escapes interpreted
PRESERVES_RECORD, PRESERVES_SEQUENCE, PRESERVES_SET, PRESERVES_DICTIONARY:
- repr==PRESERVES_REPR_NONE,
- len counts number of items:
- PRESERVES_RECORD -> number of fields plus one (for the label)
- PRESERVES_SEQUENCE -> number of items
- PRESERVES_SET -> number of items
- PRESERVES_DICTIONARY -> twice the number of key-value pairs
- data._unsigned as relative offset within index to next item,
starting from this entry; zero means "no end known"
PRESERVES_EMBEDDED: repr==PRESERVES_REPR_NONE, len==0, following item is the embedded value
PRESERVES_ANNOTATION:
- repr==PRESERVES_REPR_NONE,
- len counts number of annotations,
- data._unsigned as relative offset within index to annotated
item, starting from this entry; zero means "no end known"
- the annotated item will not be a PRESERVES_ANNOTATION
PRESERVES_END_MARKER: repr==PRESERVES_REPR_NONE, len==0, data._err
*/
typedef struct preserves_index_entry {
preserves_type_tag_t type:4;
preserves_index_entry_representation_t repr:4;
uint64_t len:56;
union {
bool _boolean;
double _double;
int64_t _signed;
uint64_t _unsigned;
preserves_error_code_t _err;
} data;
} preserves_index_entry_t;
#ifndef NDEBUG
extern void preserves_dump_index_entry(FILE* f, preserves_bytes_t *input, preserves_index_entry_t *i, bool add_newline);
#endif
typedef struct preserves_reader {
preserves_bytes_t input;
preserves_bytes_t index;
preserves_bytes_t stack;
size_t stack_top; /* ascending empty */
size_t input_pos; /* ascending full */
size_t index_pos; /* ascending empty */
bool annotation_tag_seen;
} preserves_reader_t;
typedef struct preserves_reader_result {
preserves_index_entry_t *index;
preserves_index_entry_t *end_marker;
} preserves_reader_result_t;
PRESERVES_INLINE preserves_reader_result_t preserves_reader_error_result(void) {
return (preserves_reader_result_t) { .index = NULL, .end_marker = NULL };
}
PRESERVES_INLINE preserves_reader_t preserves_create_reader(void) {
return (preserves_reader_t) {
.input = preserves_create_bytes(),
.index = preserves_create_bytes(),
.stack = preserves_create_bytes(),
.stack_top = 0,
.input_pos = 0,
.index_pos = 0,
.annotation_tag_seen = false,
};
}
PRESERVES_OUTOFLINE(void preserves_free_reader(preserves_reader_t *r), {
preserves_free_bytes(&r->input);
preserves_free_bytes(&r->index);
preserves_free_bytes(&r->stack);
r->stack_top = 0;
r->input_pos = 0;
r->index_pos = 0;
r->annotation_tag_seen = false;
});
PRESERVES_IMPLEMENTATION_CHUNK
(
#define MINIMUM_PRESERVES_READER_STACK_SIZE 32
typedef uint64_t preserves_index_offset_t;
static inline bool _preserves_reader_ateof(preserves_reader_t *r) {
return (r->input_pos >= r->input.len);
}
static inline int _preserves_reader_peek(preserves_reader_t *r) {
if (_preserves_reader_ateof(r)) return -1;
return PRESERVES_ARRAY_ELEMENT(&r->input, uint8_t, r->input_pos);
}
static inline int _preserves_reader_next(preserves_reader_t *r) {
int result = _preserves_reader_peek(r);
if (result == -1) return -1;
r->input_pos++;
return result;
}
static inline void *_preserves_reader_next_bytes(preserves_reader_t *r, size_t count) {
preserves_bytes_t bs = preserves_bytes_subsequence(&r->input, r->input_pos, count);
if (bs.len != count) return NULL;
r->input_pos += count;
return bs.ptr;
}
static inline preserves_index_entry_t *_preserves_reader_index_entry(preserves_reader_t *r,
size_t i) {
size_t limit = PRESERVES_ARRAY_ELEMENT_COUNT(&r->index, preserves_index_entry_t);
while (i >= limit) {
limit = limit * 2;
if (limit < 16) limit = 16;
if (PRESERVES_RESIZE_ARRAY(&r->index, preserves_index_entry_t, limit) == -1) {
return NULL;
}
}
return &PRESERVES_ARRAY_ELEMENT(&r->index, preserves_index_entry_t, i);
}
static inline size_t _preserves_reader_stack_peek(preserves_reader_t *r) {
if (r->stack_top >= PRESERVES_ARRAY_ELEMENT_COUNT(&r->stack, size_t)) {
abort();
}
return PRESERVES_ARRAY_ELEMENT(&r->stack, size_t, r->stack_top - 1);
}
static inline preserves_index_entry_t *_preserves_reader_stack_top_entry(preserves_reader_t *r) {
return _preserves_reader_index_entry(r, _preserves_reader_stack_peek(r));
}
static inline void _preserves_reader_stack_drop(preserves_reader_t *r) {
if (r->stack_top == 0) abort();
/* printf("popping "); */
/* preserves_dump_index_entry(stdout, &r->input, _preserves_reader_stack_top_entry(r), true); */
r->stack_top--;
}
static inline preserves_index_entry_t *_preserves_reader_finish_seq(preserves_reader_t *r) {
size_t base_index = _preserves_reader_stack_peek(r);
preserves_index_entry_t *base = _preserves_reader_stack_top_entry(r);
base->data._unsigned = r->index_pos - base_index;
_preserves_reader_stack_drop(r);
return base;
}
static inline bool _preserves_reader_in_annotations(preserves_reader_t *r) {
return (r->stack_top > 0) &&
(_preserves_reader_stack_top_entry(r)->type == PRESERVES_ANNOTATION);
}
static inline void _preserves_reader_inc_collection_len(preserves_reader_t *r, size_t *count_ptr) {
if (r->stack_top > 0) {
check_for_embedded:
preserves_index_entry_t *base = _preserves_reader_stack_top_entry(r);
if (base->type == PRESERVES_EMBEDDED) {
_preserves_reader_stack_drop(r);
goto check_for_embedded;
} else {
base->len++;
}
/* printf("added to base, which is now "); */
/* preserves_dump_index_entry(stdout, &r->input, base, true); */
} else {
(*count_ptr)--;
}
}
static inline preserves_index_entry_t *_preserves_reader_emit_entry(preserves_reader_t *r,
size_t *count_ptr,
preserves_index_entry_t e) {
if (!r->annotation_tag_seen && _preserves_reader_in_annotations(r)) {
/* printf("(popping annotation collector)\n"); */
_preserves_reader_finish_seq(r);
}
if (count_ptr != NULL) {
_preserves_reader_inc_collection_len(r, count_ptr);
}
/* printf("-- emitting: "); */
/* preserves_dump_index_entry(stdout, &r->input, &e, true); */
preserves_index_entry_t *ix = _preserves_reader_index_entry(r, r->index_pos);
if (ix == NULL) return NULL;
*ix = e;
r->index_pos++;
r->annotation_tag_seen = false;
return ix;
}
static inline preserves_reader_result_t _preserves_reader_finish(preserves_reader_t *r,
preserves_error_code_t code) {
if (code == PRESERVES_END_SYSTEM_ERROR) {
return preserves_reader_error_result();
} else {
preserves_index_entry_t *index = _preserves_reader_index_entry(r, 0);
if (index == NULL) return preserves_reader_error_result();
preserves_index_entry_t *end_marker =
_preserves_reader_emit_entry(r, NULL, (preserves_index_entry_t) {
.type = PRESERVES_END_MARKER,
.repr = PRESERVES_REPR_NONE,
.len = 0,
.data = { ._err = code },
});
if (end_marker == NULL) return preserves_reader_error_result();
return (preserves_reader_result_t) { .index = index, .end_marker = end_marker };
}
}
static inline size_t _preserves_reader_varint(preserves_reader_t *r, preserves_error_code_t *code) {
unsigned int shift_amount = 0;
size_t result = 0;
while (true) {
int b = _preserves_reader_next(r);
if (b == -1) {
*code = PRESERVES_END_INCOMPLETE_INPUT;
return 0;
}
result |= (b & 0x7f) << shift_amount;
if (b & 0x80) {
shift_amount += 7;
if (shift_amount > ((sizeof(size_t) * 8) - 7)) {
*code = PRESERVES_END_VARINT_TOO_BIG;
return 0;
}
} else {
*code = PRESERVES_END_NO_ERROR;
return result;
}
}
}
static inline preserves_index_entry_t *_preserves_emit_small_int(preserves_reader_t *r,
size_t *count_ptr,
bool is_unsigned,
int64_t value) {
return _preserves_reader_emit_entry(r, count_ptr, (preserves_index_entry_t) {
.type = PRESERVES_SIGNED_INTEGER,
.repr = is_unsigned ? PRESERVES_INT_UNSIGNED : PRESERVES_INT_SIGNED,
.len = 0,
.data = { ._signed = value },
});
}
static inline int _preserves_reader_decode_intbytes(preserves_reader_t *r,
size_t *count_ptr,
size_t len) {
size_t starting_pos = r->input_pos;
uint8_t *bs = _preserves_reader_next_bytes(r, len);
if (bs == NULL) return -1;
bool is_unsigned = false;
size_t remaining = len;
while ((remaining > 0) && (*bs == 0)) {
is_unsigned = true;
bs++;
remaining--;
}
if (remaining == 0) {
// This shouldn't happen, but it does have a denotation.
return (_preserves_emit_small_int(r, count_ptr, is_unsigned, 0) == NULL) ? -1 : 0;
}
if (remaining > 8) {
if (is_unsigned && (*bs & 0x80)) {
remaining++;
bs--;
}
return (_preserves_reader_emit_entry(r, count_ptr, (preserves_index_entry_t) {
.type = PRESERVES_SIGNED_INTEGER,
.repr = PRESERVES_INT_LARGE_BINARY,
.len = remaining,
.data = { ._unsigned = starting_pos + (len - remaining) },
}) == NULL) ? -1 : 0;
}
uint64_t buf = 0;
while (remaining > 0) {
remaining--;
buf = buf | ((*bs) << (remaining << 3));
bs++;
}
int64_t value = *(int64_t *)&buf;
return (_preserves_emit_small_int(r, count_ptr, is_unsigned, value) == NULL) ? -1 : 0;
}
static inline bool utf8_tail(uint8_t b) {
return (b >= 0x80 && b <= 0xbf);
}
static inline int check_utf8(uint8_t *bs, size_t len) {
// https://datatracker.ietf.org/doc/html/rfc3629#section-4
while (len > 0) {
uint8_t b0 = *bs++;
len--;
if (b0 >= 0x80) {
if (len < 1) return -1;
uint8_t b1 = *bs++;
len--;
if (b0 >= 0xc2 && b0 <= 0xdf) {
if (!utf8_tail(b1)) return -1;
} else {
if (len < 1) return -1;
uint8_t b2 = *bs++;
len--;
if (b0 == 0xe0) {
if (!(b1 >= 0xa0 && b1 <= 0xbf && utf8_tail(b2))) return -1;
} else if (b0 >= 0xe1 && b0 <= 0xec) {
if (!(utf8_tail(b1) && utf8_tail(b2))) return -1;
} else if (b0 == 0xed) {
if (!(b1 >= 0x80 && b1 <= 0x9f && utf8_tail(b2))) return -1;
} else if (b0 >= 0xee && b0 <= 0xef) {
if (!(utf8_tail(b1) && utf8_tail(b2))) return -1;
} else {
if (len < 1) return -1;
uint8_t b3 = *bs++;
len--;
if (b0 == 0xf0) {
if (!(b1 >= 0x90 && b1 <= 0xbf && utf8_tail(b2) && utf8_tail(b3))) return -1;
} else if (b0 >= 0xf1 && b0 <= 0xf3) {
if (!(utf8_tail(b1) && utf8_tail(b2) && utf8_tail(b3))) return -1;
} else if (b0 == 0xf4) {
if (!(b1 >= 0x80 && b1 <= 0x8f && utf8_tail(b2) && utf8_tail(b3))) return -1;
} else {
// ok!
}
}
}
}
}
return 0;
}
static inline preserves_error_code_t _preserves_reader_read_stringlike(preserves_reader_t *r,
size_t *count_ptr,
preserves_type_tag_t type,
bool should_check_utf8) {
preserves_error_code_t varint_err = PRESERVES_END_NO_ERROR;
size_t len = _preserves_reader_varint(r, &varint_err);
if (varint_err != PRESERVES_END_NO_ERROR) return varint_err;
size_t starting_pos = r->input_pos;
uint8_t *maybe_utf = _preserves_reader_next_bytes(r, len);
if (should_check_utf8 && (check_utf8(maybe_utf, len) == -1)) {
return PRESERVES_END_INVALID_UTF8;
}
if (_preserves_reader_emit_entry(r, count_ptr, (preserves_index_entry_t) {
.type = type,
.repr = PRESERVES_LITERAL,
.len = len,
.data = { ._unsigned = starting_pos },
}) == NULL) {
return PRESERVES_END_SYSTEM_ERROR;
}
return PRESERVES_END_NO_ERROR;
}
static inline preserves_index_entry_t *_preserves_reader_push(preserves_reader_t *r,
preserves_type_tag_t type) {
preserves_index_entry_t *ix = _preserves_reader_emit_entry(r, NULL, (preserves_index_entry_t) {
.type = type, .repr = PRESERVES_REPR_NONE, .len = 0, .data = { ._unsigned = 0 }});
if (ix == NULL) return NULL;
size_t limit = PRESERVES_ARRAY_ELEMENT_COUNT(&r->stack, size_t);
if (r->stack_top >= limit) {
limit += 32;
if (PRESERVES_RESIZE_ARRAY(&r->stack, size_t, limit) == -1) return NULL;
}
PRESERVES_ARRAY_ELEMENT(&r->stack, size_t, r->stack_top) = r->index_pos - 1;
r->stack_top++;
return ix;
}
)
PRESERVES_INLINE preserves_index_entry_t *preserves_skip_annotations(preserves_index_entry_t *ix) {
if (ix == NULL) return NULL;
if (ix->type != PRESERVES_ANNOTATION) return ix;
ix += ix->data._unsigned;
if (ix->type == PRESERVES_ANNOTATION) abort();
return ix;
}
#define RETURN_ON_FAIL(e) if ((e) == NULL) return preserves_reader_error_result()
PRESERVES_OUTOFLINE
(
preserves_reader_result_t preserves_read_binary_continue(preserves_reader_t *r, size_t count), {
while (count) {
/* for (int i = r->stack_top - 1; i >= 0; i--) { */
/* size_t ip = PRESERVES_ARRAY_ELEMENT(&r->stack, size_t, i); */
/* printf(" %02d: (%5lu) ", i, ip); */
/* preserves_dump_index_entry(stdout, &r->input, _preserves_reader_index_entry(r, ip), true); */
/* } */
/* printf("pos %lu (%05lx), count %lu, annotation tag seen %d: ", */
/* r->input_pos, */
/* r->input_pos, */
/* count, */
/* r->annotation_tag_seen); */
int b = _preserves_reader_next(r);
/* printf("tag 0x%02x %s\n", b, preserves_binary_format_tag_name(b)); */
if (b == -1) return _preserves_reader_finish(r, PRESERVES_END_INCOMPLETE_INPUT);
switch (b) {
case PRESERVES_BINARY_FORMAT_TAG_FALSE:
RETURN_ON_FAIL(_preserves_reader_emit_entry(r, &count, (preserves_index_entry_t) {
.type = PRESERVES_BOOLEAN, .repr = PRESERVES_REPR_NONE, .len = 0, .data = {
._boolean = false
}}));
break;
case PRESERVES_BINARY_FORMAT_TAG_TRUE:
RETURN_ON_FAIL(_preserves_reader_emit_entry(r, &count, (preserves_index_entry_t) {
.type = PRESERVES_BOOLEAN, .repr = PRESERVES_REPR_NONE, .len = 0, .data = {
._boolean = true
}}));
break;
case PRESERVES_BINARY_FORMAT_TAG_IEEE754: {
preserves_error_code_t varint_err = PRESERVES_END_NO_ERROR;
size_t len = _preserves_reader_varint(r, &varint_err);
if (varint_err != PRESERVES_END_NO_ERROR) return _preserves_reader_finish(r, varint_err);
uint8_t *bs = _preserves_reader_next_bytes(r, len);
if (bs == NULL) return _preserves_reader_finish(r, PRESERVES_END_INCOMPLETE_INPUT);
switch (len) {
case 8: {
uint32_t lo, hi;
memcpy(&hi, bs, 4);
memcpy(&lo, bs + 4, 4);
lo = ntohl(lo);
hi = ntohl(hi);
uint64_t i = (((uint64_t) hi) << 32) | ((uint64_t) lo);
double f;
memcpy(&f, &i, 8);
RETURN_ON_FAIL(_preserves_reader_emit_entry(r, &count, (preserves_index_entry_t) {
.type = PRESERVES_DOUBLE, .repr = PRESERVES_REPR_NONE, .len = 0, .data = {
._double = f
}}));
break;
}
default:
return _preserves_reader_finish(r, PRESERVES_END_INVALID_IEEE754);
}
break;
}
case PRESERVES_BINARY_FORMAT_TAG_END:
if (r->stack_top == 0) {
return _preserves_reader_finish(r, PRESERVES_END_UNEXPECTED_END);
}
preserves_index_entry_t *base = _preserves_reader_finish_seq(r);
_preserves_reader_inc_collection_len(r, &count);
if ((base->type == PRESERVES_DICTIONARY) && ((base->len % 2) != 0)) {
return _preserves_reader_finish(r, PRESERVES_END_DICTIONARY_MISSING_VALUE);
}
if ((base->type == PRESERVES_RECORD) && (base->len == 0)) {
return _preserves_reader_finish(r, PRESERVES_END_RECORD_MISSING_LABEL);
}
break;
case PRESERVES_BINARY_FORMAT_TAG_ANNOTATION:
if (r->annotation_tag_seen || !_preserves_reader_in_annotations(r)) {
RETURN_ON_FAIL(_preserves_reader_push(r, PRESERVES_ANNOTATION));
}
r->annotation_tag_seen = true;
break;
case PRESERVES_BINARY_FORMAT_TAG_EMBEDDED:
RETURN_ON_FAIL(_preserves_reader_push(r, PRESERVES_EMBEDDED));
break;
case PRESERVES_BINARY_FORMAT_TAG_SIGNED_INTEGER: {
preserves_error_code_t varint_err = PRESERVES_END_NO_ERROR;
size_t len = _preserves_reader_varint(r, &varint_err);
if (varint_err != PRESERVES_END_NO_ERROR) return _preserves_reader_finish(r, varint_err);
if (_preserves_reader_decode_intbytes(r, &count, len) == -1) {
return _preserves_reader_finish(r, PRESERVES_END_INCOMPLETE_INPUT);
}
break;
}
case PRESERVES_BINARY_FORMAT_TAG_STRING: {
preserves_error_code_t code =
_preserves_reader_read_stringlike(r, &count, PRESERVES_STRING, true);
if (code != PRESERVES_END_NO_ERROR) return _preserves_reader_finish(r, code);
break;
}
case PRESERVES_BINARY_FORMAT_TAG_BYTE_STRING: {
preserves_error_code_t code =
_preserves_reader_read_stringlike(r, &count, PRESERVES_BYTE_STRING, false);
if (code != PRESERVES_END_NO_ERROR) return _preserves_reader_finish(r, code);
break;
}
case PRESERVES_BINARY_FORMAT_TAG_SYMBOL: {
preserves_error_code_t code =
_preserves_reader_read_stringlike(r, &count, PRESERVES_SYMBOL, true);
if (code != PRESERVES_END_NO_ERROR) return _preserves_reader_finish(r, code);
break;
}
case PRESERVES_BINARY_FORMAT_TAG_RECORD:
RETURN_ON_FAIL(_preserves_reader_push(r, PRESERVES_RECORD));
break;
case PRESERVES_BINARY_FORMAT_TAG_SEQUENCE:
RETURN_ON_FAIL(_preserves_reader_push(r, PRESERVES_SEQUENCE));
break;
case PRESERVES_BINARY_FORMAT_TAG_SET:
RETURN_ON_FAIL(_preserves_reader_push(r, PRESERVES_SET));
break;
case PRESERVES_BINARY_FORMAT_TAG_DICTIONARY:
RETURN_ON_FAIL(_preserves_reader_push(r, PRESERVES_DICTIONARY));
break;
default:
return _preserves_reader_finish(r, PRESERVES_END_INVALID_TAG);
}
}
return _preserves_reader_finish(r,
(_preserves_reader_ateof(r) ? PRESERVES_END_EOF :
(r->stack_top > 0) ? PRESERVES_END_INCOMPLETE_INPUT :
PRESERVES_END_MORE_INPUT_REMAINING));
}
);
#undef RETURN_ON_FAIL
PRESERVES_OUTOFLINE
(
preserves_reader_result_t preserves_read_binary(preserves_reader_t *r,
preserves_bytes_t *input,
size_t count), {
{
size_t required_stack_bytes = MINIMUM_PRESERVES_READER_STACK_SIZE * sizeof(size_t);
if (r->stack.len < required_stack_bytes) {
if (preserves_resize_bytes(&r->stack, required_stack_bytes) == -1) {
return preserves_reader_error_result();
}
}
}
r->stack_top = 0;
r->input_pos = 0;
r->index_pos = 0;
r->annotation_tag_seen = false;
preserves_bytes_move(&r->input, input);
return preserves_read_binary_continue(r, count);
}
);
///////////////////////////////////////////////////////////////////////////
// Debug utilities
#ifndef NDEBUG
PRESERVES_IMPLEMENTATION_CHUNK
(
static void preserves_dump_bytes(FILE *f,
preserves_bytes_t *data) {
fprintf(f, ">>>");
for (size_t i = 0; i < data->len; i++) {
uint8_t c = PRESERVES_ARRAY_ELEMENT(data, uint8_t, i);
if (c < 0x20 || c >= 0x80) {
fprintf(f, "\\x%02x", c);
} else {
fprintf(f, "%c", c);
}
}
fprintf(f, "<<<");
}
void preserves_dump_index_entry(FILE *f,
preserves_bytes_t *input,
preserves_index_entry_t *i,
bool add_newline) {
fprintf(f,
"%s %s length %lu",
preserves_type_tag_name(i->type),
i->repr == PRESERVES_REPR_NONE ? "-" : preserves_index_entry_representation_name(i->repr),
(size_t) i->len);
switch (i->type) {
case PRESERVES_BOOLEAN:
fprintf(f, i->data._boolean ? " #t" : " #f");
break;
case PRESERVES_DOUBLE:
fprintf(f, " %f", i->data._double);
break;
case PRESERVES_STRING:
case PRESERVES_BYTE_STRING:
case PRESERVES_COMPACT:
case PRESERVES_SYMBOL: {
fprintf(f, " offset %lu ", i->data._unsigned);
preserves_bytes_t data = preserves_bytes_subsequence(input, i->data._unsigned, i->len);
preserves_dump_bytes(f, &data);
break;
}
case PRESERVES_RECORD:
case PRESERVES_SEQUENCE:
case PRESERVES_SET:
case PRESERVES_DICTIONARY:
fprintf(f, " skip %lu", i->data._unsigned - 1);
break;
case PRESERVES_ANNOTATION:
fprintf(f, " annotated after %lu", i->data._unsigned - 1);
break;
case PRESERVES_EMBEDDED:
break;
case PRESERVES_END_MARKER:
fprintf(f, ": %s", preserves_error_code_name(i->data._err));
break;
case PRESERVES_SIGNED_INTEGER:
switch (i->repr) {
case PRESERVES_INT_SIGNED:
fprintf(f, ": %ld", i->data._signed);
break;
case PRESERVES_INT_UNSIGNED:
default:
fprintf(f, ": %lu", i->data._unsigned);
break;
case PRESERVES_INT_LARGE_BINARY:
case PRESERVES_INT_LARGE_TEXT: {
fprintf(f, " offset %lu ", i->data._unsigned);
preserves_bytes_t data = preserves_bytes_subsequence(input, i->data._unsigned, i->len);
preserves_dump_bytes(f, &data);
break;
}
}
break;
default:
fprintf(f, ": %lu (%ld)", i->data._unsigned, i->data._signed);
break;
}
if (add_newline) {
fprintf(f, "\n");
}
}
)
#endif
///////////////////////////////////////////////////////////////////////////
#undef PRESERVES_INLINE
#undef PRESERVES_IMPLEMENTATION_CHUNK
#undef PRESERVES_OUTOFLINE
#endif