Fix spec issues with len() and IEEE754 endianness (thanks to isd)

This commit is contained in:
Tony Garnock-Jones 2022-06-11 10:06:51 +02:00
parent 959b0f90b4
commit 832fec6c43
7 changed files with 21 additions and 1210 deletions

View File

@ -64,6 +64,7 @@ Implementations of the data model, plus Syrup transfer syntax:
## Additional resources
- [Cheat sheet(s) for Preserves syntax](cheatsheet.html)
- Some [conventions for common data types](conventions.html)
- [Open questions](questions.html); see also the
[issues list]({{page.projectpages}}/issues)

View File

@ -21,10 +21,13 @@ For a value `v`, we write `«v»` for the binary encoding of `v`.
[0xA5] ++ S if S ∈ ByteString
[0xA6] ++ utf8(S) if S ∈ Symbol
`intbytes(x)` gives the big-endian two's-complement signed binary
representation of `x`, taking exactly as many whole bytes as needed to
unambiguously identify the value and its sign. `intbytes(0)` is the
empty byte sequence.
The functions `binary32(F)` and `binary64(D)` yield big-endian 4- and
8-byte IEEE 754 binary representations of `F` and `D`, respectively.
The function `intbytes(x)` gives the big-endian two's-complement signed
binary representation of `x`, taking exactly as many whole bytes as
needed to unambiguously identify the value and its sign. `intbytes(0)`
is the empty byte sequence.
### Compounds
@ -33,11 +36,12 @@ empty byte sequence.
«#{E_1...E_m}» = [0xA9] ++ seq(«E_1», ..., «E_m»)
«{K_1:V_1...K_m:V_m}» = [0xAA] ++ seq(«K_1», «V_1», ..., «K_m», «V_m»)
seq(R_1, ... R_m) = len(R_1) ++ R_1 ++...++ len(R_m) ++ R_m
seq(R_1, ..., R_m) = len(|R_1|) ++ R_1 ++...++ len(|R_m|) ++ R_m
len(m) = e(m, 128)
where e(v, d) = [v + d] if v < 128
e(v / 128, 0) ++ [(v % 128) + d] if v ≥ 128
len(m) = e(m, 128)
e(v, d) = [v + d] if v < 128
e(v / 128, 0) ++ [(v % 128) + d] if v ≥ 128
### Embeddeds
@ -49,4 +53,4 @@ To annotate a `Repr` `r` (that *MUST NOT* itself already be annotated)
with some sequence of `Value`s `[v_1, ..., v_m]`, surround `r` as
follows:
[0xBE] ++ len(r) ++ r ++ len(v_1) ++ v_1 ++...++ len(v_m) ++ v_m
[0xBE] ++ len(|r|) ++ r ++ len(v_1»|) ++ «v_1» ++...++ len(v_m»|) ++ «v_m»

View File

@ -1,2 +0,0 @@
m.output.txt
m

View File

@ -1,8 +0,0 @@
m: main.c preserves.h
gcc -Wall -Wextra -Werror -g3 -o $@ main.c
go: m
cat ../../tests/samples.bin | ./m | tee m.output.txt
clean:
rm -f m

View File

@ -1,96 +0,0 @@
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include <sys/time.h>
#define PRESERVES_IMPLEMENTATION
#include "preserves.h"
static double now() {
struct timeval tv;
if (gettimeofday(&tv, NULL) < 0) {
perror("gettimeofday");
}
return (double) tv.tv_sec + ((double) tv.tv_usec) / 1000000.0;
}
int main(__attribute__ ((unused)) int argc,
__attribute__ ((unused)) char const * const argv[])
{
preserves_bytes_t input = preserves_create_bytes();
bool silent = getenv("SILENT") != NULL;
double start = now();
{
preserves_bytes_t chunk = preserves_create_bytes();
if (preserves_resize_bytes(&chunk, 131072) == -1) {
perror("allocating chunk");
return EXIT_FAILURE;
}
while (true) {
size_t count = fread(chunk.ptr, 1, chunk.len, stdin);
if (count == 0) {
if (ferror(stdin)) {
perror("reading");
return EXIT_FAILURE;
}
break;
}
if (preserves_extend_bytes(&input, preserves_bytes_subsequence(&chunk, 0, count)) == -1) {
perror("appending");
return EXIT_FAILURE;
}
}
preserves_free_bytes(&chunk);
}
double mid = now();
{
preserves_reader_t reader = preserves_create_reader();
preserves_reader_result_t result = preserves_read_binary(&reader, &input, 1);
more_input:
if (result.index == NULL) {
perror("parsing");
return EXIT_FAILURE;
}
if (!silent) {
printf("Size of index: %lu bytes; %lu entries\n",
reader.index_pos * sizeof(preserves_index_entry_t),
reader.index_pos);
}
if (!silent) {
for (preserves_index_entry_t *i = result.index; i != result.end_marker; i++) {
preserves_dump_index_entry(stdout, &reader.input, i, true);
}
preserves_dump_index_entry(stdout, &reader.input, result.end_marker, true);
}
if (result.end_marker->data._err == PRESERVES_END_MORE_INPUT_REMAINING) {
if (!silent) {
printf("\n");
}
reader.index_pos = 0;
result = preserves_read_binary_continue(&reader, 1);
goto more_input;
}
preserves_free_reader(&reader);
}
double end = now();
printf("stage 1: %g s\n", mid - start);
printf("stage 2: %g s\n", end - mid);
printf("total: %g s\n", end - start);
preserves_free_bytes(&input);
return EXIT_SUCCESS;
}

File diff suppressed because it is too large Load Diff

View File

@ -53,6 +53,8 @@ defined recursively as follows:
big-endian, unlike [LEB128][] encoding ([as used by
Google][google-varint] in protobufs).
We write `len(|r|)` for the varint-encoding of the length of `Repr` `r`.
The following table illustrates varint-encoding.
| Number, `m` | `m` in binary, grouped into 7-bit chunks | `len(m)` bytes |
@ -71,7 +73,8 @@ varint-encoding of `m` *MUST NOT* start with `0`.
«[X_1...X_m]» = [0xA8] ++ seq(«X_1», ..., «X_m»)
«#{E_1...E_m}» = [0xA9] ++ seq(«E_1», ..., «E_m»)
«{K_1:V_1...K_m:V_m}» = [0xAA] ++ seq(«K_1», «V_1», ..., «K_m», «V_m»)
where seq(R_1, ... R_m) = len(R_1) ++ R_1 ++...++ len(R_m) ++ R_m
seq(R_1, ..., R_m) = len(|R_1|) ++ R_1 ++...++ len(|R_m|) ++ R_m
There is *no* ordering requirement on the `E_i` elements or
`K_i`/`V_i` pairs.[^no-sorting-rationale] They may appear in any
@ -172,7 +175,7 @@ represent the denoted object, prefixed with `[0xBF]`.
To annotate a `Repr` `r` with some sequence of `Value`s `[v_1, ...,
v_m]`, surround `r` as follows:
[0xBE] ++ len(r) ++ r ++ len(v_1) ++ v_1 ++...++ len(v_m) ++ v_m
[0xBE] ++ len(|r|) ++ r ++ len(v_1»|) ++ «v_1» ++...++ len(v_m»|) ++ «v_m»
The `Repr` `r` *MUST NOT* already have annotations; that is, it must not begin with `0xBE`.
@ -180,7 +183,7 @@ For example, the `Repr` corresponding to textual syntax `@a@b[]`, i.e.
an empty sequence annotated with two symbols, `a` and `b`, is
«@a @b []»
= [0xBE] ++ len(«[]») ++ «[]» ++ len(«a») ++ «a» ++ len(«b») ++ «b»
= [0xBE] ++ len(|«[]»|) ++ «[]» ++ len(|«a»|) ++ «a» ++ len(|«b»|) ++ «b»
= [0xBE, 0x81, 0xA8, 0x82, 0xA6, 0x61, 0x82, 0xA6, 0x62]
## Security Considerations
@ -230,7 +233,7 @@ undetermined number of `Value`s across, say, a TCP/IP connection:
- If the binary syntax is to be used for the connection, start the
connection with byte `0xA8` (sequence). After the initial byte, send
each value `v` as `len(«v») ++ «v»`. A side effect of this approach
each value `v` as `len(|«v»|) ++ «v»`. A side effect of this approach
is that the entire stream, when complete, is a valid `Sequence`
`Repr`.