Fix spec issues with len() and IEEE754 endianness (thanks to isd)
This commit is contained in:
parent
959b0f90b4
commit
832fec6c43
|
@ -64,6 +64,7 @@ Implementations of the data model, plus Syrup transfer syntax:
|
|||
|
||||
## Additional resources
|
||||
|
||||
- [Cheat sheet(s) for Preserves syntax](cheatsheet.html)
|
||||
- Some [conventions for common data types](conventions.html)
|
||||
- [Open questions](questions.html); see also the
|
||||
[issues list]({{page.projectpages}}/issues)
|
||||
|
|
|
@ -21,10 +21,13 @@ For a value `v`, we write `«v»` for the binary encoding of `v`.
|
|||
[0xA5] ++ S if S ∈ ByteString
|
||||
[0xA6] ++ utf8(S) if S ∈ Symbol
|
||||
|
||||
`intbytes(x)` gives the big-endian two's-complement signed binary
|
||||
representation of `x`, taking exactly as many whole bytes as needed to
|
||||
unambiguously identify the value and its sign. `intbytes(0)` is the
|
||||
empty byte sequence.
|
||||
The functions `binary32(F)` and `binary64(D)` yield big-endian 4- and
|
||||
8-byte IEEE 754 binary representations of `F` and `D`, respectively.
|
||||
|
||||
The function `intbytes(x)` gives the big-endian two's-complement signed
|
||||
binary representation of `x`, taking exactly as many whole bytes as
|
||||
needed to unambiguously identify the value and its sign. `intbytes(0)`
|
||||
is the empty byte sequence.
|
||||
|
||||
### Compounds
|
||||
|
||||
|
@ -33,11 +36,12 @@ empty byte sequence.
|
|||
«#{E_1...E_m}» = [0xA9] ++ seq(«E_1», ..., «E_m»)
|
||||
«{K_1:V_1...K_m:V_m}» = [0xAA] ++ seq(«K_1», «V_1», ..., «K_m», «V_m»)
|
||||
|
||||
seq(R_1, ... R_m) = len(R_1) ++ R_1 ++...++ len(R_m) ++ R_m
|
||||
seq(R_1, ..., R_m) = len(|R_1|) ++ R_1 ++...++ len(|R_m|) ++ R_m
|
||||
|
||||
len(m) = e(m, 128)
|
||||
where e(v, d) = [v + d] if v < 128
|
||||
e(v / 128, 0) ++ [(v % 128) + d] if v ≥ 128
|
||||
len(m) = e(m, 128)
|
||||
|
||||
e(v, d) = [v + d] if v < 128
|
||||
e(v / 128, 0) ++ [(v % 128) + d] if v ≥ 128
|
||||
|
||||
### Embeddeds
|
||||
|
||||
|
@ -49,4 +53,4 @@ To annotate a `Repr` `r` (that *MUST NOT* itself already be annotated)
|
|||
with some sequence of `Value`s `[v_1, ..., v_m]`, surround `r` as
|
||||
follows:
|
||||
|
||||
[0xBE] ++ len(r) ++ r ++ len(v_1) ++ v_1 ++...++ len(v_m) ++ v_m
|
||||
[0xBE] ++ len(|r|) ++ r ++ len(|«v_1»|) ++ «v_1» ++...++ len(|«v_m»|) ++ «v_m»
|
|
@ -1,2 +0,0 @@
|
|||
m.output.txt
|
||||
m
|
|
@ -1,8 +0,0 @@
|
|||
m: main.c preserves.h
|
||||
gcc -Wall -Wextra -Werror -g3 -o $@ main.c
|
||||
|
||||
go: m
|
||||
cat ../../tests/samples.bin | ./m | tee m.output.txt
|
||||
|
||||
clean:
|
||||
rm -f m
|
|
@ -1,96 +0,0 @@
|
|||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#include <sys/time.h>
|
||||
|
||||
#define PRESERVES_IMPLEMENTATION
|
||||
#include "preserves.h"
|
||||
|
||||
static double now() {
|
||||
struct timeval tv;
|
||||
if (gettimeofday(&tv, NULL) < 0) {
|
||||
perror("gettimeofday");
|
||||
}
|
||||
return (double) tv.tv_sec + ((double) tv.tv_usec) / 1000000.0;
|
||||
}
|
||||
|
||||
int main(__attribute__ ((unused)) int argc,
|
||||
__attribute__ ((unused)) char const * const argv[])
|
||||
{
|
||||
preserves_bytes_t input = preserves_create_bytes();
|
||||
bool silent = getenv("SILENT") != NULL;
|
||||
|
||||
double start = now();
|
||||
|
||||
{
|
||||
preserves_bytes_t chunk = preserves_create_bytes();
|
||||
if (preserves_resize_bytes(&chunk, 131072) == -1) {
|
||||
perror("allocating chunk");
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
while (true) {
|
||||
size_t count = fread(chunk.ptr, 1, chunk.len, stdin);
|
||||
if (count == 0) {
|
||||
if (ferror(stdin)) {
|
||||
perror("reading");
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (preserves_extend_bytes(&input, preserves_bytes_subsequence(&chunk, 0, count)) == -1) {
|
||||
perror("appending");
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
}
|
||||
|
||||
preserves_free_bytes(&chunk);
|
||||
}
|
||||
|
||||
double mid = now();
|
||||
|
||||
{
|
||||
preserves_reader_t reader = preserves_create_reader();
|
||||
preserves_reader_result_t result = preserves_read_binary(&reader, &input, 1);
|
||||
more_input:
|
||||
if (result.index == NULL) {
|
||||
perror("parsing");
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
if (!silent) {
|
||||
printf("Size of index: %lu bytes; %lu entries\n",
|
||||
reader.index_pos * sizeof(preserves_index_entry_t),
|
||||
reader.index_pos);
|
||||
}
|
||||
|
||||
if (!silent) {
|
||||
for (preserves_index_entry_t *i = result.index; i != result.end_marker; i++) {
|
||||
preserves_dump_index_entry(stdout, &reader.input, i, true);
|
||||
}
|
||||
preserves_dump_index_entry(stdout, &reader.input, result.end_marker, true);
|
||||
}
|
||||
|
||||
if (result.end_marker->data._err == PRESERVES_END_MORE_INPUT_REMAINING) {
|
||||
if (!silent) {
|
||||
printf("\n");
|
||||
}
|
||||
reader.index_pos = 0;
|
||||
result = preserves_read_binary_continue(&reader, 1);
|
||||
goto more_input;
|
||||
}
|
||||
|
||||
preserves_free_reader(&reader);
|
||||
}
|
||||
|
||||
double end = now();
|
||||
|
||||
printf("stage 1: %g s\n", mid - start);
|
||||
printf("stage 2: %g s\n", end - mid);
|
||||
printf("total: %g s\n", end - start);
|
||||
|
||||
preserves_free_bytes(&input);
|
||||
return EXIT_SUCCESS;
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -53,6 +53,8 @@ defined recursively as follows:
|
|||
big-endian, unlike [LEB128][] encoding ([as used by
|
||||
Google][google-varint] in protobufs).
|
||||
|
||||
We write `len(|r|)` for the varint-encoding of the length of `Repr` `r`.
|
||||
|
||||
The following table illustrates varint-encoding.
|
||||
|
||||
| Number, `m` | `m` in binary, grouped into 7-bit chunks | `len(m)` bytes |
|
||||
|
@ -71,7 +73,8 @@ varint-encoding of `m` *MUST NOT* start with `0`.
|
|||
«[X_1...X_m]» = [0xA8] ++ seq(«X_1», ..., «X_m»)
|
||||
«#{E_1...E_m}» = [0xA9] ++ seq(«E_1», ..., «E_m»)
|
||||
«{K_1:V_1...K_m:V_m}» = [0xAA] ++ seq(«K_1», «V_1», ..., «K_m», «V_m»)
|
||||
where seq(R_1, ... R_m) = len(R_1) ++ R_1 ++...++ len(R_m) ++ R_m
|
||||
|
||||
seq(R_1, ..., R_m) = len(|R_1|) ++ R_1 ++...++ len(|R_m|) ++ R_m
|
||||
|
||||
There is *no* ordering requirement on the `E_i` elements or
|
||||
`K_i`/`V_i` pairs.[^no-sorting-rationale] They may appear in any
|
||||
|
@ -172,7 +175,7 @@ represent the denoted object, prefixed with `[0xBF]`.
|
|||
To annotate a `Repr` `r` with some sequence of `Value`s `[v_1, ...,
|
||||
v_m]`, surround `r` as follows:
|
||||
|
||||
[0xBE] ++ len(r) ++ r ++ len(v_1) ++ v_1 ++...++ len(v_m) ++ v_m
|
||||
[0xBE] ++ len(|r|) ++ r ++ len(|«v_1»|) ++ «v_1» ++...++ len(|«v_m»|) ++ «v_m»
|
||||
|
||||
The `Repr` `r` *MUST NOT* already have annotations; that is, it must not begin with `0xBE`.
|
||||
|
||||
|
@ -180,7 +183,7 @@ For example, the `Repr` corresponding to textual syntax `@a@b[]`, i.e.
|
|||
an empty sequence annotated with two symbols, `a` and `b`, is
|
||||
|
||||
«@a @b []»
|
||||
= [0xBE] ++ len(«[]») ++ «[]» ++ len(«a») ++ «a» ++ len(«b») ++ «b»
|
||||
= [0xBE] ++ len(|«[]»|) ++ «[]» ++ len(|«a»|) ++ «a» ++ len(|«b»|) ++ «b»
|
||||
= [0xBE, 0x81, 0xA8, 0x82, 0xA6, 0x61, 0x82, 0xA6, 0x62]
|
||||
|
||||
## Security Considerations
|
||||
|
@ -230,7 +233,7 @@ undetermined number of `Value`s across, say, a TCP/IP connection:
|
|||
|
||||
- If the binary syntax is to be used for the connection, start the
|
||||
connection with byte `0xA8` (sequence). After the initial byte, send
|
||||
each value `v` as `len(«v») ++ «v»`. A side effect of this approach
|
||||
each value `v` as `len(|«v»|) ++ «v»`. A side effect of this approach
|
||||
is that the entire stream, when complete, is a valid `Sequence`
|
||||
`Repr`.
|
||||
|
||||
|
|
Loading…
Reference in New Issue