Fix spec issues with len() and IEEE754 endianness (thanks to isd)

2022-06-11 10:06:51 +02:00 · 2022-06-11 10:06:51 +02:00 · 832fec6c43
parent 959b0f90b4
commit 832fec6c43
7 changed files with 21 additions and 1210 deletions
--- a/README.md
+++ b/README.md
@ -64,6 +64,7 @@ Implementations of the data model, plus Syrup transfer syntax:

 ## Additional resources

+ - [Cheat sheet(s) for Preserves syntax](cheatsheet.html)
 - Some [conventions for common data types](conventions.html)
 - [Open questions](questions.html); see also the
   [issues list]({{page.projectpages}}/issues)
--- a/preserves-binary-cheatsheet.md
+++ b/preserves-binary-cheatsheet.md
@ -21,10 +21,13 @@ For a value `v`, we write `«v»` for the binary encoding of `v`.
                                 [0xA5] ++ S        if S ∈ ByteString
                                 [0xA6] ++ utf8(S)  if S ∈ Symbol

-`intbytes(x)` gives the big-endian two's-complement signed binary
-representation of `x`, taking exactly as many whole bytes as needed to
-unambiguously identify the value and its sign. `intbytes(0)` is the
-empty byte sequence.
+The functions `binary32(F)` and `binary64(D)` yield big-endian 4- and
+8-byte IEEE 754 binary representations of `F` and `D`, respectively.
+
+The function `intbytes(x)` gives the big-endian two's-complement signed
+binary representation of `x`, taking exactly as many whole bytes as
+needed to unambiguously identify the value and its sign. `intbytes(0)`
+is the empty byte sequence.

 ### Compounds

@ -33,11 +36,12 @@ empty byte sequence.
                «#{E_1...E_m}» = [0xA9] ++ seq(«E_1», ..., «E_m»)
         «{K_1:V_1...K_m:V_m}» = [0xAA] ++ seq(«K_1», «V_1», ..., «K_m», «V_m»)

-    seq(R_1, ... R_m) = len(R_1) ++ R_1 ++...++ len(R_m) ++ R_m
+            seq(R_1, ..., R_m) = len(|R_1|) ++ R_1 ++...++ len(|R_m|) ++ R_m

-    len(m) = e(m, 128)
-           where e(v, d) = [v + d]                           if v < 128
-                           e(v / 128, 0) ++ [(v % 128) + d]  if v ≥ 128
+                        len(m) = e(m, 128)
+
+                       e(v, d) = [v + d]                           if v < 128
+                                 e(v / 128, 0) ++ [(v % 128) + d]  if v ≥ 128

 ### Embeddeds

@ -49,4 +53,4 @@ To annotate a `Repr` `r` (that *MUST NOT* itself already be annotated)
 with some sequence of `Value`s `[v_1, ..., v_m]`, surround `r` as
 follows:

-    [0xBE] ++ len(r) ++ r ++ len(v_1) ++ v_1 ++...++ len(v_m) ++ v_m
+    [0xBE] ++ len(|r|) ++ r ++ len(|«v_1»|) ++ «v_1» ++...++ len(|«v_m»|) ++ «v_m»
--- a/implementations/c/.gitignore
+++ b/implementations/c/.gitignore
@ -1,2 +0,0 @@
-m.output.txt
-m
--- a/implementations/c/Makefile
+++ b/implementations/c/Makefile
@ -1,8 +0,0 @@
-m: main.c preserves.h
-	gcc -Wall -Wextra -Werror -g3 -o $@ main.c
-
-go: m
-	cat ../../tests/samples.bin | ./m | tee m.output.txt
-
-clean:
-	rm -f m
--- a/implementations/c/main.c
+++ b/implementations/c/main.c
@ -1,96 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <stdbool.h>
-
-#include <sys/time.h>
-
-#define PRESERVES_IMPLEMENTATION
-#include "preserves.h"
-
-static double now() {
-  struct timeval tv;
-  if (gettimeofday(&tv, NULL) < 0) {
-    perror("gettimeofday");
-  }
-  return (double) tv.tv_sec + ((double) tv.tv_usec) / 1000000.0;
-}
-
-int main(__attribute__ ((unused)) int argc,
-         __attribute__ ((unused)) char const * const argv[])
-{
-  preserves_bytes_t input = preserves_create_bytes();
-  bool silent = getenv("SILENT") != NULL;
-
-  double start = now();
-
-  {
-    preserves_bytes_t chunk = preserves_create_bytes();
-    if (preserves_resize_bytes(&chunk, 131072) == -1) {
-      perror("allocating chunk");
-      return EXIT_FAILURE;
-    }
-
-    while (true) {
-      size_t count = fread(chunk.ptr, 1, chunk.len, stdin);
-      if (count == 0) {
-        if (ferror(stdin)) {
-          perror("reading");
-          return EXIT_FAILURE;
-        }
-        break;
-      }
-      if (preserves_extend_bytes(&input, preserves_bytes_subsequence(&chunk, 0, count)) == -1) {
-        perror("appending");
-        return EXIT_FAILURE;
-      }
-    }
-
-    preserves_free_bytes(&chunk);
-  }
-
-  double mid = now();
-
-  {
-    preserves_reader_t reader = preserves_create_reader();
-    preserves_reader_result_t result = preserves_read_binary(&reader, &input, 1);
-  more_input:
-    if (result.index == NULL) {
-      perror("parsing");
-      return EXIT_FAILURE;
-    }
-
-    if (!silent) {
-      printf("Size of index: %lu bytes; %lu entries\n",
-             reader.index_pos * sizeof(preserves_index_entry_t),
-             reader.index_pos);
-    }
-
-    if (!silent) {
-      for (preserves_index_entry_t *i = result.index; i != result.end_marker; i++) {
-        preserves_dump_index_entry(stdout, &reader.input, i, true);
-      }
-      preserves_dump_index_entry(stdout, &reader.input, result.end_marker, true);
-    }
-
-    if (result.end_marker->data._err == PRESERVES_END_MORE_INPUT_REMAINING) {
-      if (!silent) {
-        printf("\n");
-      }
-      reader.index_pos = 0;
-      result = preserves_read_binary_continue(&reader, 1);
-      goto more_input;
-    }
-
-    preserves_free_reader(&reader);
-  }
-
-  double end = now();
-
-  printf("stage 1: %g s\n", mid - start);
-  printf("stage 2: %g s\n", end - mid);
-  printf("total: %g s\n", end - start);
-
-  preserves_free_bytes(&input);
-  return EXIT_SUCCESS;
-}
--- a/implementations/c/preserves.h
+++ b/implementations/c/preserves.h
--- a/preserves-binary.md
+++ b/preserves-binary.md
@ -53,6 +53,8 @@ defined recursively as follows:
    big-endian, unlike [LEB128][] encoding ([as used by
    Google][google-varint] in protobufs).

+We write `len(|r|)` for the varint-encoding of the length of `Repr` `r`.
+
 The following table illustrates varint-encoding.

 | Number, `m` | `m` in binary, grouped into 7-bit chunks  | `len(m)` bytes  |
@ -71,7 +73,8 @@ varint-encoding of `m` *MUST NOT* start with `0`.
            «[X_1...X_m]» = [0xA8] ++ seq(«X_1», ..., «X_m»)
           «#{E_1...E_m}» = [0xA9] ++ seq(«E_1», ..., «E_m»)
    «{K_1:V_1...K_m:V_m}» = [0xAA] ++ seq(«K_1», «V_1», ..., «K_m», «V_m»)
-                          where seq(R_1, ... R_m) = len(R_1) ++ R_1 ++...++ len(R_m) ++ R_m
+
+       seq(R_1, ..., R_m) = len(|R_1|) ++ R_1 ++...++ len(|R_m|) ++ R_m

 There is *no* ordering requirement on the `E_i` elements or
 `K_i`/`V_i` pairs.[^no-sorting-rationale] They may appear in any
@ -172,7 +175,7 @@ represent the denoted object, prefixed with `[0xBF]`.
 To annotate a `Repr` `r` with some sequence of `Value`s `[v_1, ...,
 v_m]`, surround `r` as follows:

-    [0xBE] ++ len(r) ++ r ++ len(v_1) ++ v_1 ++...++ len(v_m) ++ v_m
+    [0xBE] ++ len(|r|) ++ r ++ len(|«v_1»|) ++ «v_1» ++...++ len(|«v_m»|) ++ «v_m»

 The `Repr` `r` *MUST NOT* already have annotations; that is, it must not begin with `0xBE`.

@ -180,7 +183,7 @@ For example, the `Repr` corresponding to textual syntax `@a@b[]`, i.e.
 an empty sequence annotated with two symbols, `a` and `b`, is

    «@a @b []»
-      = [0xBE] ++ len(«[]») ++ «[]» ++ len(«a») ++ «a» ++ len(«b») ++ «b»
+      = [0xBE] ++ len(|«[]»|) ++ «[]» ++ len(|«a»|) ++ «a» ++ len(|«b»|) ++ «b»
      = [0xBE, 0x81, 0xA8, 0x82, 0xA6, 0x61, 0x82, 0xA6, 0x62]

 ## Security Considerations
@ -230,7 +233,7 @@ undetermined number of `Value`s across, say, a TCP/IP connection:

 - If the binary syntax is to be used for the connection, start the
   connection with byte `0xA8` (sequence). After the initial byte, send
-   each value `v` as `len(«v») ++ «v»`. A side effect of this approach
+   each value `v` as `len(|«v»|) ++ «v»`. A side effect of this approach
   is that the entire stream, when complete, is a valid `Sequence`
   `Repr`.