From c89147dd6aecd1543ecb923840c19d954a41adda Mon Sep 17 00:00:00 2001 From: Tony Garnock-Jones Date: Wed, 1 Nov 2023 14:13:36 +0100 Subject: [PATCH] Restrict usage of commas. Bump spec version to 0.992 --- .gitignore | 1 + Makefile | 3 +- _config.yml | 2 +- _includes/cheatsheet-text-plaintext.md | 11 +- _includes/cheatsheet-text.md | 11 +- .../doc/cheatsheet-text-plaintext.md | 11 +- preserves-expressions.md | 104 ++++++++---------- preserves-text.md | 26 +++-- 8 files changed, 82 insertions(+), 87 deletions(-) diff --git a/.gitignore b/.gitignore index f631d1d..fbaadcd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ _site/ +cheatsheet.pdf preserves-expressions.pdf preserves-binary.pdf preserves-schema.pdf diff --git a/Makefile b/Makefile index ac1dc8b..b98e45d 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,8 @@ PDFS=\ preserves-text.pdf \ preserves-binary.pdf \ preserves-schema.pdf \ - preserves-expressions.pdf + preserves-expressions.pdf \ + cheatsheet.pdf all: $(PDFS) diff --git a/_config.yml b/_config.yml index 386f9ad..2d69247 100644 --- a/_config.yml +++ b/_config.yml @@ -14,4 +14,4 @@ defaults: title: "Preserves" version_date: "October 2023" -version: "0.991.0" +version: "0.992.0" diff --git a/_includes/cheatsheet-text-plaintext.md b/_includes/cheatsheet-text-plaintext.md index 813d483..d812b69 100644 --- a/_includes/cheatsheet-text-plaintext.md +++ b/_includes/cheatsheet-text-plaintext.md @@ -3,14 +3,15 @@ Document := Value ws Value := ws (Record | Collection | Atom | Embedded | Annotated) Collection := Sequence | Dictionary | Set Atom := Boolean | ByteString | String | QuotedSymbol | Symbol | Number -ws := (space | tab | cr | lf | `,`)* +ws := (space | tab | cr | lf)* +commas := (ws `,`)* ws delimiter := ws | `<` | `>` | `[` | `]` | `{` | `}` - | `#` | `:` | `"` | `|` | `@` | `;` + | `#` | `:` | `"` | `|` | `@` | `;` | `,` Record := `<` Value+ ws `>` -Sequence := `[` Value* ws `]` -Dictionary := `{` (Value ws `:` Value)* ws `}` -Set := `#{` Value* ws `}` +Sequence := `[` (commas Value)* commas `]` +Set := `#{` (commas Value)* commas `}` +Dictionary := `{` (commas Value ws `:` Value)* commas `}` Boolean := `#t` | `#f` ByteString := `#"` binchar* `"` diff --git a/_includes/cheatsheet-text.md b/_includes/cheatsheet-text.md index a2aeb8b..ce1501f 100644 --- a/_includes/cheatsheet-text.md +++ b/_includes/cheatsheet-text.md @@ -3,14 +3,15 @@ | *Value* | := | **ws** (*Record* | *Collection* | *Atom* | *Embedded* | *Annotated*) | | *Collection* | := | *Sequence* | *Dictionary* | *Set* | | *Atom* | := | *Boolean* | *ByteString* | *String* | *QuotedSymbol* | *Symbol* | *Number* | -| **ws** | := | (**space** | **tab** | **cr** | **lf** |`,`) | -| **delimiter** | := | **ws** | `<` | `>` | `[` | `]` | `{` | `}` | `#` | `:` | `"` | `|` | `@` | `;` | +| **ws** | := | (**space** | **tab** | **cr** | **lf**) | +| **commas** | := | (**ws** `,`) **ws** | +| **delimiter** | := | **ws** | `<` | `>` | `[` | `]` | `{` | `}` | `#` | `:` | `"` | `|` | `@` | `;` | `,` | {:.postcard-grammar.textsyntax} | *Record* | := | `<`*Value*+ **ws**`>` | -| *Sequence* | := | `[`*Value* **ws**`]` | -| *Dictionary* | := | `{` (*Value* **ws**`:`*Value*) **ws**`}` | -| *Set* | := | `#{`*Value* **ws**`}` | +| *Sequence* | := | `[`(**commas** *Value*) **commas**`]` | +| *Set* | := | `#{`(**commas** *Value*) **commas**`}` | +| *Dictionary* | := | `{` (**commas** *Value* **ws**`:`*Value*) **commas**`}` | {:.postcard-grammar.textsyntax} | *Boolean* | := | `#t`|`#f` | diff --git a/implementations/rust/preserves/doc/cheatsheet-text-plaintext.md b/implementations/rust/preserves/doc/cheatsheet-text-plaintext.md index 813d483..d812b69 100644 --- a/implementations/rust/preserves/doc/cheatsheet-text-plaintext.md +++ b/implementations/rust/preserves/doc/cheatsheet-text-plaintext.md @@ -3,14 +3,15 @@ Document := Value ws Value := ws (Record | Collection | Atom | Embedded | Annotated) Collection := Sequence | Dictionary | Set Atom := Boolean | ByteString | String | QuotedSymbol | Symbol | Number -ws := (space | tab | cr | lf | `,`)* +ws := (space | tab | cr | lf)* +commas := (ws `,`)* ws delimiter := ws | `<` | `>` | `[` | `]` | `{` | `}` - | `#` | `:` | `"` | `|` | `@` | `;` + | `#` | `:` | `"` | `|` | `@` | `;` | `,` Record := `<` Value+ ws `>` -Sequence := `[` Value* ws `]` -Dictionary := `{` (Value ws `:` Value)* ws `}` -Set := `#{` Value* ws `}` +Sequence := `[` (commas Value)* commas `]` +Set := `#{` (commas Value)* commas `}` +Dictionary := `{` (commas Value ws `:` Value)* commas `}` Boolean := `#t` | `#f` ByteString := `#"` binchar* `"` diff --git a/preserves-expressions.md b/preserves-expressions.md index 836c869..595284c 100644 --- a/preserves-expressions.md +++ b/preserves-expressions.md @@ -25,16 +25,11 @@ which (ab)use Preserves text syntax as a kind of programming notation. The P-expression grammar includes by reference the definition of `Atom` from the [text syntax][], as well as the definitions that `Atom` depends on. -P-expressions take their own approach to inter-token whitespace, -however. - -**Whitespace.** Whitespace `sp` is defined as any number of spaces, -tabs, carriage returns, or line feeds. Commas are *not* considered -whitespace in P-expressions, and so class `sp` is different to class -`ws` from the text syntax. +**Whitespace.** Whitespace `ws` is, as in the text syntax, defined as +any number of spaces, tabs, carriage returns, or line feeds. - sp = *(%x20 / %x09 / CR / LF) + ws = *(%x20 / %x09 / CR / LF) No changes to [the Preserves semantic model](preserves.html) are made. Every Preserves text-syntax term can be parsed as a valid P-expression, @@ -47,20 +42,22 @@ below](#reading-preserves)). Standalone documents containing P-expressions are sequences of individual `Expr`s, followed by trailing whitespace. - Document = *Expr sp + Document = *Expr ws A single P-expression `Expr` can be an `Atom` from the [text syntax][], a compound expression, special punctuation, an `Embedded` expression, or -an `Annotated` expression. +an `Annotated` expression. The class `SimpleExpr` includes all of `Expr` +except special punctuation. - Expr = sp (Atom | Compound | Punct | Embedded | Annotated) + Expr = ws (SimpleExpr | Punct) + SimpleExpr = Atom | Compound | Embedded | Annotated Embedded and annotated values are as in the text syntax, differing only -in that uses of `Value` are replaced with `Expr`. +in that uses of `Value` are replaced with `SimpleExpr`. - Embedded = "#!" Expr - Annotated = Annotation Expr - Annotation = "@" Expr / "#" [(%x20 / %x09) linecomment] (CR / LF) + Embedded = "#!" SimpleExpr + Annotated = Annotation SimpleExpr + Annotation = "@" SimpleExpr / "#" [(%x20 / %x09) linecomment] (CR / LF) P-expression special punctuation marks are comma, semicolon, and sequences of one or more colons. @@ -70,11 +67,11 @@ Compound expressions are sequences of `Expr`s with optional trailing `Annotation`s, surrounded by various kinds of parentheses. Compound = Sequence / Record / Block / Group / Set - Sequence = "[" *Expr Trailer sp "]" - Record = "<" *Expr Trailer sp ">" - Block = "{" *Expr Trailer sp "}" - Group = "(" *Expr Trailer sp ")" - Set = "#{" *Expr Trailer sp "}" + Sequence = "[" *Expr Trailer ws "]" + Record = "<" *Expr Trailer ws ">" + Block = "{" *Expr Trailer ws "}" + Group = "(" *Expr Trailer ws ")" + Set = "#{" *Expr Trailer ws "}" In an `Annotated` P-expression, annotations and comments attach to the term following them, just as in the ordinary text syntax. However, it is @@ -117,9 +114,8 @@ sequences of Preserves values. The [previous section](#encoding-pexprs) discussed ways of representing P-expressions using Preserves. Here, we discuss *interpreting* P-expressions *as* Preserves so that (1) a Preserves datum (2) written -using Preserves text syntax[^careful-use-of-commas] and then (3) read as -a P-expression can be (4) interpreted from that P-expression to yield -the original datum. +using Preserves text syntax and then (3) read as a P-expression can be +(4) interpreted from that P-expression to yield the original datum. 1. Every `(`..`)` or `;` that appears is an error. 2. Every `:`, `::`, `:::`, ... is an error, except in context of `Block`s as described below. @@ -127,17 +123,13 @@ the original datum. 4. Every `Trailer` that appears is an error.[^discard-trailers-instead-of-error] 5. Every `Record` with no values in it is an error. 6. Every `Block` must contain zero or more repeating triplets of - `Expr`, `:`, `Expr`. Any `Block` not following this pattern is an - error. Each `Block` following the pattern is translated to a - `Dictionary` containing a key/value pair for each triplet. Any - `Block` with duplicate keys (under interpretation) is an error. - 7. Every `Set` containing any duplicate expressions (under interpretation) is an error. - -[^careful-use-of-commas]: Every Preserves datum can be read via a - P-expression reader and then interpreted successfully as Preserves - *if commas are omitted entirely in the text*. If commas are present, - however, they must not appear in certain positions, namely: either - before or after *p* in `@`*p* *q*; or before *p* in `#!`*p*. + `SimpleExpr`, `:`, `SimpleExpr`. Any `Block` not following this + pattern is an error. Each `Block` following the pattern is + translated to a `Dictionary` containing a key/value pair for each + triplet. Any `Block` with duplicate keys (under interpretation) is + an error. + 7. Every `Set` containing any duplicate expressions (under + interpretation) is an error. [^discard-trailers-instead-of-error]: **Implementation note.** When implementing parsing of P-expressions into Preserves, consider @@ -283,35 +275,31 @@ P-expression `Expr`s. ## Appendix: Equations for interpreting P-expressions as Preserves -The partial function **uncomma**(*p*) removes all occurrences of `,` -from a P-expression *p*. +The function **uncomma**(*p*) removes all occurrences of `,` from a +P-expression *p* ∈ `Expr` − {`,`}. {:.pseudocode.equations} -| **uncomma** : **Expr** | ⇀ | **Expr** | | -| **uncomma**(`[`*p* ...`]`) | = | `[`**uncomma**(*p*) ...`]` | omitting any *p* = `,` | -| **uncomma**(`<`*p* ...`>`) | = | `<`**uncomma**(*p*) ...`>` | omitting any *p* = `,` | -| **uncomma**(`{`*p* ...`}`) | = | `{`**uncomma**(*p*) ...`}` | omitting any *p* = `,` | -| **uncomma**(`(`*p* ...`)`) | = | `(`**uncomma**(*p*) ...`)` | omitting any *p* = `,` | -| **uncomma**(`#{`*p* ...`}`) | = | `#{`**uncomma**(*p*) ...`}` | omitting any *p* = `,` | -| **uncomma**(`#!`*p*) | = | `#!`**uncomma**(*p*) ...`}` | | -| **uncomma**(`@`*p* *q*) | = | `@`**uncomma**(*p*) **uncomma**(*q*) | | -| **uncomma**(*p*) | = | *p* | if *p* ∈ **Atom** ∪ **Punct** - {`,`} | - -{:.pseudocode.equations} -| **uncomma** : **Document** | ⇀ | **Document** | -| **uncomma**(*p* ...) | = | **uncomma**(*p*) ... | +| **uncomma** : **Expr** − {`,`} | ⟶ | **Expr** | | +| **uncomma**(`[`*p* ...`]`) | = | `[`**uncomma**(*p*) ...`]` | omitting any *p* = `,` | +| **uncomma**(`<`*p* ...`>`) | = | `<`**uncomma**(*p*) ...`>` | omitting any *p* = `,` | +| **uncomma**(`{`*p* ...`}`) | = | `{`**uncomma**(*p*) ...`}` | omitting any *p* = `,` | +| **uncomma**(`(`*p* ...`)`) | = | `(`**uncomma**(*p*) ...`)` | omitting any *p* = `,` | +| **uncomma**(`#{`*p* ...`}`) | = | `#{`**uncomma**(*p*) ...`}` | omitting any *p* = `,` | +| **uncomma**(`#!`*p*) | = | `#!`**uncomma**(*p*) | | +| **uncomma**(`@`*p* *q*) | = | `@`**uncomma**(*p*) **uncomma**(*q*) | | +| **uncomma**(*p*) | = | *p* | if *p* ∈ **Atom** ∪ **Punct** − {`,`} | We write ⌞**uncomma**(*p*)⌟ for the partial function mapping a -P-expression *p* ∈ `Expr` to a corresponding Preserves `Value`. +P-expression *p* ∈ `Expr` − {`,`} to a corresponding Preserves `Value`. {:.pseudocode.equations} -| ⌞·⌟ : **Expr** | ⇀ | **Value** | | -| ⌞`[`*p* ...`]`⌟ | = | `[`⌞*p*⌟ ...`]` | | -| ⌞`<`ℓ *p* ...`>`⌟ | = | `<`⌞ℓ⌟ ⌞*p*⌟ ...`>` | | -| ⌞`{`*k*`:`*v* ...`}`⌟ | = | `{`⌞*k*⌟`:`⌞*v*⌟ ...`}` | if all ⌞*k*⌟ ... are distinct | -| ⌞`#{`*p* ...`}`⌟ | = | `#{`⌞*p*⌟ ...`}` | if all ⌞*p*⌟ ... are distinct | -| ⌞`#!`*p*⌟ | = | `#!`⌞*p*⌟ | | -| ⌞`@`*p* *q*⌟ | = | `@`⌞*p*⌟ ⌞*q*⌟ | | -| ⌞*p*⌟ | = | *p* | when *p* ∈ **Atom** | +| ⌞·⌟ : **Expr** − {`,`} | ⇀ | **Value** | | +| ⌞`[`*p* ...`]`⌟ | = | `[`⌞*p*⌟ ...`]` | | +| ⌞`<`ℓ *p* ...`>`⌟ | = | `<`⌞ℓ⌟ ⌞*p*⌟ ...`>` | | +| ⌞`{`*k*`:`*v* ...`}`⌟ | = | `{`⌞*k*⌟`:`⌞*v*⌟ ...`}` | if all ⌞*k*⌟ ... are distinct | +| ⌞`#{`*p* ...`}`⌟ | = | `#{`⌞*p*⌟ ...`}` | if all ⌞*p*⌟ ... are distinct | +| ⌞`#!`*p*⌟ | = | `#!`⌞*p*⌟ | | +| ⌞`@`*p* *q*⌟ | = | `@`⌞*p*⌟ ⌞*q*⌟ | | +| ⌞*p*⌟ | = | *p* | when *p* ∈ **Atom** | ## Notes diff --git a/preserves-text.md b/preserves-text.md index 7cb492b..b8907be 100644 --- a/preserves-text.md +++ b/preserves-text.md @@ -28,10 +28,15 @@ a grammar for recognising sequences of Unicode scalar values. UTF-8 where possible. -**Whitespace.** Whitespace is defined as any number of spaces, tabs, -carriage returns, line feeds, or commas. +**Whitespace.** Whitespace `ws` is defined as any number of spaces, tabs, +carriage returns, or line feeds. - ws = *(%x20 / %x09 / CR / LF / ",") + ws = *(%x20 / %x09 / CR / LF) + + +**Commas.** In some positions inside compound terms, commas are permitted and ignored. + + commas = *(ws ",") ws **Delimiters.** Some tokens (`Boolean`, `SymbolOrNumber`) *MUST* be @@ -39,7 +44,7 @@ followed by a `delimiter` or by the end of the input.[^delimiters-lookahead] delimiter = ws / "<" / ">" / "[" / "]" / "{" / "}" - / "#" / ":" / DQUOTE / "|" / "@" / ";" + / "#" / ":" / DQUOTE / "|" / "@" / ";" / "," [^delimiters-lookahead]: The addition of this constraint means that implementations must now use some kind of lookahead to make sure a @@ -73,9 +78,9 @@ printing sets and dictionaries, implementations *SHOULD* order elements resp. keys with respect to the [total order over `Value`s](preserves.html#total-order).[^rationale-print-ordering] - Sequence = "[" *Value ws "]" - Set = "#{" *Value ws "}" - Dictionary = "{" *(Value ws ":" Value) ws "}" + Sequence = "[" *(commas Value) commas "]" + Set = "#{" *(commas Value) commas "}" + Dictionary = "{" *(commas Value ws ":" Value) commas "}" [^printing-collections]: **Implementation note.** When implementing printing of `Value`s using the textual syntax, consider supporting @@ -147,8 +152,8 @@ following the usual rules for double quote and backslash. / %s"\x" 2HEXDIG / "\" DQUOTE -The second is a sequence of pairs of hexadecimal digits interleaved -with whitespace and surrounded by `#x"` and `"`. +The second is pairs of hexadecimal digits interleaved with whitespace +and surrounded by `#x"` and `"`. ByteString =/ %s"#x" DQUOTE *(ws 2HEXDIG) ws DQUOTE @@ -320,9 +325,6 @@ give an illusion of progress. ## Acknowledgements -The treatment of commas as whitespace in the text syntax is inspired -by the same feature of [EDN](https://github.com/edn-format/edn). - The text syntax for `Boolean`s, `Symbol`s, and `ByteString`s is directly inspired by [Racket](https://racket-lang.org/)'s lexical syntax.