preserves/implementations/rust/preserves/src/value/text/reader.rs

735 lines
25 KiB
Rust

//! Implementation of [Reader] for the text syntax.
use crate::error::io_syntax_error;
use crate::error::is_eof_io_error;
use crate::error::syntax_error;
use crate::error::Error;
use crate::error::ExpectedKind;
use crate::error::Received;
use crate::hex;
use crate::value::boundary as B;
use crate::value::reader::BinarySource;
use crate::value::reader::ReaderResult;
use crate::value::repr::Annotations;
use crate::value::CompoundClass;
use crate::value::DomainParse;
use crate::value::IOValue;
use crate::value::IOValueDomainCodec;
use crate::value::Map;
use crate::value::NestedValue;
use crate::value::Reader;
use crate::value::Record;
use crate::value::Set;
use crate::value::Token;
use crate::value::Value;
use crate::value::ViaCodec;
use lazy_static::lazy_static;
use num::bigint::BigInt;
use std::convert::TryInto;
use std::io;
use std::marker::PhantomData;
/// The text syntax Preserves reader.
pub struct TextReader<'de, 'src, N: NestedValue, Dec: DomainParse<N::Embedded>, S: BinarySource<'de>> {
/// Underlying source of (utf8) bytes.
pub source: &'src mut S,
/// Decoder for producing Rust values embedded in the text.
pub dec: Dec,
/// Treatment of whitespace before a toplevel term.
pub toplevel_whitespace_mode: ToplevelWhitespaceMode,
phantom: PhantomData<&'de N>,
}
/// [TextReader] chooses `Document` mode to treat whitespace preceding end-of-file as a "no
/// more values" non-error situation, or `Value` mode to treat it as an "expected more input"
/// situation.
///
/// The Preserves syntax for `Value` treats any input at all, even whitespace, as an indicator
/// that a term is to follow. However, when using a TextReader to parse a *series* of `Value`s
/// in a `Document`, whitespace followed by EOF is to be treated as the permitted optional
/// whitespace at the end of a `Document.
pub enum ToplevelWhitespaceMode {
Document,
Value,
}
fn decode_utf8(bs: Vec<u8>) -> io::Result<String> {
Ok(String::from_utf8(bs).map_err(|_| io_syntax_error("Invalid UTF-8"))?)
}
fn append_codepoint(bs: &mut Vec<u8>, n: u32) -> io::Result<()> {
let c = char::from_u32(n).ok_or_else(|| io_syntax_error("Bad code point"))?;
let mut buf = [0; 4];
let _ = c.encode_utf8(&mut buf);
bs.extend(&buf[0..c.len_utf8()]);
Ok(())
}
impl<'de, 'src, N: NestedValue, Dec: DomainParse<N::Embedded>, S: BinarySource<'de>>
TextReader<'de, 'src, N, Dec, S>
{
/// Construct a new reader from a byte (utf8) source and embedded-value decoder.
pub fn new(source: &'src mut S, dec: Dec) -> Self {
TextReader {
source,
dec,
toplevel_whitespace_mode: ToplevelWhitespaceMode::Document,
phantom: PhantomData,
}
}
pub fn toplevel_whitespace_mode(mut self, new_mode: ToplevelWhitespaceMode) -> Self {
self.toplevel_whitespace_mode = new_mode;
self
}
fn peek(&mut self) -> io::Result<u8> {
self.source.peek()
}
fn skip(&mut self) -> io::Result<()> {
self.source.skip()
}
fn next_byte(&mut self) -> io::Result<u8> {
let b = self.source.peek()?;
self.source.skip()?;
Ok(b)
}
fn skip_whitespace(&mut self) {
self.skip_whitespace_and_maybe_commas(false)
}
fn skip_whitespace_and_maybe_commas(&mut self, skip_commas: bool) {
// Deliberately swallows errors.
while let Ok(c) = self.peek() {
match c {
b' ' | b'\t' | b'\r' | b'\n' => {
let _ = self.skip();
()
}
b',' if skip_commas => {
let _ = self.skip();
()
}
_ => break,
}
}
}
// TODO: This is a duplicate of fn expected in PackedReader.
fn expected(&mut self, k: ExpectedKind) -> Error {
match Reader::<N>::demand_next(self, true) {
Ok(v) => Error::Expected(k, Received::ReceivedOtherValue(format!("{:?}", v))),
Err(e) => e.into(),
}
}
fn gather_annotations(&mut self, vs: &mut Vec<N>) -> ReaderResult<()> {
loop {
self.skip_whitespace();
match self.peek()? {
b'#' => {
let m = self.source.mark()?;
self.skip()?;
match self.next_byte()? {
b' ' | b'\t' => vs.push(N::new(self.comment_line()?)),
b'\n' | b'\r' => vs.push(N::new("")),
_ => {
self.source.restore(&m)?;
return Ok(());
}
}
}
b'@' => {
self.skip()?;
vs.push(self.demand_next(true)?)
}
_ => return Ok(()),
}
}
}
fn prepend_annotations_to_next(&mut self, mut annotations: Vec<N>) -> ReaderResult<N> {
let (existing_annotations, v) = Reader::<N>::demand_next(self, true)?.pieces();
annotations.extend_from_slice(existing_annotations.slice());
Ok(N::wrap(Annotations::new(Some(annotations)), v))
}
fn skip_annotations(&mut self) -> ReaderResult<()> {
loop {
self.skip_whitespace();
match self.peek()? {
b'#' => {
let m = self.source.mark()?;
self.skip()?;
match self.next_byte()? {
b' ' | b'\t' => { self.comment_line()?; () }
b'\n' | b'\r' => (),
_ => {
self.source.restore(&m)?;
return Ok(());
}
}
}
b'@' => {
self.skip()?;
self.skip_value()?;
}
_ => return Ok(()),
}
}
}
/// Retrieve the next [IOValue] in the input stream.
pub fn next_iovalue(&mut self, read_annotations: bool) -> io::Result<IOValue> {
let mut r = TextReader::new(self.source, ViaCodec::new(IOValueDomainCodec));
let v = r.demand_next(read_annotations)?;
Ok(v)
}
fn comment_line(&mut self) -> io::Result<String> {
let mut bs = Vec::new();
loop {
let b = self.peek()?;
self.skip()?;
match b {
b'\r' | b'\n' => return Ok(decode_utf8(bs)?),
_ => bs.push(b),
}
}
}
fn read_hex_float(&mut self, bytecount: usize) -> io::Result<N> {
if self.next_byte()? != b'"' {
return Err(io_syntax_error(
"Missing open-double-quote in hex-encoded floating-point number",
));
}
let bs = self.read_hex_binary()?;
if bs.len() != bytecount {
return Err(io_syntax_error(
"Incorrect number of bytes in hex-encoded floating-point number",
));
}
match bytecount {
4 => Ok(Value::from(f32::from_bits(u32::from_be_bytes(bs.try_into().unwrap()))).wrap()),
8 => Ok(Value::from(f64::from_bits(u64::from_be_bytes(bs.try_into().unwrap()))).wrap()),
_ => Err(io_syntax_error(
"Unsupported byte count in hex-encoded floating-point number",
)),
}
}
fn read_stringlike<X, H, R>(
&mut self,
mut seed: R,
xform_item: X,
terminator: u8,
hexescape: u8,
hexescaper: H,
) -> io::Result<R>
where
X: Fn(&mut R, u8) -> io::Result<()>,
H: Fn(&mut R, &mut Self) -> io::Result<()>,
{
loop {
match self.next_byte()? {
c if c == terminator => return Ok(seed),
b'\\' => match self.next_byte()? {
c if c == hexescape => hexescaper(&mut seed, self)?,
c if c == terminator || c == b'\\' || c == b'/' => xform_item(&mut seed, c)?,
b'b' => xform_item(&mut seed, b'\x08')?,
b'f' => xform_item(&mut seed, b'\x0c')?,
b'n' => xform_item(&mut seed, b'\x0a')?,
b'r' => xform_item(&mut seed, b'\x0d')?,
b't' => xform_item(&mut seed, b'\x09')?,
_ => return Err(io_syntax_error("Invalid escape code")),
},
c => xform_item(&mut seed, c)?,
}
}
}
fn hexnum(&mut self, count: usize) -> io::Result<u32> {
let mut v: u32 = 0;
for _ in 0..count {
let c = self.next_byte()?;
match (c as char).to_digit(16) {
Some(d) => v = v << 4 | d,
None => return Err(io_syntax_error("Bad hex escape")),
}
}
Ok(v)
}
fn read_string(&mut self, delimiter: u8) -> io::Result<String> {
decode_utf8(self.read_stringlike(
Vec::new(),
|bs, c| Ok(bs.push(c)),
delimiter,
b'u',
|bs, r| {
let n1 = r.hexnum(4)?;
if (0xd800..=0xdbff).contains(&n1) {
let mut ok = true;
ok = ok && r.next_byte()? == b'\\';
ok = ok && r.next_byte()? == b'u';
if !ok {
Err(io_syntax_error("Missing second half of surrogate pair"))
} else {
let n2 = r.hexnum(4)?;
if (0xdc00..=0xdfff).contains(&n2) {
let n = ((n1 - 0xd800) << 10) + (n2 - 0xdc00) + 0x10000;
append_codepoint(bs, n)
} else {
Err(io_syntax_error("Bad second half of surrogate pair"))
}
}
} else {
append_codepoint(bs, n1)
}
},
)?)
}
fn read_literal_binary(&mut self) -> io::Result<N> {
Ok(N::new(
&self.read_stringlike(
Vec::new(),
|bs, b| Ok(bs.push(b)),
b'"',
b'x',
|bs, r| Ok(bs.push(r.hexnum(2)? as u8)),
)?[..],
))
}
fn read_hex_binary(&mut self) -> io::Result<Vec<u8>> {
let mut s = String::new();
loop {
self.skip_whitespace();
let c1 = self.next_byte()? as char;
if c1 == '"' {
return Ok(hex::HexParser::Strict.decode(&s).unwrap());
}
let c2 = self.next_byte()? as char;
if !(c1.is_digit(16) && c2.is_digit(16)) {
return Err(io_syntax_error("Invalid hex binary"));
}
s.push(c1);
s.push(c2);
}
}
fn read_base64_binary(&mut self) -> io::Result<N> {
let mut bs = Vec::new();
loop {
self.skip_whitespace();
let mut c = self.next_byte()?;
if c == b']' {
let bs = base64::decode_config(&decode_utf8(bs)?, base64::STANDARD_NO_PAD)
.map_err(|_| io_syntax_error("Invalid base64 character"))?;
return Ok(N::new(&bs[..]));
}
if c == b'-' {
c = b'+';
}
if c == b'_' {
c = b'/';
}
if c == b'=' {
continue;
}
bs.push(c);
}
}
fn upto(&mut self, delimiter: u8, read_annotations: bool, skip_commas: bool) -> io::Result<Vec<N>> {
let mut vs = Vec::new();
loop {
self.skip_whitespace_and_maybe_commas(skip_commas);
if self.peek()? == delimiter {
self.skip()?;
return Ok(vs);
}
vs.push(Reader::<N>::demand_next(self, read_annotations)?);
}
}
fn read_set(&mut self, read_annotations: bool) -> io::Result<N> {
let items = self.upto(b'}', read_annotations, true)?;
let mut s = Set::<N>::new();
for i in items {
if s.contains(&i) {
return Err(io_syntax_error("Duplicate set element"));
}
s.insert(i);
}
Ok(N::new(s))
}
fn read_dictionary(&mut self, read_annotations: bool) -> io::Result<N> {
let mut d = Map::new();
loop {
self.skip_whitespace_and_maybe_commas(true);
if self.peek()? == b'}' {
self.skip()?;
return Ok(N::new(d));
}
let k = Reader::<N>::demand_next(self, read_annotations)?;
self.skip_whitespace();
if self.next_byte()? != b':' {
return Err(io_syntax_error("Missing expected key/value separator"));
}
if d.contains_key(&k) {
return Err(io_syntax_error("Duplicate key"));
}
let v = Reader::<N>::demand_next(self, read_annotations)?;
d.insert(k, v);
}
}
fn require_delimiter(&mut self, msg: &'static str) -> io::Result<()> {
if self.delimiter_follows()? {
Ok(())
} else {
Err(io_syntax_error(msg))
}
}
fn delimiter_follows(&mut self) -> io::Result<bool> {
let c = match self.peek() {
Err(e) if is_eof_io_error(&e) => return Ok(true),
Err(e) => return Err(e)?,
Ok(c) if (c as char).is_whitespace() => return Ok(true),
Ok(c) => c,
};
Ok(match c {
b'(' | b')' | b'{' | b'}' | b'[' | b']' | b'<' | b'>' | b'"' | b';' | b','
| b'@' | b'#' | b':' | b'|' | b' ' => true,
_ => false,
})
}
fn read_raw_symbol_or_number(&mut self, mut bs: Vec<u8>) -> io::Result<N> {
lazy_static! {
static ref NUMBER_RE: regex::Regex =
regex::Regex::new(r"^([-+]?\d+)(((\.\d+([eE][-+]?\d+)?)|([eE][-+]?\d+))([fF]?))?$")
.unwrap();
}
while !self.delimiter_follows()? {
bs.push(self.next_byte()?);
}
let s = decode_utf8(bs)?;
match NUMBER_RE.captures(&s) {
None => Ok(N::symbol(&s)),
Some(m) => match m.get(2) {
None => Ok(N::new(s.parse::<BigInt>().map_err(|_| {
io_syntax_error(&format!("Invalid signed-integer number: {:?}", s))
})?)),
Some(_) => {
if let Some(maybe_f) = m.get(7) {
let s = m[1].to_owned() + &m[3];
if maybe_f.range().is_empty() {
Ok(N::new(s.parse::<f64>().map_err(|_| {
io_syntax_error(&format!(
"Invalid double-precision number: {:?}",
s
))
})?))
} else {
Ok(N::new(s.parse::<f32>().map_err(|_| {
io_syntax_error(&format!(
"Invalid single-precision number: {:?}",
s
))
})?))
}
} else {
panic!("Internal error: cannot analyze number {:?}", s)
}
}
},
}
}
}
impl<'de, 'src, N: NestedValue, Dec: DomainParse<N::Embedded>, S: BinarySource<'de>> Reader<'de, N>
for TextReader<'de, 'src, N, Dec, S>
{
fn next(&mut self, read_annotations: bool) -> io::Result<Option<N>> {
'restart: loop {
match self.toplevel_whitespace_mode {
ToplevelWhitespaceMode::Document => self.skip_whitespace(),
ToplevelWhitespaceMode::Value => (),
}
match self.peek() {
Err(e) if is_eof_io_error(&e) => return Ok(None),
_ => (),
}
match self.toplevel_whitespace_mode {
ToplevelWhitespaceMode::Document => (),
ToplevelWhitespaceMode::Value => self.skip_whitespace(),
}
return Ok(Some(match self.peek()? {
b'"' => {
self.skip()?;
N::new(self.read_string(b'"')?)
}
b'|' => {
self.skip()?;
N::symbol(&self.read_string(b'|')?)
}
b';' => {
return Err(io_syntax_error(
"Semicolon is reserved syntax"
));
}
b'@' => {
if read_annotations {
let mut annotations = Vec::new();
self.gather_annotations(&mut annotations)?;
self.prepend_annotations_to_next(annotations)?
} else {
self.skip_annotations()?;
self.demand_next(read_annotations)?
}
}
b':' => {
return Err(io_syntax_error(
"Unexpected key/value separator between items",
));
}
b'#' => {
self.skip()?;
match self.next_byte()? {
b' ' | b'\t' => {
if read_annotations {
let mut annotations = vec![N::new(self.comment_line()?)];
self.gather_annotations(&mut annotations)?;
self.prepend_annotations_to_next(annotations)?
} else {
self.comment_line()?;
continue 'restart;
}
}
b'f' => { self.require_delimiter("Delimiter must follow #f")?; N::new(false) }
b't' => { self.require_delimiter("Delimiter must follow #t")?; N::new(true) }
b'{' => self.read_set(read_annotations)?,
b'"' => self.read_literal_binary()?,
b'x' => match self.next_byte()? {
b'"' => N::new(&self.read_hex_binary()?[..]),
b'f' => self.read_hex_float(4)?,
b'd' => self.read_hex_float(8)?,
_ => return Err(io_syntax_error("Invalid #x syntax")),
},
b'[' => self.read_base64_binary()?,
b'!' => {
let v = self.next_iovalue(read_annotations)?;
Value::Embedded(self.dec.parse_embedded(&v)?).wrap()
}
other => {
return Err(io_syntax_error(&format!("Invalid # syntax: {:?}", other)))
}
}
}
b'<' => {
self.skip()?;
let vs = self.upto(b'>', read_annotations, false)?;
if vs.is_empty() {
return Err(io_syntax_error("Missing record label"));
}
Value::Record(Record(vs)).wrap()
}
b'[' => {
self.skip()?;
N::new(self.upto(b']', read_annotations, true)?)
}
b'{' => {
self.skip()?;
self.read_dictionary(read_annotations)?
}
b'>' => return Err(io_syntax_error("Unexpected >")),
b']' => return Err(io_syntax_error("Unexpected ]")),
b'}' => return Err(io_syntax_error("Unexpected }")),
b',' => return Err(io_syntax_error("Unexpected ,")),
other => {
self.skip()?;
self.read_raw_symbol_or_number(vec![other])?
}
}))
}
}
fn open_record(&mut self, arity: Option<usize>) -> ReaderResult<B::Type> {
self.skip_annotations()?;
if self.peek()? != b'<' {
return Err(self.expected(ExpectedKind::Record(arity)));
}
self.skip()?;
let mut b = B::Type::default();
Reader::<N>::ensure_more_expected(self, &mut b, &B::Item::RecordLabel)?;
Ok(b)
}
fn open_sequence_or_set(&mut self) -> ReaderResult<B::Item> {
self.skip_annotations()?;
let mark = Reader::<N>::mark(self)?;
match self.next_byte()? {
b'#' => match self.next_byte()? {
b'{' => return Ok(B::Item::SetValue),
_ => (),
},
b'[' => return Ok(B::Item::SequenceValue),
_ => (),
}
Reader::<N>::restore(self, &mark)?;
Err(self.expected(ExpectedKind::SequenceOrSet))
}
fn open_sequence(&mut self) -> ReaderResult<()> {
self.skip_annotations()?;
if self.peek()? != b'[' {
return Err(self.expected(ExpectedKind::Sequence));
}
self.skip()?;
Ok(())
}
fn open_set(&mut self) -> ReaderResult<()> {
self.skip_annotations()?;
let mark = Reader::<N>::mark(self)?;
match self.next_byte()? {
b'#' => match self.next_byte()? {
b'{' => return Ok(()),
_ => (),
},
_ => (),
}
Reader::<N>::restore(self, &mark)?;
Err(self.expected(ExpectedKind::Set))
}
fn open_dictionary(&mut self) -> ReaderResult<()> {
self.skip_annotations()?;
if self.peek()? != b'{' {
return Err(self.expected(ExpectedKind::Dictionary));
}
self.skip()?;
Ok(())
}
#[inline]
fn boundary(&mut self, b: &B::Type) -> ReaderResult<()> {
match b {
B::Type {
closing: Some(B::Item::DictionaryKey),
opening: Some(B::Item::DictionaryValue),
} => {
self.skip_whitespace();
if self.next_byte()? != b':' {
return Err(syntax_error("Missing expected key/value separator"));
}
}
B::Type {
closing: Some(B::Item::DictionaryValue),
opening: Some(B::Item::DictionaryKey),
} => self.skip_whitespace_and_maybe_commas(true),
B::Type {
closing: Some(B::Item::SetValue),
opening: Some(B::Item::SetValue),
} => self.skip_whitespace_and_maybe_commas(true),
B::Type {
closing: Some(B::Item::SequenceValue),
opening: Some(B::Item::SequenceValue),
} => self.skip_whitespace_and_maybe_commas(true),
_ => (),
}
Ok(())
}
fn close_compound(&mut self, b: &mut B::Type, i: &B::Item) -> ReaderResult<bool> {
self.skip_whitespace();
match self.peek()? {
b'>' | b']' | b'}' => {
self.skip()?;
Ok(true)
}
_ => {
b.shift(Some(i.clone()));
Reader::<N>::boundary(self, b)?;
Ok(false)
}
}
}
fn open_embedded(&mut self) -> ReaderResult<()> {
self.skip_annotations()?;
let mark = Reader::<N>::mark(self)?;
match self.next_byte()? {
b'#' => match self.next_byte()? {
b'!' => return Ok(()),
_ => (),
},
_ => (),
}
Reader::<N>::restore(self, &mark)?;
Err(self.expected(ExpectedKind::Embedded))
}
fn close_embedded(&mut self) -> ReaderResult<()> {
Ok(())
}
type Mark = S::Mark;
fn mark(&mut self) -> io::Result<Self::Mark> {
self.source.mark()
}
fn restore(&mut self, mark: &Self::Mark) -> io::Result<()> {
self.source.restore(mark)
}
fn next_token(&mut self, read_embedded_annotations: bool) -> io::Result<Token<N>> {
self.skip_annotations()?;
let mark = Reader::<N>::mark(self)?;
Ok(match self.next_byte()? {
b'<' => Token::Compound(CompoundClass::Record),
b'[' => Token::Compound(CompoundClass::Sequence),
b'{' => Token::Compound(CompoundClass::Dictionary),
b'>' => Token::End,
b']' => Token::End,
b'}' => Token::End,
b'#' => match self.next_byte()? {
b'!' => {
let v = self.next_iovalue(read_embedded_annotations)?;
Token::Embedded(self.dec.parse_embedded(&v)?)
}
b'{' => Token::Compound(CompoundClass::Set),
_ => {
Reader::<N>::restore(self, &mark)?;
Token::Atom(self.demand_next(false)?)
}
},
_ => {
Reader::<N>::restore(self, &mark)?;
Token::Atom(self.demand_next(false)?)
}
})
}
fn next_annotations_and_token(&mut self) -> io::Result<(Vec<N>, Token<N>)> {
let mut annotations = Vec::new();
self.gather_annotations(&mut annotations)?;
Ok((annotations, self.next_token(true)?))
}
}