//! Implementation of [Reader] for the text syntax. use crate::error::io_syntax_error; use crate::error::is_eof_io_error; use crate::error::syntax_error; use crate::error::Error; use crate::error::ExpectedKind; use crate::error::Received; use crate::hex; use crate::value::boundary as B; use crate::value::reader::BinarySource; use crate::value::reader::ReaderResult; use crate::value::repr::Annotations; use crate::value::CompoundClass; use crate::value::DomainParse; use crate::value::IOValue; use crate::value::IOValueDomainCodec; use crate::value::Map; use crate::value::NestedValue; use crate::value::Reader; use crate::value::Record; use crate::value::Set; use crate::value::Token; use crate::value::Value; use crate::value::ViaCodec; use lazy_static::lazy_static; use num::bigint::BigInt; use std::convert::TryInto; use std::io; use std::marker::PhantomData; /// The text syntax Preserves reader. pub struct TextReader<'de, 'src, N: NestedValue, Dec: DomainParse, S: BinarySource<'de>> { /// Underlying source of (utf8) bytes. pub source: &'src mut S, /// Decoder for producing Rust values embedded in the text. pub dec: Dec, /// Treatment of whitespace before a toplevel term. pub toplevel_whitespace_mode: ToplevelWhitespaceMode, phantom: PhantomData<&'de N>, } /// [TextReader] chooses `Document` mode to treat whitespace preceding end-of-file as a "no /// more values" non-error situation, or `Value` mode to treat it as an "expected more input" /// situation. /// /// The Preserves syntax for `Value` treats any input at all, even whitespace, as an indicator /// that a term is to follow. However, when using a TextReader to parse a *series* of `Value`s /// in a `Document`, whitespace followed by EOF is to be treated as the permitted optional /// whitespace at the end of a `Document. pub enum ToplevelWhitespaceMode { Document, Value, } fn decode_utf8(bs: Vec) -> io::Result { Ok(String::from_utf8(bs).map_err(|_| io_syntax_error("Invalid UTF-8"))?) } fn append_codepoint(bs: &mut Vec, n: u32) -> io::Result<()> { let c = char::from_u32(n).ok_or_else(|| io_syntax_error("Bad code point"))?; let mut buf = [0; 4]; let _ = c.encode_utf8(&mut buf); bs.extend(&buf[0..c.len_utf8()]); Ok(()) } impl<'de, 'src, N: NestedValue, Dec: DomainParse, S: BinarySource<'de>> TextReader<'de, 'src, N, Dec, S> { /// Construct a new reader from a byte (utf8) source and embedded-value decoder. pub fn new(source: &'src mut S, dec: Dec) -> Self { TextReader { source, dec, toplevel_whitespace_mode: ToplevelWhitespaceMode::Document, phantom: PhantomData, } } pub fn toplevel_whitespace_mode(mut self, new_mode: ToplevelWhitespaceMode) -> Self { self.toplevel_whitespace_mode = new_mode; self } fn peek(&mut self) -> io::Result { self.source.peek() } fn skip(&mut self) -> io::Result<()> { self.source.skip() } fn next_byte(&mut self) -> io::Result { let b = self.source.peek()?; self.source.skip()?; Ok(b) } fn skip_whitespace(&mut self) { self.skip_whitespace_and_maybe_commas(false) } fn skip_whitespace_and_maybe_commas(&mut self, skip_commas: bool) { // Deliberately swallows errors. while let Ok(c) = self.peek() { match c { b' ' | b'\t' | b'\r' | b'\n' => { let _ = self.skip(); () } b',' if skip_commas => { let _ = self.skip(); () } _ => break, } } } // TODO: This is a duplicate of fn expected in PackedReader. fn expected(&mut self, k: ExpectedKind) -> Error { match Reader::::demand_next(self, true) { Ok(v) => Error::Expected(k, Received::ReceivedOtherValue(format!("{:?}", v))), Err(e) => e.into(), } } fn gather_annotations(&mut self, vs: &mut Vec) -> ReaderResult<()> { loop { self.skip_whitespace(); match self.peek()? { b'#' => { let m = self.source.mark()?; self.skip()?; match self.next_byte()? { b' ' | b'\t' => vs.push(N::new(self.comment_line()?)), b'\n' | b'\r' => vs.push(N::new("")), _ => { self.source.restore(&m)?; return Ok(()); } } } b'@' => { self.skip()?; vs.push(self.demand_next(true)?) } _ => return Ok(()), } } } fn prepend_annotations_to_next(&mut self, mut annotations: Vec) -> ReaderResult { let (existing_annotations, v) = Reader::::demand_next(self, true)?.pieces(); annotations.extend_from_slice(existing_annotations.slice()); Ok(N::wrap(Annotations::new(Some(annotations)), v)) } fn skip_annotations(&mut self) -> ReaderResult<()> { loop { self.skip_whitespace(); match self.peek()? { b'#' => { let m = self.source.mark()?; self.skip()?; match self.next_byte()? { b' ' | b'\t' => { self.comment_line()?; () } b'\n' | b'\r' => (), _ => { self.source.restore(&m)?; return Ok(()); } } } b'@' => { self.skip()?; self.skip_value()?; } _ => return Ok(()), } } } /// Retrieve the next [IOValue] in the input stream. pub fn next_iovalue(&mut self, read_annotations: bool) -> io::Result { let mut r = TextReader::new(self.source, ViaCodec::new(IOValueDomainCodec)); let v = r.demand_next(read_annotations)?; Ok(v) } fn comment_line(&mut self) -> io::Result { let mut bs = Vec::new(); loop { let b = self.peek()?; self.skip()?; match b { b'\r' | b'\n' => return Ok(decode_utf8(bs)?), _ => bs.push(b), } } } fn read_hex_float(&mut self, bytecount: usize) -> io::Result { if self.next_byte()? != b'"' { return Err(io_syntax_error( "Missing open-double-quote in hex-encoded floating-point number", )); } let bs = self.read_hex_binary()?; if bs.len() != bytecount { return Err(io_syntax_error( "Incorrect number of bytes in hex-encoded floating-point number", )); } match bytecount { 4 => Ok(Value::from(f32::from_bits(u32::from_be_bytes(bs.try_into().unwrap()))).wrap()), 8 => Ok(Value::from(f64::from_bits(u64::from_be_bytes(bs.try_into().unwrap()))).wrap()), _ => Err(io_syntax_error( "Unsupported byte count in hex-encoded floating-point number", )), } } fn read_stringlike( &mut self, mut seed: R, xform_item: X, terminator: u8, hexescape: u8, hexescaper: H, ) -> io::Result where X: Fn(&mut R, u8) -> io::Result<()>, H: Fn(&mut R, &mut Self) -> io::Result<()>, { loop { match self.next_byte()? { c if c == terminator => return Ok(seed), b'\\' => match self.next_byte()? { c if c == hexescape => hexescaper(&mut seed, self)?, c if c == terminator || c == b'\\' || c == b'/' => xform_item(&mut seed, c)?, b'b' => xform_item(&mut seed, b'\x08')?, b'f' => xform_item(&mut seed, b'\x0c')?, b'n' => xform_item(&mut seed, b'\x0a')?, b'r' => xform_item(&mut seed, b'\x0d')?, b't' => xform_item(&mut seed, b'\x09')?, _ => return Err(io_syntax_error("Invalid escape code")), }, c => xform_item(&mut seed, c)?, } } } fn hexnum(&mut self, count: usize) -> io::Result { let mut v: u32 = 0; for _ in 0..count { let c = self.next_byte()?; match (c as char).to_digit(16) { Some(d) => v = v << 4 | d, None => return Err(io_syntax_error("Bad hex escape")), } } Ok(v) } fn read_string(&mut self, delimiter: u8) -> io::Result { decode_utf8(self.read_stringlike( Vec::new(), |bs, c| Ok(bs.push(c)), delimiter, b'u', |bs, r| { let n1 = r.hexnum(4)?; if (0xd800..=0xdbff).contains(&n1) { let mut ok = true; ok = ok && r.next_byte()? == b'\\'; ok = ok && r.next_byte()? == b'u'; if !ok { Err(io_syntax_error("Missing second half of surrogate pair")) } else { let n2 = r.hexnum(4)?; if (0xdc00..=0xdfff).contains(&n2) { let n = ((n1 - 0xd800) << 10) + (n2 - 0xdc00) + 0x10000; append_codepoint(bs, n) } else { Err(io_syntax_error("Bad second half of surrogate pair")) } } } else { append_codepoint(bs, n1) } }, )?) } fn read_literal_binary(&mut self) -> io::Result { Ok(N::new( &self.read_stringlike( Vec::new(), |bs, b| Ok(bs.push(b)), b'"', b'x', |bs, r| Ok(bs.push(r.hexnum(2)? as u8)), )?[..], )) } fn read_hex_binary(&mut self) -> io::Result> { let mut s = String::new(); loop { self.skip_whitespace(); let c1 = self.next_byte()? as char; if c1 == '"' { return Ok(hex::HexParser::Strict.decode(&s).unwrap()); } let c2 = self.next_byte()? as char; if !(c1.is_digit(16) && c2.is_digit(16)) { return Err(io_syntax_error("Invalid hex binary")); } s.push(c1); s.push(c2); } } fn read_base64_binary(&mut self) -> io::Result { let mut bs = Vec::new(); loop { self.skip_whitespace(); let mut c = self.next_byte()?; if c == b']' { let bs = base64::decode_config(&decode_utf8(bs)?, base64::STANDARD_NO_PAD) .map_err(|_| io_syntax_error("Invalid base64 character"))?; return Ok(N::new(&bs[..])); } if c == b'-' { c = b'+'; } if c == b'_' { c = b'/'; } if c == b'=' { continue; } bs.push(c); } } fn upto(&mut self, delimiter: u8, read_annotations: bool, skip_commas: bool) -> io::Result> { let mut vs = Vec::new(); loop { self.skip_whitespace_and_maybe_commas(skip_commas); if self.peek()? == delimiter { self.skip()?; return Ok(vs); } vs.push(Reader::::demand_next(self, read_annotations)?); } } fn read_set(&mut self, read_annotations: bool) -> io::Result { let items = self.upto(b'}', read_annotations, true)?; let mut s = Set::::new(); for i in items { if s.contains(&i) { return Err(io_syntax_error("Duplicate set element")); } s.insert(i); } Ok(N::new(s)) } fn read_dictionary(&mut self, read_annotations: bool) -> io::Result { let mut d = Map::new(); loop { self.skip_whitespace_and_maybe_commas(true); if self.peek()? == b'}' { self.skip()?; return Ok(N::new(d)); } let k = Reader::::demand_next(self, read_annotations)?; self.skip_whitespace(); if self.next_byte()? != b':' { return Err(io_syntax_error("Missing expected key/value separator")); } if d.contains_key(&k) { return Err(io_syntax_error("Duplicate key")); } let v = Reader::::demand_next(self, read_annotations)?; d.insert(k, v); } } fn require_delimiter(&mut self, msg: &'static str) -> io::Result<()> { if self.delimiter_follows()? { Ok(()) } else { Err(io_syntax_error(msg)) } } fn delimiter_follows(&mut self) -> io::Result { let c = match self.peek() { Err(e) if is_eof_io_error(&e) => return Ok(true), Err(e) => return Err(e)?, Ok(c) if (c as char).is_whitespace() => return Ok(true), Ok(c) => c, }; Ok(match c { b'(' | b')' | b'{' | b'}' | b'[' | b']' | b'<' | b'>' | b'"' | b';' | b',' | b'@' | b'#' | b':' | b'|' | b' ' => true, _ => false, }) } fn read_raw_symbol_or_number(&mut self, mut bs: Vec) -> io::Result { lazy_static! { static ref NUMBER_RE: regex::Regex = regex::Regex::new(r"^([-+]?\d+)(((\.\d+([eE][-+]?\d+)?)|([eE][-+]?\d+))([fF]?))?$") .unwrap(); } while !self.delimiter_follows()? { bs.push(self.next_byte()?); } let s = decode_utf8(bs)?; match NUMBER_RE.captures(&s) { None => Ok(N::symbol(&s)), Some(m) => match m.get(2) { None => Ok(N::new(s.parse::().map_err(|_| { io_syntax_error(&format!("Invalid signed-integer number: {:?}", s)) })?)), Some(_) => { if let Some(maybe_f) = m.get(7) { let s = m[1].to_owned() + &m[3]; if maybe_f.range().is_empty() { Ok(N::new(s.parse::().map_err(|_| { io_syntax_error(&format!( "Invalid double-precision number: {:?}", s )) })?)) } else { Ok(N::new(s.parse::().map_err(|_| { io_syntax_error(&format!( "Invalid single-precision number: {:?}", s )) })?)) } } else { panic!("Internal error: cannot analyze number {:?}", s) } } }, } } } impl<'de, 'src, N: NestedValue, Dec: DomainParse, S: BinarySource<'de>> Reader<'de, N> for TextReader<'de, 'src, N, Dec, S> { fn next(&mut self, read_annotations: bool) -> io::Result> { 'restart: loop { match self.toplevel_whitespace_mode { ToplevelWhitespaceMode::Document => self.skip_whitespace(), ToplevelWhitespaceMode::Value => (), } match self.peek() { Err(e) if is_eof_io_error(&e) => return Ok(None), _ => (), } match self.toplevel_whitespace_mode { ToplevelWhitespaceMode::Document => (), ToplevelWhitespaceMode::Value => self.skip_whitespace(), } return Ok(Some(match self.peek()? { b'"' => { self.skip()?; N::new(self.read_string(b'"')?) } b'|' => { self.skip()?; N::symbol(&self.read_string(b'|')?) } b';' => { return Err(io_syntax_error( "Semicolon is reserved syntax" )); } b'@' => { if read_annotations { let mut annotations = Vec::new(); self.gather_annotations(&mut annotations)?; self.prepend_annotations_to_next(annotations)? } else { self.skip_annotations()?; self.demand_next(read_annotations)? } } b':' => { return Err(io_syntax_error( "Unexpected key/value separator between items", )); } b'#' => { self.skip()?; match self.next_byte()? { b' ' | b'\t' => { if read_annotations { let mut annotations = vec![N::new(self.comment_line()?)]; self.gather_annotations(&mut annotations)?; self.prepend_annotations_to_next(annotations)? } else { self.comment_line()?; continue 'restart; } } b'f' => { self.require_delimiter("Delimiter must follow #f")?; N::new(false) } b't' => { self.require_delimiter("Delimiter must follow #t")?; N::new(true) } b'{' => self.read_set(read_annotations)?, b'"' => self.read_literal_binary()?, b'x' => match self.next_byte()? { b'"' => N::new(&self.read_hex_binary()?[..]), b'f' => self.read_hex_float(4)?, b'd' => self.read_hex_float(8)?, _ => return Err(io_syntax_error("Invalid #x syntax")), }, b'[' => self.read_base64_binary()?, b'!' => { let v = self.next_iovalue(read_annotations)?; Value::Embedded(self.dec.parse_embedded(&v)?).wrap() } other => { return Err(io_syntax_error(&format!("Invalid # syntax: {:?}", other))) } } } b'<' => { self.skip()?; let vs = self.upto(b'>', read_annotations, false)?; if vs.is_empty() { return Err(io_syntax_error("Missing record label")); } Value::Record(Record(vs)).wrap() } b'[' => { self.skip()?; N::new(self.upto(b']', read_annotations, true)?) } b'{' => { self.skip()?; self.read_dictionary(read_annotations)? } b'>' => return Err(io_syntax_error("Unexpected >")), b']' => return Err(io_syntax_error("Unexpected ]")), b'}' => return Err(io_syntax_error("Unexpected }")), b',' => return Err(io_syntax_error("Unexpected ,")), other => { self.skip()?; self.read_raw_symbol_or_number(vec![other])? } })) } } fn open_record(&mut self, arity: Option) -> ReaderResult { self.skip_annotations()?; if self.peek()? != b'<' { return Err(self.expected(ExpectedKind::Record(arity))); } self.skip()?; let mut b = B::Type::default(); Reader::::ensure_more_expected(self, &mut b, &B::Item::RecordLabel)?; Ok(b) } fn open_sequence_or_set(&mut self) -> ReaderResult { self.skip_annotations()?; let mark = Reader::::mark(self)?; match self.next_byte()? { b'#' => match self.next_byte()? { b'{' => return Ok(B::Item::SetValue), _ => (), }, b'[' => return Ok(B::Item::SequenceValue), _ => (), } Reader::::restore(self, &mark)?; Err(self.expected(ExpectedKind::SequenceOrSet)) } fn open_sequence(&mut self) -> ReaderResult<()> { self.skip_annotations()?; if self.peek()? != b'[' { return Err(self.expected(ExpectedKind::Sequence)); } self.skip()?; Ok(()) } fn open_set(&mut self) -> ReaderResult<()> { self.skip_annotations()?; let mark = Reader::::mark(self)?; match self.next_byte()? { b'#' => match self.next_byte()? { b'{' => return Ok(()), _ => (), }, _ => (), } Reader::::restore(self, &mark)?; Err(self.expected(ExpectedKind::Set)) } fn open_dictionary(&mut self) -> ReaderResult<()> { self.skip_annotations()?; if self.peek()? != b'{' { return Err(self.expected(ExpectedKind::Dictionary)); } self.skip()?; Ok(()) } #[inline] fn boundary(&mut self, b: &B::Type) -> ReaderResult<()> { match b { B::Type { closing: Some(B::Item::DictionaryKey), opening: Some(B::Item::DictionaryValue), } => { self.skip_whitespace(); if self.next_byte()? != b':' { return Err(syntax_error("Missing expected key/value separator")); } } B::Type { closing: Some(B::Item::DictionaryValue), opening: Some(B::Item::DictionaryKey), } => self.skip_whitespace_and_maybe_commas(true), B::Type { closing: Some(B::Item::SetValue), opening: Some(B::Item::SetValue), } => self.skip_whitespace_and_maybe_commas(true), B::Type { closing: Some(B::Item::SequenceValue), opening: Some(B::Item::SequenceValue), } => self.skip_whitespace_and_maybe_commas(true), _ => (), } Ok(()) } fn close_compound(&mut self, b: &mut B::Type, i: &B::Item) -> ReaderResult { self.skip_whitespace(); match self.peek()? { b'>' | b']' | b'}' => { self.skip()?; Ok(true) } _ => { b.shift(Some(i.clone())); Reader::::boundary(self, b)?; Ok(false) } } } fn open_embedded(&mut self) -> ReaderResult<()> { self.skip_annotations()?; let mark = Reader::::mark(self)?; match self.next_byte()? { b'#' => match self.next_byte()? { b'!' => return Ok(()), _ => (), }, _ => (), } Reader::::restore(self, &mark)?; Err(self.expected(ExpectedKind::Embedded)) } fn close_embedded(&mut self) -> ReaderResult<()> { Ok(()) } type Mark = S::Mark; fn mark(&mut self) -> io::Result { self.source.mark() } fn restore(&mut self, mark: &Self::Mark) -> io::Result<()> { self.source.restore(mark) } fn next_token(&mut self, read_embedded_annotations: bool) -> io::Result> { self.skip_annotations()?; let mark = Reader::::mark(self)?; Ok(match self.next_byte()? { b'<' => Token::Compound(CompoundClass::Record), b'[' => Token::Compound(CompoundClass::Sequence), b'{' => Token::Compound(CompoundClass::Dictionary), b'>' => Token::End, b']' => Token::End, b'}' => Token::End, b'#' => match self.next_byte()? { b'!' => { let v = self.next_iovalue(read_embedded_annotations)?; Token::Embedded(self.dec.parse_embedded(&v)?) } b'{' => Token::Compound(CompoundClass::Set), _ => { Reader::::restore(self, &mark)?; Token::Atom(self.demand_next(false)?) } }, _ => { Reader::::restore(self, &mark)?; Token::Atom(self.demand_next(false)?) } }) } fn next_annotations_and_token(&mut self) -> io::Result<(Vec, Token)> { let mut annotations = Vec::new(); self.gather_annotations(&mut annotations)?; Ok((annotations, self.next_token(true)?)) } }