preserves/implementations/rust/preserves/src/text/reader.rs

485 lines
17 KiB
Rust

use crate::Atom;
use crate::ValueClass;
use crate::error::Error;
use crate::error::ExpectedKind;
use crate::error::io_eof;
use crate::hex;
use crate::CompoundClass;
use crate::Reader;
use crate::boundary as B;
use crate::reader::NextToken;
use crate::reader::ReaderResult;
use crate::source::BinarySource;
use lazy_static::lazy_static;
use num_bigint::BigInt;
use std::borrow::Cow;
use std::io;
use std::marker::PhantomData;
enum Classification {
Atom(Atom<'static>),
Compound(CompoundClass),
Embedded,
CommentAnnotation,
OrdinaryAnnotation,
}
impl<'r> From<&'r Classification> for NextToken {
fn from(c: &'r Classification) -> Self {
match c {
Classification::Atom(a) => NextToken::Value(ValueClass::Atomic(a.into())),
Classification::Compound(c) => NextToken::Value(ValueClass::Compound(c.clone())),
Classification::Embedded => NextToken::Value(ValueClass::Embedded),
Classification::CommentAnnotation |
Classification::OrdinaryAnnotation => NextToken::Annotation,
}
}
}
pub struct TextReader<'de, S: BinarySource<'de>> {
pub source: S,
classification_cache: Option<Classification>,
phantom: PhantomData<&'de ()>,
}
impl<'de, S: BinarySource<'de>> TextReader<'de, S>
{
pub fn new(source: S) -> Self {
TextReader {
source,
classification_cache: None,
phantom: PhantomData,
}
}
fn syntax_error(&mut self, message: &str) -> io::Error {
self.source.syntax_error(message)
}
fn peek(&mut self) -> io::Result<Option<u8>> {
self.source.peek()
}
#[inline(always)]
fn peek_noeof(&mut self) -> io::Result<u8> {
self.source.peek_noeof()
}
fn skip(&mut self) -> io::Result<()> {
self.source.skip()
}
#[inline(always)]
fn next_byte(&mut self) -> io::Result<u8> {
self.source.read()
}
fn skip_whitespace(&mut self) {
// Deliberately swallows errors.
while let Ok(Some(c)) = self.peek() {
match c {
b' ' | b'\t' | b'\r' | b'\n' | b',' => {
let _ = self.skip();
()
}
_ => break,
}
}
}
fn decode_utf8(&mut self, bs: Vec<u8>) -> io::Result<String> {
String::from_utf8(bs).map_err(|_| self.syntax_error("Invalid UTF-8"))
}
fn comment_line(&mut self) -> io::Result<String> {
let mut bs = Vec::new();
loop {
let b = self.peek_noeof()?;
self.skip()?;
match b {
b'\r' | b'\n' => return Ok(self.decode_utf8(bs)?),
_ => bs.push(b),
}
}
}
fn read_hex_float(&mut self, bytecount: usize) -> io::Result<Atom<'static>> {
if self.next_byte()? != b'"' {
return Err(self.syntax_error("Missing open-double-quote in hex-encoded floating-point number"));
}
let bs = self.read_hex_binary()?;
if bs.len() != bytecount {
return Err(self.syntax_error("Incorrect number of bytes in hex-encoded floating-point number"));
}
match bytecount {
4 => Ok(Atom::Float(f32::from_bits(u32::from_be_bytes(bs.try_into().unwrap())))),
8 => Ok(Atom::Double(f64::from_bits(u64::from_be_bytes(bs.try_into().unwrap())))),
_ => Err(self.syntax_error("Unsupported byte count in hex-encoded floating-point number")),
}
}
fn read_stringlike<X, H, R>(
&mut self,
mut seed: R,
xform_item: X,
terminator: u8,
hexescape: u8,
hexescaper: H,
) -> io::Result<R>
where
X: Fn(&mut Self, &mut R, u8) -> io::Result<()>,
H: Fn(&mut Self, &mut R) -> io::Result<()>,
{
loop {
match self.next_byte()? {
c if c == terminator => return Ok(seed),
b'\\' => match self.next_byte()? {
c if c == hexescape => hexescaper(self, &mut seed)?,
c if c == terminator || c == b'\\' || c == b'/' => xform_item(self, &mut seed, c)?,
b'b' => xform_item(self, &mut seed, b'\x08')?,
b'f' => xform_item(self, &mut seed, b'\x0c')?,
b'n' => xform_item(self, &mut seed, b'\x0a')?,
b'r' => xform_item(self, &mut seed, b'\x0d')?,
b't' => xform_item(self, &mut seed, b'\x09')?,
_ => return Err(self.syntax_error("Invalid escape code")),
},
c => xform_item(self, &mut seed, c)?,
}
}
}
fn hexnum(&mut self, count: usize) -> io::Result<u32> {
let mut v: u32 = 0;
for _ in 0 .. count {
let c = self.next_byte()?;
match (c as char).to_digit(16) {
Some(d) =>
v = v << 4 | d,
None =>
return Err(self.syntax_error("Bad hex escape")),
}
}
Ok(v)
}
fn append_codepoint(&mut self, bs: &mut Vec<u8>, n: u32) -> io::Result<()> {
let c = char::from_u32(n).ok_or_else(|| self.syntax_error("Bad code point"))?;
let mut buf = [0; 4];
let _ = c.encode_utf8(&mut buf);
bs.extend(&buf[0 .. c.len_utf8()]);
Ok(())
}
fn read_string(&mut self, delimiter: u8) -> io::Result<String> {
let raw = self.read_stringlike(
Vec::new(),
|_r, bs, c| Ok(bs.push(c)),
delimiter,
b'u',
|r, bs| {
let n1 = r.hexnum(4)?;
if (0xd800 ..= 0xdbff).contains(&n1) {
let mut ok = true;
ok = ok && r.next_byte()? == b'\\';
ok = ok && r.next_byte()? == b'u';
if !ok {
Err(r.syntax_error("Missing second half of surrogate pair"))
} else {
let n2 = r.hexnum(4)?;
if (0xdc00 ..= 0xdfff).contains(&n2) {
let n = ((n1 - 0xd800) << 10) + (n2 - 0xdc00) + 0x10000;
r.append_codepoint(bs, n)
} else {
Err(r.syntax_error("Bad second half of surrogate pair"))
}
}
} else {
r.append_codepoint(bs, n1)
}
})?;
self.decode_utf8(raw)
}
fn read_literal_binary(&mut self) -> io::Result<Atom<'static>> {
Ok(Atom::ByteString(Cow::Owned(self.read_stringlike(
Vec::new(),
|_r, bs, b| Ok(bs.push(b)),
b'"',
b'x',
|r, bs| Ok(bs.push(r.hexnum(2)? as u8)))?)))
}
fn read_hex_binary(&mut self) -> io::Result<Vec<u8>> {
let mut s = String::new();
loop {
self.skip_whitespace();
let c1 = self.next_byte()? as char;
if c1 == '"' {
return Ok(hex::HexParser::Strict.decode(&s).unwrap());
}
let c2 = self.next_byte()? as char;
if !(c1.is_digit(16) && c2.is_digit(16)) {
return Err(self.syntax_error("Invalid hex binary"));
}
s.push(c1);
s.push(c2);
}
}
fn read_base64_binary(&mut self) -> io::Result<Atom<'static>> {
let mut bs = Vec::new();
loop {
self.skip_whitespace();
let mut c = self.next_byte()?;
if c == b']' {
let bs = base64::decode_config(&self.decode_utf8(bs)?, base64::STANDARD_NO_PAD)
.map_err(|_| self.syntax_error("Invalid base64 character"))?;
return Ok(Atom::ByteString(Cow::Owned(bs)));
}
if c == b'-' { c = b'+'; }
if c == b'_' { c = b'/'; }
if c == b'=' { continue; }
bs.push(c);
}
}
fn read_raw_symbol_or_number(&mut self, mut bs: Vec<u8>) -> io::Result<Atom<'static>> {
lazy_static! {
static ref NUMBER_RE: regex::Regex = regex::Regex::new(
r"^([-+]?\d+)(((\.\d+([eE][-+]?\d+)?)|([eE][-+]?\d+))([fF]?))?$").unwrap();
}
loop {
let c = match self.peek()? {
None => b' ',
Some(c) if (c as char).is_whitespace() => b' ',
Some(c) => c
};
match c {
b'(' | b')' | b'{' | b'}' | b'[' | b']' | b'<' | b'>' |
b'"' | b';' | b',' | b'@' | b'#' | b':' | b'|' | b' ' => {
let s = self.decode_utf8(bs)?;
return match NUMBER_RE.captures(&s) {
None => Ok(Atom::Symbol(s.into())),
Some(m) => match m.get(2) {
None => Ok(Atom::SignedInteger(Cow::Owned(
s.parse::<BigInt>().map_err(
|_| self.syntax_error(&format!(
"Invalid signed-integer number: {:?}", s)))?.into()))),
Some(_) => {
if let Some(maybe_f) = m.get(7) {
let s = m[1].to_owned() + &m[3];
if maybe_f.range().is_empty() {
Ok(Atom::Double(s.parse::<f64>().map_err(
|_| self.syntax_error(&format!(
"Invalid double-precision number: {:?}", s)))?))
} else {
Ok(Atom::Float(s.parse::<f32>().map_err(
|_| self.syntax_error(&format!(
"Invalid single-precision number: {:?}", s)))?))
}
} else {
panic!("Internal error: cannot analyze number {:?}", s)
}
}
}
}
}
c => {
self.skip()?;
bs.push(c)
}
}
}
}
fn read_classification(&mut self) -> io::Result<Classification> {
self.skip_whitespace();
let c = match self.peek()? {
None => Err(io_eof())?,
Some(c) => c,
};
self.skip()?;
Ok(match c {
b'"' => Classification::Atom(Atom::String(Cow::Owned(self.read_string(b'"')?))),
b'|' => Classification::Atom(Atom::Symbol(Cow::Owned(self.read_string(b'|')?))),
b':' => Err(self.syntax_error("Unexpected key/value separator between items"))?,
b';' => Classification::CommentAnnotation,
b'@' => Classification::OrdinaryAnnotation,
b'#' => {
match self.next_byte()? {
b'f' => Classification::Atom(Atom::Boolean(false)),
b't' => Classification::Atom(Atom::Boolean(true)),
b'{' => Classification::Compound(CompoundClass::Set),
b'"' => Classification::Atom(self.read_literal_binary()?),
b'x' => match self.next_byte()? {
b'"' => Classification::Atom(Atom::ByteString(self.read_hex_binary()?.into())),
b'f' => Classification::Atom(self.read_hex_float(4)?),
b'd' => Classification::Atom(self.read_hex_float(8)?),
_ => Err(self.syntax_error("Invalid #x syntax"))?,
},
b'[' => Classification::Atom(self.read_base64_binary()?),
b'!' => Classification::Embedded,
other => Err(self.syntax_error(&format!("Invalid # syntax: {:?}", other)))?,
}
}
b'<' => Classification::Compound(CompoundClass::Record),
b'[' => Classification::Compound(CompoundClass::Sequence),
b'{' => Classification::Compound(CompoundClass::Dictionary),
b'>' => Err(self.syntax_error("Unexpected >"))?,
b']' => Err(self.syntax_error("Unexpected ]"))?,
b'}' => Err(self.syntax_error("Unexpected }"))?,
other => Classification::Atom(self.read_raw_symbol_or_number(vec![other])?),
})
}
}
impl<'de, S: BinarySource<'de>> Reader<'de> for TextReader<'de, S>
{
fn peek_class(&mut self) -> io::Result<Option<NextToken>> {
if let Some(a) = &self.classification_cache {
Ok(Some(a.into()))
} else {
let a = self.read_classification()?;
let result = (&a).into();
self.classification_cache = Some(a);
Ok(Some(result))
}
}
fn next_atom(&mut self) -> ReaderResult<Atom<'de>> {
self.skip_annotations()?;
let a = self.classification_cache.take().map_or_else(
|| self.read_classification(),
|c| Ok(c))?;
match a {
Classification::Atom(a) => Ok(a),
Classification::Compound(_) => Err(self.syntax_error("Unexpected compound value"))?,
Classification::Embedded => Err(self.syntax_error("Unexpected embedded value"))?,
Classification::CommentAnnotation | Classification::OrdinaryAnnotation =>
unreachable!("Annotations are supposed to have been skipped already"),
}
}
fn open_record(&mut self) -> ReaderResult<()> {
self.skip_annotations()?;
if self.peek_class()? != Some(NextToken::Value(ValueClass::Compound(CompoundClass::Record))) {
return Err(Error::Expected(ExpectedKind::Record));
}
self.classification_cache = None;
Ok(())
}
fn open_sequence(&mut self) -> ReaderResult<()> {
self.skip_annotations()?;
if self.peek_class()? != Some(NextToken::Value(ValueClass::Compound(CompoundClass::Sequence))) {
return Err(Error::Expected(ExpectedKind::Sequence));
}
self.classification_cache = None;
Ok(())
}
fn open_set(&mut self) -> ReaderResult<()> {
self.skip_annotations()?;
if self.peek_class()? != Some(NextToken::Value(ValueClass::Compound(CompoundClass::Set))) {
return Err(Error::Expected(ExpectedKind::Set));
}
self.classification_cache = None;
Ok(())
}
fn open_dictionary(&mut self) -> ReaderResult<()> {
self.skip_annotations()?;
if self.peek_class()? != Some(NextToken::Value(ValueClass::Compound(CompoundClass::Dictionary))) {
return Err(Error::Expected(ExpectedKind::Dictionary));
}
self.classification_cache = None;
Ok(())
}
#[inline]
fn boundary(&mut self, b: &B::Type) -> ReaderResult<()> {
match b {
B::Type {
closing: Some(B::Item::DictionaryKey),
opening: Some(B::Item::DictionaryValue),
} => {
self.skip_whitespace();
if self.next_byte()? != b':' {
Err(self.syntax_error("Missing expected key/value separator"))?;
}
},
_ => (),
}
Ok(())
}
fn close_compound(&mut self, b: &mut B::Type, i: &B::Item) -> ReaderResult<bool> {
self.skip_whitespace();
match self.peek_noeof()? {
b'>' | b']' | b'}' => {
self.skip()?;
Ok(true)
}
_ => {
b.shift(Some(i.clone()));
self.boundary(b)?;
Ok(false)
}
}
}
fn open_embedded(&mut self) -> ReaderResult<()> {
self.skip_annotations()?;
if self.peek_class()? != Some(NextToken::Value(ValueClass::Embedded)) {
return Err(Error::Expected(ExpectedKind::Embedded));
}
self.classification_cache = None;
Ok(())
}
fn close_embedded(&mut self) -> ReaderResult<()> {
Ok(())
}
fn mark(&mut self) -> io::Result<usize> {
if self.classification_cache.is_some() {
panic!("Cannot mark with full classification_cache");
}
self.source.mark()
}
fn restore(&mut self, mark: usize) -> io::Result<()> {
self.classification_cache = None;
self.source.restore(mark)
}
fn open_annotation(&mut self) -> ReaderResult<()> {
let _ = self.peek_class()?;
match self.classification_cache {
None => unreachable!("peek_class should have primed the cache"),
Some(Classification::CommentAnnotation) => {
self.classification_cache = Some(Classification::Atom(
Atom::String(Cow::Owned(self.comment_line()?))));
Ok(())
}
Some(Classification::OrdinaryAnnotation) => {
self.classification_cache = None;
Ok(())
}
Some(_) => Err(Error::Expected(ExpectedKind::Annotation))?,
}
}
fn close_annotation(&mut self) -> ReaderResult<()> {
Ok(())
}
}