preserves/implementations/rust/preserves/src/text/reader.rs

use crate::Atom;
use crate::ValueClass;
use crate::error::Error;
use crate::error::ExpectedKind;
use crate::error::io_eof;

use crate::hex;

use crate::CompoundClass;
use crate::Reader;
use crate::boundary as B;
use crate::reader::NextToken;
use crate::reader::ReaderResult;
use crate::source::BinarySource;

use lazy_static::lazy_static;

use num_bigint::BigInt;

use std::borrow::Cow;
use std::io;
use std::marker::PhantomData;

enum Classification {
    Atom(Atom<'static>),
    Compound(CompoundClass),
    Embedded,
    CommentAnnotation,
    OrdinaryAnnotation,
}

impl<'r> From<&'r Classification> for NextToken {
    fn from(c: &'r Classification) -> Self {
        match c {
            Classification::Atom(a) => NextToken::Value(ValueClass::Atomic(a.into())),
            Classification::Compound(c) => NextToken::Value(ValueClass::Compound(c.clone())),
            Classification::Embedded => NextToken::Value(ValueClass::Embedded),
            Classification::CommentAnnotation |
            Classification::OrdinaryAnnotation => NextToken::Annotation,
        }
    }
}

pub struct TextReader<'de, S: BinarySource<'de>> {
    pub source: S,
    classification_cache: Option<Classification>,
    phantom: PhantomData<&'de ()>,
}

impl<'de, S: BinarySource<'de>> TextReader<'de, S>
{
    pub fn new(source: S) -> Self {
        TextReader {
            source,
            classification_cache: None,
            phantom: PhantomData,
        }
    }

    fn syntax_error(&mut self, message: &str) -> io::Error {
        self.source.syntax_error(message)
    }

    fn peek(&mut self) -> io::Result<Option<u8>> {
        self.source.peek()
    }

    #[inline(always)]
    fn peek_noeof(&mut self) -> io::Result<u8> {
        self.source.peek_noeof()
    }

    fn skip(&mut self) -> io::Result<()> {
        self.source.skip()
    }

    #[inline(always)]
    fn next_byte(&mut self) -> io::Result<u8> {
        self.source.read()
    }

    fn skip_whitespace(&mut self) {
        // Deliberately swallows errors.
        while let Ok(Some(c)) = self.peek() {
            match c {
                b' ' | b'\t' | b'\r' | b'\n' | b',' => {
                    let _ = self.skip();
                    ()
                }
                _ => break,
            }
        }
    }

    fn decode_utf8(&mut self, bs: Vec<u8>) -> io::Result<String> {
        String::from_utf8(bs).map_err(|_| self.syntax_error("Invalid UTF-8"))
    }

    fn comment_line(&mut self) -> io::Result<String> {
        let mut bs = Vec::new();
        loop {
            let b = self.peek_noeof()?;
            self.skip()?;
            match b {
                b'\r' | b'\n' => return Ok(self.decode_utf8(bs)?),
                _ => bs.push(b),
            }
        }
    }

    fn read_hex_float(&mut self, bytecount: usize) -> io::Result<Atom<'static>> {
        if self.next_byte()? != b'"' {
            return Err(self.syntax_error("Missing open-double-quote in hex-encoded floating-point number"));
        }
        let bs = self.read_hex_binary()?;
        if bs.len() != bytecount {
            return Err(self.syntax_error("Incorrect number of bytes in hex-encoded floating-point number"));
        }
        match bytecount {
            4 => Ok(Atom::Float(f32::from_bits(u32::from_be_bytes(bs.try_into().unwrap())))),
            8 => Ok(Atom::Double(f64::from_bits(u64::from_be_bytes(bs.try_into().unwrap())))),
            _ => Err(self.syntax_error("Unsupported byte count in hex-encoded floating-point number")),
        }
    }

    fn read_stringlike<X, H, R>(
        &mut self,
        mut seed: R,
        xform_item: X,
        terminator: u8,
        hexescape: u8,
        hexescaper: H,
    ) -> io::Result<R>
    where
        X: Fn(&mut Self, &mut R, u8) -> io::Result<()>,
        H: Fn(&mut Self, &mut R) -> io::Result<()>,
    {
        loop {
            match self.next_byte()? {
                c if c == terminator => return Ok(seed),
                b'\\' => match self.next_byte()? {
                    c if c == hexescape => hexescaper(self, &mut seed)?,
                    c if c == terminator || c == b'\\' || c == b'/' => xform_item(self, &mut seed, c)?,
                    b'b' => xform_item(self, &mut seed, b'\x08')?,
                    b'f' => xform_item(self, &mut seed, b'\x0c')?,
                    b'n' => xform_item(self, &mut seed, b'\x0a')?,
                    b'r' => xform_item(self, &mut seed, b'\x0d')?,
                    b't' => xform_item(self, &mut seed, b'\x09')?,
                    _ => return Err(self.syntax_error("Invalid escape code")),
                },
                c => xform_item(self, &mut seed, c)?,
            }
        }
    }

    fn hexnum(&mut self, count: usize) -> io::Result<u32> {
        let mut v: u32 = 0;
        for _ in 0 .. count {
            let c = self.next_byte()?;
            match (c as char).to_digit(16) {
                Some(d) =>
                    v = v << 4 | d,
                None =>
                    return Err(self.syntax_error("Bad hex escape")),
            }
        }
        Ok(v)
    }

    fn append_codepoint(&mut self, bs: &mut Vec<u8>, n: u32) -> io::Result<()> {
        let c = char::from_u32(n).ok_or_else(|| self.syntax_error("Bad code point"))?;
        let mut buf = [0; 4];
        let _ = c.encode_utf8(&mut buf);
        bs.extend(&buf[0 .. c.len_utf8()]);
        Ok(())
    }

    fn read_string(&mut self, delimiter: u8) -> io::Result<String> {
        let raw = self.read_stringlike(
            Vec::new(),
            |_r, bs, c| Ok(bs.push(c)),
            delimiter,
            b'u',
            |r, bs| {
                let n1 = r.hexnum(4)?;
                if (0xd800 ..= 0xdbff).contains(&n1) {
                    let mut ok = true;
                    ok = ok && r.next_byte()? == b'\\';
                    ok = ok && r.next_byte()? == b'u';
                    if !ok {
                        Err(r.syntax_error("Missing second half of surrogate pair"))
                    } else {
                        let n2 = r.hexnum(4)?;
                        if (0xdc00 ..= 0xdfff).contains(&n2) {
                            let n = ((n1 - 0xd800) << 10) + (n2 - 0xdc00) + 0x10000;
                            r.append_codepoint(bs, n)
                        } else {
                            Err(r.syntax_error("Bad second half of surrogate pair"))
                        }
                    }
                } else {
                    r.append_codepoint(bs, n1)
                }
            })?;
        self.decode_utf8(raw)
    }

    fn read_literal_binary(&mut self) -> io::Result<Atom<'static>> {
        Ok(Atom::ByteString(Cow::Owned(self.read_stringlike(
            Vec::new(),
            |_r, bs, b| Ok(bs.push(b)),
            b'"',
            b'x',
            |r, bs| Ok(bs.push(r.hexnum(2)? as u8)))?)))
    }

    fn read_hex_binary(&mut self) -> io::Result<Vec<u8>> {
        let mut s = String::new();
        loop {
            self.skip_whitespace();
            let c1 = self.next_byte()? as char;
            if c1 == '"' {
                return Ok(hex::HexParser::Strict.decode(&s).unwrap());
            }
            let c2 = self.next_byte()? as char;
            if !(c1.is_digit(16) && c2.is_digit(16)) {
                return Err(self.syntax_error("Invalid hex binary"));
            }
            s.push(c1);
            s.push(c2);
        }
    }

    fn read_base64_binary(&mut self) -> io::Result<Atom<'static>> {
        let mut bs = Vec::new();
        loop {
            self.skip_whitespace();
            let mut c = self.next_byte()?;
            if c == b']' {
                let bs = base64::decode_config(&self.decode_utf8(bs)?, base64::STANDARD_NO_PAD)
                    .map_err(|_| self.syntax_error("Invalid base64 character"))?;
                return Ok(Atom::ByteString(Cow::Owned(bs)));
            }
            if c == b'-' { c = b'+'; }
            if c == b'_' { c = b'/'; }
            if c == b'=' { continue; }
            bs.push(c);
        }
    }

    fn read_raw_symbol_or_number(&mut self, mut bs: Vec<u8>) -> io::Result<Atom<'static>> {
        lazy_static! {
            static ref NUMBER_RE: regex::Regex = regex::Regex::new(
                r"^([-+]?\d+)(((\.\d+([eE][-+]?\d+)?)|([eE][-+]?\d+))([fF]?))?$").unwrap();
        }
        loop {
            let c = match self.peek()? {
                None => b' ',
                Some(c) if (c as char).is_whitespace() => b' ',
                Some(c) => c
            };
            match c {
                b'(' | b')' | b'{' | b'}' | b'[' | b']' | b'<' | b'>' |
                b'"' | b';' | b',' | b'@' | b'#' | b':' | b'|' | b' ' => {
                    let s = self.decode_utf8(bs)?;
                    return match NUMBER_RE.captures(&s) {
                        None => Ok(Atom::Symbol(s.into())),
                        Some(m) => match m.get(2) {
                            None => Ok(Atom::SignedInteger(Cow::Owned(
                                s.parse::<BigInt>().map_err(
                                    |_| self.syntax_error(&format!(
                                        "Invalid signed-integer number: {:?}", s)))?.into()))),
                            Some(_) => {
                                if let Some(maybe_f) = m.get(7) {
                                    let s = m[1].to_owned() + &m[3];
                                    if maybe_f.range().is_empty() {
                                        Ok(Atom::Double(s.parse::<f64>().map_err(
                                            |_| self.syntax_error(&format!(
                                                "Invalid double-precision number: {:?}", s)))?))
                                    } else {
                                        Ok(Atom::Float(s.parse::<f32>().map_err(
                                            |_| self.syntax_error(&format!(
                                                "Invalid single-precision number: {:?}", s)))?))
                                    }
                                } else {
                                    panic!("Internal error: cannot analyze number {:?}", s)
                                }
                            }
                        }
                    }
                }
                c => {
                    self.skip()?;
                    bs.push(c)
                }
            }
        }
    }

    fn read_classification(&mut self) -> io::Result<Classification> {
        self.skip_whitespace();
        let c = match self.peek()? {
            None => Err(io_eof())?,
            Some(c) => c,
        };
        self.skip()?;

        Ok(match c {
            b'"' => Classification::Atom(Atom::String(Cow::Owned(self.read_string(b'"')?))),
            b'|' => Classification::Atom(Atom::Symbol(Cow::Owned(self.read_string(b'|')?))),
            b':' => Err(self.syntax_error("Unexpected key/value separator between items"))?,
            b';' => Classification::CommentAnnotation,
            b'@' => Classification::OrdinaryAnnotation,
            b'#' => {
                match self.next_byte()? {
                    b'f' => Classification::Atom(Atom::Boolean(false)),
                    b't' => Classification::Atom(Atom::Boolean(true)),
                    b'{' => Classification::Compound(CompoundClass::Set),
                    b'"' => Classification::Atom(self.read_literal_binary()?),
                    b'x' => match self.next_byte()? {
                        b'"' => Classification::Atom(Atom::ByteString(self.read_hex_binary()?.into())),
                        b'f' => Classification::Atom(self.read_hex_float(4)?),
                        b'd' => Classification::Atom(self.read_hex_float(8)?),
                        _ => Err(self.syntax_error("Invalid #x syntax"))?,
                    },
                    b'[' => Classification::Atom(self.read_base64_binary()?),
                    b'!' => Classification::Embedded,
                    other => Err(self.syntax_error(&format!("Invalid # syntax: {:?}", other)))?,
                }
            }
            b'<' => Classification::Compound(CompoundClass::Record),
            b'[' => Classification::Compound(CompoundClass::Sequence),
            b'{' => Classification::Compound(CompoundClass::Dictionary),
            b'>' => Err(self.syntax_error("Unexpected >"))?,
            b']' => Err(self.syntax_error("Unexpected ]"))?,
            b'}' => Err(self.syntax_error("Unexpected }"))?,
            other => Classification::Atom(self.read_raw_symbol_or_number(vec![other])?),
        })
    }
}

impl<'de, S: BinarySource<'de>> Reader<'de> for TextReader<'de, S>
{
    fn peek_class(&mut self) -> io::Result<Option<NextToken>> {
        if let Some(a) = &self.classification_cache {
            Ok(Some(a.into()))
        } else {
            let a = self.read_classification()?;
            let result = (&a).into();
            self.classification_cache = Some(a);
            Ok(Some(result))
        }
    }

    fn next_atom(&mut self) -> ReaderResult<Atom<'de>> {
        self.skip_annotations()?;

        let a = self.classification_cache.take().map_or_else(
            || self.read_classification(),
            |c| Ok(c))?;

        match a {
            Classification::Atom(a) => Ok(a),
            Classification::Compound(_) => Err(self.syntax_error("Unexpected compound value"))?,
            Classification::Embedded => Err(self.syntax_error("Unexpected embedded value"))?,
            Classification::CommentAnnotation | Classification::OrdinaryAnnotation =>
                unreachable!("Annotations are supposed to have been skipped already"),
        }
    }

    fn open_record(&mut self) -> ReaderResult<()> {
        self.skip_annotations()?;
        if self.peek_class()? != Some(NextToken::Value(ValueClass::Compound(CompoundClass::Record))) {
            return Err(Error::Expected(ExpectedKind::Record));
        }
        self.classification_cache = None;
        Ok(())
    }

    fn open_sequence(&mut self) -> ReaderResult<()> {
        self.skip_annotations()?;
        if self.peek_class()? != Some(NextToken::Value(ValueClass::Compound(CompoundClass::Sequence))) {
            return Err(Error::Expected(ExpectedKind::Sequence));
        }
        self.classification_cache = None;
        Ok(())
    }

    fn open_set(&mut self) -> ReaderResult<()> {
        self.skip_annotations()?;
        if self.peek_class()? != Some(NextToken::Value(ValueClass::Compound(CompoundClass::Set))) {
            return Err(Error::Expected(ExpectedKind::Set));
        }
        self.classification_cache = None;
        Ok(())
    }

    fn open_dictionary(&mut self) -> ReaderResult<()> {
        self.skip_annotations()?;
        if self.peek_class()? != Some(NextToken::Value(ValueClass::Compound(CompoundClass::Dictionary))) {
            return Err(Error::Expected(ExpectedKind::Dictionary));
        }
        self.classification_cache = None;
        Ok(())
    }

    #[inline]
    fn boundary(&mut self, b: &B::Type) -> ReaderResult<()> {
        match b {
            B::Type {
                closing: Some(B::Item::DictionaryKey),
                opening: Some(B::Item::DictionaryValue),
            } => {
                self.skip_whitespace();
                if self.next_byte()? != b':' {
                    Err(self.syntax_error("Missing expected key/value separator"))?;
                }
            },
            _ => (),
        }
        Ok(())
    }

    fn close_compound(&mut self, b: &mut B::Type, i: &B::Item) -> ReaderResult<bool> {
        self.skip_whitespace();
        match self.peek_noeof()? {
            b'>' | b']' | b'}' => {
                self.skip()?;
                Ok(true)
            }
            _ => {
                b.shift(Some(i.clone()));
                self.boundary(b)?;
                Ok(false)
            }
        }
    }

    fn open_embedded(&mut self) -> ReaderResult<()> {
        self.skip_annotations()?;
        if self.peek_class()? != Some(NextToken::Value(ValueClass::Embedded)) {
            return Err(Error::Expected(ExpectedKind::Embedded));
        }
        self.classification_cache = None;
        Ok(())
    }

    fn close_embedded(&mut self) -> ReaderResult<()> {
        Ok(())
    }

    fn mark(&mut self) -> io::Result<usize> {
        if self.classification_cache.is_some() {
            panic!("Cannot mark with full classification_cache");
        }
        self.source.mark()
    }

    fn restore(&mut self, mark: usize) -> io::Result<()> {
        self.classification_cache = None;
        self.source.restore(mark)
    }

    fn open_annotation(&mut self) -> ReaderResult<()> {
        let _ = self.peek_class()?;
        match self.classification_cache {
            None => unreachable!("peek_class should have primed the cache"),
            Some(Classification::CommentAnnotation) => {
                self.classification_cache = Some(Classification::Atom(
                    Atom::String(Cow::Owned(self.comment_line()?))));
                Ok(())
            }
            Some(Classification::OrdinaryAnnotation) => {
                self.classification_cache = None;
                Ok(())
            }
            Some(_) => Err(Error::Expected(ExpectedKind::Annotation))?,
        }
    }

    fn close_annotation(&mut self) -> ReaderResult<()> {
        Ok(())
    }
}