use crate::Atom; use crate::ValueClass; use crate::error::Error; use crate::error::ExpectedKind; use crate::error::io_eof; use crate::hex; use crate::CompoundClass; use crate::Reader; use crate::boundary as B; use crate::reader::NextToken; use crate::reader::ReaderResult; use crate::source::BinarySource; use lazy_static::lazy_static; use num_bigint::BigInt; use std::borrow::Cow; use std::io; use std::marker::PhantomData; enum Classification { Atom(Atom<'static>), Compound(CompoundClass), Embedded, CommentAnnotation, OrdinaryAnnotation, } impl<'r> From<&'r Classification> for NextToken { fn from(c: &'r Classification) -> Self { match c { Classification::Atom(a) => NextToken::Value(ValueClass::Atomic(a.into())), Classification::Compound(c) => NextToken::Value(ValueClass::Compound(c.clone())), Classification::Embedded => NextToken::Value(ValueClass::Embedded), Classification::CommentAnnotation | Classification::OrdinaryAnnotation => NextToken::Annotation, } } } pub struct TextReader<'de, S: BinarySource<'de>> { pub source: S, classification_cache: Option, phantom: PhantomData<&'de ()>, } impl<'de, S: BinarySource<'de>> TextReader<'de, S> { pub fn new(source: S) -> Self { TextReader { source, classification_cache: None, phantom: PhantomData, } } fn syntax_error(&mut self, message: &str) -> io::Error { self.source.syntax_error(message) } fn peek(&mut self) -> io::Result> { self.source.peek() } #[inline(always)] fn peek_noeof(&mut self) -> io::Result { self.source.peek_noeof() } fn skip(&mut self) -> io::Result<()> { self.source.skip() } #[inline(always)] fn next_byte(&mut self) -> io::Result { self.source.read() } fn skip_whitespace(&mut self) { // Deliberately swallows errors. while let Ok(Some(c)) = self.peek() { match c { b' ' | b'\t' | b'\r' | b'\n' | b',' => { let _ = self.skip(); () } _ => break, } } } fn decode_utf8(&mut self, bs: Vec) -> io::Result { String::from_utf8(bs).map_err(|_| self.syntax_error("Invalid UTF-8")) } fn comment_line(&mut self) -> io::Result { let mut bs = Vec::new(); loop { let b = self.peek_noeof()?; self.skip()?; match b { b'\r' | b'\n' => return Ok(self.decode_utf8(bs)?), _ => bs.push(b), } } } fn read_hex_float(&mut self, bytecount: usize) -> io::Result> { if self.next_byte()? != b'"' { return Err(self.syntax_error("Missing open-double-quote in hex-encoded floating-point number")); } let bs = self.read_hex_binary()?; if bs.len() != bytecount { return Err(self.syntax_error("Incorrect number of bytes in hex-encoded floating-point number")); } match bytecount { 4 => Ok(Atom::Float(f32::from_bits(u32::from_be_bytes(bs.try_into().unwrap())))), 8 => Ok(Atom::Double(f64::from_bits(u64::from_be_bytes(bs.try_into().unwrap())))), _ => Err(self.syntax_error("Unsupported byte count in hex-encoded floating-point number")), } } fn read_stringlike( &mut self, mut seed: R, xform_item: X, terminator: u8, hexescape: u8, hexescaper: H, ) -> io::Result where X: Fn(&mut Self, &mut R, u8) -> io::Result<()>, H: Fn(&mut Self, &mut R) -> io::Result<()>, { loop { match self.next_byte()? { c if c == terminator => return Ok(seed), b'\\' => match self.next_byte()? { c if c == hexescape => hexescaper(self, &mut seed)?, c if c == terminator || c == b'\\' || c == b'/' => xform_item(self, &mut seed, c)?, b'b' => xform_item(self, &mut seed, b'\x08')?, b'f' => xform_item(self, &mut seed, b'\x0c')?, b'n' => xform_item(self, &mut seed, b'\x0a')?, b'r' => xform_item(self, &mut seed, b'\x0d')?, b't' => xform_item(self, &mut seed, b'\x09')?, _ => return Err(self.syntax_error("Invalid escape code")), }, c => xform_item(self, &mut seed, c)?, } } } fn hexnum(&mut self, count: usize) -> io::Result { let mut v: u32 = 0; for _ in 0 .. count { let c = self.next_byte()?; match (c as char).to_digit(16) { Some(d) => v = v << 4 | d, None => return Err(self.syntax_error("Bad hex escape")), } } Ok(v) } fn append_codepoint(&mut self, bs: &mut Vec, n: u32) -> io::Result<()> { let c = char::from_u32(n).ok_or_else(|| self.syntax_error("Bad code point"))?; let mut buf = [0; 4]; let _ = c.encode_utf8(&mut buf); bs.extend(&buf[0 .. c.len_utf8()]); Ok(()) } fn read_string(&mut self, delimiter: u8) -> io::Result { let raw = self.read_stringlike( Vec::new(), |_r, bs, c| Ok(bs.push(c)), delimiter, b'u', |r, bs| { let n1 = r.hexnum(4)?; if (0xd800 ..= 0xdbff).contains(&n1) { let mut ok = true; ok = ok && r.next_byte()? == b'\\'; ok = ok && r.next_byte()? == b'u'; if !ok { Err(r.syntax_error("Missing second half of surrogate pair")) } else { let n2 = r.hexnum(4)?; if (0xdc00 ..= 0xdfff).contains(&n2) { let n = ((n1 - 0xd800) << 10) + (n2 - 0xdc00) + 0x10000; r.append_codepoint(bs, n) } else { Err(r.syntax_error("Bad second half of surrogate pair")) } } } else { r.append_codepoint(bs, n1) } })?; self.decode_utf8(raw) } fn read_literal_binary(&mut self) -> io::Result> { Ok(Atom::ByteString(Cow::Owned(self.read_stringlike( Vec::new(), |_r, bs, b| Ok(bs.push(b)), b'"', b'x', |r, bs| Ok(bs.push(r.hexnum(2)? as u8)))?))) } fn read_hex_binary(&mut self) -> io::Result> { let mut s = String::new(); loop { self.skip_whitespace(); let c1 = self.next_byte()? as char; if c1 == '"' { return Ok(hex::HexParser::Strict.decode(&s).unwrap()); } let c2 = self.next_byte()? as char; if !(c1.is_digit(16) && c2.is_digit(16)) { return Err(self.syntax_error("Invalid hex binary")); } s.push(c1); s.push(c2); } } fn read_base64_binary(&mut self) -> io::Result> { let mut bs = Vec::new(); loop { self.skip_whitespace(); let mut c = self.next_byte()?; if c == b']' { let bs = base64::decode_config(&self.decode_utf8(bs)?, base64::STANDARD_NO_PAD) .map_err(|_| self.syntax_error("Invalid base64 character"))?; return Ok(Atom::ByteString(Cow::Owned(bs))); } if c == b'-' { c = b'+'; } if c == b'_' { c = b'/'; } if c == b'=' { continue; } bs.push(c); } } fn read_raw_symbol_or_number(&mut self, mut bs: Vec) -> io::Result> { lazy_static! { static ref NUMBER_RE: regex::Regex = regex::Regex::new( r"^([-+]?\d+)(((\.\d+([eE][-+]?\d+)?)|([eE][-+]?\d+))([fF]?))?$").unwrap(); } loop { let c = match self.peek()? { None => b' ', Some(c) if (c as char).is_whitespace() => b' ', Some(c) => c }; match c { b'(' | b')' | b'{' | b'}' | b'[' | b']' | b'<' | b'>' | b'"' | b';' | b',' | b'@' | b'#' | b':' | b'|' | b' ' => { let s = self.decode_utf8(bs)?; return match NUMBER_RE.captures(&s) { None => Ok(Atom::Symbol(s.into())), Some(m) => match m.get(2) { None => Ok(Atom::SignedInteger(s.parse::().map_err( |_| self.syntax_error(&format!( "Invalid signed-integer number: {:?}", s)))?.into())), Some(_) => { if let Some(maybe_f) = m.get(7) { let s = m[1].to_owned() + &m[3]; if maybe_f.range().is_empty() { Ok(Atom::Double(s.parse::().map_err( |_| self.syntax_error(&format!( "Invalid double-precision number: {:?}", s)))?)) } else { Ok(Atom::Float(s.parse::().map_err( |_| self.syntax_error(&format!( "Invalid single-precision number: {:?}", s)))?)) } } else { panic!("Internal error: cannot analyze number {:?}", s) } } } } } c => { self.skip()?; bs.push(c) } } } } fn read_classification(&mut self) -> io::Result { self.skip_whitespace(); let c = match self.peek()? { None => Err(io_eof())?, Some(c) => c, }; self.skip()?; Ok(match c { b'"' => Classification::Atom(Atom::String(Cow::Owned(self.read_string(b'"')?))), b'|' => Classification::Atom(Atom::Symbol(Cow::Owned(self.read_string(b'|')?))), b':' => Err(self.syntax_error("Unexpected key/value separator between items"))?, b';' => Classification::CommentAnnotation, b'@' => Classification::OrdinaryAnnotation, b'#' => { match self.next_byte()? { b'f' => Classification::Atom(Atom::Boolean(false)), b't' => Classification::Atom(Atom::Boolean(true)), b'{' => Classification::Compound(CompoundClass::Set), b'"' => Classification::Atom(self.read_literal_binary()?), b'x' => match self.next_byte()? { b'"' => Classification::Atom(Atom::ByteString(self.read_hex_binary()?.into())), b'f' => Classification::Atom(self.read_hex_float(4)?), b'd' => Classification::Atom(self.read_hex_float(8)?), _ => Err(self.syntax_error("Invalid #x syntax"))?, }, b'[' => Classification::Atom(self.read_base64_binary()?), b'!' => Classification::Embedded, other => Err(self.syntax_error(&format!("Invalid # syntax: {:?}", other)))?, } } b'<' => Classification::Compound(CompoundClass::Record), b'[' => Classification::Compound(CompoundClass::Sequence), b'{' => Classification::Compound(CompoundClass::Dictionary), b'>' => Err(self.syntax_error("Unexpected >"))?, b']' => Err(self.syntax_error("Unexpected ]"))?, b'}' => Err(self.syntax_error("Unexpected }"))?, other => Classification::Atom(self.read_raw_symbol_or_number(vec![other])?), }) } } impl<'de, S: BinarySource<'de>> Reader<'de> for TextReader<'de, S> { fn peek_class(&mut self) -> io::Result> { if let Some(a) = &self.classification_cache { Ok(Some(a.into())) } else { let a = self.read_classification()?; let result = (&a).into(); self.classification_cache = Some(a); Ok(Some(result)) } } fn next_atom(&mut self) -> ReaderResult> { self.skip_annotations()?; let a = self.classification_cache.take().map_or_else( || self.read_classification(), |c| Ok(c))?; match a { Classification::Atom(a) => Ok(a), Classification::Compound(_) => Err(self.syntax_error("Unexpected compound value"))?, Classification::Embedded => Err(self.syntax_error("Unexpected embedded value"))?, Classification::CommentAnnotation | Classification::OrdinaryAnnotation => unreachable!("Annotations are supposed to have been skipped already"), } } fn open_record(&mut self) -> ReaderResult<()> { self.skip_annotations()?; if self.peek_class()? != Some(NextToken::Value(ValueClass::Compound(CompoundClass::Record))) { return Err(Error::Expected(ExpectedKind::Record)); } self.classification_cache = None; Ok(()) } fn open_sequence(&mut self) -> ReaderResult<()> { self.skip_annotations()?; if self.peek_class()? != Some(NextToken::Value(ValueClass::Compound(CompoundClass::Sequence))) { return Err(Error::Expected(ExpectedKind::Sequence)); } self.classification_cache = None; Ok(()) } fn open_set(&mut self) -> ReaderResult<()> { self.skip_annotations()?; if self.peek_class()? != Some(NextToken::Value(ValueClass::Compound(CompoundClass::Set))) { return Err(Error::Expected(ExpectedKind::Set)); } self.classification_cache = None; Ok(()) } fn open_dictionary(&mut self) -> ReaderResult<()> { self.skip_annotations()?; if self.peek_class()? != Some(NextToken::Value(ValueClass::Compound(CompoundClass::Dictionary))) { return Err(Error::Expected(ExpectedKind::Dictionary)); } self.classification_cache = None; Ok(()) } #[inline] fn boundary(&mut self, b: &B::Type) -> ReaderResult<()> { match b { B::Type { closing: Some(B::Item::DictionaryKey), opening: Some(B::Item::DictionaryValue), } => { self.skip_whitespace(); if self.next_byte()? != b':' { Err(self.syntax_error("Missing expected key/value separator"))?; } }, _ => (), } Ok(()) } fn close_compound(&mut self, b: &mut B::Type, i: &B::Item) -> ReaderResult { self.skip_whitespace(); match self.peek_noeof()? { b'>' | b']' | b'}' => { self.skip()?; Ok(true) } _ => { b.shift(Some(i.clone())); self.boundary(b)?; Ok(false) } } } fn open_embedded(&mut self) -> ReaderResult<()> { self.skip_annotations()?; if self.peek_class()? != Some(NextToken::Value(ValueClass::Embedded)) { return Err(Error::Expected(ExpectedKind::Embedded)); } self.classification_cache = None; Ok(()) } fn close_embedded(&mut self) -> ReaderResult<()> { Ok(()) } fn mark(&mut self) -> io::Result { if self.classification_cache.is_some() { panic!("Cannot mark with full classification_cache"); } self.source.mark() } fn restore(&mut self, mark: usize) -> io::Result<()> { self.classification_cache = None; self.source.restore(mark) } fn open_annotation(&mut self) -> ReaderResult<()> { let _ = self.peek_class()?; match self.classification_cache { None => unreachable!("peek_class should have primed the cache"), Some(Classification::CommentAnnotation) => { self.classification_cache = Some(Classification::Atom( Atom::String(Cow::Owned(self.comment_line()?)))); Ok(()) } Some(Classification::OrdinaryAnnotation) => { self.classification_cache = None; Ok(()) } Some(_) => Err(Error::Expected(ExpectedKind::Annotation))?, } } fn close_annotation(&mut self) -> ReaderResult<()> { Ok(()) } }