use std::fmt::Display; use std::ops::Range; use std::sync::Arc; use itertools::Itertools; use ordered_float::NotNan; use super::LexerPlugin; use super::context::Context; use super::errors::{FloatPlacehPrio, NoCommentEnd}; use super::numeric::{parse_num, print_nat16, numstart}; use crate::ast::{PHClass, Placeholder}; use crate::error::{ProjectResult, ProjectError}; use crate::foreign::Atom; use crate::interner::Tok; use crate::parse::numeric::{numchar, lex_numeric}; use crate::parse::string::lex_string; use crate::systems::stl::Numeric; use crate::utils::pure_seq::next; use crate::utils::unwrap_or; use crate::{Location, VName}; #[derive(Clone, Debug)] pub struct Entry { pub lexeme: Lexeme, pub location: Location, } impl Entry { /// Checks if the lexeme is a comment or line break #[must_use] pub fn is_filler(&self) -> bool { matches!(self.lexeme, Lexeme::Comment(_) | Lexeme::BR) } #[must_use] pub fn is_keyword(&self) -> bool { false // matches!( // self.lexeme, // Lexeme::Const // | Lexeme::Export // | Lexeme::Import // | Lexeme::Macro // | Lexeme::Module // ) } #[must_use] pub fn location(&self) -> Location { self.location.clone() } #[must_use] pub fn range(&self) -> Range { self.location.range().expect("An Entry can only have a known location") } #[must_use] pub fn file(&self) -> Arc { self.location.file().expect("An Entry can only have a range location") } fn new(location: Location, lexeme: Lexeme) -> Self { Self { lexeme, location } } } impl Display for Entry { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { self.lexeme.fmt(f) } } impl AsRef for Entry { fn as_ref(&self) -> &Location { &self.location } } #[derive(Clone, Debug)] pub enum Lexeme { Atom(Atom), Name(Tok), Arrow(NotNan), /// Walrus operator (formerly shorthand macro) Walrus, /// Line break BR, /// Namespace separator NS, /// Left paren LP(char), /// Right paren RP(char), /// Backslash BS, At, // Dot, Type, // type operator Comment(Arc), // Export, // Import, // Module, // Macro, // Const, Placeh(Placeholder), } impl Display for Lexeme { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Atom(a) => write!(f, "{a:?}"), Self::Name(token) => write!(f, "{}", **token), Self::Walrus => write!(f, ":="), Self::Arrow(prio) => write!(f, "={}=>", print_nat16(*prio)), Self::NS => write!(f, "::"), Self::LP(l) => write!(f, "{}", l), Self::RP(l) => match l { '(' => write!(f, ")"), '[' => write!(f, "]"), '{' => write!(f, "}}"), _ => f.debug_tuple("RP").field(l).finish(), }, Self::BR => writeln!(f), Self::BS => write!(f, "\\"), Self::At => write!(f, "@"), Self::Type => write!(f, ":"), Self::Comment(text) => write!(f, "--[{}]--", text), // Self::Export => write!(f, "export"), // Self::Import => write!(f, "import"), // Self::Module => write!(f, "module"), // Self::Const => write!(f, "const"), // Self::Macro => write!(f, "macro"), Self::Placeh(Placeholder { name, class }) => match *class { PHClass::Scalar => write!(f, "${}", **name), PHClass::Vec { nonzero, prio } => { if nonzero { write!(f, "...") } else { write!(f, "..") }?; write!(f, "${}", **name)?; if prio != 0 { write!(f, ":{}", prio)?; }; Ok(()) }, }, } } } impl Lexeme { #[must_use] pub fn rule(prio: impl Into) -> Self { Lexeme::Arrow( NotNan::new(prio.into()).expect("Rule priority cannot be NaN"), ) } pub fn strict_eq(&self, other: &Self) -> bool { match (self, other) { (Self::Arrow(f1), Self::Arrow(f2)) => f1 == f2, (Self::At, Self::At) | (Self::BR, Self::BR) => true, (Self::BS, Self::BS) /*| (Self::Const, Self::Const)*/ => true, // (Self::Export, Self::Export) | (Self::Import, Self::Import) => true, // (Self::Macro, Self::Macro) | (Self::Module, Self::Module) => true, (Self::NS, Self::NS) | (Self::Type, Self::Type) => true, (Self::Walrus, Self::Walrus) => true, (Self::Atom(a1), Self::Atom(a2)) => a1.0.strict_eq(&a2.0), (Self::Comment(c1), Self::Comment(c2)) => c1 == c2, (Self::LP(p1), Self::LP(p2)) | (Self::RP(p1), Self::RP(p2)) => p1 == p2, (Self::Name(n1), Self::Name(n2)) => n1 == n2, (Self::Placeh(ph1), Self::Placeh(ph2)) => ph1 == ph2, (_, _) => false, } } } #[allow(unused)] pub fn format(lexed: &[Entry]) -> String { lexed.iter().join(" ") } pub fn namechar(c: char) -> bool { c.is_alphanumeric() | (c == '_') } pub fn namestart(c: char) -> bool { c.is_alphabetic() | (c == '_') } pub fn opchar(c: char) -> bool { !namestart(c) && !numstart(c) && !c.is_whitespace() && !"()[]{},".contains(c) } pub fn split_filter( s: &str, mut pred: impl FnMut(char) -> bool, ) -> (&str, &str) { s.find(|c| !pred(c)).map_or((s, ""), |i| s.split_at(i)) } fn lit_table() -> impl IntoIterator { [ ("\\", Lexeme::BS), ("@", Lexeme::At), ("(", Lexeme::LP('(')), ("[", Lexeme::LP('[')), ("{", Lexeme::LP('{')), (")", Lexeme::RP('(')), ("]", Lexeme::RP('[')), ("}", Lexeme::RP('{')), ("\n", Lexeme::BR), (":=", Lexeme::Walrus), ("::", Lexeme::NS), (":", Lexeme::Type), ] } static BUILTIN_ATOMS: &[&dyn LexerPlugin] = &[&lex_string, &lex_numeric]; pub fn lex( mut tokens: Vec, mut data: &str, ctx: &impl Context, ) -> ProjectResult> { let mut prev_len = data.len() + 1; 'tail:loop { if prev_len == data.len() { panic!("got stuck at {data:?}, parsed {:?}", tokens.last().unwrap()); } prev_len = data.len(); data = data.trim_start_matches(|c: char| c.is_whitespace() && c != '\n'); let (head, _) = match next(data.chars()) { Some((h, t)) => (h, t.as_str()), None => return Ok(tokens), }; for lexer in ctx.lexers().iter().chain(BUILTIN_ATOMS.iter()) { if let Some(res) = lexer(data, ctx) { let (atom, tail) = res?; if tail.len() == data.len() { panic!("lexer plugin consumed 0 characters") } let loc = ctx.location(data.len() - tail.len(), tail); tokens.push(Entry::new(loc, Lexeme::Atom(atom))); data = tail; continue 'tail; } } for (prefix, lexeme) in lit_table() { if let Some(tail) = data.strip_prefix(prefix) { tokens.push(Entry::new(ctx.location(prefix.len(), tail), lexeme.clone())); data = tail; continue 'tail; } } if let Some(tail) = data.strip_prefix(',') { let lexeme = Lexeme::Name(ctx.interner().i(",")); tokens.push(Entry::new(ctx.location(1, tail), lexeme)); data = tail; continue 'tail; } if let Some(tail) = data.strip_prefix("--[") { let (note, tail) = (tail.split_once("]--")) .ok_or_else(|| NoCommentEnd(ctx.location(tail.len(), "")).rc())?; let lexeme = Lexeme::Comment(Arc::new(note.to_string())); let location = ctx.location(note.len() + 3, tail); tokens.push(Entry::new(location, lexeme)); data = tail; continue 'tail; } if let Some(tail) = data.strip_prefix("--") { let (note, tail) = split_filter(tail, |c| c != '\n'); let lexeme = Lexeme::Comment(Arc::new(note.to_string())); let location = ctx.location(note.len(), tail); tokens.push(Entry::new(location, lexeme)); data = tail; continue 'tail; } if let Some(tail) = data.strip_prefix('=') { if tail.chars().next().map_or(false, numstart) { let (num, post_num) = split_filter(tail, numchar); if let Some(tail) = post_num.strip_prefix("=>") { let lexeme = Lexeme::Arrow(parse_num(num).map_err(|e| e.into_proj(num.len(), post_num, ctx))?.as_float()); let location = ctx.location(num.len() + 3, tail); tokens.push(Entry::new(location, lexeme)); data = tail; continue 'tail; } } } // todo: parse placeholders, don't forget vectorials! if let Some(tail) = data.strip_prefix('$') { let (name, tail) = split_filter(tail, namechar); if !name.is_empty() { let name = ctx.interner().i(name); let location = ctx.location(name.len() + 1, tail); let lexeme = Lexeme::Placeh(Placeholder { name, class: PHClass::Scalar }); tokens.push(Entry::new(location, lexeme)); data = tail; continue 'tail; } } if let Some(vec) = data.strip_prefix("..") { let (nonzero, tail) = vec.strip_prefix('.').map_or((false, vec), |t| (true, t)); if let Some(tail) = tail.strip_prefix('$') { let (name, tail) = split_filter(tail, namechar); if !name.is_empty() { let (prio, priolen, tail) = tail .strip_prefix(':') .map(|tail| split_filter(tail, numchar)) .filter(|(num, _)| !num.is_empty()) .map(|(num_str, tail)| { parse_num(num_str) .map_err(|e| e.into_proj(num_str.len(), tail, ctx)) .and_then(|num| { Ok(unwrap_or!(num => Numeric::Uint; { return Err(FloatPlacehPrio(ctx.location(num_str.len(), tail)).rc()) })) }) .map(|p| (p, num_str.len() + 1, tail)) }) .unwrap_or(Ok((0, 0, tail)))?; let byte_len = if nonzero { 4 } else { 3 } + priolen + name.len(); let name = ctx.interner().i(name); let class = PHClass::Vec { nonzero, prio }; let lexeme = Lexeme::Placeh(Placeholder { name, class }); tokens.push(Entry::new(ctx.location(byte_len, tail), lexeme)); data = tail; continue 'tail; } } } if namestart(head) { let (name, tail) = split_filter(data, namechar); if !name.is_empty() { let lexeme = Lexeme::Name(ctx.interner().i(name)); tokens.push(Entry::new(ctx.location(name.len(), tail), lexeme)); data = tail; continue 'tail; } } if opchar(head) { let (name, tail) = split_filter(data, opchar); if !name.is_empty() { let lexeme = Lexeme::Name(ctx.interner().i(name)); tokens.push(Entry::new(ctx.location(name.len(), tail), lexeme)); data = tail; continue 'tail; } } unreachable!(r#"opchar is pretty much defined as "not namechar" "#) } }