forked from Orchid/orchid
Custom lexers can now terminate operators
New constraint: custom lexer output is dropped whenever it is used to terminate an operator nested inside another custom lexer, because the recursive call has to return exactly one lexeme
This commit is contained in:
@@ -1,7 +1,10 @@
|
||||
use std::collections::VecDeque;
|
||||
use std::ops::Range;
|
||||
|
||||
use futures::FutureExt;
|
||||
use futures::lock::Mutex;
|
||||
use orchid_base::clone;
|
||||
use orchid_base::error::{OrcErrv, OrcRes, mk_errv};
|
||||
use orchid_base::error::{OrcErrv, OrcRes, mk_errv, report};
|
||||
use orchid_base::interner::{IStr, is};
|
||||
use orchid_base::location::SrcRange;
|
||||
use orchid_base::name::Sym;
|
||||
@@ -23,10 +26,11 @@ pub struct LexCtx<'a> {
|
||||
pub tail: &'a str,
|
||||
pub sub_trees: &'a mut Vec<Expr>,
|
||||
pub ctx: &'a Ctx,
|
||||
pub produced: &'a mut VecDeque<ParsTokTree>,
|
||||
}
|
||||
impl<'a> LexCtx<'a> {
|
||||
#[must_use]
|
||||
pub fn push<'b>(&'b mut self, pos: u32) -> LexCtx<'b>
|
||||
pub fn sub<'b>(&'b mut self, pos: u32, produced: &'b mut VecDeque<ParsTokTree>) -> LexCtx<'b>
|
||||
where 'a: 'b {
|
||||
LexCtx {
|
||||
source: self.source,
|
||||
@@ -35,6 +39,7 @@ impl<'a> LexCtx<'a> {
|
||||
systems: self.systems,
|
||||
sub_trees: &mut *self.sub_trees,
|
||||
ctx: self.ctx,
|
||||
produced,
|
||||
}
|
||||
}
|
||||
#[must_use]
|
||||
@@ -44,6 +49,7 @@ impl<'a> LexCtx<'a> {
|
||||
pub fn set_pos(&mut self, pos: u32) { self.tail = &self.source[pos as usize..] }
|
||||
pub fn push_pos(&mut self, delta: u32) { self.set_pos(self.get_pos() + delta) }
|
||||
pub fn set_tail(&mut self, tail: &'a str) { self.tail = tail }
|
||||
pub fn pos_from(&self, tail: &'a str) -> u32 { (self.source.len() - tail.len()) as u32 }
|
||||
#[must_use]
|
||||
pub fn strip_prefix(&mut self, tgt: &str) -> bool {
|
||||
if let Some(src) = self.tail.strip_prefix(tgt) {
|
||||
@@ -79,23 +85,41 @@ impl<'a> LexCtx<'a> {
|
||||
self.tail = rest;
|
||||
matches
|
||||
}
|
||||
pub fn pop_char(&mut self) -> Option<char> {
|
||||
let mut chars = self.tail.chars();
|
||||
let ret = chars.next()?;
|
||||
self.tail = chars.as_str();
|
||||
Some(ret)
|
||||
}
|
||||
pub fn sr_to(&self, start: u32) -> SrcRange { self.sr(start..self.get_pos()) }
|
||||
pub fn sr(&self, range: Range<u32>) -> SrcRange { SrcRange::new(range, self.path) }
|
||||
}
|
||||
|
||||
pub async fn lex_once(ctx: &mut LexCtx<'_>) -> OrcRes<ParsTokTree> {
|
||||
pub async fn lex_once(ctx: &mut LexCtx<'_>) -> OrcRes<bool> {
|
||||
ctx.trim(unrep_space);
|
||||
if ctx.tail.is_empty() {
|
||||
return Ok(false);
|
||||
}
|
||||
let start = ctx.get_pos();
|
||||
assert!(
|
||||
!ctx.tail.is_empty() && !ctx.tail.starts_with(unrep_space),
|
||||
"Lexing empty string or whitespace to token!\n\
|
||||
Invocations of lex_tok should check for empty string"
|
||||
);
|
||||
let tok = if ctx.strip_prefix("\r\n") || ctx.strip_prefix("\r") || ctx.strip_prefix("\n") {
|
||||
ParsTok::BR
|
||||
} else if let Some(tail) = (ctx.tail.starts_with(name_start).then_some(ctx.tail))
|
||||
.and_then(|t| t.trim_start_matches(name_char).strip_prefix("::"))
|
||||
{
|
||||
let name = &ctx.tail[..ctx.tail.len() - tail.len() - "::".len()];
|
||||
ctx.set_tail(tail);
|
||||
let body = lex_once(ctx).boxed_local().await?;
|
||||
let mut produced = VecDeque::new();
|
||||
let mut sub_cx = ctx.sub(ctx.pos_from(tail), &mut produced);
|
||||
if !lex_once(&mut sub_cx).boxed_local().await? {
|
||||
return Err(mk_errv(
|
||||
is("Unexpected end of source text").await,
|
||||
":: cannot be the last token",
|
||||
[SrcRange::new(start..ctx.get_pos(), ctx.path)],
|
||||
));
|
||||
}
|
||||
let pos = sub_cx.get_pos();
|
||||
ctx.set_pos(pos);
|
||||
let body = produced.pop_front().expect("lex_once returned true");
|
||||
ctx.produced.extend(produced.into_iter());
|
||||
ParsTok::NS(is(name).await, Box::new(body))
|
||||
} else if ctx.strip_prefix("--[") {
|
||||
let Some((cmt, tail)) = ctx.tail.split_once("]--") else {
|
||||
@@ -113,96 +137,169 @@ pub async fn lex_once(ctx: &mut LexCtx<'_>) -> OrcRes<ParsTokTree> {
|
||||
ParsTok::Comment(is(&tail[2..end]).await)
|
||||
} else if let Some(tail) = ctx.tail.strip_prefix('\\').filter(|t| t.starts_with(name_start)) {
|
||||
// fanciness like \$placeh in templates is resolved in the macro engine.
|
||||
ctx.set_tail(tail);
|
||||
let arg = lex_once(ctx).boxed_local().await?;
|
||||
let start = ctx.get_pos();
|
||||
let mut produced = VecDeque::new();
|
||||
let mut sub_cx = ctx.sub(ctx.pos_from(tail), &mut produced);
|
||||
if !lex_once(&mut sub_cx).boxed_local().await? {
|
||||
return Err(mk_errv(
|
||||
is("Unexpected end of file").await,
|
||||
"Expected a lambda argument and body",
|
||||
[SrcRange::new(start..ctx.get_pos(), ctx.path)],
|
||||
));
|
||||
}
|
||||
let pos = sub_cx.get_pos();
|
||||
ctx.set_pos(pos);
|
||||
let arg = produced.pop_front().expect("lex_once returned true");
|
||||
ctx.produced.extend(produced);
|
||||
ctx.trim_ws();
|
||||
ParsTok::LambdaHead(Box::new(arg))
|
||||
} else if let Some((lp, rp, paren)) = PARENS.iter().find(|(lp, ..)| ctx.strip_char(*lp)) {
|
||||
let mut body = Vec::new();
|
||||
let mut body = VecDeque::new();
|
||||
ctx.trim_ws();
|
||||
while !ctx.strip_char(*rp) {
|
||||
if ctx.tail.is_empty() {
|
||||
let mut sub_cx = ctx.sub(ctx.get_pos(), &mut body);
|
||||
if !lex_once(&mut sub_cx).boxed_local().await? {
|
||||
return Err(mk_errv(
|
||||
is("unclosed paren").await,
|
||||
format!("this {lp} has no matching {rp}"),
|
||||
[SrcRange::new(start..start + 1, ctx.path)],
|
||||
));
|
||||
}
|
||||
body.push(lex_once(ctx).boxed_local().await?);
|
||||
let pos = sub_cx.get_pos();
|
||||
ctx.set_pos(pos);
|
||||
ctx.trim_ws();
|
||||
}
|
||||
ParsTok::S(*paren, body)
|
||||
ParsTok::S(*paren, body.into_iter().collect())
|
||||
} else if let Some(res) = sys_lex(ctx).await {
|
||||
let token = res?;
|
||||
ctx.produced.extend(token);
|
||||
return Ok(true);
|
||||
} else if ctx.tail.starts_with(name_start) {
|
||||
ParsTok::Name(is(ctx.get_start_matches(name_char)).await)
|
||||
} else if ctx.tail.starts_with(op_char) {
|
||||
let whole_tail = ctx.tail;
|
||||
ctx.pop_char().expect("The above check would have failed");
|
||||
let mut tail_after_op = ctx.tail;
|
||||
|
||||
let mut lookahead = Vec::new();
|
||||
while !ctx.tail.is_empty() && ctx.tail.starts_with(op_char) {
|
||||
match sys_lex(ctx).await {
|
||||
None => {
|
||||
ctx.pop_char();
|
||||
tail_after_op = ctx.tail;
|
||||
},
|
||||
Some(sys_res) => {
|
||||
match sys_res {
|
||||
Err(e) => report(e),
|
||||
Ok(tokv) => lookahead = tokv,
|
||||
}
|
||||
break;
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
let op_str = &whole_tail[0..whole_tail.len() - tail_after_op.len()];
|
||||
ctx.produced.push_back(ParsTok::Name(is(op_str).await).at(ctx.sr_to(start)));
|
||||
ctx.produced.extend(lookahead);
|
||||
return Ok(true);
|
||||
} else {
|
||||
for sys in ctx.systems {
|
||||
let mut errors = Vec::new();
|
||||
if ctx.tail.starts_with(|c| sys.can_lex(c)) {
|
||||
let (source, pos, path) = (ctx.source.clone(), ctx.get_pos(), ctx.path.clone());
|
||||
let temp_store = ctx.ctx.exprs.derive();
|
||||
let ctx_lck = &Mutex::new(&mut *ctx);
|
||||
let errors_lck = &Mutex::new(&mut errors);
|
||||
let temp_store_cb = temp_store.clone();
|
||||
let lx = sys
|
||||
.lex(source, path, pos, |pos| {
|
||||
clone!(temp_store_cb);
|
||||
async move {
|
||||
let mut ctx_g = ctx_lck.lock().await;
|
||||
match lex_once(&mut ctx_g.push(pos)).boxed_local().await {
|
||||
Ok(t) => Some(api::SubLexed {
|
||||
pos: t.sr.end(),
|
||||
tree: ctx_g.ser_subtree(t, temp_store_cb.clone()).await,
|
||||
}),
|
||||
Err(e) => {
|
||||
errors_lck.lock().await.push(e);
|
||||
None
|
||||
},
|
||||
}
|
||||
return Err(mk_errv(
|
||||
is("Unrecognized character").await,
|
||||
"The following syntax is meaningless.",
|
||||
[SrcRange::new(start..start + 1, ctx.path)],
|
||||
));
|
||||
};
|
||||
ctx.produced.push_back(ParsTokTree { tok, sr: ctx.sr_to(start) });
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
/// Parse one token via any of the systems, if we can
|
||||
///
|
||||
/// This function never writes lookahead
|
||||
pub async fn sys_lex(ctx: &mut LexCtx<'_>) -> Option<OrcRes<Vec<ParsTokTree>>> {
|
||||
for sys in ctx.systems {
|
||||
let mut errors = Vec::new();
|
||||
if ctx.tail.starts_with(|c| sys.can_lex(c)) {
|
||||
let (source, pos, path) = (ctx.source.clone(), ctx.get_pos(), ctx.path.clone());
|
||||
let temp_store = ctx.ctx.exprs.derive();
|
||||
let ctx_lck = &Mutex::new(&mut *ctx);
|
||||
let errors_lck = &Mutex::new(&mut errors);
|
||||
let temp_store_cb = temp_store.clone();
|
||||
let lx = sys
|
||||
.lex(source, path, pos, |pos| {
|
||||
clone!(temp_store_cb);
|
||||
async move {
|
||||
let mut ctx_g = ctx_lck.lock().await;
|
||||
let mut produced = VecDeque::new();
|
||||
let mut sub_cx = ctx_g.sub(pos, &mut produced);
|
||||
let lex_res = lex_once(&mut sub_cx).boxed_local().await;
|
||||
let pos1 = sub_cx.get_pos();
|
||||
ctx_g.set_pos(pos1);
|
||||
match lex_res {
|
||||
Ok(false) => {
|
||||
errors_lck.lock().await.push(mk_errv(
|
||||
is("End of file").await,
|
||||
"Unexpected end of source text",
|
||||
[ctx_g.sr_to(pos)],
|
||||
));
|
||||
None
|
||||
},
|
||||
Ok(true) => {
|
||||
let tok = produced.pop_front().unwrap();
|
||||
Some(api::SubLexed {
|
||||
pos: tok.sr.end(),
|
||||
tree: ctx_g.ser_subtree(tok, temp_store_cb.clone()).await,
|
||||
})
|
||||
},
|
||||
Err(e) => {
|
||||
errors_lck.lock().await.push(e);
|
||||
None
|
||||
},
|
||||
}
|
||||
})
|
||||
.await;
|
||||
match lx {
|
||||
Err(e) => return Err(errors.into_iter().fold(OrcErrv::from_api(&e).await, |a, b| a + b)),
|
||||
Ok(Some(lexed)) => {
|
||||
ctx.set_pos(lexed.pos);
|
||||
let lexed_tree = ctx.des_subtree(&lexed.expr, temp_store).await;
|
||||
let stable_tree = recur(lexed_tree, &|tt, r| {
|
||||
}
|
||||
})
|
||||
.await;
|
||||
match lx {
|
||||
Err(e) =>
|
||||
return Some(Err(errors.into_iter().fold(OrcErrv::from_api(&e).await, |a, b| a + b))),
|
||||
Ok(Some(lexed)) => {
|
||||
ctx.set_pos(lexed.pos);
|
||||
let mut stable_trees = Vec::new();
|
||||
for tok in lexed.expr {
|
||||
stable_trees.push(recur(ctx.des_subtree(&tok, temp_store.clone()).await, &|tt, r| {
|
||||
if let ParsTok::NewExpr(expr) = tt.tok {
|
||||
return ParsTok::Handle(expr).at(tt.sr);
|
||||
}
|
||||
r(tt)
|
||||
});
|
||||
return Ok(stable_tree);
|
||||
},
|
||||
Ok(None) => match errors.into_iter().reduce(|a, b| a + b) {
|
||||
Some(errors) => return Err(errors),
|
||||
None => continue,
|
||||
},
|
||||
}
|
||||
}));
|
||||
}
|
||||
return Some(Ok(stable_trees));
|
||||
},
|
||||
Ok(None) => match errors.into_iter().reduce(|a, b| a + b) {
|
||||
Some(errors) => return Some(Err(errors)),
|
||||
None => continue,
|
||||
},
|
||||
}
|
||||
}
|
||||
if ctx.tail.starts_with(name_start) {
|
||||
ParsTok::Name(is(ctx.get_start_matches(name_char)).await)
|
||||
} else if ctx.tail.starts_with(op_char) {
|
||||
ParsTok::Name(is(ctx.get_start_matches(op_char)).await)
|
||||
} else {
|
||||
return Err(mk_errv(
|
||||
is("Unrecognized character").await,
|
||||
"The following syntax is meaningless.",
|
||||
[SrcRange::new(start..start + 1, ctx.path)],
|
||||
));
|
||||
}
|
||||
};
|
||||
Ok(ParsTokTree { tok, sr: SrcRange::new(start..ctx.get_pos(), ctx.path) })
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
pub async fn lex(text: IStr, path: Sym, systems: &[System], ctx: &Ctx) -> OrcRes<Vec<ParsTokTree>> {
|
||||
let mut sub_trees = Vec::new();
|
||||
let mut ctx =
|
||||
LexCtx { source: &text, sub_trees: &mut sub_trees, tail: &text[..], systems, path: &path, ctx };
|
||||
let mut tokv = Vec::new();
|
||||
let mut produced = VecDeque::new();
|
||||
let mut ctx = LexCtx {
|
||||
source: &text,
|
||||
sub_trees: &mut sub_trees,
|
||||
tail: &text[..],
|
||||
systems,
|
||||
path: &path,
|
||||
ctx,
|
||||
produced: &mut produced,
|
||||
};
|
||||
ctx.trim(unrep_space);
|
||||
while !ctx.tail.is_empty() {
|
||||
tokv.push(lex_once(&mut ctx).await?);
|
||||
while lex_once(&mut ctx).await? {
|
||||
ctx.trim(unrep_space);
|
||||
}
|
||||
Ok(tokv)
|
||||
Ok(produced.into())
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user