- interner impls logically separate from API in orchid-base (default host interner still in base for testing) - error reporting, logging, and a variety of other features passed down via context in extension, not yet in host to maintain library-ish profile, should consider options - no global spawn mechanic, the host has a spawn function but extensions only get a stash for enqueuing async work in sync callbacks which is then explicitly, manually, and with strict order popped and awaited - still deadlocks nondeterministically for some ungodly reason
161 lines
5.1 KiB
Rust
161 lines
5.1 KiB
Rust
use itertools::Itertools;
|
|
use orchid_base::error::{OrcErr, OrcErrv, OrcRes, mk_errv};
|
|
use orchid_base::interner::is;
|
|
use orchid_base::location::SrcRange;
|
|
use orchid_base::name::Sym;
|
|
use orchid_base::sym;
|
|
use orchid_base::tree::{Paren, wrap_tokv};
|
|
use orchid_extension::gen_expr::sym_ref;
|
|
use orchid_extension::lexer::{LexContext, Lexer, err_not_applicable};
|
|
use orchid_extension::parser::p_tree2gen;
|
|
use orchid_extension::tree::{GenTok, GenTokTree, ref_tok, x_tok};
|
|
|
|
use super::str_atom::IntStrAtom;
|
|
|
|
/// Reasons why [parse_string] might fail. See [StringError]
|
|
#[derive(Clone)]
|
|
enum StringErrorKind {
|
|
/// A unicode escape sequence wasn't followed by 4 hex digits
|
|
NotHex,
|
|
/// A unicode escape sequence contained an unassigned code point
|
|
BadCodePoint,
|
|
/// An unrecognized escape sequence was found
|
|
BadEscSeq,
|
|
}
|
|
|
|
/// Error produced by [parse_string]
|
|
#[derive(Clone)]
|
|
struct StringError {
|
|
/// Character where the error occured
|
|
pos: u32,
|
|
/// Reason for the error
|
|
kind: StringErrorKind,
|
|
}
|
|
|
|
impl StringError {
|
|
/// Convert into project error for reporting
|
|
pub async fn into_proj(self, path: &Sym, pos: u32) -> OrcErrv {
|
|
let start = pos + self.pos;
|
|
mk_errv(
|
|
is("Failed to parse string").await,
|
|
match self.kind {
|
|
StringErrorKind::NotHex => "Expected a hex digit",
|
|
StringErrorKind::BadCodePoint => "The specified number is not a Unicode code point",
|
|
StringErrorKind::BadEscSeq => "Unrecognized escape sequence",
|
|
},
|
|
[SrcRange::new(start..start + 1, path).pos()],
|
|
)
|
|
}
|
|
}
|
|
|
|
/// Process escape sequences in a string literal
|
|
fn parse_string(str: &str) -> Result<String, StringError> {
|
|
let mut target = String::new();
|
|
let mut iter = str.char_indices().map(|(i, c)| (i as u32, c));
|
|
while let Some((_, c)) = iter.next() {
|
|
if c != '\\' {
|
|
target.push(c);
|
|
continue;
|
|
}
|
|
let (mut pos, code) = iter.next().expect("lexer would have continued");
|
|
let next = match code {
|
|
c @ ('\\' | '"' | '\'' | '$') => c,
|
|
'b' => '\x08',
|
|
'f' => '\x0f',
|
|
'n' => '\n',
|
|
'r' => '\r',
|
|
't' => '\t',
|
|
'\n' => 'skipws: loop {
|
|
match iter.next() {
|
|
None => return Ok(target),
|
|
Some((_, c)) =>
|
|
if !c.is_whitespace() {
|
|
break 'skipws c;
|
|
},
|
|
}
|
|
},
|
|
'u' => {
|
|
let acc = ((0..4).rev())
|
|
.map(|radical| {
|
|
let (j, c) = (iter.next()).ok_or(StringError { pos, kind: StringErrorKind::NotHex })?;
|
|
pos = j;
|
|
let b = u32::from_str_radix(&String::from(c), 16)
|
|
.map_err(|_| StringError { pos, kind: StringErrorKind::NotHex })?;
|
|
Ok(16u32.pow(radical) + b)
|
|
})
|
|
.fold_ok(0, u32::wrapping_add)?;
|
|
char::from_u32(acc).ok_or(StringError { pos, kind: StringErrorKind::BadCodePoint })?
|
|
},
|
|
_ => return Err(StringError { pos, kind: StringErrorKind::BadEscSeq }),
|
|
};
|
|
target.push(next);
|
|
}
|
|
Ok(target)
|
|
}
|
|
|
|
#[derive(Debug, Default)]
|
|
pub struct StringLexer;
|
|
impl Lexer for StringLexer {
|
|
const CHAR_FILTER: &'static [std::ops::RangeInclusive<char>] = &['"'..='"', '`'..='`'];
|
|
async fn lex<'a>(all: &'a str, lctx: &'a LexContext<'a>) -> OrcRes<(&'a str, GenTokTree)> {
|
|
let Some(mut tail) = all.strip_prefix('"') else {
|
|
return Err(err_not_applicable().await);
|
|
};
|
|
let mut ret = None;
|
|
let mut cur = String::new();
|
|
let mut errors = vec![];
|
|
async fn str_to_gen<'a>(
|
|
str: &mut String,
|
|
tail: &str,
|
|
err: &mut Vec<OrcErr>,
|
|
ctx: &'a LexContext<'a>,
|
|
) -> GenTokTree {
|
|
let str_val_res = parse_string(&str.split_off(0));
|
|
if let Err(e) = &str_val_res {
|
|
err.extend(e.clone().into_proj(ctx.src(), ctx.pos(tail) - str.len() as u32).await);
|
|
}
|
|
let str_val = str_val_res.unwrap_or_default();
|
|
x_tok(IntStrAtom::from(is(&str_val).await)).await.at(ctx.pos_lt(str.len() as u32, tail))
|
|
as GenTokTree
|
|
}
|
|
let add_frag = |prev: Option<GenTokTree>, new: GenTokTree| async {
|
|
let Some(prev) = prev else { return new };
|
|
let concat_fn =
|
|
ref_tok(sym!(std::string::concat)).await.at(SrcRange::zw(prev.sr.path(), prev.sr.start()));
|
|
wrap_tokv([concat_fn, prev, new])
|
|
};
|
|
loop {
|
|
if let Some(rest) = tail.strip_prefix('"') {
|
|
return Ok((
|
|
rest,
|
|
add_frag(ret, str_to_gen(&mut cur, tail, &mut errors, lctx).await).await,
|
|
));
|
|
} else if let Some(rest) = tail.strip_prefix('$') {
|
|
ret = Some(add_frag(ret, str_to_gen(&mut cur, tail, &mut errors, lctx).await).await);
|
|
let (new_tail, tree) = lctx.recurse(rest).await?;
|
|
tail = new_tail;
|
|
// wrap the received token in a call to to_str
|
|
let to_str = sym_ref(sym!(std::string::to_str));
|
|
let sr = tree.sr();
|
|
let inj_to_str_tok = GenTok::NewExpr(to_str).at(sr.map_range(|_| sr.start()..sr.start()));
|
|
let to_str_call = GenTok::S(Paren::Round, vec![inj_to_str_tok, p_tree2gen(tree)]).at(sr);
|
|
ret = Some(add_frag(ret, to_str_call).await);
|
|
} else if tail.starts_with('\\') {
|
|
// parse_string will deal with it, we just have to skip the next char
|
|
tail = &tail[2..];
|
|
} else {
|
|
let mut ch = tail.chars();
|
|
if let Some(c) = ch.next() {
|
|
cur.push(c);
|
|
tail = ch.as_str();
|
|
} else {
|
|
let range = lctx.pos(all)..lctx.pos("");
|
|
return Err(mk_errv(is("No string end").await, "String never terminated with \"", [
|
|
SrcRange::new(range.clone(), lctx.src()),
|
|
]));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|