Files
orchid/orchid-std/src/std/string/str_lexer.rs
Lawrence Bethlenfalvy 32d6237dc5 task_local context over context objects
- interner impls logically separate from API in orchid-base (default host interner still in base for testing)
- error reporting, logging, and a variety of other features passed down via context in extension, not yet in host to maintain library-ish profile, should consider options
- no global spawn mechanic, the host has a spawn function but extensions only get a stash for enqueuing async work in sync callbacks which is then explicitly, manually, and with strict order popped and awaited
- still deadlocks nondeterministically for some ungodly reason
2026-01-01 14:54:29 +00:00

161 lines
5.1 KiB
Rust

use itertools::Itertools;
use orchid_base::error::{OrcErr, OrcErrv, OrcRes, mk_errv};
use orchid_base::interner::is;
use orchid_base::location::SrcRange;
use orchid_base::name::Sym;
use orchid_base::sym;
use orchid_base::tree::{Paren, wrap_tokv};
use orchid_extension::gen_expr::sym_ref;
use orchid_extension::lexer::{LexContext, Lexer, err_not_applicable};
use orchid_extension::parser::p_tree2gen;
use orchid_extension::tree::{GenTok, GenTokTree, ref_tok, x_tok};
use super::str_atom::IntStrAtom;
/// Reasons why [parse_string] might fail. See [StringError]
#[derive(Clone)]
enum StringErrorKind {
/// A unicode escape sequence wasn't followed by 4 hex digits
NotHex,
/// A unicode escape sequence contained an unassigned code point
BadCodePoint,
/// An unrecognized escape sequence was found
BadEscSeq,
}
/// Error produced by [parse_string]
#[derive(Clone)]
struct StringError {
/// Character where the error occured
pos: u32,
/// Reason for the error
kind: StringErrorKind,
}
impl StringError {
/// Convert into project error for reporting
pub async fn into_proj(self, path: &Sym, pos: u32) -> OrcErrv {
let start = pos + self.pos;
mk_errv(
is("Failed to parse string").await,
match self.kind {
StringErrorKind::NotHex => "Expected a hex digit",
StringErrorKind::BadCodePoint => "The specified number is not a Unicode code point",
StringErrorKind::BadEscSeq => "Unrecognized escape sequence",
},
[SrcRange::new(start..start + 1, path).pos()],
)
}
}
/// Process escape sequences in a string literal
fn parse_string(str: &str) -> Result<String, StringError> {
let mut target = String::new();
let mut iter = str.char_indices().map(|(i, c)| (i as u32, c));
while let Some((_, c)) = iter.next() {
if c != '\\' {
target.push(c);
continue;
}
let (mut pos, code) = iter.next().expect("lexer would have continued");
let next = match code {
c @ ('\\' | '"' | '\'' | '$') => c,
'b' => '\x08',
'f' => '\x0f',
'n' => '\n',
'r' => '\r',
't' => '\t',
'\n' => 'skipws: loop {
match iter.next() {
None => return Ok(target),
Some((_, c)) =>
if !c.is_whitespace() {
break 'skipws c;
},
}
},
'u' => {
let acc = ((0..4).rev())
.map(|radical| {
let (j, c) = (iter.next()).ok_or(StringError { pos, kind: StringErrorKind::NotHex })?;
pos = j;
let b = u32::from_str_radix(&String::from(c), 16)
.map_err(|_| StringError { pos, kind: StringErrorKind::NotHex })?;
Ok(16u32.pow(radical) + b)
})
.fold_ok(0, u32::wrapping_add)?;
char::from_u32(acc).ok_or(StringError { pos, kind: StringErrorKind::BadCodePoint })?
},
_ => return Err(StringError { pos, kind: StringErrorKind::BadEscSeq }),
};
target.push(next);
}
Ok(target)
}
#[derive(Debug, Default)]
pub struct StringLexer;
impl Lexer for StringLexer {
const CHAR_FILTER: &'static [std::ops::RangeInclusive<char>] = &['"'..='"', '`'..='`'];
async fn lex<'a>(all: &'a str, lctx: &'a LexContext<'a>) -> OrcRes<(&'a str, GenTokTree)> {
let Some(mut tail) = all.strip_prefix('"') else {
return Err(err_not_applicable().await);
};
let mut ret = None;
let mut cur = String::new();
let mut errors = vec![];
async fn str_to_gen<'a>(
str: &mut String,
tail: &str,
err: &mut Vec<OrcErr>,
ctx: &'a LexContext<'a>,
) -> GenTokTree {
let str_val_res = parse_string(&str.split_off(0));
if let Err(e) = &str_val_res {
err.extend(e.clone().into_proj(ctx.src(), ctx.pos(tail) - str.len() as u32).await);
}
let str_val = str_val_res.unwrap_or_default();
x_tok(IntStrAtom::from(is(&str_val).await)).await.at(ctx.pos_lt(str.len() as u32, tail))
as GenTokTree
}
let add_frag = |prev: Option<GenTokTree>, new: GenTokTree| async {
let Some(prev) = prev else { return new };
let concat_fn =
ref_tok(sym!(std::string::concat)).await.at(SrcRange::zw(prev.sr.path(), prev.sr.start()));
wrap_tokv([concat_fn, prev, new])
};
loop {
if let Some(rest) = tail.strip_prefix('"') {
return Ok((
rest,
add_frag(ret, str_to_gen(&mut cur, tail, &mut errors, lctx).await).await,
));
} else if let Some(rest) = tail.strip_prefix('$') {
ret = Some(add_frag(ret, str_to_gen(&mut cur, tail, &mut errors, lctx).await).await);
let (new_tail, tree) = lctx.recurse(rest).await?;
tail = new_tail;
// wrap the received token in a call to to_str
let to_str = sym_ref(sym!(std::string::to_str));
let sr = tree.sr();
let inj_to_str_tok = GenTok::NewExpr(to_str).at(sr.map_range(|_| sr.start()..sr.start()));
let to_str_call = GenTok::S(Paren::Round, vec![inj_to_str_tok, p_tree2gen(tree)]).at(sr);
ret = Some(add_frag(ret, to_str_call).await);
} else if tail.starts_with('\\') {
// parse_string will deal with it, we just have to skip the next char
tail = &tail[2..];
} else {
let mut ch = tail.chars();
if let Some(c) = ch.next() {
cur.push(c);
tail = ch.as_str();
} else {
let range = lctx.pos(all)..lctx.pos("");
return Err(mk_errv(is("No string end").await, "String never terminated with \"", [
SrcRange::new(range.clone(), lctx.src()),
]));
}
}
}
}
}