mirror of
https://github.com/Noratrieb/dilaria.git
synced 2026-01-14 17:35:03 +01:00
519 lines
15 KiB
Rust
519 lines
15 KiB
Rust
//!
|
|
//! The lex module lexes the source code into Tokens
|
|
//!
|
|
//! For error handling, there is a single `Error` token, which contains the error. The lexer
|
|
//! is an iterator, and can therefore be used without any allocations
|
|
|
|
use crate::errors::{CompilerError, Span};
|
|
use crate::gc::Symbol;
|
|
use crate::RtAlloc;
|
|
use std::iter::Peekable;
|
|
use std::str::CharIndices;
|
|
|
|
///
|
|
/// A single token generated from the lexer
|
|
///
|
|
/// For example `for`, `"hello"`, `main` or `.`
|
|
#[derive(Debug, Clone)]
|
|
pub struct Token {
|
|
pub span: Span,
|
|
pub kind: TokenKind,
|
|
}
|
|
|
|
impl Token {
|
|
fn single_span(start: usize, kind: TokenKind) -> Token {
|
|
Self {
|
|
span: Span::single(start),
|
|
kind,
|
|
}
|
|
}
|
|
|
|
fn new(span: Span, kind: TokenKind) -> Token {
|
|
Self { span, kind }
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Clone, PartialEq)]
|
|
pub enum TokenKind {
|
|
// keywords
|
|
Let,
|
|
Print,
|
|
Fn,
|
|
If,
|
|
Else,
|
|
Loop,
|
|
While,
|
|
For,
|
|
Break,
|
|
Return,
|
|
True,
|
|
False,
|
|
Null,
|
|
And,
|
|
Or,
|
|
Not,
|
|
// literals
|
|
String(Symbol),
|
|
Number(f64),
|
|
// ident
|
|
Ident(Symbol),
|
|
// punctuation
|
|
/// ;
|
|
Semi,
|
|
/// +
|
|
Plus,
|
|
/// -
|
|
Minus,
|
|
/// *
|
|
Asterisk,
|
|
/// /
|
|
Slash,
|
|
/// %
|
|
Percent,
|
|
/// {
|
|
BraceO,
|
|
/// }
|
|
BraceC,
|
|
/// [
|
|
BracketO,
|
|
/// ]
|
|
BracketC,
|
|
/// (
|
|
ParenO,
|
|
/// )
|
|
ParenC,
|
|
/// .
|
|
Dot,
|
|
/// ,
|
|
Comma,
|
|
// =
|
|
Equal,
|
|
/// ==
|
|
EqualEqual,
|
|
/// !=
|
|
BangEqual,
|
|
/// >
|
|
Greater,
|
|
/// <
|
|
Less,
|
|
/// >=
|
|
GreaterEqual,
|
|
/// <=
|
|
LessEqual,
|
|
|
|
/// An error occurred. It's boxed to save space, since `CompilerError` is > 6 `usize` big
|
|
Error(Box<CompilerError>),
|
|
}
|
|
|
|
#[derive(Debug)]
|
|
pub struct Lexer<'code, 'gc> {
|
|
code: Peekable<CharIndices<'code>>,
|
|
src: &'code str,
|
|
rt_alloc: &'gc mut RtAlloc,
|
|
}
|
|
|
|
impl<'code, 'gc> Lexer<'code, 'gc> {
|
|
pub fn new(code: &'code str, rt_alloc: &'gc mut RtAlloc) -> Self {
|
|
Self {
|
|
code: code.char_indices().peekable(),
|
|
src: code,
|
|
rt_alloc,
|
|
}
|
|
}
|
|
|
|
fn expect(&mut self, expected: char) -> bool {
|
|
self.code
|
|
.peek()
|
|
.map_or(false, |(_, char)| *char == expected)
|
|
}
|
|
|
|
fn maybe_next_char(
|
|
&mut self,
|
|
expect_char: char,
|
|
true_type: TokenKind,
|
|
false_type: TokenKind,
|
|
start: usize,
|
|
) -> Token {
|
|
if self.expect(expect_char) {
|
|
let _ = self.code.next(); // consume first one
|
|
Token {
|
|
span: Span::start_len(start, start + 2),
|
|
kind: true_type,
|
|
}
|
|
} else {
|
|
Token {
|
|
span: Span::single(start),
|
|
kind: false_type,
|
|
}
|
|
}
|
|
}
|
|
|
|
fn keyword_or_ident(&mut self, name: &str) -> TokenKind {
|
|
match name {
|
|
"loop" => TokenKind::Loop,
|
|
"let" => TokenKind::Let,
|
|
"fn" => TokenKind::Fn,
|
|
"for" => TokenKind::For,
|
|
"false" => TokenKind::False,
|
|
"if" => TokenKind::If,
|
|
"else" => TokenKind::Else,
|
|
"while" => TokenKind::While,
|
|
"break" => TokenKind::Break,
|
|
"return" => TokenKind::Return,
|
|
"true" => TokenKind::True,
|
|
"null" => TokenKind::Null,
|
|
"not" => TokenKind::Not,
|
|
"and" => TokenKind::And,
|
|
"or" => TokenKind::Or,
|
|
"print" => TokenKind::Print,
|
|
_ => TokenKind::Ident(self.rt_alloc.intern_string(name)),
|
|
}
|
|
}
|
|
}
|
|
|
|
impl<'code, 'gc> Iterator for Lexer<'code, 'gc> {
|
|
type Item = Token;
|
|
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
let token = loop {
|
|
let (start, char) = self.code.next()?;
|
|
match char {
|
|
_ if char.is_whitespace() => {}
|
|
'#' => {
|
|
// only peek so we don't skip the \n if the # is at the end
|
|
if let Some((_, '#')) = self.code.peek() {
|
|
let _ = self.code.next();
|
|
loop {
|
|
if let Some((_, '#')) | None = self.code.next() {
|
|
if let Some((_, '#')) | None = self.code.next() {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
loop {
|
|
if let Some((_, '\n')) | None = self.code.next() {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
';' => break Token::single_span(start, TokenKind::Semi),
|
|
'+' => break Token::single_span(start, TokenKind::Plus),
|
|
'-' => break Token::single_span(start, TokenKind::Minus),
|
|
'*' => break Token::single_span(start, TokenKind::Asterisk),
|
|
'/' => break Token::single_span(start, TokenKind::Slash),
|
|
'%' => break Token::single_span(start, TokenKind::Percent),
|
|
'{' => break Token::single_span(start, TokenKind::BraceO),
|
|
'}' => break Token::single_span(start, TokenKind::BraceC),
|
|
'[' => break Token::single_span(start, TokenKind::BracketO),
|
|
']' => break Token::single_span(start, TokenKind::BracketC),
|
|
'(' => break Token::single_span(start, TokenKind::ParenO),
|
|
')' => break Token::single_span(start, TokenKind::ParenC),
|
|
'.' => break Token::single_span(start, TokenKind::Dot),
|
|
',' => break Token::single_span(start, TokenKind::Comma),
|
|
'=' => {
|
|
break self.maybe_next_char(
|
|
'=',
|
|
TokenKind::EqualEqual,
|
|
TokenKind::Equal,
|
|
start,
|
|
);
|
|
}
|
|
'!' => {
|
|
break if self.expect('=') {
|
|
let _ = self.code.next(); // consume =;
|
|
Token::new(Span::start_len(start, start + 2), TokenKind::BangEqual)
|
|
} else {
|
|
Token::new(
|
|
Span::single(start),
|
|
TokenKind::Error(Box::new(CompilerError::with_note(
|
|
Span::single(start),
|
|
"Expected '=' after '!'".to_string(),
|
|
"If you meant to use it for negation, use `not`".to_string(),
|
|
))),
|
|
)
|
|
};
|
|
}
|
|
'>' => {
|
|
break self.maybe_next_char(
|
|
'=',
|
|
TokenKind::GreaterEqual,
|
|
TokenKind::Greater,
|
|
start,
|
|
);
|
|
}
|
|
'<' => {
|
|
break self.maybe_next_char('=', TokenKind::LessEqual, TokenKind::Less, start);
|
|
}
|
|
'"' => {
|
|
let mut buffer = String::new();
|
|
let mut escaped = false;
|
|
let end = loop {
|
|
match self.code.next() {
|
|
Some((end, '"')) if !escaped => break end,
|
|
Some((_, '\\')) if !escaped => escaped = true,
|
|
Some((_, char)) => {
|
|
escaped = false;
|
|
buffer.push(char);
|
|
}
|
|
None => {
|
|
return Some(Token::new(
|
|
Span::single(start),
|
|
TokenKind::Error(Box::new(CompilerError::with_note(
|
|
Span::single(start), // no not show the whole literal, this does not make sense
|
|
"String literal not closed".to_string(),
|
|
"Close the literal using '\"'".to_string(),
|
|
))),
|
|
));
|
|
}
|
|
}
|
|
};
|
|
break Token::new(
|
|
Span::start_end(start, end),
|
|
TokenKind::String(self.rt_alloc.intern_string(&buffer)),
|
|
);
|
|
}
|
|
char => {
|
|
if char.is_ascii_digit() {
|
|
let mut had_dot = false;
|
|
let end = loop {
|
|
// peek here because the character signaling the end should not be consumed
|
|
match self.code.peek() {
|
|
Some((_, '.')) if !had_dot => {
|
|
let _ = self.code.next();
|
|
had_dot = true;
|
|
}
|
|
Some((_, next_char)) if next_char.is_ascii_digit() => {
|
|
let _ = self.code.next();
|
|
}
|
|
Some((end, _)) => break *end,
|
|
None => break self.src.len(), // reached EOF, so parse this number
|
|
}
|
|
};
|
|
let number_str = &self.src[start..end];
|
|
let span = Span::start_end(start, end);
|
|
let number = number_str.parse::<f64>();
|
|
break match number {
|
|
Ok(number) if number.is_infinite() => {
|
|
Token::new(span, TokenKind::Error(Box::new(CompilerError::with_note(
|
|
span,
|
|
"Number literal too long".to_string(),
|
|
"A number literal cannot be larger than a 64 bit float can represent"
|
|
.to_string(),
|
|
))))
|
|
}
|
|
Ok(number) => Token::new(span, TokenKind::Number(number)),
|
|
Err(err) => Token::new(span, TokenKind::Error(Box::new(CompilerError::with_note(
|
|
span,
|
|
"Invalid number".to_string(),
|
|
err.to_string(),
|
|
)))),
|
|
};
|
|
} else if is_valid_ident_start(char) {
|
|
// it must be an identifier
|
|
let end = loop {
|
|
match self.code.peek() {
|
|
Some((_, char)) if is_valid_ident_part(*char) => {
|
|
let _ = self.code.next(); // consume identifier part
|
|
}
|
|
Some((end, _)) => break *end,
|
|
None => break self.src.len(),
|
|
}
|
|
};
|
|
break Token::new(
|
|
Span::start_end(start, end),
|
|
self.keyword_or_ident(&self.src[start..end]),
|
|
);
|
|
} else {
|
|
break Token::new(
|
|
Span::single(start),
|
|
TokenKind::Error(Box::new(CompilerError::with_note(
|
|
Span::single(start),
|
|
format!("Unexpected character: '{}'", char),
|
|
"Character is not allowed outside of string literals and comments"
|
|
.to_string(),
|
|
))),
|
|
);
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
Some(token)
|
|
}
|
|
}
|
|
|
|
fn is_valid_ident_part(char: char) -> bool {
|
|
char.is_alphanumeric() || char == '_'
|
|
}
|
|
|
|
fn is_valid_ident_start(char: char) -> bool {
|
|
char.is_alphabetic() || char == '_'
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod test {
|
|
use crate::lex::Lexer;
|
|
use crate::RtAlloc;
|
|
|
|
type StdString = std::string::String;
|
|
|
|
fn lex_test(code: &str) {
|
|
// SAFETY: we only work in this tiny scope
|
|
let mut runtime = unsafe { RtAlloc::new() };
|
|
|
|
let lexer = Lexer::new(code, &mut runtime);
|
|
let tokens = lexer.map(|token| token.kind).collect::<Vec<_>>();
|
|
|
|
insta::assert_debug_snapshot!(tokens);
|
|
}
|
|
|
|
#[test]
|
|
fn smiley_face() {
|
|
lex_test(">>.<<");
|
|
}
|
|
|
|
#[test]
|
|
fn greater_than_less_than_equal() {
|
|
lex_test(">= <= == < < >=");
|
|
}
|
|
|
|
#[test]
|
|
fn no_no_no() {
|
|
lex_test("!= != = !=");
|
|
}
|
|
|
|
#[test]
|
|
fn braces_brackets_parens() {
|
|
lex_test("{([]]}");
|
|
}
|
|
|
|
#[test]
|
|
fn braces_brackets_parens_whitespace() {
|
|
lex_test(
|
|
"{ ( [ ] ]
|
|
|
|
}",
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn fancy_stuff() {
|
|
lex_test(". ,- * -, .");
|
|
}
|
|
|
|
#[test]
|
|
fn comments() {
|
|
lex_test("fn # fn");
|
|
}
|
|
|
|
#[test]
|
|
fn long_multiline_comment() {
|
|
lex_test(
|
|
"fn ## hello i am something
|
|
|
|
i span multiple lines
|
|
|
|
will you love me? 🥺🥺🥺🥺🥺
|
|
|
|
pls :) o(* ̄▽ ̄*)ブ
|
|
|
|
i like the indentation here ngl | sneak for -> ## for ## <- sneak for
|
|
## and",
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn terminate_multiline_comment_correctly() {
|
|
lex_test(
|
|
"fn ## # no not here :( ## let # ## <- this is commented out
|
|
# so no multiline comment
|
|
##
|
|
|
|
here it starts
|
|
# let #
|
|
# # and
|
|
## or
|
|
",
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn greeting() {
|
|
lex_test("-.- /%");
|
|
}
|
|
|
|
#[test]
|
|
fn countdown() {
|
|
lex_test("3 . . 2 . . 1 . . 0");
|
|
}
|
|
|
|
#[test]
|
|
fn larger_numbers() {
|
|
lex_test("123456789, 123456789.1234, 64785903");
|
|
}
|
|
|
|
#[test]
|
|
fn string() {
|
|
lex_test(r#""uwu""#);
|
|
}
|
|
|
|
#[test]
|
|
fn strings() {
|
|
lex_test(r#"( "hi" "uwu" "\"uwu\"" "no \\ u" )"#);
|
|
}
|
|
|
|
#[test]
|
|
fn keywords() {
|
|
lex_test("let fn if else loop while break for true false null and not or print");
|
|
}
|
|
|
|
#[test]
|
|
fn keyword_and_ident() {
|
|
lex_test("let variable be a loop if false is true");
|
|
}
|
|
|
|
#[test]
|
|
fn not_quite_a_keyword() {
|
|
let words = [
|
|
"letter",
|
|
"fori",
|
|
"fnfn",
|
|
"iffy",
|
|
"bloop",
|
|
"loopy_yeah",
|
|
"whileTrue",
|
|
"truefalse",
|
|
"falsetrue",
|
|
"nullability",
|
|
"rot",
|
|
"ornot",
|
|
"nor",
|
|
"andnowQuestionMark",
|
|
"notOrAnd",
|
|
"breakMe",
|
|
"Ibreak",
|
|
];
|
|
let sentences = words
|
|
.iter()
|
|
.map(|word| format!("{} ", word))
|
|
.collect::<StdString>();
|
|
|
|
lex_test(&sentences);
|
|
}
|
|
|
|
#[test]
|
|
fn serious_program() {
|
|
lex_test(
|
|
r#"let string = "hallol"
|
|
let number = 5
|
|
let me out ._.
|
|
fn world() {
|
|
if number == 5 or true == false and not false {
|
|
println("Hello \\ World!")
|
|
}
|
|
}"#,
|
|
);
|
|
}
|
|
}
|