mirror of
https://github.com/Noratrieb/dilaria.git
synced 2026-01-14 09:25:02 +01:00
all lex tests pass 🎉
This commit is contained in:
parent
1bd999bb9b
commit
1ed076a5d3
4 changed files with 256 additions and 24 deletions
|
|
@ -58,6 +58,14 @@ Comments using `#`
|
|||
# hi!
|
||||
```
|
||||
|
||||
Multiline comments using `##` until `##`
|
||||
```
|
||||
##
|
||||
hi
|
||||
comment
|
||||
##
|
||||
```
|
||||
|
||||
There are many native functions, that can easily be customized and added/removed by the host
|
||||
|
||||
```
|
||||
|
|
|
|||
13
src/alloc.rs
13
src/alloc.rs
|
|
@ -1,3 +1,5 @@
|
|||
#![allow(dead_code)]
|
||||
|
||||
use std::rc::Rc;
|
||||
|
||||
pub struct Alloc {
|
||||
|
|
@ -9,7 +11,8 @@ pub enum Object {
|
|||
}
|
||||
|
||||
/// Reference to an interned String
|
||||
struct IStr {
|
||||
#[derive(Debug)]
|
||||
pub struct IStr {
|
||||
/// This will be changed to a raw pointer once a tracing GC is implemented
|
||||
data: Rc<str>,
|
||||
hash: u64,
|
||||
|
|
@ -27,13 +30,5 @@ mod table {
|
|||
#[derive(Debug, Default)]
|
||||
struct StringHashBuilder;
|
||||
|
||||
impl std::hash::BuildHasher for StringHashBuilder {
|
||||
type Hasher = ();
|
||||
|
||||
fn build_hasher(&self) -> Self::Hasher {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
struct PrimitveHasher {}
|
||||
}
|
||||
|
|
|
|||
258
src/lex.rs
258
src/lex.rs
|
|
@ -1,5 +1,3 @@
|
|||
#![allow(dead_code)]
|
||||
|
||||
use std::iter::Peekable;
|
||||
use std::str::CharIndices;
|
||||
|
||||
|
|
@ -55,7 +53,7 @@ pub enum TokenType<'code> {
|
|||
Or,
|
||||
Not,
|
||||
// literals
|
||||
String(&'code str),
|
||||
String(String),
|
||||
Number(f64),
|
||||
// ident
|
||||
Ident(&'code str),
|
||||
|
|
@ -153,6 +151,25 @@ impl<'code> Iterator for Lexer<'code> {
|
|||
let (start, char) = self.code.next()?;
|
||||
match char {
|
||||
_ if char.is_whitespace() => {}
|
||||
'#' => {
|
||||
// only peek so we don't skip the \n if the # is at the end
|
||||
if let Some((_, '#')) = self.code.peek() {
|
||||
let _ = self.code.next();
|
||||
loop {
|
||||
if let Some((_, '#')) | None = self.code.next() {
|
||||
if let Some((_, '#')) | None = self.code.next() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
loop {
|
||||
if let Some((_, '\n')) | None = self.code.next() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
'+' => break Token::single_span(start, TokenType::Plus),
|
||||
'-' => break Token::single_span(start, TokenType::Minus),
|
||||
'*' => break Token::single_span(start, TokenType::Asterisk),
|
||||
|
|
@ -202,12 +219,16 @@ impl<'code> Iterator for Lexer<'code> {
|
|||
);
|
||||
}
|
||||
'"' => {
|
||||
let mut buffer = String::new();
|
||||
let mut escaped = false;
|
||||
let end = loop {
|
||||
match self.code.next() {
|
||||
Some((end, '"')) if !escaped => break end,
|
||||
Some((_, '\\')) if !escaped => escaped = true,
|
||||
Some((_, _)) => escaped = false,
|
||||
Some((_, char)) => {
|
||||
escaped = false;
|
||||
buffer.push(char);
|
||||
}
|
||||
None => {
|
||||
return Some(Err(LexError(
|
||||
"reached EOF expecting '\"'".to_string(),
|
||||
|
|
@ -215,10 +236,7 @@ impl<'code> Iterator for Lexer<'code> {
|
|||
}
|
||||
}
|
||||
};
|
||||
break Token::new(
|
||||
Span::new(start, end - start),
|
||||
TokenType::String(&self.src[start + 1..end]),
|
||||
);
|
||||
break Token::new(Span::new(start, end - start), TokenType::String(buffer));
|
||||
}
|
||||
char => {
|
||||
if char.is_ascii_digit() {
|
||||
|
|
@ -250,6 +268,23 @@ impl<'code> Iterator for Lexer<'code> {
|
|||
}
|
||||
Err(err) => return Some(Err(err)),
|
||||
}
|
||||
} else if is_valid_ident_start(char) {
|
||||
// it must be an identifier
|
||||
let end = loop {
|
||||
match self.code.peek() {
|
||||
Some((_, char)) if is_valid_ident_part(*char) => {
|
||||
let _ = self.code.next(); // consume identifier part
|
||||
}
|
||||
Some((end, _)) => break *end,
|
||||
None => break self.src.len(),
|
||||
}
|
||||
};
|
||||
break Token::new(
|
||||
Span::new(start, end),
|
||||
keyword_or_ident(&self.src[start..end]),
|
||||
);
|
||||
} else {
|
||||
return Some(Err(LexError(format!("Invalid character: {}", char))));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -259,8 +294,54 @@ impl<'code> Iterator for Lexer<'code> {
|
|||
}
|
||||
}
|
||||
|
||||
fn keyword_or_ident(name: &str) -> TokenType {
|
||||
// make this efficient using the trie pattern
|
||||
// ignore that unicode exists, because all keywords are in ascii
|
||||
// we need to use bytes though instead of indexing into the string directly to avoid panics
|
||||
let bs = name.as_bytes();
|
||||
let len = bs.len();
|
||||
// there are no single letter keywords
|
||||
if len < 2 {
|
||||
return TokenType::Ident(name);
|
||||
}
|
||||
match bs[0] {
|
||||
// loop && let
|
||||
b'l' => match bs[1] {
|
||||
b'o' if len == 4 && bs[2..4] == *b"op" => TokenType::Loop,
|
||||
b'e' if len == 3 && bs[2] == b't' => TokenType::Let,
|
||||
_ => TokenType::Ident(name),
|
||||
},
|
||||
// for && fn && false
|
||||
b'f' => match bs[1] {
|
||||
b'n' if len == 2 => TokenType::Fn,
|
||||
b'o' if len == 3 && bs[2] == b'r' => TokenType::For,
|
||||
b'a' if len == 5 && bs[2..5] == *b"lse" => TokenType::False,
|
||||
_ => TokenType::Ident(name),
|
||||
},
|
||||
// if
|
||||
b'i' if len == 2 && bs[1] == b'f' => TokenType::If,
|
||||
// else
|
||||
b'e' if len == 4 && bs[1..4] == *b"lse" => TokenType::Else,
|
||||
// while
|
||||
b'w' if len == 5 && bs[1..5] == *b"hile" => TokenType::While,
|
||||
// true
|
||||
b't' if len == 4 && bs[1..4] == *b"rue" => TokenType::True,
|
||||
// null && not
|
||||
b'n' => match bs[1] {
|
||||
b'u' if len == 4 && bs[2..4] == *b"ll" => TokenType::Null,
|
||||
b'o' if len == 3 && bs[2] == b't' => TokenType::Not,
|
||||
_ => TokenType::Ident(name),
|
||||
},
|
||||
// and
|
||||
b'a' if len == 3 && bs[1..3] == *b"nd" => TokenType::And,
|
||||
// or
|
||||
b'o' if len == 2 && bs[1] == b'r' => TokenType::Or,
|
||||
_ => TokenType::Ident(name),
|
||||
}
|
||||
}
|
||||
|
||||
fn is_valid_ident_part(char: char) -> bool {
|
||||
char.is_alphanumeric()
|
||||
char.is_alphanumeric() || char == '_'
|
||||
}
|
||||
|
||||
fn is_valid_ident_start(char: char) -> bool {
|
||||
|
|
@ -275,6 +356,8 @@ mod test {
|
|||
use crate::lex::Lexer;
|
||||
use crate::lex::TokenType::{self, *};
|
||||
|
||||
type StdString = std::string::String;
|
||||
|
||||
fn lex_types(str: &str) -> Vec<TokenType> {
|
||||
let lexer = Lexer::lex(str);
|
||||
lexer.map(|token| token.unwrap().kind).collect::<Vec<_>>()
|
||||
|
|
@ -338,6 +421,44 @@ mod test {
|
|||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn comments() {
|
||||
lex_test("fn # fn", vec![Fn]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn long_multiline_comment() {
|
||||
lex_test(
|
||||
"fn ## hello i am something
|
||||
|
||||
i span multiple lines
|
||||
|
||||
will you love me? 🥺🥺🥺🥺🥺
|
||||
|
||||
pls :) o(* ̄▽ ̄*)ブ
|
||||
|
||||
i like the indentation here ngl | sneak for -> ## for ## <- sneak for
|
||||
## and",
|
||||
vec![Fn, For, And],
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn terminate_multiline_comment_correctly() {
|
||||
lex_test(
|
||||
"fn ## # no not here :( ## let # ## <- this is commented out
|
||||
# so no multiline comment
|
||||
##
|
||||
|
||||
here it starts
|
||||
# let #
|
||||
# # and
|
||||
## or
|
||||
",
|
||||
vec![Fn, Let, Or],
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn greeting() {
|
||||
lex_test("-.- /%", vec![Minus, Dot, Minus, Slash, Percent])
|
||||
|
|
@ -378,7 +499,7 @@ mod test {
|
|||
|
||||
#[test]
|
||||
fn string() {
|
||||
lex_test(r#""uwu""#, vec![String("uwu")])
|
||||
lex_test(r#""uwu""#, vec![String("uwu".to_string())])
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -387,12 +508,121 @@ mod test {
|
|||
r#"( "hi" "uwu" "\"uwu\"" "no \\ u" )"#,
|
||||
vec![
|
||||
ParenO,
|
||||
String("hi"),
|
||||
String("uwu"),
|
||||
String("\"uwu\""),
|
||||
String("no \\ u"),
|
||||
String("hi".to_string()),
|
||||
String("uwu".to_string()),
|
||||
String("\"uwu\"".to_string()),
|
||||
String("no \\ u".to_string()),
|
||||
ParenC,
|
||||
],
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn keywords() {
|
||||
lex_test(
|
||||
"let fn if else loop while for true false null and not or",
|
||||
vec![
|
||||
Let, Fn, If, Else, Loop, While, For, True, False, Null, And, Not, Or,
|
||||
],
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn keyword_and_ident() {
|
||||
lex_test(
|
||||
"let variable be a loop if false is true",
|
||||
vec![
|
||||
Let,
|
||||
Ident("variable"),
|
||||
Ident("be"),
|
||||
Ident("a"),
|
||||
Loop,
|
||||
If,
|
||||
False,
|
||||
Ident("is"),
|
||||
True,
|
||||
],
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn not_quite_a_keyword() {
|
||||
let words = [
|
||||
"letter",
|
||||
"fori",
|
||||
"fnfn",
|
||||
"iffy",
|
||||
"bloop",
|
||||
"loopy_yeah",
|
||||
"whileTrue",
|
||||
"truefalse",
|
||||
"falsetrue",
|
||||
"nullability",
|
||||
"rot",
|
||||
"ornot",
|
||||
"nor",
|
||||
"andnowQuestionMark",
|
||||
"notOrAnd",
|
||||
];
|
||||
let sentences = words
|
||||
.iter()
|
||||
.map(|word| format!("{} ", word))
|
||||
.collect::<StdString>();
|
||||
let expected = words.map(TokenType::Ident).to_vec();
|
||||
|
||||
lex_test(&sentences, expected)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn serious_program() {
|
||||
lex_test(
|
||||
r#"let string = "hallol"
|
||||
let number = 5
|
||||
let me out ._.
|
||||
fn world() {
|
||||
if number == 5 or true == false and not false {
|
||||
print("Hello \\ World!")
|
||||
}
|
||||
}"#,
|
||||
vec![
|
||||
Let,
|
||||
Ident("string"),
|
||||
Equal,
|
||||
String("hallol".to_string()),
|
||||
Let,
|
||||
Ident("number"),
|
||||
Equal,
|
||||
Number(5.0),
|
||||
Let,
|
||||
Ident("me"),
|
||||
Ident("out"),
|
||||
Dot,
|
||||
Ident("_"),
|
||||
Dot,
|
||||
Fn,
|
||||
Ident("world"),
|
||||
ParenO,
|
||||
ParenC,
|
||||
BraceO,
|
||||
If,
|
||||
Ident("number"),
|
||||
EqualEqual,
|
||||
Number(5.0),
|
||||
Or,
|
||||
True,
|
||||
EqualEqual,
|
||||
False,
|
||||
And,
|
||||
Not,
|
||||
False,
|
||||
BraceO,
|
||||
Ident("print"),
|
||||
ParenO,
|
||||
String("Hello \\ World!".to_string()),
|
||||
ParenC,
|
||||
BraceC,
|
||||
BraceC,
|
||||
],
|
||||
)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,7 +1,6 @@
|
|||
mod alloc;
|
||||
mod lex;
|
||||
mod parse;
|
||||
mod string;
|
||||
|
||||
pub fn run_program(program: &str) {
|
||||
let lexer = lex::Lexer::lex(program);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue