all lex tests pass 🎉

This commit is contained in:
nora 2021-10-07 20:22:44 +02:00
parent 1bd999bb9b
commit 1ed076a5d3
4 changed files with 256 additions and 24 deletions

View file

@ -58,6 +58,14 @@ Comments using `#`
# hi!
```
Multiline comments using `##` until `##`
```
##
hi
comment
##
```
There are many native functions, that can easily be customized and added/removed by the host
```

View file

@ -1,3 +1,5 @@
#![allow(dead_code)]
use std::rc::Rc;
pub struct Alloc {
@ -9,7 +11,8 @@ pub enum Object {
}
/// Reference to an interned String
struct IStr {
#[derive(Debug)]
pub struct IStr {
/// This will be changed to a raw pointer once a tracing GC is implemented
data: Rc<str>,
hash: u64,
@ -27,13 +30,5 @@ mod table {
#[derive(Debug, Default)]
struct StringHashBuilder;
impl std::hash::BuildHasher for StringHashBuilder {
type Hasher = ();
fn build_hasher(&self) -> Self::Hasher {
todo!()
}
}
struct PrimitveHasher {}
}

View file

@ -1,5 +1,3 @@
#![allow(dead_code)]
use std::iter::Peekable;
use std::str::CharIndices;
@ -55,7 +53,7 @@ pub enum TokenType<'code> {
Or,
Not,
// literals
String(&'code str),
String(String),
Number(f64),
// ident
Ident(&'code str),
@ -153,6 +151,25 @@ impl<'code> Iterator for Lexer<'code> {
let (start, char) = self.code.next()?;
match char {
_ if char.is_whitespace() => {}
'#' => {
// only peek so we don't skip the \n if the # is at the end
if let Some((_, '#')) = self.code.peek() {
let _ = self.code.next();
loop {
if let Some((_, '#')) | None = self.code.next() {
if let Some((_, '#')) | None = self.code.next() {
break;
}
}
}
} else {
loop {
if let Some((_, '\n')) | None = self.code.next() {
break;
}
}
}
}
'+' => break Token::single_span(start, TokenType::Plus),
'-' => break Token::single_span(start, TokenType::Minus),
'*' => break Token::single_span(start, TokenType::Asterisk),
@ -202,12 +219,16 @@ impl<'code> Iterator for Lexer<'code> {
);
}
'"' => {
let mut buffer = String::new();
let mut escaped = false;
let end = loop {
match self.code.next() {
Some((end, '"')) if !escaped => break end,
Some((_, '\\')) if !escaped => escaped = true,
Some((_, _)) => escaped = false,
Some((_, char)) => {
escaped = false;
buffer.push(char);
}
None => {
return Some(Err(LexError(
"reached EOF expecting '\"'".to_string(),
@ -215,10 +236,7 @@ impl<'code> Iterator for Lexer<'code> {
}
}
};
break Token::new(
Span::new(start, end - start),
TokenType::String(&self.src[start + 1..end]),
);
break Token::new(Span::new(start, end - start), TokenType::String(buffer));
}
char => {
if char.is_ascii_digit() {
@ -250,6 +268,23 @@ impl<'code> Iterator for Lexer<'code> {
}
Err(err) => return Some(Err(err)),
}
} else if is_valid_ident_start(char) {
// it must be an identifier
let end = loop {
match self.code.peek() {
Some((_, char)) if is_valid_ident_part(*char) => {
let _ = self.code.next(); // consume identifier part
}
Some((end, _)) => break *end,
None => break self.src.len(),
}
};
break Token::new(
Span::new(start, end),
keyword_or_ident(&self.src[start..end]),
);
} else {
return Some(Err(LexError(format!("Invalid character: {}", char))));
}
}
}
@ -259,8 +294,54 @@ impl<'code> Iterator for Lexer<'code> {
}
}
fn keyword_or_ident(name: &str) -> TokenType {
// make this efficient using the trie pattern
// ignore that unicode exists, because all keywords are in ascii
// we need to use bytes though instead of indexing into the string directly to avoid panics
let bs = name.as_bytes();
let len = bs.len();
// there are no single letter keywords
if len < 2 {
return TokenType::Ident(name);
}
match bs[0] {
// loop && let
b'l' => match bs[1] {
b'o' if len == 4 && bs[2..4] == *b"op" => TokenType::Loop,
b'e' if len == 3 && bs[2] == b't' => TokenType::Let,
_ => TokenType::Ident(name),
},
// for && fn && false
b'f' => match bs[1] {
b'n' if len == 2 => TokenType::Fn,
b'o' if len == 3 && bs[2] == b'r' => TokenType::For,
b'a' if len == 5 && bs[2..5] == *b"lse" => TokenType::False,
_ => TokenType::Ident(name),
},
// if
b'i' if len == 2 && bs[1] == b'f' => TokenType::If,
// else
b'e' if len == 4 && bs[1..4] == *b"lse" => TokenType::Else,
// while
b'w' if len == 5 && bs[1..5] == *b"hile" => TokenType::While,
// true
b't' if len == 4 && bs[1..4] == *b"rue" => TokenType::True,
// null && not
b'n' => match bs[1] {
b'u' if len == 4 && bs[2..4] == *b"ll" => TokenType::Null,
b'o' if len == 3 && bs[2] == b't' => TokenType::Not,
_ => TokenType::Ident(name),
},
// and
b'a' if len == 3 && bs[1..3] == *b"nd" => TokenType::And,
// or
b'o' if len == 2 && bs[1] == b'r' => TokenType::Or,
_ => TokenType::Ident(name),
}
}
fn is_valid_ident_part(char: char) -> bool {
char.is_alphanumeric()
char.is_alphanumeric() || char == '_'
}
fn is_valid_ident_start(char: char) -> bool {
@ -275,6 +356,8 @@ mod test {
use crate::lex::Lexer;
use crate::lex::TokenType::{self, *};
type StdString = std::string::String;
fn lex_types(str: &str) -> Vec<TokenType> {
let lexer = Lexer::lex(str);
lexer.map(|token| token.unwrap().kind).collect::<Vec<_>>()
@ -338,6 +421,44 @@ mod test {
)
}
#[test]
fn comments() {
lex_test("fn # fn", vec![Fn]);
}
#[test]
fn long_multiline_comment() {
lex_test(
"fn ## hello i am something
i span multiple lines
will you love me? 🥺🥺🥺🥺🥺
pls :) o(**)
i like the indentation here ngl | sneak for -> ## for ## <- sneak for
## and",
vec![Fn, For, And],
)
}
#[test]
fn terminate_multiline_comment_correctly() {
lex_test(
"fn ## # no not here :( ## let # ## <- this is commented out
# so no multiline comment
##
here it starts
# let #
# # and
## or
",
vec![Fn, Let, Or],
)
}
#[test]
fn greeting() {
lex_test("-.- /%", vec![Minus, Dot, Minus, Slash, Percent])
@ -378,7 +499,7 @@ mod test {
#[test]
fn string() {
lex_test(r#""uwu""#, vec![String("uwu")])
lex_test(r#""uwu""#, vec![String("uwu".to_string())])
}
#[test]
@ -387,12 +508,121 @@ mod test {
r#"( "hi" "uwu" "\"uwu\"" "no \\ u" )"#,
vec![
ParenO,
String("hi"),
String("uwu"),
String("\"uwu\""),
String("no \\ u"),
String("hi".to_string()),
String("uwu".to_string()),
String("\"uwu\"".to_string()),
String("no \\ u".to_string()),
ParenC,
],
)
}
#[test]
fn keywords() {
lex_test(
"let fn if else loop while for true false null and not or",
vec![
Let, Fn, If, Else, Loop, While, For, True, False, Null, And, Not, Or,
],
)
}
#[test]
fn keyword_and_ident() {
lex_test(
"let variable be a loop if false is true",
vec![
Let,
Ident("variable"),
Ident("be"),
Ident("a"),
Loop,
If,
False,
Ident("is"),
True,
],
)
}
#[test]
fn not_quite_a_keyword() {
let words = [
"letter",
"fori",
"fnfn",
"iffy",
"bloop",
"loopy_yeah",
"whileTrue",
"truefalse",
"falsetrue",
"nullability",
"rot",
"ornot",
"nor",
"andnowQuestionMark",
"notOrAnd",
];
let sentences = words
.iter()
.map(|word| format!("{} ", word))
.collect::<StdString>();
let expected = words.map(TokenType::Ident).to_vec();
lex_test(&sentences, expected)
}
#[test]
fn serious_program() {
lex_test(
r#"let string = "hallol"
let number = 5
let me out ._.
fn world() {
if number == 5 or true == false and not false {
print("Hello \\ World!")
}
}"#,
vec![
Let,
Ident("string"),
Equal,
String("hallol".to_string()),
Let,
Ident("number"),
Equal,
Number(5.0),
Let,
Ident("me"),
Ident("out"),
Dot,
Ident("_"),
Dot,
Fn,
Ident("world"),
ParenO,
ParenC,
BraceO,
If,
Ident("number"),
EqualEqual,
Number(5.0),
Or,
True,
EqualEqual,
False,
And,
Not,
False,
BraceO,
Ident("print"),
ParenO,
String("Hello \\ World!".to_string()),
ParenC,
BraceC,
BraceC,
],
)
}
}

View file

@ -1,7 +1,6 @@
mod alloc;
mod lex;
mod parse;
mod string;
pub fn run_program(program: &str) {
let lexer = lex::Lexer::lex(program);