all lex tests pass 🎉

2026-03-14 20:36:03 +01:00 · 2021-10-07 20:22:44 +02:00 · 2021-10-07 20:22:44 +02:00 · 1ed076a5d3
commit 1ed076a5d3
parent 1bd999bb9b
4 changed files with 256 additions and 24 deletions
--- a/README.md
+++ b/README.md
@ -58,6 +58,14 @@ Comments using `#`
 # hi!
 ```

+Multiline comments using `##` until `##`
+```
+##
+hi
+comment
+##
+```
+
 There are many native functions, that can easily be customized and added/removed by the host

 ```
--- a/src/alloc.rs
+++ b/src/alloc.rs
@ -1,3 +1,5 @@
+#![allow(dead_code)]
+
 use std::rc::Rc;

 pub struct Alloc {
@ -9,7 +11,8 @@ pub enum Object {
 }

 /// Reference to an interned String
-struct IStr {
+#[derive(Debug)]
+pub struct IStr {
    /// This will be changed to a raw pointer once a tracing GC is implemented
    data: Rc<str>,
    hash: u64,
@ -27,13 +30,5 @@ mod table {
    #[derive(Debug, Default)]
    struct StringHashBuilder;

-    impl std::hash::BuildHasher for StringHashBuilder {
-        type Hasher = ();
-
-        fn build_hasher(&self) -> Self::Hasher {
-            todo!()
-        }
-    }
-
    struct PrimitveHasher {}
 }
--- a/src/lex.rs
+++ b/src/lex.rs
@ -1,5 +1,3 @@
-#![allow(dead_code)]
-
 use std::iter::Peekable;
 use std::str::CharIndices;

@ -55,7 +53,7 @@ pub enum TokenType<'code> {
    Or,
    Not,
    // literals
-    String(&'code str),
+    String(String),
    Number(f64),
    // ident
    Ident(&'code str),
@ -153,6 +151,25 @@ impl<'code> Iterator for Lexer<'code> {
            let (start, char) = self.code.next()?;
            match char {
                _ if char.is_whitespace() => {}
+                '#' => {
+                    // only peek so we don't skip the \n if the # is at the end
+                    if let Some((_, '#')) = self.code.peek() {
+                        let _ = self.code.next();
+                        loop {
+                            if let Some((_, '#')) | None = self.code.next() {
+                                if let Some((_, '#')) | None = self.code.next() {
+                                    break;
+                                }
+                            }
+                        }
+                    } else {
+                        loop {
+                            if let Some((_, '\n')) | None = self.code.next() {
+                                break;
+                            }
+                        }
+                    }
+                }
                '+' => break Token::single_span(start, TokenType::Plus),
                '-' => break Token::single_span(start, TokenType::Minus),
                '*' => break Token::single_span(start, TokenType::Asterisk),
@ -202,12 +219,16 @@ impl<'code> Iterator for Lexer<'code> {
                    );
                }
                '"' => {
+                    let mut buffer = String::new();
                    let mut escaped = false;
                    let end = loop {
                        match self.code.next() {
                            Some((end, '"')) if !escaped => break end,
                            Some((_, '\\')) if !escaped => escaped = true,
-                            Some((_, _)) => escaped = false,
+                            Some((_, char)) => {
+                                escaped = false;
+                                buffer.push(char);
+                            }
                            None => {
                                return Some(Err(LexError(
                                    "reached EOF expecting '\"'".to_string(),
@ -215,10 +236,7 @@ impl<'code> Iterator for Lexer<'code> {
                            }
                        }
                    };
-                    break Token::new(
-                        Span::new(start, end - start),
-                        TokenType::String(&self.src[start + 1..end]),
-                    );
+                    break Token::new(Span::new(start, end - start), TokenType::String(buffer));
                }
                char => {
                    if char.is_ascii_digit() {
@ -250,6 +268,23 @@ impl<'code> Iterator for Lexer<'code> {
                            }
                            Err(err) => return Some(Err(err)),
                        }
+                    } else if is_valid_ident_start(char) {
+                        // it must be an identifier
+                        let end = loop {
+                            match self.code.peek() {
+                                Some((_, char)) if is_valid_ident_part(*char) => {
+                                    let _ = self.code.next(); // consume identifier part
+                                }
+                                Some((end, _)) => break *end,
+                                None => break self.src.len(),
+                            }
+                        };
+                        break Token::new(
+                            Span::new(start, end),
+                            keyword_or_ident(&self.src[start..end]),
+                        );
+                    } else {
+                        return Some(Err(LexError(format!("Invalid character: {}", char))));
                    }
                }
            }
@ -259,8 +294,54 @@ impl<'code> Iterator for Lexer<'code> {
    }
 }

+fn keyword_or_ident(name: &str) -> TokenType {
+    // make this efficient using the trie pattern
+    // ignore that unicode exists, because all keywords are in ascii
+    // we need to use bytes though instead of indexing into the string directly to avoid panics
+    let bs = name.as_bytes();
+    let len = bs.len();
+    // there are no single letter keywords
+    if len < 2 {
+        return TokenType::Ident(name);
+    }
+    match bs[0] {
+        // loop && let
+        b'l' => match bs[1] {
+            b'o' if len == 4 && bs[2..4] == *b"op" => TokenType::Loop,
+            b'e' if len == 3 && bs[2] == b't' => TokenType::Let,
+            _ => TokenType::Ident(name),
+        },
+        // for && fn && false
+        b'f' => match bs[1] {
+            b'n' if len == 2 => TokenType::Fn,
+            b'o' if len == 3 && bs[2] == b'r' => TokenType::For,
+            b'a' if len == 5 && bs[2..5] == *b"lse" => TokenType::False,
+            _ => TokenType::Ident(name),
+        },
+        // if
+        b'i' if len == 2 && bs[1] == b'f' => TokenType::If,
+        // else
+        b'e' if len == 4 && bs[1..4] == *b"lse" => TokenType::Else,
+        // while
+        b'w' if len == 5 && bs[1..5] == *b"hile" => TokenType::While,
+        // true
+        b't' if len == 4 && bs[1..4] == *b"rue" => TokenType::True,
+        // null && not
+        b'n' => match bs[1] {
+            b'u' if len == 4 && bs[2..4] == *b"ll" => TokenType::Null,
+            b'o' if len == 3 && bs[2] == b't' => TokenType::Not,
+            _ => TokenType::Ident(name),
+        },
+        // and
+        b'a' if len == 3 && bs[1..3] == *b"nd" => TokenType::And,
+        // or
+        b'o' if len == 2 && bs[1] == b'r' => TokenType::Or,
+        _ => TokenType::Ident(name),
+    }
+}
+
 fn is_valid_ident_part(char: char) -> bool {
-    char.is_alphanumeric()
+    char.is_alphanumeric() || char == '_'
 }

 fn is_valid_ident_start(char: char) -> bool {
@ -275,6 +356,8 @@ mod test {
    use crate::lex::Lexer;
    use crate::lex::TokenType::{self, *};

+    type StdString = std::string::String;
+
    fn lex_types(str: &str) -> Vec<TokenType> {
        let lexer = Lexer::lex(str);
        lexer.map(|token| token.unwrap().kind).collect::<Vec<_>>()
@ -338,6 +421,44 @@ mod test {
        )
    }

+    #[test]
+    fn comments() {
+        lex_test("fn # fn", vec![Fn]);
+    }
+
+    #[test]
+    fn long_multiline_comment() {
+        lex_test(
+            "fn ## hello i am something
+         
+         i span multiple lines
+
+        will you love me? 🥺🥺🥺🥺🥺
+
+    pls :) o(*￣▽￣*)ブ
+     
+       i like the indentation here ngl |     sneak for -> ## for ## <- sneak for
+         ## and",
+            vec![Fn, For, And],
+        )
+    }
+
+    #[test]
+    fn terminate_multiline_comment_correctly() {
+        lex_test(
+            "fn ## # no not here :( ## let # ## <- this is commented out 
+            # so no multiline comment
+            ## 
+            
+            here it starts
+            # let #
+            # # and
+            ## or
+            ",
+            vec![Fn, Let, Or],
+        )
+    }
+
    #[test]
    fn greeting() {
        lex_test("-.- /%", vec![Minus, Dot, Minus, Slash, Percent])
@ -378,7 +499,7 @@ mod test {

    #[test]
    fn string() {
-        lex_test(r#""uwu""#, vec![String("uwu")])
+        lex_test(r#""uwu""#, vec![String("uwu".to_string())])
    }

    #[test]
@ -387,12 +508,121 @@ mod test {
            r#"(  "hi" "uwu" "\"uwu\""  "no \\ u" )"#,
            vec![
                ParenO,
-                String("hi"),
-                String("uwu"),
-                String("\"uwu\""),
-                String("no \\ u"),
+                String("hi".to_string()),
+                String("uwu".to_string()),
+                String("\"uwu\"".to_string()),
+                String("no \\ u".to_string()),
                ParenC,
            ],
        )
    }
+
+    #[test]
+    fn keywords() {
+        lex_test(
+            "let fn if else loop while for true false null and not or",
+            vec![
+                Let, Fn, If, Else, Loop, While, For, True, False, Null, And, Not, Or,
+            ],
+        )
+    }
+
+    #[test]
+    fn keyword_and_ident() {
+        lex_test(
+            "let variable be a loop if false is true",
+            vec![
+                Let,
+                Ident("variable"),
+                Ident("be"),
+                Ident("a"),
+                Loop,
+                If,
+                False,
+                Ident("is"),
+                True,
+            ],
+        )
+    }
+
+    #[test]
+    fn not_quite_a_keyword() {
+        let words = [
+            "letter",
+            "fori",
+            "fnfn",
+            "iffy",
+            "bloop",
+            "loopy_yeah",
+            "whileTrue",
+            "truefalse",
+            "falsetrue",
+            "nullability",
+            "rot",
+            "ornot",
+            "nor",
+            "andnowQuestionMark",
+            "notOrAnd",
+        ];
+        let sentences = words
+            .iter()
+            .map(|word| format!("{} ", word))
+            .collect::<StdString>();
+        let expected = words.map(TokenType::Ident).to_vec();
+
+        lex_test(&sentences, expected)
+    }
+
+    #[test]
+    fn serious_program() {
+        lex_test(
+            r#"let string = "hallol"
+        let number = 5
+        let me out ._.
+        fn world() {
+            if number == 5 or true == false and not false {
+                print("Hello \\ World!")
+            }
+        }"#,
+            vec![
+                Let,
+                Ident("string"),
+                Equal,
+                String("hallol".to_string()),
+                Let,
+                Ident("number"),
+                Equal,
+                Number(5.0),
+                Let,
+                Ident("me"),
+                Ident("out"),
+                Dot,
+                Ident("_"),
+                Dot,
+                Fn,
+                Ident("world"),
+                ParenO,
+                ParenC,
+                BraceO,
+                If,
+                Ident("number"),
+                EqualEqual,
+                Number(5.0),
+                Or,
+                True,
+                EqualEqual,
+                False,
+                And,
+                Not,
+                False,
+                BraceO,
+                Ident("print"),
+                ParenO,
+                String("Hello \\ World!".to_string()),
+                ParenC,
+                BraceC,
+                BraceC,
+            ],
+        )
+    }
 }
--- a/src/lib.rs
+++ b/src/lib.rs
@ -1,7 +1,6 @@
 mod alloc;
 mod lex;
 mod parse;
-mod string;

 pub fn run_program(program: &str) {
    let lexer = lex::Lexer::lex(program);